Add codespell

EleutherAI · Feb 23, 2022 · 83a1bfb · 83a1bfb
1 parent 0885768
commit 83a1bfb
Show file tree

Hide file tree

Showing 26 changed files with 57 additions and 44 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -27,3 +27,12 @@ repos:
  hooks:
  - id: black
  language_version: python3.8
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.1.0
+ hooks:
+ - id: codespell
+ args: [
+ '--ignore-words-list=reord', # Word used in error messages that need rewording
+ --check-filenames,
+ --check-hidden,
+ ]
diff --git a/README.md b/README.md
@@ -277,7 +277,7 @@ The general usage pattern is:
 python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
 ```
 
-You can pass in an arbritrary number of configs which will all be merged at runtime.
+You can pass in an arbitrary number of configs which will all be merged at runtime.
 
 You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.
 

diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
@@ -27,7 +27,7 @@ LR Scheduler Arguments
 
  Default = 0.0
 
- Minumum value for learning rate. The scheduler clips values below this threshold.
+ Minimum value for learning rate. The scheduler clips values below this threshold.
 
 
 

diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
@@ -37,7 +37,7 @@
 
 def check_checkpoint_args(neox_args, checkpoint_args):
  """Ensure fixed arguments for a model are the same for the input
- arguments and the one retreived frm checkpoint."""
+ arguments and the one retrieved frm checkpoint."""
 
  assert isinstance(checkpoint_args, dict), "args stored in checkpoint is a dict"
  for checkpoint_arg_name, checkpoint_arg_value in checkpoint_args.items():

diff --git a/megatron/data/data_utils.py b/megatron/data/data_utils.py
@@ -13,7 +13,7 @@
 
 
 def make_data_loader(dataset, neox_args):
- """Buld dataloader given an input dataset."""
+ """Build dataloader given an input dataset."""
  if dataset is None:
  return None
  # Data parallel arguments.

diff --git a/megatron/data/gpt2_dataset.py b/megatron/data/gpt2_dataset.py
@@ -172,7 +172,7 @@ def _build_index_mappings(
  )
  # shuffle-idx.
  start_time = time.time()
- # -1 is due to data structure used to retieve the index:
+ # -1 is due to data structure used to retrieve the index:
  # sample i --> [sample_idx[i], sample_idx[i+1])
  shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
  np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)

diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
@@ -39,7 +39,7 @@ void build_blending_indices(py::array_t<uint8_t>& dataset_index,
  const bool verbose)
 {
  /* Given multiple datasets and a weighting array, build samples
- such that it follows those wieghts.*/
+ such that it follows those weights.*/
 
  if (verbose) { std::cout << "> building indices for blendable datasets ..." << std::endl; }
 
@@ -124,7 +124,7 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
  int64_t sample_index = 0;
  // Index into doc_idx.
  int64_t doc_idx_index = 0;
- // Begining offset for each document.
+ // Beginning offset for each document.
  int32_t doc_offset = 0;
  // Start with first document and no offset.
  sample_idx[2 * sample_index] = doc_idx_index;
@@ -148,7 +148,7 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
  doc_offset += (remaining_seq_length + doc_length - 1);
  remaining_seq_length = 0;
  } else {
- // Otherwise, start from the begining of the next document.
+ // Otherwise, start from the beginning of the next document.
  ++doc_idx_index;
  doc_offset = 0;
  }
@@ -268,7 +268,7 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
  const auto sent_index_first = docs[doc];
  const auto sent_index_last = docs[doc + 1];
 
- // At the begining of the document previous index is the
+ // At the beginning of the document previous index is the
  // start index.
  auto prev_start_index = sent_index_first;
 
@@ -508,7 +508,7 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
  const auto sent_index_last = docs[doc + 1];
  const auto target_seq_len = max_seq_length - titles_sizes[doc];
 
- // At the begining of the document previous index is the
+ // At the beginning of the document previous index is the
  // start index.
  auto prev_start_index = sent_index_first;
 

diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
@@ -22,20 +22,23 @@
 srcpath = Path(__file__).parent.absolute()
 
 # Setting this param to a list has a problem of generating different
-# compilation commands (with diferent order of architectures) and
+# compilation commands (with different order of architectures) and
 # leading to recompilation of fused kernels. Set it to empty string
-# to avoid recompilation and assign arch flags explicity in
+# to avoid recompilation and assign arch flags explicitly in
 # extra_cuda_cflags below
 os.environ["TORCH_CUDA_ARCH_LIST"] = ""
 
+
 def load_fused_kernels():
  try:
  import scaled_upper_triang_masked_softmax_cuda
  import scaled_masked_softmax_cuda
  except (ImportError, ModuleNotFoundError):
  print("\n")
  print("=" * 100)
- print(f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them')
+ print(
+ f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them'
+ )
  print("=" * 100)
  exit()
  return
diff --git a/megatron/fused_kernels/compat.h b/megatron/fused_kernels/compat.h
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-/*This code is copied fron NVIDIA apex:
+/*This code is copied from NVIDIA apex:
  * https://github.com/NVIDIA/apex
  * with minor changes. */
 

diff --git a/megatron/initialize.py b/megatron/initialize.py
@@ -193,6 +193,7 @@ def _initialize_distributed(neox_args):
  # Init DeepSpeed Activation Checkpointing Features
  setup_deepspeed_random_and_activation_checkpointing(neox_args=neox_args)
 
+
 def _init_autoresume(neox_args):
  """Set autoresume start time."""
 
@@ -211,8 +212,9 @@ def _init_autoresume(neox_args):
  neox_args.adlr_autoresume_object.init()
  torch.distributed.barrier()
 
+
 def _set_random_seed(seed):
- """Set random seed for reproducability."""
+ """Set random seed for reproducibility."""
  if seed is not None and seed > 0:
  random.seed(seed)
  np.random.seed(seed)

diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
@@ -18,7 +18,6 @@
 from ..fused_kernels import load_fused_kernels
 
 
-
 class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
  """
  Fused operation which performs following three operations in sequence
@@ -122,7 +121,7 @@ def __init__(
 
  if fusion_type != SoftmaxFusionTypes.none:
  load_fused_kernels() # check fused kernels are installed
- 
+
  self.upper_triang_mask_fusion = fusion_type == SoftmaxFusionTypes.upper_triang
  self.general_mask_fusion = fusion_type == SoftmaxFusionTypes.general
  self.fusion = fusion_type != SoftmaxFusionTypes.none

diff --git a/megatron/model/gpt2_model.py b/megatron/model/gpt2_model.py
@@ -303,7 +303,7 @@ def _set_parallel_output(self, value):
 
  def inference_mode(self, use_cache=True):
  """
- Sets up the model for inference by turning on k/v caching (if specificied) and setting `parallel output` of the final layer to false,
+ Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
  so logits are gathered across model parallel ranks.
 
  :param cache: (bool) True if you want to use caching during inference, False otherwise

diff --git a/megatron/model/init_functions.py b/megatron/model/init_functions.py
@@ -123,6 +123,6 @@ def _get(name):
  elif name == "small_init":
  return small_init_init_method(args.hidden_size)
  else:
- raise NotImplementedError(f"Unkown init method {name}")
+ raise NotImplementedError(f"Unknown init method {name}")
 
  return _get(args.init_method), _get(args.output_layer_init_method)
diff --git a/megatron/model/utils.py b/megatron/model/utils.py
@@ -107,7 +107,7 @@ def _is_checkpointable(self, funcs):
 
  def inference_mode(self, use_cache=True):
  """
- Sets up the model for inference by turning on k/v caching (if specificied) and setting `parallel output` of the final layer to false,
+ Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
  so logits are gathered across model parallel ranks.
 
  :param cache: (bool) True if you want to use caching during inference, False otherwise

diff --git a/megatron/mpu/cross_entropy.py b/megatron/mpu/cross_entropy.py
@@ -35,7 +35,7 @@ def forward(ctx, vocab_parallel_logits, target):
  # Subtract the maximum value.
  vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
 
- # Get the partition's vocab indecies
+ # Get the partition's vocab indices
  get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
  partition_vocab_size = vocab_parallel_logits.size()[-1]
  rank = get_model_parallel_rank()
@@ -90,10 +90,10 @@ def forward(ctx, vocab_parallel_logits, target):
  @staticmethod
  def backward(ctx, grad_output):
 
- # Retreive tensors from the forward path.
+ # Retrieve tensors from the forward path.
  softmax, target_mask, masked_target_1d = ctx.saved_tensors
 
- # All the inputs have softmax as thier gradient.
+ # All the inputs have softmax as their gradient.
  grad_input = softmax
  # For simplicity, work with the 2D gradient.
  partition_vocab_size = softmax.size()[-1]

diff --git a/megatron/mpu/data.py b/megatron/mpu/data.py
@@ -81,7 +81,7 @@ def broadcast_data(keys, data, datatype):
  members of the same model parallel group.
 
  Arguments:
- keys: list of keys in the data disctionary to be broadcasted
+ keys: list of keys in the data dictionary to be broadcasted
  data: data dictionary of string keys and cpu tensor values.
  datatype: torch data type of all tensors in data associated
  with keys.
@@ -103,7 +103,7 @@ def broadcast_data(keys, data, datatype):
  total_numel, device=torch.cuda.current_device(), dtype=datatype
  )
 
- # Boradcast
+ # Broadcast
  torch.distributed.broadcast(
  flatten_data, get_model_parallel_src_rank(), group=get_model_parallel_group()
  )

diff --git a/megatron/neox_arguments/__init__.py b/megatron/neox_arguments/__init__.py
@@ -26,8 +26,8 @@
 
 * NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen
 * The Subclasses group args according to their purpose
-* The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be ommitted
-* The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be ommitted
+* The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be omitted
+* The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be omitted
 * calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items())
 * refer to docstrings in code for more information
 """

diff --git a/megatron/neox_arguments/arguments.py b/megatron/neox_arguments/arguments.py
@@ -854,7 +854,7 @@ def validate_values(self):
  if self.hidden_size % self.num_attention_heads != 0:
  error_message = (
  self.__class__.__name__
- + ".validate_values() hidden_size must be divisable by num_attention_heads"
+ + ".validate_values() hidden_size must be divisible by num_attention_heads"
  )
  logging.error(error_message)
  raise ValueError(error_message)

diff --git a/megatron/neox_arguments/neox_args.py b/megatron/neox_arguments/neox_args.py
@@ -398,7 +398,7 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):
 
  min_lr: float = 0.0
  """
- Minumum value for learning rate. The scheduler clips values below this threshold.
+ Minimum value for learning rate. The scheduler clips values below this threshold.
  """
 
  warmup: float = 0.01
@@ -624,7 +624,7 @@ class NeoXArgsTokenizer(NeoXArgsTemplate):
 
  tokenizer = None
  """
- tokenizer object loaded into memory and accesible by other functions
+ tokenizer object loaded into memory and accessible by other functions
  """
 
 

diff --git a/megatron/text_generation_utils.py b/megatron/text_generation_utils.py
@@ -42,7 +42,7 @@ def get_batch(neox_args, context_tokens: torch.Tensor):
 
  # Move to GPU.
  tokens = context_tokens.contiguous().cuda()
- # Get the attention mask and postition ids.
+ # Get the attention mask and position ids.
  attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
  data=tokens,
  eod_token=neox_args.tokenizer.eod,
@@ -635,7 +635,7 @@ def generate_samples_unconditional(
 
  number_of_samples (default 10): number of unconditional samples to be generated
 
- output_file: file where generation results are to be stored in jsonl format. no file will be stored if ommitted
+ output_file: file where generation results are to be stored in jsonl format. no file will be stored if omitted
 
  eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
  maximum_tokens: maximum number of tokens to be generated

diff --git a/megatron/tokenizer/gpt2_tokenization.py b/megatron/tokenizer/gpt2_tokenization.py
@@ -53,7 +53,7 @@ def bytes_to_unicode():
  The reversible bpe codes work on unicode strings.
  This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
  When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
- This is a signficant percentage of your normal, say, 32K bpe vocab.
+ This is a significant percentage of your normal, say, 32K bpe vocab.
  To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
  And avoids mapping to whitespace/control characters the bpe code barfs on.
  """
@@ -154,7 +154,7 @@ def from_pretrained(
  pretrained_model_name_or_path
  in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
  ):
- # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
+ # if we're using a pretrained model, ensure the tokenizer won't index sequences longer
  # than the number of positional embeddings
  max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
  pretrained_model_name_or_path

diff --git a/megatron/training.py b/megatron/training.py
@@ -61,7 +61,7 @@
 def pretrain(neox_args):
  """Main training program.
 
- This function will run the followings in the order provided:
+ This function will run the following in the order provided:
  1) initialize Megatron.
  2) setup model, optimizer and lr schedule
  3) call train_val_test_data_provider to get train/val/test datasets.
@@ -77,7 +77,7 @@ def pretrain(neox_args):
  use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
  )
 
- # Initalize and get arguments, timers, and Tensorboard writer.
+ # Initialize and get arguments, timers, and Tensorboard writer.
  initialize_megatron(neox_args=neox_args)
 
  # Model, optimizer, and learning rate.

diff --git a/megatron/utils.py b/megatron/utils.py
@@ -273,7 +273,7 @@ def write(self, names, iteration, normalizer=1.0, reset=False):
  """Write timers to a tensorboard writer"""
  # currently when using add_scalars,
  # torch.utils.add_scalars makes each timer its own run, which
- # polutes the runs list, so we just add each as a scalar
+ # pollutes the runs list, so we just add each as a scalar
  assert normalizer > 0.0
  for name in names:
  value = self.timers[name].elapsed(reset=reset) / normalizer

diff --git a/tests/neox_args/test_neoxargs_usage.py b/tests/neox_args/test_neoxargs_usage.py
@@ -16,7 +16,7 @@ def test_neoxargs_usage():
  declared_all = True
  neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys())
 
- # we exlude a number of properties (implemented with the @property decorator) or functions that we know exists
+ # we exclude a number of properties (implemented with the @property decorator) or functions that we know exists
  exclude = set(
  [
  "params_dtype",

diff --git a/tools/inspect_checkpoints.py b/tools/inspect_checkpoints.py
@@ -40,7 +40,7 @@ def sizeof_fmt(num, suffix="B"):
 
 
 def pretty_print(contents: dict):
- """Prints a nice summary of the top-level contens in a checkpoint dictionary."""
+ """Prints a nice summary of the top-level contents in a checkpoint dictionary."""
  col_size = max(len(str(k)) for k in contents)
  for k, v in sorted(contents.items()):
  key_length = len(str(k))
@@ -78,7 +78,7 @@ def common_entries(*dcts):
 
 
 def pretty_print_double(contents1: dict, contents2: dict, args):
- """Prints a nice summary of the top-level contens in a checkpoint dictionary."""
+ """Prints a nice summary of the top-level contents in a checkpoint dictionary."""
  col_size = max(
  max(len(str(k)) for k in contents1), max(len(str(k)) for k in contents2)
  )