Skip to content

Commit

Permalink
Add codespell
Browse files Browse the repository at this point in the history
  • Loading branch information
Mistobaan committed Feb 23, 2022
1 parent 0885768 commit 83a1bfb
Show file tree
Hide file tree
Showing 26 changed files with 57 additions and 44 deletions.
9 changes: 9 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,12 @@ repos:
hooks:
- id: black
language_version: python3.8
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
- id: codespell
args: [
'--ignore-words-list=reord', # Word used in error messages that need rewording
--check-filenames,
--check-hidden,
]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ The general usage pattern is:
python ./deepy.py train.py [path/to/config1.yml] [path/to/config2.yml] ...
```

You can pass in an arbritrary number of configs which will all be merged at runtime.
You can pass in an arbitrary number of configs which will all be merged at runtime.

You can also optionally pass in a config prefix, which will assume all your configs are in the same folder and append that prefix to their path.

Expand Down
2 changes: 1 addition & 1 deletion configs/neox_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ LR Scheduler Arguments

Default = 0.0

Minumum value for learning rate. The scheduler clips values below this threshold.
Minimum value for learning rate. The scheduler clips values below this threshold.



Expand Down
2 changes: 1 addition & 1 deletion megatron/checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

def check_checkpoint_args(neox_args, checkpoint_args):
"""Ensure fixed arguments for a model are the same for the input
arguments and the one retreived frm checkpoint."""
arguments and the one retrieved frm checkpoint."""

assert isinstance(checkpoint_args, dict), "args stored in checkpoint is a dict"
for checkpoint_arg_name, checkpoint_arg_value in checkpoint_args.items():
Expand Down
2 changes: 1 addition & 1 deletion megatron/data/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def make_data_loader(dataset, neox_args):
"""Buld dataloader given an input dataset."""
"""Build dataloader given an input dataset."""
if dataset is None:
return None
# Data parallel arguments.
Expand Down
2 changes: 1 addition & 1 deletion megatron/data/gpt2_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def _build_index_mappings(
)
# shuffle-idx.
start_time = time.time()
# -1 is due to data structure used to retieve the index:
# -1 is due to data structure used to retrieve the index:
# sample i --> [sample_idx[i], sample_idx[i+1])
shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
Expand Down
10 changes: 5 additions & 5 deletions megatron/data/helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ void build_blending_indices(py::array_t<uint8_t>& dataset_index,
const bool verbose)
{
/* Given multiple datasets and a weighting array, build samples
such that it follows those wieghts.*/
such that it follows those weights.*/

if (verbose) { std::cout << "> building indices for blendable datasets ..." << std::endl; }

Expand Down Expand Up @@ -124,7 +124,7 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
int64_t sample_index = 0;
// Index into doc_idx.
int64_t doc_idx_index = 0;
// Begining offset for each document.
// Beginning offset for each document.
int32_t doc_offset = 0;
// Start with first document and no offset.
sample_idx[2 * sample_index] = doc_idx_index;
Expand All @@ -148,7 +148,7 @@ py::array build_sample_idx(const py::array_t<int32_t>& sizes_,
doc_offset += (remaining_seq_length + doc_length - 1);
remaining_seq_length = 0;
} else {
// Otherwise, start from the begining of the next document.
// Otherwise, start from the beginning of the next document.
++doc_idx_index;
doc_offset = 0;
}
Expand Down Expand Up @@ -268,7 +268,7 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
const auto sent_index_first = docs[doc];
const auto sent_index_last = docs[doc + 1];

// At the begining of the document previous index is the
// At the beginning of the document previous index is the
// start index.
auto prev_start_index = sent_index_first;

Expand Down Expand Up @@ -508,7 +508,7 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
const auto sent_index_last = docs[doc + 1];
const auto target_seq_len = max_seq_length - titles_sizes[doc];

// At the begining of the document previous index is the
// At the beginning of the document previous index is the
// start index.
auto prev_start_index = sent_index_first;

Expand Down
9 changes: 6 additions & 3 deletions megatron/fused_kernels/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,23 @@
srcpath = Path(__file__).parent.absolute()

# Setting this param to a list has a problem of generating different
# compilation commands (with diferent order of architectures) and
# compilation commands (with different order of architectures) and
# leading to recompilation of fused kernels. Set it to empty string
# to avoid recompilation and assign arch flags explicity in
# to avoid recompilation and assign arch flags explicitly in
# extra_cuda_cflags below
os.environ["TORCH_CUDA_ARCH_LIST"] = ""


def load_fused_kernels():
try:
import scaled_upper_triang_masked_softmax_cuda
import scaled_masked_softmax_cuda
except (ImportError, ModuleNotFoundError):
print("\n")
print("=" * 100)
print(f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them')
print(
f'ERROR: Fused kernels configured but not installed. Please run `python {str(srcpath / "setup.py")} install` to install them'
)
print("=" * 100)
exit()
return
2 changes: 1 addition & 1 deletion megatron/fused_kernels/compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

/*This code is copied fron NVIDIA apex:
/*This code is copied from NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */

Expand Down
4 changes: 3 additions & 1 deletion megatron/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def _initialize_distributed(neox_args):
# Init DeepSpeed Activation Checkpointing Features
setup_deepspeed_random_and_activation_checkpointing(neox_args=neox_args)


def _init_autoresume(neox_args):
"""Set autoresume start time."""

Expand All @@ -211,8 +212,9 @@ def _init_autoresume(neox_args):
neox_args.adlr_autoresume_object.init()
torch.distributed.barrier()


def _set_random_seed(seed):
"""Set random seed for reproducability."""
"""Set random seed for reproducibility."""
if seed is not None and seed > 0:
random.seed(seed)
np.random.seed(seed)
Expand Down
3 changes: 1 addition & 2 deletions megatron/model/fused_softmax.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from ..fused_kernels import load_fused_kernels



class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
"""
Fused operation which performs following three operations in sequence
Expand Down Expand Up @@ -122,7 +121,7 @@ def __init__(

if fusion_type != SoftmaxFusionTypes.none:
load_fused_kernels() # check fused kernels are installed

self.upper_triang_mask_fusion = fusion_type == SoftmaxFusionTypes.upper_triang
self.general_mask_fusion = fusion_type == SoftmaxFusionTypes.general
self.fusion = fusion_type != SoftmaxFusionTypes.none
Expand Down
2 changes: 1 addition & 1 deletion megatron/model/gpt2_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def _set_parallel_output(self, value):

def inference_mode(self, use_cache=True):
"""
Sets up the model for inference by turning on k/v caching (if specificied) and setting `parallel output` of the final layer to false,
Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
so logits are gathered across model parallel ranks.
:param cache: (bool) True if you want to use caching during inference, False otherwise
Expand Down
2 changes: 1 addition & 1 deletion megatron/model/init_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,6 @@ def _get(name):
elif name == "small_init":
return small_init_init_method(args.hidden_size)
else:
raise NotImplementedError(f"Unkown init method {name}")
raise NotImplementedError(f"Unknown init method {name}")

return _get(args.init_method), _get(args.output_layer_init_method)
2 changes: 1 addition & 1 deletion megatron/model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _is_checkpointable(self, funcs):

def inference_mode(self, use_cache=True):
"""
Sets up the model for inference by turning on k/v caching (if specificied) and setting `parallel output` of the final layer to false,
Sets up the model for inference by turning on k/v caching (if specified) and setting `parallel output` of the final layer to false,
so logits are gathered across model parallel ranks.
:param cache: (bool) True if you want to use caching during inference, False otherwise
Expand Down
6 changes: 3 additions & 3 deletions megatron/mpu/cross_entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def forward(ctx, vocab_parallel_logits, target):
# Subtract the maximum value.
vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))

# Get the partition's vocab indecies
# Get the partition's vocab indices
get_vocab_range = VocabUtility.vocab_range_from_per_partition_vocab_size
partition_vocab_size = vocab_parallel_logits.size()[-1]
rank = get_model_parallel_rank()
Expand Down Expand Up @@ -90,10 +90,10 @@ def forward(ctx, vocab_parallel_logits, target):
@staticmethod
def backward(ctx, grad_output):

# Retreive tensors from the forward path.
# Retrieve tensors from the forward path.
softmax, target_mask, masked_target_1d = ctx.saved_tensors

# All the inputs have softmax as thier gradient.
# All the inputs have softmax as their gradient.
grad_input = softmax
# For simplicity, work with the 2D gradient.
partition_vocab_size = softmax.size()[-1]
Expand Down
4 changes: 2 additions & 2 deletions megatron/mpu/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def broadcast_data(keys, data, datatype):
members of the same model parallel group.
Arguments:
keys: list of keys in the data disctionary to be broadcasted
keys: list of keys in the data dictionary to be broadcasted
data: data dictionary of string keys and cpu tensor values.
datatype: torch data type of all tensors in data associated
with keys.
Expand All @@ -103,7 +103,7 @@ def broadcast_data(keys, data, datatype):
total_numel, device=torch.cuda.current_device(), dtype=datatype
)

# Boradcast
# Broadcast
torch.distributed.broadcast(
flatten_data, get_model_parallel_src_rank(), group=get_model_parallel_group()
)
Expand Down
4 changes: 2 additions & 2 deletions megatron/neox_arguments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
* NeoX args (in ./arguments) inherits from the following subclasses: NeoXArgsDeepspeedRunner, NeoXArgsDeepspeedConfig, NeoXArgsModel, NeoXArgsTokenizer, NeoXArgsTraining, NeoXArgsParallelism, NeoXArgsLogging, NeoXArgsOther, NeoXArgsTextgen
* The Subclasses group args according to their purpose
* The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be ommitted
* The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be ommitted
* The attributes of NeoXArgsDeepspeedRunner are directly mapped to the expected command line args of deepspeed.launcher.runner.main; no attributes unknown to deepspeed should be included; no arguments relevant for deepspeed should be omitted
* The attributes of NeoXArgsDeepspeedConfig are directly mapped to the expected keys of the deepspeed config; no arguments relevant for deepspeed should be omitted
* calculated attributes (decorator '@property') are available as attribute, but would not be included in dataclass fields (e.g. NeoXArgs().__dataclass_fields__.items())
* refer to docstrings in code for more information
"""
Expand Down
2 changes: 1 addition & 1 deletion megatron/neox_arguments/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,7 @@ def validate_values(self):
if self.hidden_size % self.num_attention_heads != 0:
error_message = (
self.__class__.__name__
+ ".validate_values() hidden_size must be divisable by num_attention_heads"
+ ".validate_values() hidden_size must be divisible by num_attention_heads"
)
logging.error(error_message)
raise ValueError(error_message)
Expand Down
4 changes: 2 additions & 2 deletions megatron/neox_arguments/neox_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,7 @@ class NeoXArgsLRScheduler(NeoXArgsTemplate):

min_lr: float = 0.0
"""
Minumum value for learning rate. The scheduler clips values below this threshold.
Minimum value for learning rate. The scheduler clips values below this threshold.
"""

warmup: float = 0.01
Expand Down Expand Up @@ -624,7 +624,7 @@ class NeoXArgsTokenizer(NeoXArgsTemplate):

tokenizer = None
"""
tokenizer object loaded into memory and accesible by other functions
tokenizer object loaded into memory and accessible by other functions
"""


Expand Down
4 changes: 2 additions & 2 deletions megatron/text_generation_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_batch(neox_args, context_tokens: torch.Tensor):

# Move to GPU.
tokens = context_tokens.contiguous().cuda()
# Get the attention mask and postition ids.
# Get the attention mask and position ids.
attention_mask, _, position_ids = get_ltor_masks_and_position_ids(
data=tokens,
eod_token=neox_args.tokenizer.eod,
Expand Down Expand Up @@ -635,7 +635,7 @@ def generate_samples_unconditional(
number_of_samples (default 10): number of unconditional samples to be generated
output_file: file where generation results are to be stored in jsonl format. no file will be stored if ommitted
output_file: file where generation results are to be stored in jsonl format. no file will be stored if omitted
eos_token_id: end of text token at which completion is terminated, even if max_tokes count has not been reached
maximum_tokens: maximum number of tokens to be generated
Expand Down
4 changes: 2 additions & 2 deletions megatron/tokenizer/gpt2_tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def bytes_to_unicode():
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
Expand Down Expand Up @@ -154,7 +154,7 @@ def from_pretrained(
pretrained_model_name_or_path
in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
):
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# if we're using a pretrained model, ensure the tokenizer won't index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
pretrained_model_name_or_path
Expand Down
4 changes: 2 additions & 2 deletions megatron/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
def pretrain(neox_args):
"""Main training program.
This function will run the followings in the order provided:
This function will run the following in the order provided:
1) initialize Megatron.
2) setup model, optimizer and lr schedule
3) call train_val_test_data_provider to get train/val/test datasets.
Expand All @@ -77,7 +77,7 @@ def pretrain(neox_args):
use_wandb=neox_args.use_wandb, tensorboard_writer=neox_args.tensorboard_writer
)

# Initalize and get arguments, timers, and Tensorboard writer.
# Initialize and get arguments, timers, and Tensorboard writer.
initialize_megatron(neox_args=neox_args)

# Model, optimizer, and learning rate.
Expand Down
2 changes: 1 addition & 1 deletion megatron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def write(self, names, iteration, normalizer=1.0, reset=False):
"""Write timers to a tensorboard writer"""
# currently when using add_scalars,
# torch.utils.add_scalars makes each timer its own run, which
# polutes the runs list, so we just add each as a scalar
# pollutes the runs list, so we just add each as a scalar
assert normalizer > 0.0
for name in names:
value = self.timers[name].elapsed(reset=reset) / normalizer
Expand Down
2 changes: 1 addition & 1 deletion tests/neox_args/test_neoxargs_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_neoxargs_usage():
declared_all = True
neox_args_attributes = set(NeoXArgs.__dataclass_fields__.keys())

# we exlude a number of properties (implemented with the @property decorator) or functions that we know exists
# we exclude a number of properties (implemented with the @property decorator) or functions that we know exists
exclude = set(
[
"params_dtype",
Expand Down
4 changes: 2 additions & 2 deletions tools/inspect_checkpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def sizeof_fmt(num, suffix="B"):


def pretty_print(contents: dict):
"""Prints a nice summary of the top-level contens in a checkpoint dictionary."""
"""Prints a nice summary of the top-level contents in a checkpoint dictionary."""
col_size = max(len(str(k)) for k in contents)
for k, v in sorted(contents.items()):
key_length = len(str(k))
Expand Down Expand Up @@ -78,7 +78,7 @@ def common_entries(*dcts):


def pretty_print_double(contents1: dict, contents2: dict, args):
"""Prints a nice summary of the top-level contens in a checkpoint dictionary."""
"""Prints a nice summary of the top-level contents in a checkpoint dictionary."""
col_size = max(
max(len(str(k)) for k in contents1), max(len(str(k)) for k in contents2)
)
Expand Down
Loading

0 comments on commit 83a1bfb

Please sign in to comment.