diff --git a/README.md b/README.md index 602ad8b74c..d4ad344875 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ The [`examples/pretrain_bert.sh`](./examples/pretrain_bert.sh) script runs singl The logging, checkpoint-saving, and evaluation interval options are specified. Note that the `--data-path` now includes the additional `_text_sentence` suffix added in preprocessing, but does not include the file extensions. -Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py). +Further command line arguments are described in the source file [`arguments.py`](./megatron/training/arguments.py). To run `examples/pretrain_bert.sh`, make any desired modifications including setting the environment variables for `CHECKPOINT_PATH`, `VOCAB_FILE`, and `DATA_PATH`. Make sure to set these variables to their paths in the container. Then launch the container with Megatron and necessary paths mounted (as explained in [Setup](#setup)) and run the example script. @@ -167,7 +167,7 @@ The `examples/pretrain_gpt.sh` script runs single GPU 345M parameter GPT pretrai It follows largely the same format as the previous BERT script with a few notable differences: the tokenization scheme used is BPE (which requires a merge table and a `json` vocabulary file) instead of WordPiece, the model architecture allows for longer sequences (note that the max position embedding must be greater than or equal to the maximum sequence length), and the `--lr-decay-style` has been set to cosine decay. Note that the `--data-path` now includes the additional `_text_document` suffix added in preprocessing, but does not include the file extensions. -Further command line arguments are described in the source file [`arguments.py`](./megatron/arguments.py). +Further command line arguments are described in the source file [`arguments.py`](./megatron/training/arguments.py). `examples/pretrain_gpt.sh` can be launched the same way as described for BERT. Set the env vars and make any other modifications, launch the container with appropriate mounts, and run the script. @@ -290,7 +290,7 @@ python preprocess_data.py \ --workers 5 # works well for 10 CPU cores. Scale up accordingly. -2. Use a custom samples mapping function in place of `megatron/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/data/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop. +2. Use a custom samples mapping function in place of `megatron/legacy/data/realm_dataset_utils.get_block_samples_mapping` if required. To do this, you will need to implement a new function in C++ inside of `megatron/core/datasets/helpers.cpp`. The samples mapping data structure is used to select the data that will constitute every training sample in advance of the training loop. The samples mapping is responsible for holding all of the required metadata needed to construct the sample from one or more indexed datasets. In REALM, the samples mapping contains the start and end sentence indices, as well as the document index (to find the correct title for a body) and a unique ID for every block. 3. Pretrain a BERT language model using `pretrain_bert.py`, with the sequence length equal to the block size in token ids. This model should be trained on the same indexed dataset that is used to supply the blocks for the information retrieval task. In REALM, this is an uncased bert base model trained with the standard hyperparameters. @@ -384,7 +384,7 @@ You can also use CURL or any other tools to query the server directly: curl 'http://localhost:5000/api' -X 'PUT' -H 'Content-Type: application/json; charset=UTF-8' -d '{"prompts":["Hello world"], "tokens_to_generate":1}' -See [megatron/text_generation_server.py](megatron/text_generation_server.py) for more API options. +See [megatron/inference/text_generation_server.py](megatron/inference/text_generation_server.py) for more API options. ### Detoxify GPT via Self-generation We include an example in `examples/detxoify_lm/` to detoxify language models by leveraging the generative power of language models. @@ -531,10 +531,10 @@ The Llama-2 [family of models](https://ai.meta.com/llama/) are an open-source se The Llama-2 checkpoints can be loaded into Megatron for inference and finetuning. See documentation [here](docs/llama2.md). # Model Optimization and Deployment -Megatron-Core (MCore) `GPTModel` family supports advanced quantization algorithms and high-performance deployment through TensorRT-LLM. +Megatron-Core (MCore) `GPTModel` family supports advanced quantization algorithms and high-performance inference through TensorRT-LLM. ## Quantization and TensorRT-LLM Deployment -See [Megatron Model Optimization and Deployment](examples/deploy/README.md) for `llama2` and `nemotron3` examples. +See [Megatron Model Optimization and Deployment](examples/inference/README.md) for `llama2` and `nemotron3` examples. # Datasets We do not host any datasets for GPT or BERT training, however, we detail their collection so that our results may be reproduced. diff --git a/examples/detxoify_lm/finetune_gpt.py b/examples/detxoify_lm/finetune_gpt.py index f1bbba5bda..48154bcfd3 100644 --- a/examples/detxoify_lm/finetune_gpt.py +++ b/examples/detxoify_lm/finetune_gpt.py @@ -10,19 +10,19 @@ import sys sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))) -from megatron import get_args -from megatron import get_timers -from megatron import get_tokenizer -from megatron import print_rank_0 +from megatron.training import get_args +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 from megatron.core import mpu from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.blended_megatron_dataset_config import GPTDatasetConfig from megatron.core.datasets.gpt_dataset import GPTDataset -from megatron.model import GPTModel +from megatron.legacy.model import GPTModel from megatron.core.enums import ModelType from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import get_ltor_masks_and_position_ids +from megatron.training.utils import average_losses_across_data_parallel_group def model_provider(pre_process=True, post_process=True): """Build the model.""" diff --git a/examples/detxoify_lm/generate_samples_gpt.py b/examples/detxoify_lm/generate_samples_gpt.py index da12bbd7dc..7e7b9a20b2 100644 --- a/examples/detxoify_lm/generate_samples_gpt.py +++ b/examples/detxoify_lm/generate_samples_gpt.py @@ -9,24 +9,24 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))) import torch -from megatron import get_args -from megatron import get_tokenizer -from megatron import print_rank_0 -from megatron.checkpointing import load_checkpoint +from megatron.training import get_args +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training.checkpointing import load_checkpoint from megatron.core import mpu -from megatron.initialize import initialize_megatron -from megatron.model import GPTModel +from megatron.training.initialize import initialize_megatron +from megatron.legacy.model import GPTModel from megatron.training import get_model -from megatron.text_generation import generate_and_post_process -from megatron.arguments import core_transformer_config_from_args +from megatron.inference.text_generation import generate_and_post_process +from megatron.training.arguments import core_transformer_config_from_args from megatron.core.models.gpt import GPTModel from typing import Union -import megatron.model +import megatron.legacy.model from megatron.core.transformer.spec_utils import import_module -from megatron.arguments import core_transformer_config_from_args +from megatron.training.arguments import core_transformer_config_from_args from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec -def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.model.GPTModel]: +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: """Builds the model. If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model. @@ -37,7 +37,7 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat Returns: - Union[GPTModel, megatron.model.GPTModel]: The returned model + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model """ args = get_args() @@ -83,7 +83,7 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat else: assert(args.context_parallel_size == 1), "Context parallelism is only supported with Megatron Core!" - model = megatron.model.GPTModel( + model = megatron.legacy.model.GPTModel( config, num_tokentypes=0, parallel_output=True, diff --git a/examples/deploy/README.md b/examples/inference/README.md similarity index 96% rename from examples/deploy/README.md rename to examples/inference/README.md index c63993e9ca..7251a8d015 100644 --- a/examples/deploy/README.md +++ b/examples/inference/README.md @@ -42,7 +42,7 @@ following checkpoint formats with some remedy: | GPTModel | sharded | remedy arguments | |-----------------------------------|---------|-----------------------------------------| -| megatron.model | | `--ammo-load-classic-megatron-to-mcore` | +| megatron.legacy.model | | `--ammo-load-classic-megatron-to-mcore` | | TE-Fused (default mcore gpt spec) | | `--ammo-convert-te-to-local-spec` | | TE-Fused (default mcore gpt spec) | x | | @@ -76,7 +76,7 @@ cd .. Now launch the PTQ + TensorRT-LLM export script, ``` -bash examples/deploy/ptq_trtllm_nemotron3_8b ./nemotron-3-8b-base-4k None +bash examples/inference/ptq_trtllm_nemotron3_8b ./nemotron-3-8b-base-4k None ``` By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can @@ -112,7 +112,7 @@ The script expects `${CHECKPOINT_DIR}` (`./nemotron-3-8b-base-4k`) to have the f > that we support. ```sh -bash examples/deploy/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} +bash examples/inference/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR} ``` The script expect `${CHECKPOINT_DIR}` to have the following structure: diff --git a/examples/deploy/ptq_trtllm_llama_7b.sh b/examples/inference/ptq_trtllm_llama_7b.sh similarity index 91% rename from examples/deploy/ptq_trtllm_llama_7b.sh rename to examples/inference/ptq_trtllm_llama_7b.sh index dc936c82ac..4b285f95f9 100644 --- a/examples/deploy/ptq_trtllm_llama_7b.sh +++ b/examples/inference/ptq_trtllm_llama_7b.sh @@ -73,7 +73,7 @@ python -c "import ammo.torch.quantization.extensions as ext; print(ext.cuda_ext) launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/deploy/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR} +torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR} # This script is using mpi4py which will fork multiple processes. -python examples/deploy/trtllm_text_generation.py ${trtllm_options} +python examples/inference/trtllm_text_generation.py ${trtllm_options} diff --git a/examples/deploy/ptq_trtllm_nemotron3_8b.sh b/examples/inference/ptq_trtllm_nemotron3_8b.sh similarity index 91% rename from examples/deploy/ptq_trtllm_nemotron3_8b.sh rename to examples/inference/ptq_trtllm_nemotron3_8b.sh index 418021b102..2a90367d4c 100644 --- a/examples/deploy/ptq_trtllm_nemotron3_8b.sh +++ b/examples/inference/ptq_trtllm_nemotron3_8b.sh @@ -68,8 +68,8 @@ python -c "import ammo.torch.quantization.extensions as ext; print(ext.cuda_ext) launch_config="--nproc_per_node=${TP}" # Launch multi-process with torchrun -torchrun ${launch_config} examples/deploy/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR} +torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options} --load ${CHECKPOINT_LOAD_DIR} # This script is using mpi4py which will fork multiple processes. -python examples/deploy/trtllm_text_generation.py ${trtllm_options} +python examples/inference/trtllm_text_generation.py ${trtllm_options} diff --git a/examples/deploy/text_generation_ptq.py b/examples/inference/text_generation_ptq.py similarity index 95% rename from examples/deploy/text_generation_ptq.py rename to examples/inference/text_generation_ptq.py index db25a5a4c7..85aa4d13db 100644 --- a/examples/deploy/text_generation_ptq.py +++ b/examples/inference/text_generation_ptq.py @@ -13,16 +13,16 @@ from datasets import load_dataset # [ModelOpt]: changing the default model provider to the AMMO version -from megatron import get_args, print_rank_0 -from megatron.checkpointing import load_checkpoint, save_checkpoint +from megatron.training import get_args, print_rank_0 +from megatron.training.checkpointing import load_checkpoint, save_checkpoint from megatron.core import mpu from megatron.core.dist_checkpointing import load -from megatron.deploy.arguments import add_ammo_args -from megatron.deploy.gpt.model_provider import model_provider -from megatron.initialize import initialize_megatron -from megatron.text_generation import generate_and_post_process +from megatron.inference.arguments import add_ammo_args +from megatron.inference.gpt.model_provider import model_provider +from megatron.training.initialize import initialize_megatron +from megatron.inference.text_generation import generate_and_post_process from megatron.training import get_model -from megatron.utils import unwrap_model +from megatron.training.utils import unwrap_model QUANT_CFG_CHOICES = { "int8": atq.INT8_DEFAULT_CFG, diff --git a/examples/deploy/trtllm_text_generation.py b/examples/inference/trtllm_text_generation.py similarity index 100% rename from examples/deploy/trtllm_text_generation.py rename to examples/inference/trtllm_text_generation.py diff --git a/megatron/core/deploy/__init__.py b/megatron/core/inference/__init__.py similarity index 100% rename from megatron/core/deploy/__init__.py rename to megatron/core/inference/__init__.py diff --git a/megatron/core/deploy/gpt/__init__.py b/megatron/core/inference/gpt/__init__.py similarity index 100% rename from megatron/core/deploy/gpt/__init__.py rename to megatron/core/inference/gpt/__init__.py diff --git a/megatron/core/deploy/gpt/model_specs.py b/megatron/core/inference/gpt/model_specs.py similarity index 100% rename from megatron/core/deploy/gpt/model_specs.py rename to megatron/core/inference/gpt/model_specs.py diff --git a/megatron/core/deploy/gpt/state_dict_hooks.py b/megatron/core/inference/gpt/state_dict_hooks.py similarity index 100% rename from megatron/core/deploy/gpt/state_dict_hooks.py rename to megatron/core/inference/gpt/state_dict_hooks.py diff --git a/megatron/deploy/__init__.py b/megatron/inference/__init__.py similarity index 100% rename from megatron/deploy/__init__.py rename to megatron/inference/__init__.py diff --git a/megatron/deploy/arguments.py b/megatron/inference/arguments.py similarity index 100% rename from megatron/deploy/arguments.py rename to megatron/inference/arguments.py diff --git a/megatron/deploy/gpt/__init__.py b/megatron/inference/gpt/__init__.py similarity index 100% rename from megatron/deploy/gpt/__init__.py rename to megatron/inference/gpt/__init__.py diff --git a/megatron/deploy/gpt/model_provider.py b/megatron/inference/gpt/model_provider.py similarity index 90% rename from megatron/deploy/gpt/model_provider.py rename to megatron/inference/gpt/model_provider.py index 39fb49f8c3..e0cc326861 100644 --- a/megatron/deploy/gpt/model_provider.py +++ b/megatron/inference/gpt/model_provider.py @@ -4,10 +4,10 @@ from typing import Union -from megatron import get_args, print_rank_0 -from megatron.arguments import core_transformer_config_from_args -from megatron.core.deploy.gpt.model_specs import get_gpt_layer_ammo_spec -from megatron.core.deploy.gpt.state_dict_hooks import ( +from megatron.training import get_args, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args +from megatron.core.inference.gpt.model_specs import get_gpt_layer_ammo_spec +from megatron.core.inference.gpt.state_dict_hooks import ( mcore_gpt_load_classic_state_dict_pre_hook, mcore_gpt_load_te_state_dict_pre_hook, ) diff --git a/megatron/static/index.html b/megatron/inference/static/index.html similarity index 100% rename from megatron/static/index.html rename to megatron/inference/static/index.html diff --git a/megatron/text_generation/__init__.py b/megatron/inference/text_generation/__init__.py similarity index 100% rename from megatron/text_generation/__init__.py rename to megatron/inference/text_generation/__init__.py diff --git a/megatron/text_generation/api.py b/megatron/inference/text_generation/api.py similarity index 100% rename from megatron/text_generation/api.py rename to megatron/inference/text_generation/api.py diff --git a/megatron/text_generation/beam_utils.py b/megatron/inference/text_generation/beam_utils.py similarity index 100% rename from megatron/text_generation/beam_utils.py rename to megatron/inference/text_generation/beam_utils.py diff --git a/megatron/text_generation/communication.py b/megatron/inference/text_generation/communication.py similarity index 100% rename from megatron/text_generation/communication.py rename to megatron/inference/text_generation/communication.py diff --git a/megatron/text_generation/forward_step.py b/megatron/inference/text_generation/forward_step.py similarity index 99% rename from megatron/text_generation/forward_step.py rename to megatron/inference/text_generation/forward_step.py index 6a88709a52..e6951966c6 100644 --- a/megatron/text_generation/forward_step.py +++ b/megatron/inference/text_generation/forward_step.py @@ -6,7 +6,7 @@ import torch -from megatron import get_args +from megatron.training import get_args from megatron.core import mpu, InferenceParams from .communication import ( send_to_next_pipeline_rank, diff --git a/megatron/text_generation/generation.py b/megatron/inference/text_generation/generation.py similarity index 99% rename from megatron/text_generation/generation.py rename to megatron/inference/text_generation/generation.py index 11dd9f436b..2abab71e0f 100644 --- a/megatron/text_generation/generation.py +++ b/megatron/inference/text_generation/generation.py @@ -5,9 +5,9 @@ import torch import torch.nn.functional as F -from megatron import get_args, get_tokenizer +from megatron.training import get_args, get_tokenizer from megatron.core import mpu -from megatron.utils import get_ltor_masks_and_position_ids +from megatron.training.utils import get_ltor_masks_and_position_ids from .communication import ( copy_from_last_to_first_pipeline_stage, broadcast_from_last_pipeline_stage, diff --git a/megatron/text_generation/sampling.py b/megatron/inference/text_generation/sampling.py similarity index 100% rename from megatron/text_generation/sampling.py rename to megatron/inference/text_generation/sampling.py diff --git a/megatron/text_generation/tokenization.py b/megatron/inference/text_generation/tokenization.py similarity index 98% rename from megatron/text_generation/tokenization.py rename to megatron/inference/text_generation/tokenization.py index 441add74f9..18cc077e2c 100644 --- a/megatron/text_generation/tokenization.py +++ b/megatron/inference/text_generation/tokenization.py @@ -6,7 +6,7 @@ import torch -from megatron import get_tokenizer, get_args +from megatron.training import get_tokenizer, get_args from .communication import broadcast_int_list, broadcast_tensor diff --git a/megatron/text_generation_server.py b/megatron/inference/text_generation_server.py similarity index 98% rename from megatron/text_generation_server.py rename to megatron/inference/text_generation_server.py index 6ce98000d3..2eba2e259e 100644 --- a/megatron/text_generation_server.py +++ b/megatron/inference/text_generation_server.py @@ -5,9 +5,9 @@ import threading from flask import Flask, request, jsonify, current_app from flask_restful import Resource, Api -from megatron import get_args -from megatron.text_generation import generate_and_post_process -from megatron.text_generation import beam_search_and_post_process +from megatron.training import get_args +from megatron.inference.text_generation import generate_and_post_process +from megatron.inference.text_generation import beam_search_and_post_process GENERATE_NUM = 0 diff --git a/megatron/data/__init__.py b/megatron/legacy/data/__init__.py similarity index 100% rename from megatron/data/__init__.py rename to megatron/legacy/data/__init__.py diff --git a/megatron/data/autoaugment.py b/megatron/legacy/data/autoaugment.py similarity index 100% rename from megatron/data/autoaugment.py rename to megatron/legacy/data/autoaugment.py diff --git a/megatron/data/biencoder_dataset_utils.py b/megatron/legacy/data/biencoder_dataset_utils.py similarity index 97% rename from megatron/data/biencoder_dataset_utils.py rename to megatron/legacy/data/biencoder_dataset_utils.py index 6e4de43c2f..4ea43cd087 100644 --- a/megatron/data/biencoder_dataset_utils.py +++ b/megatron/legacy/data/biencoder_dataset_utils.py @@ -4,11 +4,11 @@ import numpy as np import torch -from megatron import get_args, get_tokenizer, print_rank_0 +from megatron.training import get_args, get_tokenizer, print_rank_0 from megatron.core import mpu, tensor_parallel -from megatron.data.dataset_utils import create_masked_lm_predictions, \ +from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, \ pad_and_convert_to_numpy -from megatron.data.data_samplers import MegatronPretrainingSampler +from megatron.legacy.data.data_samplers import MegatronPretrainingSampler def make_attention_mask(source_block, target_block): """ diff --git a/megatron/data/data_samplers.py b/megatron/legacy/data/data_samplers.py similarity index 99% rename from megatron/data/data_samplers.py rename to megatron/legacy/data/data_samplers.py index 3e337ea5ab..78c7e1af41 100644 --- a/megatron/data/data_samplers.py +++ b/megatron/legacy/data/data_samplers.py @@ -7,7 +7,7 @@ import torch import numpy as np from torch.utils.data import Dataset -from megatron import get_args +from megatron.training import get_args from megatron.core import mpu diff --git a/megatron/data/dataset_utils.py b/megatron/legacy/data/dataset_utils.py similarity index 99% rename from megatron/data/dataset_utils.py rename to megatron/legacy/data/dataset_utils.py index b164190bc5..f6ff472836 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/legacy/data/dataset_utils.py @@ -26,7 +26,7 @@ import numpy as np import torch -from megatron import ( +from megatron.training import ( get_args, print_rank_0 ) @@ -535,8 +535,8 @@ def build_dataset(name, data_prefix, max_num_samples, max_seq_length_dec, dataset_type='standard_bert', indexed_dataset=None): - from megatron.data.ict_dataset import ICTDataset - from megatron.data.multimodal_dataset import MultiModalDataset + from megatron.legacy.data.ict_dataset import ICTDataset + from megatron.legacy.data.multimodal_dataset import MultiModalDataset if dataset_type == DSET_TYPE_BERT or dataset_type == DSET_TYPE_T5: raise ValueError("The Megatron-LM BERT and T5 datasets are deprecated.") diff --git a/megatron/data/ict_dataset.py b/megatron/legacy/data/ict_dataset.py similarity index 96% rename from megatron/data/ict_dataset.py rename to megatron/legacy/data/ict_dataset.py index 6dac35ff9d..2c65f2ce92 100644 --- a/megatron/data/ict_dataset.py +++ b/megatron/legacy/data/ict_dataset.py @@ -4,10 +4,10 @@ import numpy as np from torch.utils.data import Dataset -from megatron import get_tokenizer -from megatron import get_args -from megatron.data.dataset_utils import get_indexed_dataset_ -from megatron.data.realm_dataset_utils import get_block_samples_mapping +from megatron.training import get_tokenizer +from megatron.training import get_args +from megatron.legacy.data.dataset_utils import get_indexed_dataset_ +from megatron.legacy.data.realm_dataset_utils import get_block_samples_mapping def make_attention_mask(source_block, target_block): """ diff --git a/megatron/data/image_folder.py b/megatron/legacy/data/image_folder.py similarity index 100% rename from megatron/data/image_folder.py rename to megatron/legacy/data/image_folder.py diff --git a/megatron/data/multimodal_dataset.py b/megatron/legacy/data/multimodal_dataset.py similarity index 100% rename from megatron/data/multimodal_dataset.py rename to megatron/legacy/data/multimodal_dataset.py diff --git a/megatron/data/orqa_wiki_dataset.py b/megatron/legacy/data/orqa_wiki_dataset.py similarity index 97% rename from megatron/data/orqa_wiki_dataset.py rename to megatron/legacy/data/orqa_wiki_dataset.py index 4019cd764c..99217d64b0 100644 --- a/megatron/data/orqa_wiki_dataset.py +++ b/megatron/legacy/data/orqa_wiki_dataset.py @@ -9,9 +9,9 @@ import torch from torch.utils.data import Dataset -from megatron import print_rank_0, get_args, get_tokenizer +from megatron.training import print_rank_0, get_args, get_tokenizer from megatron.core import tensor_parallel -from megatron.data.biencoder_dataset_utils import make_attention_mask +from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask def get_open_retrieval_wiki_dataset(): args = get_args() diff --git a/megatron/data/realm_dataset_utils.py b/megatron/legacy/data/realm_dataset_utils.py similarity index 96% rename from megatron/data/realm_dataset_utils.py rename to megatron/legacy/data/realm_dataset_utils.py index ebd9ebc498..50bf9bd05d 100644 --- a/megatron/data/realm_dataset_utils.py +++ b/megatron/legacy/data/realm_dataset_utils.py @@ -4,10 +4,10 @@ import numpy as np import torch -from megatron import print_rank_0 +from megatron.training import print_rank_0 from megatron.core import mpu, tensor_parallel -from megatron.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy -from megatron import get_args, get_tokenizer, print_rank_0 +from megatron.legacy.data.dataset_utils import create_masked_lm_predictions, pad_and_convert_to_numpy +from megatron.training import get_args, get_tokenizer, print_rank_0 def get_one_epoch_dataloader(dataset, micro_batch_size=None): @@ -24,7 +24,7 @@ def get_one_epoch_dataloader(dataset, micro_batch_size=None): sampler = torch.utils.data.SequentialSampler(dataset) # importantly, drop_last must be False to get all the data. assert False, 'DistributedBatchSampler deprecated, change the implementation' - from megatron.data.samplers import DistributedBatchSampler + from megatron.legacy.data.samplers import DistributedBatchSampler batch_sampler = DistributedBatchSampler(sampler, batch_size=global_batch_size, drop_last=False, diff --git a/megatron/data/realm_index.py b/megatron/legacy/data/realm_index.py similarity index 99% rename from megatron/data/realm_index.py rename to megatron/legacy/data/realm_index.py index 1fa4a309ed..2575af7ff0 100644 --- a/megatron/data/realm_index.py +++ b/megatron/legacy/data/realm_index.py @@ -6,7 +6,7 @@ import numpy as np import torch -from megatron import get_args +from megatron.training import get_args from megatron.core import mpu diff --git a/megatron/data/vit_dataset.py b/megatron/legacy/data/vit_dataset.py similarity index 97% rename from megatron/data/vit_dataset.py rename to megatron/legacy/data/vit_dataset.py index 82391e9157..e65c536c89 100644 --- a/megatron/data/vit_dataset.py +++ b/megatron/legacy/data/vit_dataset.py @@ -5,10 +5,10 @@ import torch import torchvision.transforms as T from torchvision import datasets -from megatron import get_args -from megatron.data.image_folder import ImageFolder -from megatron.data.autoaugment import ImageNetPolicy -from megatron.data.data_samplers import RandomSeedDataset +from megatron.training import get_args +from megatron.legacy.data.image_folder import ImageFolder +from megatron.legacy.data.autoaugment import ImageNetPolicy +from megatron.legacy.data.data_samplers import RandomSeedDataset from PIL import Image, ImageFilter, ImageOps diff --git a/megatron/fp16_deprecated/loss_scaler.py b/megatron/legacy/fp16_deprecated/loss_scaler.py similarity index 100% rename from megatron/fp16_deprecated/loss_scaler.py rename to megatron/legacy/fp16_deprecated/loss_scaler.py diff --git a/megatron/fused_kernels/__init__.py b/megatron/legacy/fused_kernels/__init__.py similarity index 100% rename from megatron/fused_kernels/__init__.py rename to megatron/legacy/fused_kernels/__init__.py diff --git a/megatron/fused_kernels/compat.h b/megatron/legacy/fused_kernels/compat.h similarity index 100% rename from megatron/fused_kernels/compat.h rename to megatron/legacy/fused_kernels/compat.h diff --git a/megatron/fused_kernels/tests/__init__.py b/megatron/legacy/fused_kernels/tests/__init__.py similarity index 100% rename from megatron/fused_kernels/tests/__init__.py rename to megatron/legacy/fused_kernels/tests/__init__.py diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py similarity index 97% rename from megatron/fused_kernels/tests/test_fused_kernels.py rename to megatron/legacy/fused_kernels/tests/test_fused_kernels.py index 74024c5020..adb9ac6f7d 100644 --- a/megatron/fused_kernels/tests/test_fused_kernels.py +++ b/megatron/legacy/fused_kernels/tests/test_fused_kernels.py @@ -3,11 +3,11 @@ import torch from torch.nn import LayerNorm -from megatron.model.enums import AttnMaskType -from megatron.model.fused_layer_norm import MixedFusedLayerNorm -from megatron.model.fused_softmax import FusedScaleMaskSoftmax -from megatron.model.utils import attention_mask_func -from megatron.fused_kernels import load +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.fused_layer_norm import MixedFusedLayerNorm +from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax +from megatron.legacy.model.utils import attention_mask_func +from megatron.legacy.fused_kernels import load def test_load_fused_kernels(): try: diff --git a/megatron/fused_kernels/type_shim.h b/megatron/legacy/fused_kernels/type_shim.h similarity index 100% rename from megatron/fused_kernels/type_shim.h rename to megatron/legacy/fused_kernels/type_shim.h diff --git a/megatron/indexer.py b/megatron/legacy/indexer.py similarity index 89% rename from megatron/indexer.py rename to megatron/legacy/indexer.py index 45f530a7d4..75851ad70f 100644 --- a/megatron/indexer.py +++ b/megatron/legacy/indexer.py @@ -3,14 +3,14 @@ import torch import torch.distributed as dist -from megatron import get_args, print_rank_0 +from megatron.training import get_args, print_rank_0 from megatron.core import mpu -from megatron.checkpointing import load_biencoder_checkpoint -from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset -from megatron.data.orqa_wiki_dataset import get_open_retrieval_batch -from megatron.data.biencoder_dataset_utils import get_one_epoch_dataloader -from megatron.data.realm_index import detach, OpenRetreivalDataStore -from megatron.model.biencoder_model import get_model_provider +from megatron.training.checkpointing import load_biencoder_checkpoint +from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset +from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_batch +from megatron.legacy.data.biencoder_dataset_utils import get_one_epoch_dataloader +from megatron.legacy.data.realm_index import detach, OpenRetreivalDataStore +from megatron.legacy.model.biencoder_model import get_model_provider from megatron.training import get_model diff --git a/megatron/model/__init__.py b/megatron/legacy/model/__init__.py similarity index 100% rename from megatron/model/__init__.py rename to megatron/legacy/model/__init__.py diff --git a/megatron/model/bert_model.py b/megatron/legacy/model/bert_model.py similarity index 94% rename from megatron/model/bert_model.py rename to megatron/legacy/model/bert_model.py index cd4bb35db7..4171791cbf 100644 --- a/megatron/model/bert_model.py +++ b/megatron/legacy/model/bert_model.py @@ -4,16 +4,16 @@ import torch -from megatron import get_args +from megatron.training import get_args from megatron.core import tensor_parallel -from megatron.model.enums import AttnMaskType -from megatron.model.language_model import parallel_lm_logits -from megatron.model.language_model import get_language_model -from megatron.model.utils import get_norm -from megatron.model.utils import openai_gelu, erf_gelu -from megatron.model.utils import get_linear_layer -from megatron.model.utils import init_method_normal -from megatron.model.utils import scaled_init_method_normal +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.language_model import parallel_lm_logits +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_norm +from megatron.legacy.model.utils import openai_gelu, erf_gelu +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal from .module import MegatronModule @@ -169,7 +169,7 @@ def __init__(self, self._binary_head_key = 'binary_head' def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) def forward(self, bert_model_input, attention_mask, diff --git a/megatron/model/biencoder_model.py b/megatron/legacy/model/biencoder_model.py similarity index 94% rename from megatron/model/biencoder_model.py rename to megatron/legacy/model/biencoder_model.py index c910879dc8..8983cb5407 100644 --- a/megatron/model/biencoder_model.py +++ b/megatron/legacy/model/biencoder_model.py @@ -2,17 +2,17 @@ import torch import sys -from megatron import get_args, print_rank_0, get_tokenizer +from megatron.training import get_args, print_rank_0, get_tokenizer from megatron.core import mpu -from megatron.checkpointing import fix_query_key_value_ordering -from megatron.checkpointing import get_checkpoint_tracker_filename -from megatron.checkpointing import get_checkpoint_name -from megatron.model.bert_model import bert_position_ids -from megatron.model.enums import AttnMaskType -from megatron.model.language_model import get_language_model -from megatron.model.utils import get_linear_layer -from megatron.model.utils import init_method_normal -from megatron.model.utils import scaled_init_method_normal +from megatron.training.checkpointing import fix_query_key_value_ordering +from megatron.training.checkpointing import get_checkpoint_tracker_filename +from megatron.training.checkpointing import get_checkpoint_name +from megatron.legacy.model.bert_model import bert_position_ids +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal from .module import MegatronModule def get_model_provider(only_query_model=False, only_context_model=False, @@ -104,7 +104,7 @@ def __init__(self, self._context_key = 'context_model' def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" # this is just a placeholder and will be needed when model # parallelism will be used # self.language_model.set_input_tensor(input_tensor) @@ -201,7 +201,7 @@ def init_state_dict_from_bert(self): try: state_dict = torch.load(checkpoint_name, map_location='cpu') except ModuleNotFoundError: - from megatron.fp16_deprecated import loss_scaler + from megatron.legacy.fp16_deprecated import loss_scaler # For backward compatibility. print_rank_0(' > deserializing using the old code structure ...') sys.modules['fp16.loss_scaler'] = sys.modules[ diff --git a/megatron/model/classification.py b/megatron/legacy/model/classification.py similarity index 85% rename from megatron/model/classification.py rename to megatron/legacy/model/classification.py index bac50c54cd..c9fe165280 100644 --- a/megatron/model/classification.py +++ b/megatron/legacy/model/classification.py @@ -4,13 +4,13 @@ import torch -from megatron import get_args, print_rank_last -from megatron.model.enums import AttnMaskType -from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids -from megatron.model.language_model import get_language_model -from megatron.model.utils import get_linear_layer -from megatron.model.utils import init_method_normal -from megatron.model.utils import scaled_init_method_normal +from megatron.training import get_args, print_rank_last +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal from .module import MegatronModule @@ -42,11 +42,11 @@ def __init__(self, self.classification_dropout = torch.nn.Dropout(args.hidden_dropout) self.classification_head = get_linear_layer(args.hidden_size, self.num_classes, - init_method) + config.init_method) self._classification_head_key = 'classification_head' def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) def forward(self, model_input, attention_mask, tokentype_ids=None): diff --git a/megatron/model/enums.py b/megatron/legacy/model/enums.py similarity index 100% rename from megatron/model/enums.py rename to megatron/legacy/model/enums.py diff --git a/megatron/model/fused_bias_gelu.py b/megatron/legacy/model/fused_bias_gelu.py similarity index 100% rename from megatron/model/fused_bias_gelu.py rename to megatron/legacy/model/fused_bias_gelu.py diff --git a/megatron/model/fused_layer_norm.py b/megatron/legacy/model/fused_layer_norm.py similarity index 100% rename from megatron/model/fused_layer_norm.py rename to megatron/legacy/model/fused_layer_norm.py diff --git a/megatron/model/fused_softmax.py b/megatron/legacy/model/fused_softmax.py similarity index 99% rename from megatron/model/fused_softmax.py rename to megatron/legacy/model/fused_softmax.py index 9bacf33740..4a561b6897 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/legacy/model/fused_softmax.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn -from megatron.model.enums import AttnMaskType +from megatron.legacy.model.enums import AttnMaskType class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function): diff --git a/megatron/model/gpt_model.py b/megatron/legacy/model/gpt_model.py similarity index 97% rename from megatron/model/gpt_model.py rename to megatron/legacy/model/gpt_model.py index dd47188da4..8e380199db 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/legacy/model/gpt_model.py @@ -4,7 +4,7 @@ import torch -from megatron import get_args +from megatron.training import get_args from megatron.core import tensor_parallel from .module import MegatronModule @@ -70,7 +70,7 @@ def __init__(self, self.initialize_word_embeddings() def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) def forward(self, input_ids, position_ids, attention_mask, diff --git a/megatron/model/language_model.py b/megatron/legacy/model/language_model.py similarity index 99% rename from megatron/model/language_model.py rename to megatron/legacy/model/language_model.py index 948d1c3cc5..a6ee1cf563 100644 --- a/megatron/model/language_model.py +++ b/megatron/legacy/model/language_model.py @@ -5,7 +5,7 @@ import torch import torch.nn.functional as F -from megatron import get_args +from megatron.training import get_args from megatron.core import mpu, tensor_parallel from megatron.core.enums import ModelType from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding @@ -426,7 +426,7 @@ def __init__(self, self._output_layer_key = 'output_layer' def set_input_tensor(self, input_tensor): - """ See megatron.model.transformer.set_input_tensor()""" + """ See megatron.legacy.model.transformer.set_input_tensor()""" # This is usually handled in schedules.py but some inference code still # gives us non-lists or None diff --git a/megatron/model/module.py b/megatron/legacy/model/module.py similarity index 99% rename from megatron/model/module.py rename to megatron/legacy/model/module.py index cd0ef2a4e2..849fda7453 100644 --- a/megatron/model/module.py +++ b/megatron/legacy/model/module.py @@ -6,7 +6,7 @@ from torch.autograd import Variable from torch.nn.parameter import Parameter -from megatron import get_args +from megatron.training import get_args from megatron.core import mpu, tensor_parallel diff --git a/megatron/model/multiple_choice.py b/megatron/legacy/model/multiple_choice.py similarity index 88% rename from megatron/model/multiple_choice.py rename to megatron/legacy/model/multiple_choice.py index 41f8bb49f6..bec0548c40 100644 --- a/megatron/model/multiple_choice.py +++ b/megatron/legacy/model/multiple_choice.py @@ -4,13 +4,13 @@ import torch -from megatron import get_args, print_rank_last -from megatron.model.enums import AttnMaskType -from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids -from megatron.model.language_model import get_language_model -from megatron.model.utils import get_linear_layer -from megatron.model.utils import init_method_normal -from megatron.model.utils import scaled_init_method_normal +from megatron.training import get_args, print_rank_last +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.utils import scaled_init_method_normal from .module import MegatronModule @@ -43,7 +43,7 @@ def __init__(self, self._multichoice_head_key = 'multichoice_head' def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) def forward(self, model_input, attention_mask, tokentype_ids=None): diff --git a/megatron/model/realm_model.py b/megatron/legacy/model/realm_model.py similarity index 93% rename from megatron/model/realm_model.py rename to megatron/legacy/model/realm_model.py index 654f2992f6..5b2859a7f2 100644 --- a/megatron/model/realm_model.py +++ b/megatron/legacy/model/realm_model.py @@ -1,17 +1,17 @@ import os import torch -from megatron import get_args, print_rank_0 -from megatron.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name -from megatron.model import BertModel +from megatron.training import get_args, print_rank_0 +from megatron.training.checkpointing import get_checkpoint_tracker_filename, get_checkpoint_name +from megatron.legacy.model import BertModel from .module import MegatronModule from megatron.core import mpu -from megatron.model.enums import AttnMaskType -from megatron.model.utils import get_linear_layer -from megatron.model.utils import init_method_normal -from megatron.model.language_model import get_language_model -from megatron.model.utils import scaled_init_method_normal -from megatron.model.bert_model import bert_extended_attention_mask, bert_position_ids +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.utils import init_method_normal +from megatron.legacy.model.language_model import get_language_model +from megatron.legacy.model.utils import scaled_init_method_normal +from megatron.legacy.model.bert_model import bert_extended_attention_mask, bert_position_ids def general_ict_model_provider(only_query_model=False, only_block_model=False): diff --git a/megatron/model/rms_norm.py b/megatron/legacy/model/rms_norm.py similarity index 100% rename from megatron/model/rms_norm.py rename to megatron/legacy/model/rms_norm.py diff --git a/megatron/model/t5_model.py b/megatron/legacy/model/t5_model.py similarity index 95% rename from megatron/model/t5_model.py rename to megatron/legacy/model/t5_model.py index f9fabd3401..c05ef23b0b 100644 --- a/megatron/model/t5_model.py +++ b/megatron/legacy/model/t5_model.py @@ -4,12 +4,12 @@ import torch -from megatron import get_args +from megatron.training import get_args from megatron.core import tensor_parallel -from megatron.model.enums import AttnMaskType -from megatron.model.language_model import parallel_lm_logits, get_language_model -from megatron.model import LayerNorm -from megatron.model.utils import ( +from megatron.legacy.model.enums import AttnMaskType +from megatron.legacy.model.language_model import parallel_lm_logits, get_language_model +from megatron.legacy.model import LayerNorm +from megatron.legacy.model.utils import ( openai_gelu, get_linear_layer ) @@ -101,7 +101,7 @@ def __init__(self, self._lm_head_key = 'lm_head' def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.language_model.set_input_tensor(input_tensor) def forward(self, encoder_input_ids, decoder_input_ids, encoder_attn_mask, diff --git a/megatron/model/transformer.py b/megatron/legacy/model/transformer.py similarity index 99% rename from megatron/model/transformer.py rename to megatron/legacy/model/transformer.py index be76fa9230..ef19656e00 100644 --- a/megatron/model/transformer.py +++ b/megatron/legacy/model/transformer.py @@ -9,15 +9,16 @@ import torch.nn.functional as F from typing import Optional -from megatron import get_timers, get_args, core, get_num_microbatches +from megatron import core +from megatron.training import get_timers, get_args, get_num_microbatches from .module import MegatronModule from megatron.core import mpu, tensor_parallel from megatron.core.enums import ModelType -from megatron.model.enums import AttnMaskType, LayerType, AttnType -from megatron.model.fused_softmax import FusedScaleMaskSoftmax -from megatron.model.fused_bias_gelu import bias_gelu_impl +from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType +from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax +from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding, apply_rotary_pos_emb -from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu, get_norm +from megatron.legacy.model.utils import attention_mask_func, openai_gelu, erf_gelu, get_norm from megatron.core.tensor_parallel import ( gather_from_sequence_parallel_region_to_moe, reduce_scatter_to_sequence_parallel_region_from_moe, diff --git a/megatron/model/utils.py b/megatron/legacy/model/utils.py similarity index 96% rename from megatron/model/utils.py rename to megatron/legacy/model/utils.py index ace7f346c4..5762000d5d 100644 --- a/megatron/model/utils.py +++ b/megatron/legacy/model/utils.py @@ -6,8 +6,8 @@ import torch -from megatron import get_args -from megatron.model import LayerNorm, RMSNorm +from megatron.training import get_args +from megatron.legacy.model import LayerNorm, RMSNorm from megatron.core.jit import jit_fuser def init_method_normal(sigma): diff --git a/megatron/model/vision/classification.py b/megatron/legacy/model/vision/classification.py similarity index 84% rename from megatron/model/vision/classification.py rename to megatron/legacy/model/vision/classification.py index 3d5c823df4..f9419c71de 100644 --- a/megatron/model/vision/classification.py +++ b/megatron/legacy/model/vision/classification.py @@ -4,11 +4,11 @@ import torch from torch.nn.init import trunc_normal_ -from megatron import get_args -from megatron.model.utils import get_linear_layer -from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead -from megatron.model.vision.mit_backbone import mit_b3_avg -from megatron.model.module import MegatronModule +from megatron.training import get_args +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.vision.vit_backbone import VitBackbone, VitMlpHead +from megatron.legacy.model.vision.mit_backbone import mit_b3_avg +from megatron.legacy.model.module import MegatronModule class VitClassificationModel(MegatronModule): """Vision Transformer Model.""" @@ -42,7 +42,7 @@ def __init__(self, config, num_classes, finetune=False, ) def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.backbone.set_input_tensor(input_tensor) def forward(self, input): @@ -76,7 +76,7 @@ def _init_weights(self, m): torch.nn.init.constant_(m.bias, 0) def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" pass def forward(self, input): diff --git a/megatron/model/vision/dino.py b/megatron/legacy/model/vision/dino.py similarity index 96% rename from megatron/model/vision/dino.py rename to megatron/legacy/model/vision/dino.py index 151ec26647..20ca2100f6 100644 --- a/megatron/model/vision/dino.py +++ b/megatron/legacy/model/vision/dino.py @@ -12,12 +12,12 @@ import numpy as np import torch.nn.functional as F from torch.nn.init import trunc_normal_ -from megatron import get_args, print_rank_0 -from megatron.model.utils import get_linear_layer -from megatron.model.vision.vit_backbone import VitBackbone -from megatron.model.module import MegatronModule -from megatron.model.vision.mit_backbone import mit_b5_avg -from megatron.model.vision.esvit_swin_backbone import get_swin +from megatron.training import get_args, print_rank_0 +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.vision.vit_backbone import VitBackbone +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.mit_backbone import mit_b5_avg +from megatron.legacy.model.vision.esvit_swin_backbone import get_swin class DINOLoss(torch.nn.Module): diff --git a/megatron/model/vision/esvit_swin_backbone.py b/megatron/legacy/model/vision/esvit_swin_backbone.py similarity index 99% rename from megatron/model/vision/esvit_swin_backbone.py rename to megatron/legacy/model/vision/esvit_swin_backbone.py index 70aee3db42..87932040cb 100644 --- a/megatron/model/vision/esvit_swin_backbone.py +++ b/megatron/legacy/model/vision/esvit_swin_backbone.py @@ -15,9 +15,9 @@ from functools import partial import torch.distributed as dist from torch.nn.init import trunc_normal_ -from megatron.model.transformer import DropPath -from megatron import get_args -from megatron.model import LayerNorm +from megatron.legacy.model.transformer import DropPath +from megatron.training import get_args +from megatron.legacy.model import LayerNorm import numpy as np from math import sqrt diff --git a/megatron/model/vision/inpainting.py b/megatron/legacy/model/vision/inpainting.py similarity index 91% rename from megatron/model/vision/inpainting.py rename to megatron/legacy/model/vision/inpainting.py index 6aae9658bc..f71f5e3209 100644 --- a/megatron/model/vision/inpainting.py +++ b/megatron/legacy/model/vision/inpainting.py @@ -8,12 +8,12 @@ import einops import torch import torch.nn.functional as F -from megatron import get_args, print_rank_0 -from megatron.model.utils import get_linear_layer -from megatron.model.vision.vit_backbone import VitBackbone -from megatron.model.module import MegatronModule -from megatron.model.vision.mit_backbone import mit_b3 -from megatron.model.vision.utils import resize +from megatron.training import get_args, print_rank_0 +from megatron.legacy.model.utils import get_linear_layer +from megatron.legacy.model.vision.vit_backbone import VitBackbone +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.mit_backbone import mit_b3 +from megatron.legacy.model.vision.utils import resize class VitInpaintingModel(MegatronModule): @@ -113,7 +113,7 @@ def __init__(self, pre_process=True, post_process=True): self.linear_pred = torch.nn.Conv2d(self.embedding_dim, self.flatten_dim, kernel_size=1) def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" pass def forward(self, input): diff --git a/megatron/model/vision/knn_monitor.py b/megatron/legacy/model/vision/knn_monitor.py similarity index 96% rename from megatron/model/vision/knn_monitor.py rename to megatron/legacy/model/vision/knn_monitor.py index a7d79854eb..ad796d1f2e 100644 --- a/megatron/model/vision/knn_monitor.py +++ b/megatron/legacy/model/vision/knn_monitor.py @@ -1,9 +1,9 @@ import torch.nn.functional as F import torch -from megatron import print_rank_0, get_args +from megatron.training import print_rank_0, get_args from megatron.core import mpu -from megatron.data.vit_dataset import ClassificationTransform -from megatron.data.image_folder import ImageFolder +from megatron.legacy.data.vit_dataset import ClassificationTransform +from megatron.legacy.data.image_folder import ImageFolder _FEATURE_BANK = None diff --git a/megatron/model/vision/mit_backbone.py b/megatron/legacy/model/vision/mit_backbone.py similarity index 99% rename from megatron/model/vision/mit_backbone.py rename to megatron/legacy/model/vision/mit_backbone.py index 6640b105df..3ca2303c30 100644 --- a/megatron/model/vision/mit_backbone.py +++ b/megatron/legacy/model/vision/mit_backbone.py @@ -6,8 +6,8 @@ import torch.nn.functional as F from functools import partial from torch.nn.init import trunc_normal_ -from megatron.model.transformer import DropPath -from megatron.model import LayerNorm +from megatron.legacy.model.transformer import DropPath +from megatron.legacy.model import LayerNorm class Mlp(nn.Module): diff --git a/megatron/model/vision/swin_backbone.py b/megatron/legacy/model/vision/swin_backbone.py similarity index 99% rename from megatron/model/vision/swin_backbone.py rename to megatron/legacy/model/vision/swin_backbone.py index 9a622c7070..231802c8f2 100644 --- a/megatron/model/vision/swin_backbone.py +++ b/megatron/legacy/model/vision/swin_backbone.py @@ -12,7 +12,7 @@ from timm.models.layers import DropPath, to_2tuple, trunc_normal_ from math import sqrt -from megatron import get_args +from megatron.training import get_args from functools import partial diff --git a/megatron/model/vision/utils.py b/megatron/legacy/model/vision/utils.py similarity index 100% rename from megatron/model/vision/utils.py rename to megatron/legacy/model/vision/utils.py diff --git a/megatron/model/vision/vit_backbone.py b/megatron/legacy/model/vision/vit_backbone.py similarity index 96% rename from megatron/model/vision/vit_backbone.py rename to megatron/legacy/model/vision/vit_backbone.py index 15cf75affc..7994afb838 100644 --- a/megatron/model/vision/vit_backbone.py +++ b/megatron/legacy/model/vision/vit_backbone.py @@ -7,14 +7,14 @@ import torch import apex import torch.nn.functional as F -from megatron import get_args -from megatron.model.transformer import ParallelTransformer -from megatron.model.utils import ( +from megatron.training import get_args +from megatron.legacy.model.transformer import ParallelTransformer +from megatron.legacy.model.utils import ( get_linear_layer, init_method_normal, scaled_init_method_normal, ) -from megatron.model.module import MegatronModule +from megatron.legacy.model.module import MegatronModule CLASS_TOKEN_LENGTH = 8 @@ -206,7 +206,7 @@ def __init__(self, ) def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" self.transformer.set_input_tensor(input_tensor) def forward(self, input): diff --git a/megatron/mpu/tests/__init__.py b/megatron/legacy/mpu/tests/__init__.py similarity index 100% rename from megatron/mpu/tests/__init__.py rename to megatron/legacy/mpu/tests/__init__.py diff --git a/megatron/mpu/tests/commons.py b/megatron/legacy/mpu/tests/commons.py similarity index 100% rename from megatron/mpu/tests/commons.py rename to megatron/legacy/mpu/tests/commons.py diff --git a/megatron/mpu/tests/test_cross_entropy.py b/megatron/legacy/mpu/tests/test_cross_entropy.py similarity index 100% rename from megatron/mpu/tests/test_cross_entropy.py rename to megatron/legacy/mpu/tests/test_cross_entropy.py diff --git a/megatron/mpu/tests/test_data.py b/megatron/legacy/mpu/tests/test_data.py similarity index 100% rename from megatron/mpu/tests/test_data.py rename to megatron/legacy/mpu/tests/test_data.py diff --git a/megatron/mpu/tests/test_initialize.py b/megatron/legacy/mpu/tests/test_initialize.py similarity index 100% rename from megatron/mpu/tests/test_initialize.py rename to megatron/legacy/mpu/tests/test_initialize.py diff --git a/megatron/mpu/tests/test_layers.py b/megatron/legacy/mpu/tests/test_layers.py similarity index 100% rename from megatron/mpu/tests/test_layers.py rename to megatron/legacy/mpu/tests/test_layers.py diff --git a/megatron/mpu/tests/test_random.py b/megatron/legacy/mpu/tests/test_random.py similarity index 100% rename from megatron/mpu/tests/test_random.py rename to megatron/legacy/mpu/tests/test_random.py diff --git a/megatron/memory.py b/megatron/memory.py deleted file mode 100644 index a5fef75baa..0000000000 --- a/megatron/memory.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - - -import torch - - -# A dictionary of all the memory buffers allocated. -_MEM_BUFFS = dict() - - -def allocate_mem_buff(name, numel, dtype, track_usage): - """Allocate a memory buffer.""" - assert name not in _MEM_BUFFS, \ - 'memory buffer {} already allocated.'.format(name) - _MEM_BUFFS[name] = MemoryBuffer(name, numel, dtype, track_usage) - return _MEM_BUFFS[name] - - -def get_mem_buff(name): - """Get the memory buffer.""" - return _MEM_BUFFS[name] - - -class MemoryBuffer: - """Contiguous memory buffer. - Allocate a contiguous memory of type `dtype` and size `numel`. It is - used to reduce memory fragmentation. - - Usage: After the allocation, the `_start` index is set tot the first - index of the memory. A memory chunk starting from `_start` index - can be `allocated` for an input tensor, with the elements of the - tensor being coppied. The buffer can be reused by resetting the - `_start` index. - - """ - def __init__(self, name, numel, dtype, track_usage): - if torch.distributed.get_rank() == 0: - element_size = torch.tensor([], dtype=dtype).element_size() - print('> building the {} memory buffer with {} num elements ' - 'and {} dtype ({:.1f} MB)...'.format( - name, numel, dtype, numel*element_size/1024/1024), - flush=True) - self.name = name - self.numel = numel - self.dtype = dtype - self.data = torch.empty(self.numel, - dtype=self.dtype, - device=torch.cuda.current_device(), - requires_grad=False) - - # Index tracking the start of the free memory. - self._start = 0 - - # Values used for tracking usage. - self.track_usage = track_usage - if self.track_usage: - self.in_use_value = 0.0 - self.total_value = 0.0 - - - def reset(self): - """Reset the buffer start index to the beginning of the buffer.""" - self._start = 0 - - - def is_in_use(self): - """Whether the current buffer hold on to any memory.""" - return self._start > 0 - - - def numel_in_use(self): - """Return number of elements in use.""" - return self._start - - - def add(self, tensor): - """Allocate a chunk of memory from the buffer to tensor and copy - the values.""" - assert tensor.dtype == self.dtype, \ - 'Input tensor type {} different from buffer type {}'.format( - tensor.dtype, self.dtype) - # Number of elements of the input tensor. - tensor_numel = torch.numel(tensor) - new_start = self._start + tensor_numel - assert new_start <= self.numel, \ - 'Not enough memory left in the buffer ({} > {})'.format( - tensor_numel, self.numel - self._start) - # New tensor is a view into the memory. - new_tensor = self.data[self._start:new_start] - self._start = new_start - new_tensor = new_tensor.view(tensor.shape) - new_tensor.copy_(tensor) - # Return a pointer to the new tensor. - return new_tensor - - - def get_data(self): - """Return the data currently in use.""" - if self.track_usage: - self.in_use_value += float(self._start) - self.total_value += float(self.numel) - return self.data[:self._start] - - - def print_average_usage(self): - """Print memory usage average over time. We would like this value - to be as high as possible.""" - assert self.track_usage, 'You need to enable track usage.' - if torch.distributed.get_rank() == 0: - print(' > usage of {} memory buffer: {:.2f} %'.format( - self.name, self.in_use_value * 100.0 / self.total_value), - flush=True) - - - -class RingMemBuffer: - """A ring of memory buffers.""" - - def __init__(self, name, num_buffers, numel, dtype, track_usage): - self.num_buffers = num_buffers - self.buffers = [ - allocate_mem_buff(name+' {}'.format(i), numel, dtype, track_usage) - for i in range(num_buffers)] - self._index = -1 - - - def get_next_buffer(self): - self._index += 1 - self._index = self._index % self.num_buffers - buff = self.buffers[self._index] - assert not buff.is_in_use(), 'buffer is already in use.' - return buff diff --git a/megatron/__init__.py b/megatron/training/__init__.py similarity index 95% rename from megatron/__init__.py rename to megatron/training/__init__.py index 42c4518b5e..a539e5930f 100644 --- a/megatron/__init__.py +++ b/megatron/training/__init__.py @@ -14,6 +14,7 @@ from .global_vars import get_adlr_autoresume from .global_vars import get_timers from .initialize import initialize_megatron +from .training import pretrain from .utils import (print_rank_0, is_last_rank, diff --git a/megatron/arguments.py b/megatron/training/arguments.py similarity index 100% rename from megatron/arguments.py rename to megatron/training/arguments.py diff --git a/megatron/checkpointing.py b/megatron/training/checkpointing.py similarity index 99% rename from megatron/checkpointing.py rename to megatron/training/checkpointing.py index caebaae6d2..2d32a32ffe 100644 --- a/megatron/checkpointing.py +++ b/megatron/training/checkpointing.py @@ -9,9 +9,9 @@ import torch -from megatron import update_num_microbatches +from megatron.training import update_num_microbatches from megatron.core import mpu, tensor_parallel, dist_checkpointing -from .core.dist_checkpointing.mapping import ShardedObject +from ..core.dist_checkpointing.mapping import ShardedObject from .global_vars import get_args from .utils import (unwrap_model, print_rank_0) @@ -492,14 +492,14 @@ def _load_base_checkpoint(load_dir, rank0=False, sharded_state_dict=None, try: state_dict = torch.load(checkpoint_name, map_location='cpu') except ModuleNotFoundError: - from megatron.fp16_deprecated import loss_scaler + from megatron.legacy.fp16_deprecated import loss_scaler # For backward compatibility. if not rank0: print_rank_0(' > deserializing using the old code structure ...') sys.modules['fp16.loss_scaler'] = sys.modules[ - 'megatron.fp16_deprecated.loss_scaler'] + 'megatron.legacy.fp16_deprecated.loss_scaler'] sys.modules['megatron.fp16.loss_scaler'] = sys.modules[ - 'megatron.fp16_deprecated.loss_scaler'] + 'megatron.legacy.fp16_deprecated.loss_scaler'] state_dict = torch.load(checkpoint_name, map_location='cpu') sys.modules.pop('fp16.loss_scaler', None) sys.modules.pop('megatron.fp16.loss_scaler', None) diff --git a/megatron/dist_signal_handler.py b/megatron/training/dist_signal_handler.py similarity index 100% rename from megatron/dist_signal_handler.py rename to megatron/training/dist_signal_handler.py diff --git a/megatron/global_vars.py b/megatron/training/global_vars.py similarity index 98% rename from megatron/global_vars.py rename to megatron/training/global_vars.py index 89a20d6df3..ce68d8e04f 100644 --- a/megatron/global_vars.py +++ b/megatron/training/global_vars.py @@ -6,9 +6,9 @@ import sys import torch -from megatron import dist_signal_handler +from megatron.training import dist_signal_handler from megatron.core import Timers -from megatron.tokenizer import build_tokenizer +from megatron.training.tokenizer import build_tokenizer from .microbatches import build_num_microbatches_calculator _GLOBAL_ARGS = None diff --git a/megatron/initialize.py b/megatron/training/initialize.py similarity index 95% rename from megatron/initialize.py rename to megatron/training/initialize.py index 63d7066f56..8e99788731 100644 --- a/megatron/initialize.py +++ b/megatron/training/initialize.py @@ -10,17 +10,17 @@ import torch from datetime import timedelta -from megatron import fused_kernels -from megatron import get_adlr_autoresume -from megatron import get_args -from megatron import get_tensorboard_writer +from megatron.legacy import fused_kernels +from megatron.training import get_adlr_autoresume +from megatron.training import get_args +from megatron.training import get_tensorboard_writer from megatron.core import mpu, tensor_parallel -from megatron.arguments import parse_args, validate_args -from megatron.yaml_arguments import validate_yaml -from megatron.checkpointing import load_args_from_checkpoint -from megatron.global_vars import set_global_variables -from megatron.model.transformer import bias_dropout_add_fused_train -from megatron.model.fused_bias_gelu import bias_gelu +from megatron.training.arguments import parse_args, validate_args +from megatron.training.yaml_arguments import validate_yaml +from megatron.training.checkpointing import load_args_from_checkpoint +from megatron.training.global_vars import set_global_variables +from megatron.legacy.model.transformer import bias_dropout_add_fused_train +from megatron.legacy.model.fused_bias_gelu import bias_gelu def initialize_megatron( extra_args_provider=None, diff --git a/megatron/log_handler.py b/megatron/training/log_handler.py similarity index 100% rename from megatron/log_handler.py rename to megatron/training/log_handler.py diff --git a/megatron/microbatches.py b/megatron/training/microbatches.py similarity index 100% rename from megatron/microbatches.py rename to megatron/training/microbatches.py diff --git a/megatron/optimizer_param_scheduler.py b/megatron/training/optimizer_param_scheduler.py similarity index 99% rename from megatron/optimizer_param_scheduler.py rename to megatron/training/optimizer_param_scheduler.py index 0cf5fb1d8f..baed2b23ae 100644 --- a/megatron/optimizer_param_scheduler.py +++ b/megatron/training/optimizer_param_scheduler.py @@ -4,7 +4,7 @@ import math -from megatron import print_rank_0 +from .utils import print_rank_0 class OptimizerParamScheduler(object): """Anneals learning rate and weight decay""" diff --git a/megatron/theoretical_memory_usage.py b/megatron/training/theoretical_memory_usage.py similarity index 100% rename from megatron/theoretical_memory_usage.py rename to megatron/training/theoretical_memory_usage.py diff --git a/megatron/tokenizer/__init__.py b/megatron/training/tokenizer/__init__.py similarity index 100% rename from megatron/tokenizer/__init__.py rename to megatron/training/tokenizer/__init__.py diff --git a/megatron/tokenizer/bert_tokenization.py b/megatron/training/tokenizer/bert_tokenization.py similarity index 100% rename from megatron/tokenizer/bert_tokenization.py rename to megatron/training/tokenizer/bert_tokenization.py diff --git a/megatron/tokenizer/gpt2_tokenization.py b/megatron/training/tokenizer/gpt2_tokenization.py similarity index 100% rename from megatron/tokenizer/gpt2_tokenization.py rename to megatron/training/tokenizer/gpt2_tokenization.py diff --git a/megatron/tokenizer/tokenizer.py b/megatron/training/tokenizer/tokenizer.py similarity index 100% rename from megatron/tokenizer/tokenizer.py rename to megatron/training/tokenizer/tokenizer.py diff --git a/megatron/training.py b/megatron/training/training.py similarity index 98% rename from megatron/training.py rename to megatron/training/training.py index a02800211a..42f903d113 100644 --- a/megatron/training.py +++ b/megatron/training/training.py @@ -18,38 +18,40 @@ _TRAIN_START_TIME = time.time() import torch -from megatron import get_args -from megatron import get_signal_handler -from megatron import get_timers -from megatron import get_tensorboard_writer -from megatron import get_wandb_writer -from megatron import get_one_logger -from megatron import get_current_global_batch_size -from megatron import get_num_microbatches -from megatron import is_last_rank -from megatron import update_num_microbatches from megatron.core import mpu, tensor_parallel from megatron.core.utils import get_model_config -from megatron import print_rank_0 -from megatron import print_rank_last -from megatron.checkpointing import load_checkpoint -from megatron.checkpointing import save_checkpoint -from megatron.model import Float16Module +from megatron.training.checkpointing import load_checkpoint +from megatron.training.checkpointing import save_checkpoint +from megatron.legacy.model import Float16Module from megatron.core.distributed import DistributedDataParallel as DDP from megatron.core.distributed import finalize_model_grads from megatron.core.enums import ModelType from megatron.core.optimizer import get_megatron_optimizer, OptimizerConfig -from megatron.initialize import initialize_megatron -from megatron.initialize import write_args_to_tensorboard -from megatron.initialize import set_jit_fusion_options -from megatron.optimizer_param_scheduler import OptimizerParamScheduler -from megatron.utils import check_adlr_autoresume_termination -from megatron.utils import unwrap_model -from megatron.data.data_samplers import build_pretraining_data_loader -from megatron.utils import calc_params_l2_norm +from megatron.training.initialize import initialize_megatron +from megatron.training.initialize import write_args_to_tensorboard +from megatron.training.initialize import set_jit_fusion_options +from megatron.training.optimizer_param_scheduler import OptimizerParamScheduler +from megatron.legacy.data.data_samplers import build_pretraining_data_loader from megatron.core.pipeline_parallel import get_forward_backward_func -from megatron.utils import report_memory -from megatron.model.vision.knn_monitor import compute_feature_bank + +from .utils import ( + calc_params_l2_norm, + check_adlr_autoresume_termination, + is_last_rank, + print_rank_0, + print_rank_last, + report_memory, + unwrap_model) +from .global_vars import ( + get_args, + get_signal_handler, + get_timers, + get_tensorboard_writer, + get_wandb_writer, + get_one_logger, + get_current_global_batch_size, + get_num_microbatches, + update_num_microbatches) def print_datetime(string): @@ -1118,6 +1120,7 @@ def evaluate(forward_step_func, timers('evaluate', log_level=0).start(barrier=True) if args.vision_pretraining and args.vision_pretraining_type == "dino": + from megatron.legacy.model.vision.knn_monitor import compute_feature_bank compute_feature_bank(model) # Turn on evaluation mode which disables dropout. diff --git a/megatron/utils.py b/megatron/training/utils.py similarity index 98% rename from megatron/utils.py rename to megatron/training/utils.py index fcc72edaeb..220a8271ff 100644 --- a/megatron/utils.py +++ b/megatron/training/utils.py @@ -16,15 +16,15 @@ except ImportError: amp_C = None -from megatron import ( +from megatron.training import ( get_args, get_adlr_autoresume, ) from megatron.core import DistributedDataParallel as DDP from megatron.core import mpu from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate -from megatron.model import Float16Module -from megatron.model.module import param_is_not_shared +from megatron.legacy.model import Float16Module +from megatron.legacy.model.module import param_is_not_shared ALL_MODULE_WRAPPER_CLASSNAMES = (DDP, Float16Module) @@ -143,7 +143,7 @@ def print_params_min_max_norm(optimizer, iteration): def check_adlr_autoresume_termination(iteration, model, optimizer, opt_param_scheduler): """Check for autoresume signal and exit if it is received.""" - from megatron.checkpointing import save_checkpoint + from megatron.training.checkpointing import save_checkpoint args = get_args() autoresume = get_adlr_autoresume() diff --git a/megatron/yaml_arguments.py b/megatron/training/yaml_arguments.py similarity index 100% rename from megatron/yaml_arguments.py rename to megatron/training/yaml_arguments.py diff --git a/pretrain_bert.py b/pretrain_bert.py index e6b2f66896..0f95fabf4b 100644 --- a/pretrain_bert.py +++ b/pretrain_bert.py @@ -7,17 +7,17 @@ import torch import torch.nn.functional as F -from megatron import get_args -from megatron import get_tokenizer -from megatron import print_rank_0 -from megatron import get_timers +from megatron.training import get_args +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training import get_timers from megatron.core import tensor_parallel from megatron.core.enums import ModelType -import megatron.model +import megatron.legacy.model from megatron.core.models.bert.bert_model import BertModel from megatron.training import pretrain -from megatron.utils import average_losses_across_data_parallel_group -from megatron.arguments import core_transformer_config_from_args +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.arguments import core_transformer_config_from_args from megatron.core.transformer.spec_utils import import_module from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec, bert_layer_local_spec from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder @@ -58,7 +58,7 @@ def model_provider(pre_process=True, post_process=True): pre_process=pre_process, post_process=post_process) else: - model = megatron.model.BertModel( + model = megatron.legacy.model.BertModel( config=config, num_tokentypes=num_tokentypes, add_binary_head=args.bert_binary_head, diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 1d95a69c98..e7e556f1f7 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -5,33 +5,33 @@ import torch from functools import partial from typing import Union -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import get_tokenizer from megatron.core import mpu from megatron.core.enums import ModelType from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.gpt_dataset import GPTDatasetConfig from megatron.core.datasets.gpt_dataset import MockGPTDataset, GPTDataset -import megatron.model +import megatron.legacy.model from megatron.core.models.gpt import GPTModel from megatron.training import pretrain from megatron.core.transformer.spec_utils import import_module -from megatron.utils import ( +from megatron.training.utils import ( get_batch_on_this_cp_rank, get_batch_on_this_tp_rank, average_losses_across_data_parallel_group ) -from megatron.arguments import core_transformer_config_from_args -from megatron.yaml_arguments import core_transformer_config_from_yaml +from megatron.training.arguments import core_transformer_config_from_args +from megatron.training.yaml_arguments import core_transformer_config_from_yaml from megatron.core.models.gpt.gpt_layer_specs import ( get_gpt_layer_local_spec, get_gpt_layer_with_transformer_engine_spec, ) -def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.model.GPTModel]: +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: """Builds the model. If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model. @@ -42,7 +42,7 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat Returns: - Union[GPTModel, megatron.model.GPTModel]: The returned model + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model """ args = get_args() use_te = args.transformer_impl == "transformer_engine" @@ -79,7 +79,7 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat else: assert(args.context_parallel_size == 1), "Context parallelism is only supported with Megatron Core!" - model = megatron.model.GPTModel( + model = megatron.legacy.model.GPTModel( config, num_tokentypes=0, parallel_output=True, diff --git a/pretrain_ict.py b/pretrain_ict.py index 50226d7375..0ae9059273 100644 --- a/pretrain_ict.py +++ b/pretrain_ict.py @@ -9,16 +9,16 @@ import torch.distributed as dist import torch.nn.functional as F -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers from megatron.core import mpu from megatron.core.enums import ModelType -from megatron.data.biencoder_dataset_utils import get_ict_batch -from megatron.data.dataset_utils import build_train_valid_test_datasets -from megatron.model.biencoder_model import biencoder_model_provider +from megatron.legacy.data.biencoder_dataset_utils import get_ict_batch +from megatron.legacy.data.dataset_utils import build_train_valid_test_datasets +from megatron.legacy.model.biencoder_model import biencoder_model_provider from megatron.training import pretrain -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import average_losses_across_data_parallel_group def pretrain_ict_model_provider(pre_process=True, post_process=True): diff --git a/pretrain_retro.py b/pretrain_retro.py index ced2665431..8379ffd275 100644 --- a/pretrain_retro.py +++ b/pretrain_retro.py @@ -5,11 +5,11 @@ from functools import partial import torch -from megatron import get_args -from megatron import get_timers -from megatron import get_tokenizer -from megatron import print_rank_0 -from megatron.arguments import core_transformer_config_from_args +from megatron.training import get_args +from megatron.training import get_timers +from megatron.training import get_tokenizer +from megatron.training import print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args from megatron.core import tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.retro.query.retro_dataset import get_retro_datasets @@ -18,7 +18,7 @@ from megatron.core.models.retro import get_retro_decoder_block_spec, RetroConfig, RetroModel from megatron.core.models.retro.utils import get_all_true_mask from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids +from megatron.training.utils import get_ltor_masks_and_position_ids from pretrain_gpt import ( is_dataset_built_on_rank, loss_func, @@ -64,7 +64,7 @@ def model_provider(pre_process=True, post_process=True): """Build the model. Select between two different model classes: - 1. Default model (uses megatron/models/gpt_model.py). + 1. Default model (uses megatron.legacy.models/gpt_model.py). 2. Core model (uses megatron/core/models/retro/model.py). """ diff --git a/pretrain_t5.py b/pretrain_t5.py index f6b93cabd5..122b50ea98 100644 --- a/pretrain_t5.py +++ b/pretrain_t5.py @@ -6,7 +6,7 @@ import torch -from megatron import ( +from megatron.training import ( get_args, get_timers, get_tokenizer, @@ -16,15 +16,15 @@ from megatron.core.enums import ModelType from megatron.core.models.T5 import T5Model from megatron.training import pretrain -from megatron.utils import average_losses_across_data_parallel_group -from megatron.arguments import core_transformer_config_from_args +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.arguments import core_transformer_config_from_args from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.t5_dataset import T5MaskedWordPieceDataset, T5MaskedWordPieceDatasetConfig from megatron.core.models.T5.t5_spec import (get_t5_encoder_with_transformer_engine_block_spec, get_t5_decoder_with_transformer_engine_block_spec, get_t5_encoder_with_local_block_spec, get_t5_decoder_with_local_block_spec) -from megatron.model import T5Model as NonCoreT5Model +from megatron.legacy.model import T5Model as NonCoreT5Model """ Pipeline parallelism for T5 diff --git a/pretrain_vision_classify.py b/pretrain_vision_classify.py index e7dc2a7ee8..8d9b28baeb 100644 --- a/pretrain_vision_classify.py +++ b/pretrain_vision_classify.py @@ -5,14 +5,14 @@ import torch import torch.nn.functional as F from functools import partial -from megatron import get_args, get_timers, print_rank_0 +from megatron.training import get_args, get_timers, print_rank_0 from megatron.core.enums import ModelType -from megatron.data.vit_dataset import build_train_valid_datasets -from megatron.model.vision.classification import VitClassificationModel -from megatron.model.vision.classification import MitClassificationModel +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from megatron.legacy.model.vision.classification import VitClassificationModel +from megatron.legacy.model.vision.classification import MitClassificationModel from megatron.training import pretrain -from megatron.utils import average_losses_across_data_parallel_group -from megatron.arguments import core_transformer_config_from_args +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.arguments import core_transformer_config_from_args def model_provider(pre_process=True, post_process=True): diff --git a/pretrain_vision_dino.py b/pretrain_vision_dino.py index 01efeab2b1..f75280c42d 100644 --- a/pretrain_vision_dino.py +++ b/pretrain_vision_dino.py @@ -6,14 +6,14 @@ import numpy as np import torch.distributed as dist from functools import partial -from megatron import get_args, get_timers, print_rank_0 +from megatron.training import get_args, get_timers, print_rank_0 from megatron.core.enums import ModelType -from megatron.data.vit_dataset import build_train_valid_datasets -from megatron.model.vision.dino import DINOPretrainModel -from megatron.model.vision.knn_monitor import knn_predict, get_feature_bank +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from megatron.legacy.model.vision.dino import DINOPretrainModel +from megatron.legacy.model.vision.knn_monitor import knn_predict, get_feature_bank from megatron.training import pretrain -from megatron.utils import average_losses_across_data_parallel_group, unwrap_model -from megatron.arguments import core_transformer_config_from_args +from megatron.training.utils import average_losses_across_data_parallel_group, unwrap_model +from megatron.training.arguments import core_transformer_config_from_args def model_provider(pre_process=True, post_process=True): """Build the model.""" diff --git a/pretrain_vision_inpaint.py b/pretrain_vision_inpaint.py index 1947a47faf..8570baab5b 100644 --- a/pretrain_vision_inpaint.py +++ b/pretrain_vision_inpaint.py @@ -5,15 +5,15 @@ import torch import torch.nn.functional as F from functools import partial -from megatron import get_args, get_timers, print_rank_0, print_rank_last +from megatron.training import get_args, get_timers, print_rank_0, print_rank_last from megatron.core.enums import ModelType -from megatron.data.vit_dataset import build_train_valid_datasets -from megatron.model.vision.inpainting import VitInpaintingModel -from megatron.model.vision.inpainting import MitInpaintingModel +from megatron.legacy.data.vit_dataset import build_train_valid_datasets +from megatron.legacy.model.vision.inpainting import VitInpaintingModel +from megatron.legacy.model.vision.inpainting import MitInpaintingModel from megatron.training import pretrain -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import average_losses_across_data_parallel_group from tasks.vision.segmentation.metrics import SSIM, PSNR -from megatron.arguments import core_transformer_config_from_args +from megatron.training.arguments import core_transformer_config_from_args def model_provider(pre_process=True, post_process=True): """Build the model.""" diff --git a/pretrain_vlm.py b/pretrain_vlm.py index 00ce693861..7007c53591 100644 --- a/pretrain_vlm.py +++ b/pretrain_vlm.py @@ -5,8 +5,8 @@ import torch -from megatron import get_args, get_timers, get_tokenizer, print_rank_0 -from megatron.arguments import core_transformer_config_from_args +from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0 +from megatron.training.arguments import core_transformer_config_from_args from megatron.core import tensor_parallel from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig diff --git a/report_theoretical_memory.py b/report_theoretical_memory.py index 34b8a7e0d6..79b483dd5d 100644 --- a/report_theoretical_memory.py +++ b/report_theoretical_memory.py @@ -3,9 +3,9 @@ """Computes theoretical memory footprint for model training without instantiating a model and running training iterations on GPU(s).""" -from megatron import get_args -from megatron.initialize import initialize_megatron -from megatron.theoretical_memory_usage import report_theoretical_memory +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron +from megatron.training.theoretical_memory_usage import report_theoretical_memory if __name__ == "__main__": initialize_megatron(allow_no_cuda=True, skip_mpu_initialization=True) diff --git a/tasks/eval_utils.py b/tasks/eval_utils.py index 98d1bfb2ed..be29b93f53 100644 --- a/tasks/eval_utils.py +++ b/tasks/eval_utils.py @@ -8,8 +8,8 @@ import torch -from megatron import get_args -from megatron import print_rank_last, is_last_rank +from megatron.training import get_args +from megatron.training import print_rank_last, is_last_rank from megatron.core import mpu from megatron.schedules import get_forward_backward_func from tasks.finetune_utils import build_data_loader diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py index b468ca8d20..b281b11739 100644 --- a/tasks/finetune_utils.py +++ b/tasks/finetune_utils.py @@ -6,20 +6,20 @@ import sys import torch -from megatron import get_args, get_num_microbatches -from megatron import print_rank_0 -from megatron import get_timers +from megatron.training import get_args, get_num_microbatches +from megatron.training import print_rank_0 +from megatron.training import get_timers from megatron.core import mpu from megatron.core.enums import ModelType -from megatron.checkpointing import load_checkpoint -from megatron.checkpointing import save_checkpoint +from megatron.training.checkpointing import load_checkpoint +from megatron.training.checkpointing import save_checkpoint from megatron.training import evaluate_and_print_results from megatron.training import setup_model_and_optimizer from megatron.training import train_step from megatron.training import training_log -from megatron.utils import average_losses_across_data_parallel_group -from megatron.utils import calc_params_l2_norm -from megatron.utils import check_adlr_autoresume_termination +from megatron.training.utils import average_losses_across_data_parallel_group +from megatron.training.utils import calc_params_l2_norm +from megatron.training.utils import check_adlr_autoresume_termination def process_batch(batch): diff --git a/tasks/glue/data.py b/tasks/glue/data.py index d96f6962d9..3e2eeaa078 100644 --- a/tasks/glue/data.py +++ b/tasks/glue/data.py @@ -7,7 +7,7 @@ from torch.utils.data import Dataset -from megatron import print_rank_0 +from megatron.training import print_rank_0 from tasks.data_utils import build_sample from tasks.data_utils import build_tokens_types_paddings_from_text diff --git a/tasks/glue/finetune.py b/tasks/glue/finetune.py index 306f24b7f1..7e89453dea 100644 --- a/tasks/glue/finetune.py +++ b/tasks/glue/finetune.py @@ -2,13 +2,13 @@ """GLUE finetuning/evaluation.""" -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_tokenizer -from megatron.model.classification import Classification +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.legacy.model.classification import Classification from tasks.eval_utils import accuracy_func_provider from tasks.finetune_utils import finetune -from megatron.arguments import core_transformer_config_from_args +from megatron.training.arguments import core_transformer_config_from_args def glue_classification(num_classes, Dataset, diff --git a/tasks/glue/mnli.py b/tasks/glue/mnli.py index 8cecc5911e..cd4b2d6176 100644 --- a/tasks/glue/mnli.py +++ b/tasks/glue/mnli.py @@ -2,7 +2,7 @@ """MNLI dataset.""" -from megatron import print_rank_0 +from megatron.training import print_rank_0 from tasks.data_utils import clean_text from .data import GLUEAbstractDataset diff --git a/tasks/glue/qqp.py b/tasks/glue/qqp.py index 5409f5f746..f8a0e06ca0 100644 --- a/tasks/glue/qqp.py +++ b/tasks/glue/qqp.py @@ -2,7 +2,7 @@ """QQP dataset.""" -from megatron import print_rank_0 +from megatron.training import print_rank_0 from tasks.data_utils import clean_text from .data import GLUEAbstractDataset diff --git a/tasks/main.py b/tasks/main.py index cf8226b3f5..7083c443f4 100644 --- a/tasks/main.py +++ b/tasks/main.py @@ -7,8 +7,8 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) -from megatron import get_args -from megatron.initialize import initialize_megatron +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron def get_tasks_args(parser): diff --git a/tasks/msdp/evaluate.py b/tasks/msdp/evaluate.py index b0631d7b8f..87cfbdbd70 100644 --- a/tasks/msdp/evaluate.py +++ b/tasks/msdp/evaluate.py @@ -2,8 +2,8 @@ """Model evaluation""" -from megatron import get_args -from megatron import print_rank_0 +from megatron.training import get_args +from megatron.training import print_rank_0 from tasks.msdp.metrics import F1Metric from tqdm import tqdm diff --git a/tasks/msdp/main.py b/tasks/msdp/main.py index 6ffd944207..a0068c7b06 100644 --- a/tasks/msdp/main.py +++ b/tasks/msdp/main.py @@ -6,8 +6,8 @@ import sys sys.path.append(os.path.abspath(os.path.join( os.path.join(os.path.dirname(__file__), os.path.pardir), os.path.pardir))) -from megatron import get_args -from megatron.initialize import initialize_megatron +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron def get_tasks_args(parser): diff --git a/tasks/msdp/prompt.py b/tasks/msdp/prompt.py index a4e777e0b8..c1d1651c34 100644 --- a/tasks/msdp/prompt.py +++ b/tasks/msdp/prompt.py @@ -6,15 +6,15 @@ import torch import requests from nltk import word_tokenize -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_tokenizer +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer from megatron.core import mpu -from megatron.model import GPTModel +from megatron.legacy.model import GPTModel from megatron.training import get_model -from megatron.checkpointing import load_checkpoint -from megatron.initialize import initialize_megatron -from megatron.text_generation import generate_and_post_process +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from megatron.inference.text_generation import generate_and_post_process def call_model_api(inputs, tokens_to_generate): diff --git a/tasks/orqa/evaluate_orqa.py b/tasks/orqa/evaluate_orqa.py index 3bcc71ba44..f960425499 100644 --- a/tasks/orqa/evaluate_orqa.py +++ b/tasks/orqa/evaluate_orqa.py @@ -2,8 +2,8 @@ """Main tasks functionality.""" -from megatron import get_args, print_rank_0 -from megatron.indexer import IndexBuilder +from megatron.training import get_args, print_rank_0 +from megatron.legacy.indexer import IndexBuilder from tasks.orqa.evaluate_utils import ORQAEvaluator def main(): diff --git a/tasks/orqa/evaluate_utils.py b/tasks/orqa/evaluate_utils.py index 6d4ba786c0..b7ce3fcd8d 100644 --- a/tasks/orqa/evaluate_utils.py +++ b/tasks/orqa/evaluate_utils.py @@ -2,11 +2,11 @@ import torch -from megatron import get_args, print_rank_0 -from megatron.checkpointing import load_biencoder_checkpoint -from megatron.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset -from megatron.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex -from megatron.model.biencoder_model import get_model_provider +from megatron.training import get_args, print_rank_0 +from megatron.training.checkpointing import load_biencoder_checkpoint +from megatron.legacy.data.orqa_wiki_dataset import get_open_retrieval_wiki_dataset +from megatron.legacy.data.realm_index import OpenRetreivalDataStore, FaissMIPSIndex +from megatron.legacy.model.biencoder_model import get_model_provider from megatron.training import get_model from tasks.orqa.unsupervised.nq import get_nq_dataset from tasks.orqa.unsupervised.nq import get_one_epoch_nq_dataloader diff --git a/tasks/orqa/supervised/data.py b/tasks/orqa/supervised/data.py index eb99e2df82..89ae60c89e 100644 --- a/tasks/orqa/supervised/data.py +++ b/tasks/orqa/supervised/data.py @@ -10,8 +10,8 @@ import numpy as np from torch.utils.data import Dataset -from megatron import print_rank_0, get_args -from megatron.data.biencoder_dataset_utils import make_attention_mask +from megatron.training import print_rank_0, get_args +from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask def build_token_types_from_context_list(ctx_list, tokenizer, max_seq_length): ctx_id_list, ctx_types_list = [], [] diff --git a/tasks/orqa/supervised/eval_utils.py b/tasks/orqa/supervised/eval_utils.py index 02966362c9..27af475c8d 100644 --- a/tasks/orqa/supervised/eval_utils.py +++ b/tasks/orqa/supervised/eval_utils.py @@ -9,9 +9,9 @@ import torch.nn.functional as F from torch.utils.data import DataLoader -from megatron import get_args, print_rank_0 +from megatron.training import get_args, print_rank_0 from megatron.core import mpu -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import average_losses_across_data_parallel_group from tasks.finetune_utils import build_data_loader def task_collate_fn(batch_data): diff --git a/tasks/orqa/supervised/finetune.py b/tasks/orqa/supervised/finetune.py index c186dcc518..f09c40365c 100644 --- a/tasks/orqa/supervised/finetune.py +++ b/tasks/orqa/supervised/finetune.py @@ -9,11 +9,11 @@ import torch import torch.nn.functional as F -from megatron import get_args, get_timers, get_tokenizer, print_rank_0 +from megatron.training import get_args, get_timers, get_tokenizer, print_rank_0 from megatron.core import mpu -from megatron.indexer import IndexBuilder -from megatron.model.biencoder_model import biencoder_model_provider -from megatron.utils import average_losses_across_data_parallel_group +from megatron.legacy.indexer import IndexBuilder +from megatron.legacy.model.biencoder_model import biencoder_model_provider +from megatron.training.utils import average_losses_across_data_parallel_group from pretrain_ict import get_group_world_size_rank from tasks.finetune_utils import finetune from tasks.orqa.supervised.eval_utils import accuracy_func_provider diff --git a/tasks/orqa/unsupervised/nq.py b/tasks/orqa/unsupervised/nq.py index 56fd77c12c..2d1bfca730 100644 --- a/tasks/orqa/unsupervised/nq.py +++ b/tasks/orqa/unsupervised/nq.py @@ -13,8 +13,8 @@ from torch.utils.data import DataLoader from torch.utils.data import Dataset, BatchSampler -from megatron import print_rank_0, get_args, get_tokenizer -from megatron.data.biencoder_dataset_utils import make_attention_mask +from megatron.training import print_rank_0, get_args, get_tokenizer +from megatron.legacy.data.biencoder_dataset_utils import make_attention_mask def get_nq_dataset(qa_data, split): args = get_args() diff --git a/tasks/race/data.py b/tasks/race/data.py index c4967a0842..0c22108daa 100644 --- a/tasks/race/data.py +++ b/tasks/race/data.py @@ -6,7 +6,7 @@ from torch.utils.data import Dataset -from megatron import print_rank_0 +from megatron.training import print_rank_0 from tasks.data_utils import build_sample from tasks.data_utils import build_tokens_types_paddings_from_ids from tasks.data_utils import clean_text diff --git a/tasks/race/finetune.py b/tasks/race/finetune.py index ec714a1b80..09d9e739b8 100644 --- a/tasks/race/finetune.py +++ b/tasks/race/finetune.py @@ -2,14 +2,14 @@ """Race.""" -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_tokenizer -from megatron.model.multiple_choice import MultipleChoice +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.legacy.model.multiple_choice import MultipleChoice from tasks.eval_utils import accuracy_func_provider from tasks.finetune_utils import finetune from tasks.race.data import RaceDataset -from megatron.arguments import core_transformer_config_from_args +from megatron.training.arguments import core_transformer_config_from_args def train_valid_datasets_provider(): diff --git a/tasks/vision/classification/classification.py b/tasks/vision/classification/classification.py index cc8dbe629e..3398df8051 100644 --- a/tasks/vision/classification/classification.py +++ b/tasks/vision/classification/classification.py @@ -4,13 +4,13 @@ import torch.nn.functional as F from functools import partial -from megatron import get_args, get_timers -from megatron import print_rank_0 -from megatron.model.vision.classification import VitClassificationModel -from megatron.data.vit_dataset import build_train_valid_datasets +from megatron.training import get_args, get_timers +from megatron.training import print_rank_0 +from megatron.legacy.model.vision.classification import VitClassificationModel +from megatron.legacy.data.vit_dataset import build_train_valid_datasets from tasks.vision.classification.eval_utils import accuracy_func_provider from tasks.vision.finetune_utils import finetune -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import average_losses_across_data_parallel_group def classification(): diff --git a/tasks/vision/classification/eval_utils.py b/tasks/vision/classification/eval_utils.py index d3eaec4850..45cc4ea708 100644 --- a/tasks/vision/classification/eval_utils.py +++ b/tasks/vision/classification/eval_utils.py @@ -7,8 +7,8 @@ import torch -from megatron import get_args -from megatron import print_rank_0, print_rank_last +from megatron.training import get_args +from megatron.training import print_rank_0, print_rank_last from megatron.core import mpu from megatron.schedules import get_forward_backward_func from tasks.vision.finetune_utils import build_data_loader diff --git a/tasks/vision/finetune_utils.py b/tasks/vision/finetune_utils.py index f7fb97db0c..ced2e674e6 100644 --- a/tasks/vision/finetune_utils.py +++ b/tasks/vision/finetune_utils.py @@ -4,19 +4,19 @@ import torch import torch.nn.functional as F -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import utils +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import utils from megatron.core import mpu -from megatron.checkpointing import load_checkpoint -from megatron.checkpointing import save_checkpoint +from megatron.training.checkpointing import load_checkpoint +from megatron.training.checkpointing import save_checkpoint from megatron.training import evaluate_and_print_results from megatron.training import setup_model_and_optimizer from megatron.training import train_step from megatron.training import training_log -from megatron.utils import check_adlr_autoresume_termination -from megatron.utils import average_losses_across_data_parallel_group, print_params_min_max_norm +from megatron.training.utils import check_adlr_autoresume_termination +from megatron.training.utils import average_losses_across_data_parallel_group, print_params_min_max_norm from megatron.core.enums import ModelType def process_batch(batch): diff --git a/tasks/vision/main.py b/tasks/vision/main.py index 7c1b738110..7975f6e9c1 100644 --- a/tasks/vision/main.py +++ b/tasks/vision/main.py @@ -13,8 +13,8 @@ ) ) ) -from megatron import get_args -from megatron.initialize import initialize_megatron +from megatron.training import get_args +from megatron.training.initialize import initialize_megatron def get_tasks_args(parser): """Provide extra arguments required for tasks.""" diff --git a/tasks/vision/segmentation/cityscapes.py b/tasks/vision/segmentation/cityscapes.py index 1a182288f2..af63a6f616 100644 --- a/tasks/vision/segmentation/cityscapes.py +++ b/tasks/vision/segmentation/cityscapes.py @@ -41,7 +41,7 @@ from torchvision.datasets.utils import extract_archive, verify_str_arg, iterable_to_str from torchvision.datasets import VisionDataset from PIL import Image -from megatron import print_rank_0 +from megatron.training import print_rank_0 class Cityscapes(VisionDataset): diff --git a/tasks/vision/segmentation/data.py b/tasks/vision/segmentation/data.py index 292e9cab33..a0ea612cfb 100644 --- a/tasks/vision/segmentation/data.py +++ b/tasks/vision/segmentation/data.py @@ -7,11 +7,11 @@ import torchvision.transforms as T from torchvision import datasets from torch.utils.data import Dataset -from megatron.data.autoaugment import ImageNetPolicy +from megatron.legacy.data.autoaugment import ImageNetPolicy from tasks.vision.segmentation.cityscapes import Cityscapes import tasks.vision.segmentation.transforms as ET -from megatron.data.autoaugment import ImageNetPolicy -from megatron import get_args +from megatron.legacy.data.autoaugment import ImageNetPolicy +from megatron.training import get_args from PIL import Image, ImageOps diff --git a/tasks/vision/segmentation/finetune_segformer.py b/tasks/vision/segmentation/finetune_segformer.py index 10a4085be4..300f107bb3 100644 --- a/tasks/vision/segmentation/finetune_segformer.py +++ b/tasks/vision/segmentation/finetune_segformer.py @@ -6,16 +6,16 @@ import torch import torch.nn.functional as F from functools import partial -from megatron import get_args, get_timers -from megatron import print_rank_0, print_rank_last +from megatron.training import get_args, get_timers +from megatron.training import print_rank_0, print_rank_last from megatron.core import mpu from tasks.vision.finetune_utils import finetune from tasks.vision.finetune_utils import build_data_loader -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import average_losses_across_data_parallel_group from megatron.schedules import get_forward_backward_func from tasks.vision.segmentation.data import build_train_valid_datasets from tasks.vision.segmentation.seg_models import SegformerSegmentationModel -from megatron.model.vision.utils import resize +from megatron.legacy.model.vision.utils import resize def calculate_iou(hist_data): diff --git a/tasks/vision/segmentation/finetune_setr.py b/tasks/vision/segmentation/finetune_setr.py index 7f3208d09a..10ff886c08 100644 --- a/tasks/vision/segmentation/finetune_setr.py +++ b/tasks/vision/segmentation/finetune_setr.py @@ -5,12 +5,12 @@ import torch import torch.nn.functional as F from functools import partial -from megatron import get_args, get_timers -from megatron import print_rank_0, print_rank_last +from megatron.training import get_args, get_timers +from megatron.training import print_rank_0, print_rank_last from megatron.core import mpu from tasks.vision.finetune_utils import finetune from tasks.vision.finetune_utils import build_data_loader -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import average_losses_across_data_parallel_group from megatron.schedules import get_forward_backward_func from tasks.vision.segmentation.metrics import CFMatrix from tasks.vision.segmentation.data import build_train_valid_datasets diff --git a/tasks/vision/segmentation/seg_heads.py b/tasks/vision/segmentation/seg_heads.py index 61b16cdcbd..6d06cbca94 100644 --- a/tasks/vision/segmentation/seg_heads.py +++ b/tasks/vision/segmentation/seg_heads.py @@ -4,10 +4,10 @@ import torch import apex import torch.nn.functional as F -from megatron import get_args -from megatron.model import LayerNorm -from megatron.model.module import MegatronModule -from megatron.model.vision.utils import resize +from megatron.training import get_args +from megatron.legacy.model import LayerNorm +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.utils import resize class SetrSegmentationHead(MegatronModule): diff --git a/tasks/vision/segmentation/seg_models.py b/tasks/vision/segmentation/seg_models.py index 3bf0f48def..9b152d06ed 100644 --- a/tasks/vision/segmentation/seg_models.py +++ b/tasks/vision/segmentation/seg_models.py @@ -4,10 +4,10 @@ import torch import apex import torch.nn.functional as F -from megatron import get_args -from megatron.model.module import MegatronModule -from megatron.model.vision.vit_backbone import VitBackbone, VitMlpHead -from megatron.model.vision.mit_backbone import mit_b3, mit_b5 +from megatron.training import get_args +from megatron.legacy.model.module import MegatronModule +from megatron.legacy.model.vision.vit_backbone import VitBackbone, VitMlpHead +from megatron.legacy.model.vision.mit_backbone import mit_b3, mit_b5 from tasks.vision.segmentation.seg_heads import SetrSegmentationHead, SegformerSegmentationHead @@ -36,7 +36,7 @@ def __init__(self, ) def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" pass def forward(self, input): @@ -68,7 +68,7 @@ def __init__(self, ) def set_input_tensor(self, input_tensor): - """See megatron.model.transformer.set_input_tensor()""" + """See megatron.legacy.model.transformer.set_input_tensor()""" pass def forward(self, input): diff --git a/tasks/vision/segmentation/transforms.py b/tasks/vision/segmentation/transforms.py index 8506c53266..51e11abeca 100644 --- a/tasks/vision/segmentation/transforms.py +++ b/tasks/vision/segmentation/transforms.py @@ -12,8 +12,8 @@ import torchvision.transforms as T from torchvision import datasets from torch.utils.data import Dataset -from megatron import print_rank_0 -from megatron import get_args +from megatron.training import print_rank_0 +from megatron.training import get_args from PIL import Image, ImageOps, ImageEnhance import torchvision.transforms as torch_tr diff --git a/tasks/vision/segmentation/utils.py b/tasks/vision/segmentation/utils.py index dfc6a20148..f9cfb820cb 100644 --- a/tasks/vision/segmentation/utils.py +++ b/tasks/vision/segmentation/utils.py @@ -1,7 +1,7 @@ import math import torch import numpy as np -from megatron import get_args +from megatron.training import get_args def slidingcrops(img, mask): # img: [b c h w] diff --git a/tasks/zeroshot_gpt/datasets.py b/tasks/zeroshot_gpt/datasets.py index 92b7d78913..eafaa8dab1 100644 --- a/tasks/zeroshot_gpt/datasets.py +++ b/tasks/zeroshot_gpt/datasets.py @@ -8,9 +8,9 @@ import numpy as np import torch -from megatron import get_args -from megatron import print_rank_0 -from megatron import get_tokenizer +from megatron.training import get_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer from .detokenizer import get_detokenizer diff --git a/tasks/zeroshot_gpt/evaluate.py b/tasks/zeroshot_gpt/evaluate.py index f8fad0dac8..e42c776e83 100644 --- a/tasks/zeroshot_gpt/evaluate.py +++ b/tasks/zeroshot_gpt/evaluate.py @@ -6,16 +6,16 @@ import torch -from megatron import get_args -from megatron import print_rank_0, is_last_rank -from megatron import get_tokenizer +from megatron.training import get_args +from megatron.training import print_rank_0, is_last_rank +from megatron.training import get_tokenizer from megatron.core import parallel_state, tensor_parallel -from megatron.checkpointing import load_checkpoint -from megatron.model import GPTModel +from megatron.training.checkpointing import load_checkpoint +from megatron.legacy.model import GPTModel from megatron.training import get_model -from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model from megatron.core.pipeline_parallel.p2p_communication import recv_forward, send_forward -from megatron.arguments import core_transformer_config_from_args +from megatron.training.arguments import core_transformer_config_from_args from tasks.finetune_utils import build_data_loader from .datasets import build_dataset diff --git a/tests/unit_tests/data/test_preprocess_data.py b/tests/unit_tests/data/test_preprocess_data.py index 708867c623..bfa3b6bee6 100644 --- a/tests/unit_tests/data/test_preprocess_data.py +++ b/tests/unit_tests/data/test_preprocess_data.py @@ -9,7 +9,7 @@ import requests from megatron.core.datasets.indexed_dataset import IndexedDataset -from megatron.tokenizer.gpt2_tokenization import ( +from megatron.training.tokenizer.gpt2_tokenization import ( PRETRAINED_MERGES_ARCHIVE_MAP, PRETRAINED_VOCAB_ARCHIVE_MAP, ) diff --git a/tests/unit_tests/test_training.py b/tests/unit_tests/test_training.py index 9479447f29..bc2f9ef40d 100644 --- a/tests/unit_tests/test_training.py +++ b/tests/unit_tests/test_training.py @@ -1,7 +1,7 @@ from types import SimpleNamespace -from megatron.global_vars import set_args -from megatron.training import build_train_valid_test_data_iterators +from megatron.training.global_vars import set_args +from megatron.training.training import build_train_valid_test_data_iterators from tests.unit_tests.test_utilities import Utils diff --git a/tests/unit_tests/transformer/moe/test_grouped_mlp.py b/tests/unit_tests/transformer/moe/test_grouped_mlp.py index e443272db8..e62bac310a 100644 --- a/tests/unit_tests/transformer/moe/test_grouped_mlp.py +++ b/tests/unit_tests/transformer/moe/test_grouped_mlp.py @@ -5,13 +5,13 @@ import torch import torch.nn.functional as F -from megatron.arguments import parse_args +from megatron.training.arguments import parse_args from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec from megatron.core.transformer.moe import grouped_gemm_util as gg from megatron.core.transformer.moe.moe_layer import MoELayer from megatron.core.transformer.transformer_config import TransformerConfig -from megatron.initialize import _set_random_seed -from megatron.model import Float16Module +from megatron.training.initialize import _set_random_seed +from megatron.legacy.model import Float16Module from tests.unit_tests.test_utilities import Utils DEVICE_CAPABILITY = None diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py index f1db99f371..73e4a52fa1 100644 --- a/tests/unit_tests/transformer/moe/test_routers.py +++ b/tests/unit_tests/transformer/moe/test_routers.py @@ -5,7 +5,7 @@ import torch from megatron.core.transformer.moe.router import Router -from megatron.initialize import _set_random_seed +from megatron.training.initialize import _set_random_seed from tests.unit_tests.test_utilities import Utils from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.moe.moe_layer import MoELayer diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py index ec067a41fb..633c1f64b9 100644 --- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py +++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py @@ -6,7 +6,7 @@ from megatron.core.transformer.moe.router import Router, TopKRouter from megatron.core.transformer.moe.token_dispatcher import MoEDroplessTokenDispatcher -from megatron.initialize import _set_random_seed +from megatron.training.initialize import _set_random_seed from tests.unit_tests.test_utilities import Utils from megatron.core.transformer.transformer_config import TransformerConfig diff --git a/tools/bert_embedding/dataset.py b/tools/bert_embedding/dataset.py index 4b7bd97e06..da165b8b10 100644 --- a/tools/bert_embedding/dataset.py +++ b/tools/bert_embedding/dataset.py @@ -3,7 +3,7 @@ import numpy as np import torch -from megatron import get_args, get_tokenizer +from megatron.training import get_args, get_tokenizer class BertEmbeddingDataset(torch.utils.data.Dataset): diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py index b2fbd689dc..b1f7eb86f2 100644 --- a/tools/bert_embedding/embed.py +++ b/tools/bert_embedding/embed.py @@ -9,13 +9,13 @@ from torch.utils.data._utils.collate import default_collate from tqdm import tqdm -from megatron import get_args, get_tokenizer, print_rank_0 +from megatron.training import get_args, get_tokenizer, print_rank_0 from megatron import core -from megatron.arguments import core_transformer_config_from_args +from megatron.training.arguments import core_transformer_config_from_args from megatron.core.datasets.retro.utils import get_blocks_by_rank from megatron.core.enums import ModelType from megatron.core.pipeline_parallel import get_forward_backward_func -from megatron.model import BertModel +from megatron.legacy.model import BertModel from megatron.training import setup_model_and_optimizer from pretrain_bert import model_provider, get_batch, loss_func, forward_step diff --git a/tools/checkpoint/loader_llama2_hf.py b/tools/checkpoint/loader_llama2_hf.py index 9b7209acca..969b9add95 100644 --- a/tools/checkpoint/loader_llama2_hf.py +++ b/tools/checkpoint/loader_llama2_hf.py @@ -158,12 +158,12 @@ def _load_checkpoint(queue, args): sys.path.insert(0, args.megatron_path) try: - from megatron.arguments import parse_args, validate_args - from megatron.global_vars import set_args, set_global_variables - from megatron.model import module + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.legacy.model import module from megatron.core import mpu from megatron.core.enums import ModelType - from megatron import fused_kernels + from megatron.training import fused_kernels except ModuleNotFoundError: print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") queue.put("exit") diff --git a/tools/checkpoint/loader_mcore.py b/tools/checkpoint/loader_mcore.py index d885375af3..0994898829 100644 --- a/tools/checkpoint/loader_mcore.py +++ b/tools/checkpoint/loader_mcore.py @@ -36,13 +36,13 @@ def _load_checkpoint(queue, args): sys.path.insert(0, args.megatron_path) try: - from megatron.arguments import parse_args, validate_args - from megatron.global_vars import set_args, set_global_variables - from megatron.checkpointing import load_args_from_checkpoint, load_checkpoint - from megatron.model import module + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint + from megatron.legacy.model import module from megatron.core import mpu from megatron.core.enums import ModelType - from megatron import fused_kernels + from megatron.training import fused_kernels except ModuleNotFoundError: print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") queue.put("exit") diff --git a/tools/checkpoint/loader_megatron.py b/tools/checkpoint/loader_megatron.py index f3924dfb1d..c059b3c16e 100644 --- a/tools/checkpoint/loader_megatron.py +++ b/tools/checkpoint/loader_megatron.py @@ -34,13 +34,13 @@ def _load_checkpoint(queue, args): sys.path.insert(0, args.megatron_path) try: - from megatron.arguments import parse_args, validate_args - from megatron.global_vars import set_args, set_global_variables - from megatron.checkpointing import load_args_from_checkpoint, load_checkpoint - from megatron.model import module + from megatron.training.arguments import parse_args, validate_args + from megatron.training.global_vars import set_args, set_global_variables + from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint + from megatron.legacy.model import module from megatron.core import mpu from megatron.core.enums import ModelType - from megatron import fused_kernels + from megatron.training import fused_kernels except ModuleNotFoundError: print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") queue.put("exit") diff --git a/tools/checkpoint/saver_mcore.py b/tools/checkpoint/saver_mcore.py index a5507724a3..de63153494 100644 --- a/tools/checkpoint/saver_mcore.py +++ b/tools/checkpoint/saver_mcore.py @@ -228,12 +228,12 @@ def save_checkpoint(queue, args): sys.path.insert(0, args.megatron_path) try: - from megatron.arguments import (parse_args, validate_args) - from megatron.checkpointing import save_checkpoint - from megatron.global_vars import set_global_variables, get_args + from megatron.training.arguments import (parse_args, validate_args) + from megatron.training.checkpointing import save_checkpoint + from megatron.training.global_vars import set_global_variables, get_args from megatron.core.enums import ModelType - from megatron.tokenizer.tokenizer import _vocab_size_with_padding - from megatron import fused_kernels + from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding + from megatron.training import fused_kernels from megatron.core import mpu except ModuleNotFoundError: print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") diff --git a/tools/checkpoint/saver_megatron.py b/tools/checkpoint/saver_megatron.py index ae8a5a2c41..78dbd6dd05 100644 --- a/tools/checkpoint/saver_megatron.py +++ b/tools/checkpoint/saver_megatron.py @@ -29,12 +29,12 @@ def save_checkpoint(queue, args): sys.path.insert(0, args.megatron_path) try: - from megatron.arguments import (parse_args, validate_args) - from megatron.checkpointing import save_checkpoint - from megatron.global_vars import set_global_variables, get_args + from megatron.training.arguments import (parse_args, validate_args) + from megatron.training.checkpointing import save_checkpoint + from megatron.training.global_vars import set_global_variables, get_args from megatron.core.enums import ModelType - from megatron.tokenizer.tokenizer import _vocab_size_with_padding - from megatron import fused_kernels + from megatron.training.tokenizer.tokenizer import _vocab_size_with_padding + from megatron.training import fused_kernels from megatron.core import mpu except ModuleNotFoundError: print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.") diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py index 19ffc567f2..55d9d6c856 100644 --- a/tools/preprocess_data.py +++ b/tools/preprocess_data.py @@ -20,7 +20,7 @@ except ImportError: nltk_available = False -from megatron.tokenizer import build_tokenizer +from megatron.training.tokenizer import build_tokenizer from megatron.core.datasets import indexed_dataset diff --git a/tools/preprocess_data_nmt.py b/tools/preprocess_data_nmt.py index c36c954d18..13a04f6ee2 100644 --- a/tools/preprocess_data_nmt.py +++ b/tools/preprocess_data_nmt.py @@ -11,7 +11,7 @@ os.path.pardir))) import time import torch -from megatron.tokenizer import build_tokenizer +from megatron.training.tokenizer import build_tokenizer from megatron.core.datasets import indexed_dataset diff --git a/tools/preprocess_mmdata.py b/tools/preprocess_mmdata.py index 255dad945a..247b66b4d1 100755 --- a/tools/preprocess_mmdata.py +++ b/tools/preprocess_mmdata.py @@ -21,7 +21,7 @@ except ImportError: nltk_available = False -from megatron.tokenizer import build_tokenizer +from megatron.training.tokenizer import build_tokenizer from megatron.core.datasets.indexed_dataset import IndexedDatasetBuilder diff --git a/tools/retro/cli/cli.py b/tools/retro/cli/cli.py index ba6deb19af..18da6c7779 100644 --- a/tools/retro/cli/cli.py +++ b/tools/retro/cli/cli.py @@ -6,7 +6,7 @@ import typing as T from types import SimpleNamespace -from megatron.arguments import load_retro_config, parse_args, validate_args +from megatron.training.arguments import load_retro_config, parse_args, validate_args from megatron.core.datasets.retro.db.dataset import DBDataset from megatron.core.datasets.retro.db.utils import ( get_indexed_dataset_infos as get_db_indexed_dataset_infos, diff --git a/tools/retro/sft/sft_retro.py b/tools/retro/sft/sft_retro.py index fd95c05586..63d321b8d4 100644 --- a/tools/retro/sft/sft_retro.py +++ b/tools/retro/sft/sft_retro.py @@ -8,16 +8,16 @@ sys.path.append(os.path.abspath(os.path.join( os.path.join(os.path.dirname(__file__), "../../../")))) -from megatron import get_args, get_retro_args -from megatron import print_rank_0 -from megatron import get_timers -from megatron import get_tokenizer +from megatron.training import get_args, get_retro_args +from megatron.training import print_rank_0 +from megatron.training import get_timers +from megatron.training import get_tokenizer from megatron.core import tensor_parallel from megatron.core.enums import ModelType from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder from megatron.training import pretrain -from megatron.utils import get_ltor_masks_and_position_ids -from megatron.utils import average_losses_across_data_parallel_group +from megatron.training.utils import get_ltor_masks_and_position_ids +from megatron.training.utils import average_losses_across_data_parallel_group from pretrain_gpt import model_provider, is_dataset_built_on_rank from tools.retro.sft.dataset_conv import JsonQADataset, JsonQADatasetConfig, RetroJsonQADataset, RetroJsonQADatasetConfig diff --git a/tools/retro/text_generation/retro_api.py b/tools/retro/text_generation/retro_api.py index 9dd96587b5..b70677485d 100644 --- a/tools/retro/text_generation/retro_api.py +++ b/tools/retro/text_generation/retro_api.py @@ -5,13 +5,13 @@ import numpy as np import torch from megatron.core import mpu -from megatron import print_rank_0, get_retro_args, get_args, get_tokenizer -from megatron.text_generation.communication import broadcast_float_list, broadcast_tensor, broadcast_int_list -from megatron.text_generation.generation import ( +from megatron.training import print_rank_0, get_retro_args, get_args, get_tokenizer +from megatron.inference.text_generation.communication import broadcast_float_list, broadcast_tensor, broadcast_int_list +from megatron.inference.text_generation.generation import ( score_and_return_on_first_stage) from tools.retro.text_generation.retro_generation import ( retro_generate_tokens_probs_and_return_on_first_stage) -from megatron.text_generation.tokenization import ( +from megatron.inference.text_generation.tokenization import ( detokenize_generations) diff --git a/tools/retro/text_generation/retro_generation.py b/tools/retro/text_generation/retro_generation.py index e892856c5b..6ec4426789 100644 --- a/tools/retro/text_generation/retro_generation.py +++ b/tools/retro/text_generation/retro_generation.py @@ -4,16 +4,16 @@ """Generation utilities.""" import torch import torch.nn.functional as F -from megatron import get_args, get_tokenizer -from megatron import get_retro_args +from megatron.training import get_args, get_tokenizer +from megatron.training import get_retro_args from megatron.core import mpu -from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model -from megatron.text_generation.communication import ( +from megatron.training.utils import get_ltor_masks_and_position_ids, unwrap_model +from megatron.inference.text_generation.communication import ( copy_from_last_to_first_pipeline_stage, broadcast_from_last_pipeline_stage, broadcast_from_last_to_first_pipeline_stage, broadcast_int_list, broadcast_tensor) -from megatron.text_generation.generation import _build_attention_mask_and_position_ids -from megatron.text_generation.sampling import sample +from megatron.inference.text_generation.generation import _build_attention_mask_and_position_ids +from megatron.inference.text_generation.sampling import sample diff --git a/tools/retro/text_generation/retro_text_generation.py b/tools/retro/text_generation/retro_text_generation.py index 6b456127e2..c1cdcafb79 100755 --- a/tools/retro/text_generation/retro_text_generation.py +++ b/tools/retro/text_generation/retro_text_generation.py @@ -8,11 +8,11 @@ sys.path.append(os.path.abspath(os.path.join( os.path.join(os.path.dirname(__file__), "../../../")))) -from megatron import get_args, get_retro_args -from megatron import print_rank_0 -from megatron import get_tokenizer -from megatron.checkpointing import load_checkpoint -from megatron.initialize import initialize_megatron +from megatron.training import get_args, get_retro_args +from megatron.training import print_rank_0 +from megatron.training import get_tokenizer +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron from megatron.core.models.gpt import GPTModel from megatron.training import get_model from tools.retro.text_generation.retro_api import retro_generate_and_post_process @@ -20,12 +20,12 @@ from tools.retro.sft.dataset_conv import reformat_prompt, preprocess, reformat_prompt_short import numpy as np import time -import megatron.model -from megatron.arguments import core_transformer_config_from_args +import megatron.legacy.model +from megatron.training.arguments import core_transformer_config_from_args -def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.model.GPTModel]: +def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]: """Builds the model. If you set the use_mcore_models to True, it will return the mcore GPT model and if not the legacy GPT model. @@ -36,13 +36,13 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat Returns: - Union[GPTModel, megatron.model.GPTModel]: The returned model + Union[GPTModel, megatron.legacy.model.GPTModel]: The returned model """ print_rank_0('building GPT model ...') config = core_transformer_config_from_args(get_args()) # not support core model yet - model = megatron.model.GPTModel( + model = megatron.legacy.model.GPTModel( config, num_tokentypes=0, parallel_output=False, diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py index da2f841364..28e0a32fa6 100644 --- a/tools/run_text_generation_server.py +++ b/tools/run_text_generation_server.py @@ -6,17 +6,17 @@ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))) import socket -from megatron import get_args -from megatron import print_rank_0 +from megatron.training import get_args +from megatron.training import print_rank_0 from megatron.core import mpu -from megatron.checkpointing import load_checkpoint -from megatron.initialize import initialize_megatron -from megatron.model import GPTModel +from megatron.training.checkpointing import load_checkpoint +from megatron.training.initialize import initialize_megatron +from megatron.legacy.model import GPTModel from megatron.training import get_model -from megatron.arguments import core_transformer_config_from_args -from megatron.text_generation_server import MegatronServer -from megatron.text_generation import generate_and_post_process -from megatron.text_generation import beam_search_and_post_process +from megatron.training.arguments import core_transformer_config_from_args +from megatron.inference.text_generation_server import MegatronServer +from megatron.inference.text_generation import generate_and_post_process +from megatron.inference.text_generation import beam_search_and_post_process import torch def model_provider(pre_process=True, post_process=True):