Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory profiling #1153

Merged
merged 32 commits into from
Feb 21, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
f5fd54c
Fixes distributed tests, and skips tests that are broken.
jahatef Feb 14, 2024
4a4a934
Merge branch 'main' of github.com:EleutherAI/gpt-neox into main
jahatef Feb 18, 2024
f63593b
memory profiling for gpt-neox. Only works for pp=0, pp=1+ needs DS co…
jahatef Feb 20, 2024
4ed9d42
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
89efc48
adds memory profiling for pipeline parallel
jahatef Feb 21, 2024
95f31f0
Merge branch 'memory_profiling' of github.com:EleutherAI/gpt-neox int…
jahatef Feb 21, 2024
9551afe
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
4135743
fix spacing
jahatef Feb 21, 2024
7b0cdaf
Merge branch 'memory_profiling' of github.com:EleutherAI/gpt-neox int…
jahatef Feb 21, 2024
45aea7a
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
3bff276
fix spacing again
jahatef Feb 21, 2024
2452697
Merge branch 'memory_profiling' of github.com:EleutherAI/gpt-neox int…
jahatef Feb 21, 2024
d9c7e4b
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
7af1c9d
get rid of unwanted changes
jahatef Feb 21, 2024
47f76af
Merge branch 'memory_profiling' of github.com:EleutherAI/gpt-neox int…
jahatef Feb 21, 2024
7994909
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
a2893db
get rid of file
jahatef Feb 21, 2024
db8b70b
Merge branch 'memory_profiling' of github.com:EleutherAI/gpt-neox int…
jahatef Feb 21, 2024
80b1e30
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
7467632
Merge branch 'main' into memory_profiling
Quentin-Anthony Feb 21, 2024
5c51c43
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
20bc950
add nsight systems support
jahatef Feb 21, 2024
fd0b471
Merge branch 'memory_profiling' of github.com:EleutherAI/gpt-neox int…
jahatef Feb 21, 2024
87bca9d
remove tests changes again
jahatef Feb 21, 2024
65ce859
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
49cf95d
add tests
jahatef Feb 21, 2024
ab8126d
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
ae2c61d
Update training.py
jahatef Feb 21, 2024
21eba94
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
edfcdaf
Add assertion message
Quentin-Anthony Feb 21, 2024
8669123
pre-commit
Quentin-Anthony Feb 21, 2024
80aa4cb
Update NeoXArgs docs automatically
invalid-email-address Feb 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
remove tests changes again
  • Loading branch information
jahatef committed Feb 21, 2024
commit 87bca9dc8f84a4fac9cb140a443172869e49c0c9
434 changes: 97 additions & 337 deletions tests/common.py

Large diffs are not rendered by default.

91 changes: 0 additions & 91 deletions tests/conftest.py

This file was deleted.

4 changes: 4 additions & 0 deletions tests/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .test_model_instantiation import run_test_model_instantiation
from .test_model_train import run_train_test
from .test_model_checkpoint import run_checkpoint_test
94 changes: 48 additions & 46 deletions tests/model/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

import pytest
from tests.common import (
DistributedTest,
distributed_test,
clear_test_dirs,
model_setup,
binary,
Expand Down Expand Up @@ -73,58 +73,60 @@ def test_train(param_dict):
d = tempfile.mkdtemp()
param_dict["save"] = d

t1 = test_run_checkpoint_test_class()
t1.run_checkpoint_test(param_dict=param_dict)
@distributed_test(world_size=2)
def wrapper():
run_checkpoint_test(param_dict=param_dict)

wrapper()

class test_run_checkpoint_test_class(DistributedTest):
def run_checkpoint_test(yaml_list=None, param_dict=None):

from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint
def run_checkpoint_test(yaml_list=None, param_dict=None):

model, optimizer, lr_scheduler, args_loaded = model_setup(
yaml_list, param_dict, clear_data=True
)
from megatron.checkpointing import load_checkpoint
from megatron.checkpointing import save_checkpoint

# save model checkpoint
save_checkpoint(
neox_args=args_loaded,
iteration=42,
model=model,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
)
model, optimizer, lr_scheduler, args_loaded = model_setup(
yaml_list, param_dict, clear_data=True
)

# reload model from checkpoint
(
reloaded_model,
reloaded_optimizer,
reloaded_lr_scheduler,
args_reloaded,
) = model_setup(yaml_list, param_dict, clear_data=False)
iteration = load_checkpoint(
neox_args=args_reloaded,
model=reloaded_model,
optimizer=reloaded_optimizer,
lr_scheduler=reloaded_lr_scheduler,
)
# save model checkpoint
save_checkpoint(
neox_args=args_loaded,
iteration=42,
model=model,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
)

# ensure same checkpoint is loaded
assert (
iteration == 42
), "run_checkpoint_test() iteration loaded from checkpoint correct"

# check all weight groups are the same
for idx, ((n1, p1), (n2, p2)) in enumerate(
zip(
list(model.module.named_parameters()),
list(reloaded_model.module.named_parameters()),
)
):
assert n1 == n2
params_equal = (p1 == p2).all().item()
assert params_equal, "run_checkpoint_test() params equal: " + str(n1)
# reload model from checkpoint
(
reloaded_model,
reloaded_optimizer,
reloaded_lr_scheduler,
args_reloaded,
) = model_setup(yaml_list, param_dict, clear_data=False)
iteration = load_checkpoint(
neox_args=args_reloaded,
model=reloaded_model,
optimizer=reloaded_optimizer,
lr_scheduler=reloaded_lr_scheduler,
)

# ensure same checkpoint is loaded
assert (
iteration == 42
), "run_checkpoint_test() iteration loaded from checkpoint correct"

# check all weight groups are the same
for idx, ((n1, p1), (n2, p2)) in enumerate(
zip(
list(model.module.named_parameters()),
list(reloaded_model.module.named_parameters()),
)
):
assert n1 == n2
params_equal = (p1 == p2).all().item()
assert params_equal, "run_checkpoint_test() params equal: " + str(n1)


if __name__ == "__main__":
Expand Down
76 changes: 38 additions & 38 deletions tests/model/test_model_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

import os
import pytest
from tests.common import DistributedTest, model_setup, parametrize
from tests.common import distributed_test, model_setup, parametrize

PARAMS_TO_TEST = {
"pipe_parallel_size,model_parallel_size,world_size": [
Expand Down Expand Up @@ -67,47 +67,47 @@
@pytest.mark.skip
@pytest.mark.parametrize("param_dict", parameters, ids=names)
def test_train(param_dict):
t1 = run_generate_test_class()
t1.run_generate_test(param_dict, param_dict.pop("prompt"))
@distributed_test(world_size=param_dict.pop("world_size", 2))
def wrapper():
run_generate_test(param_dict=param_dict, prompt=param_dict.pop("prompt"))

wrapper()

class run_generate_test_class(DistributedTest):
world_size = 2

def run_generate_test(param_dict, prompt):
from megatron.text_generation_utils import generate_samples_from_prompt
from megatron.utils import is_mp_rank_0
def run_generate_test(param_dict, prompt):
from megatron.text_generation_utils import generate_samples_from_prompt
from megatron.utils import is_mp_rank_0

fixed_params = {
"num_samples": 3,
"maximum_tokens": 50,
"make_vocab_size_divisible_by": 2,
"sample_output_file": "test_sample_output.txt",
"checkpoint_activations": False,
"partition_activations": False,
"no_load_optim": True,
}
fixed_params = {
"num_samples": 3,
"maximum_tokens": 50,
"make_vocab_size_divisible_by": 2,
"sample_output_file": "test_sample_output.txt",
"checkpoint_activations": False,
"partition_activations": False,
"no_load_optim": True,
}

param_dict.update(fixed_params)
# TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this
model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
model.eval()
param_dict.update(fixed_params)
# TODO: we don't need to reinstantiate the model every time if we're only changing sampling settings - should be a workaround for this
model, _, _, args_loaded = model_setup(None, param_dict, clear_data=True)
model.eval()

prompts = [prompt for _ in range(args_loaded.num_samples)]
output = generate_samples_from_prompt(
neox_args=args_loaded,
model=model,
text=prompts,
maximum_tokens=args_loaded.maximum_tokens,
recompute=False,
temperature=args_loaded.temperature,
top_k=args_loaded.top_k,
top_p=args_loaded.top_p,
)
prompts = [prompt for _ in range(args_loaded.num_samples)]
output = generate_samples_from_prompt(
neox_args=args_loaded,
model=model,
text=prompts,
maximum_tokens=args_loaded.maximum_tokens,
recompute=False,
temperature=args_loaded.temperature,
top_k=args_loaded.top_k,
top_p=args_loaded.top_p,
)

# outputs only get generated on mp rank 0
if is_mp_rank_0():
assert len(output) == len(prompts)
for prompt, out in zip(prompts, output):
assert prompt == out["context"]
assert len(out["text"]) > 0
# outputs only get generated on mp rank 0
if is_mp_rank_0():
assert len(output) == len(prompts)
for prompt, out in zip(prompts, output):
assert prompt == out["context"]
assert len(out["text"]) > 0
51 changes: 27 additions & 24 deletions tests/model/test_model_instantiation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import torch
import os
from tests.common import (
DistributedTest,
distributed_test,
model_setup,
clear_test_dirs,
parametrize,
Expand Down Expand Up @@ -80,8 +80,11 @@
)
@pytest.mark.parametrize("param_dict", parameters, ids=names)
def test_instantiate(param_dict):
t1 = test_instantiate_optimizers_class()
t1.run_test_model_instantiation(param_dict)
@distributed_test(world_size=param_dict.pop("world_size", 2))
def wrapper():
run_test_model_instantiation(param_dict=param_dict)

wrapper()


OPTIMIZER_PARAMS = {
Expand All @@ -105,24 +108,24 @@ def test_instantiate(param_dict):
)
@pytest.mark.parametrize("param_dict", opt_params, ids=opt_name)
def test_instantiate_optimizers(param_dict):
t1 = test_instantiate_optimizers_class()
t1.run_test_model_instantiation(param_dict)


class test_instantiate_optimizers_class(DistributedTest):
world_size = 2

def run_test_model_instantiation(yaml_list=None, param_dict=None):
from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine

model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
if args_loaded.pipe_parallel_size < 2:
assert isinstance(
model, DeepSpeedEngine
), "test model instantiation " + str(yaml_list)
else:
assert isinstance(model, PipelineEngine), "test model instantiation " + str(
yaml_list
)
if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
clear_test_dirs()
@distributed_test(world_size=2)
def wrapper():
run_test_model_instantiation(param_dict=param_dict)

wrapper()


def run_test_model_instantiation(yaml_list=None, param_dict=None):
from deepspeed.runtime.pipe.engine import PipelineEngine, DeepSpeedEngine

model, optimizer, lr_scheduler, args_loaded = model_setup(yaml_list, param_dict)
if args_loaded.pipe_parallel_size < 2:
assert isinstance(model, DeepSpeedEngine), "test model instantiation " + str(
yaml_list
)
else:
assert isinstance(model, PipelineEngine), "test model instantiation " + str(
yaml_list
)
if torch.distributed.get_world_size() == 1 or torch.distributed.get_rank() == 0:
clear_test_dirs()
6 changes: 2 additions & 4 deletions tests/model/test_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
"bigbird",
"bslongformer",
"gmlp",
"amlp",
"flash",
],
"hidden_dropout": [0, 0.1],
Expand All @@ -49,10 +50,7 @@

keys_to_test = PARAMS_TO_TEST.keys()

# TODO: fix model training tests
@pytest.mark.skip(
reason="All model tests are skipped until we fix the CUDA + torch multiprocessing issue."
)

@pytest.mark.parametrize(
"key, value",
[(key, value) for key in keys_to_test for value in PARAMS_TO_TEST[key]],
Expand Down
Loading
Loading