Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deepspeed benchmarking #878

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
f4706e0
add flash_attn_kvpacked
satpalsr Mar 29, 2023
f4a9106
Changed is_pipe_parallel setting to fix pipeline-parallel inference
curt-tigges Mar 31, 2023
83a7b9a
Update NeoXArgs docs automatically
invalid-email-address Mar 31, 2023
45d7052
fix formatting
satpalsr Apr 11, 2023
857c556
gpt benchmark script
cr458 Apr 3, 2023
1ab5bf3
remove duplicate argparse
cr458 Apr 4, 2023
afb6b29
HF inference
cr458 Apr 4, 2023
3f7d605
benchmarking configs + script changes
cr458 Apr 11, 2023
d99d2ce
plot directly, runs deepspeed and hf for single benchmark
cr458 Apr 12, 2023
b0e9745
remove plotting comments
cr458 Apr 12, 2023
9c645dd
accept changes from main & resolve conflicts
satpalsr Apr 15, 2023
ee99945
Merge branch 'main' into flash_attn_infer
satpalsr Apr 15, 2023
9b1733e
tmp changes
cr458 Apr 17, 2023
22cac56
Merge remote-tracking branch 'satpalsr/flash_attn_infer' into deepspe…
cr458 Apr 17, 2023
466749b
merge conflict git hash
cr458 Apr 17, 2023
b10739f
separate scripts for Deepspeed/HF and neox
cr458 Apr 18, 2023
4990f9b
debugging: works when world size > 1 but not otherwise
cr458 Apr 18, 2023
88981b2
working ( but not serially)
cr458 Apr 19, 2023
5e3ca7f
working ish gpt-neox just need to figure out how to get dataframe back
cr458 Apr 20, 2023
3ee9d3b
get dataframe output from stdout
cr458 Apr 20, 2023
2a6e8cd
remove gpt neox inference from script
cr458 May 21, 2023
7ea22d9
remove lines
cr458 May 21, 2023
ef4fdd4
device error
cr458 May 21, 2023
d8184f3
Add DS inference
satpalsr May 22, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions benchmarking/configs/inference_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# GPT inference testing setup
models:
- EleutherAI/pythia-70m
- EleutherAI/pythia-160m
- EleutherAI/pythia-410m
- EleutherAI/pythia-1b
- EleutherAI/pythia-1.4b

world_size: 1
trials: 10
max_tokens: 4
179 changes: 179 additions & 0 deletions benchmarking/hf_ds_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py'''

import argparse
import io
import os
import subprocess
import time

import deepspeed
from deepspeed.accelerator import get_accelerator
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from transformers import pipeline
import torch
import yaml

PYTHIA_TO_OLD_SUFFIXES = {
"70m": "19M",
"160m": "125M",
"410m": "350M",
"1b": "800M",
"1.4b": "1-3B",
"2.8b": "2.7B",
"6.9b": "6-7B",
"12b": "13B",
"20b": "20B"}

def benchmark_model(
model, output_dir, use_deepspeed, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials):

deepspeed.init_distributed()
if local_rank == 0:
print("BENCHMARK SETTINGS:")
print(f"\tMODEL: {model}")
print(f"\tMAX_TOKENS: {max_tokens}")
print(f"\tDTYPE: {dtype}")
print(f"\tCUDA_GRAPHS: {graphs}")
print(f"\tKERNEL_INJECT: {kernel_inject}")
print(f"\tWORLD_SIZE: {world_size}")

if dtype == "int8":
dtype = torch.int8
elif dtype == "fp16":
dtype = torch.float16
else:
dtype = torch.float32

pipe = pipeline("text-generation", model=model, framework="pt")

if dtype == torch.float16:
pipe.model.half()
print("")
if use_deepspeed:
pipe.model = deepspeed.init_inference(
pipe.model,
dtype=dtype,
mp_size=world_size,
replace_with_kernel_inject=kernel_inject,
enable_cuda_graph=graphs,
)
pipe.model.profile_model_time()

responses = []
times = []
mtimes = []
for i in range(trials):
get_accelerator().synchronize()
start = time.time()
r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=max_tokens)
get_accelerator().synchronize()
end = time.time()
responses.append(r)
times.append(end - start) # / (max_tokens - 3))
if use_deepspeed:
mtimes.append(sum(pipe.model.model_times()))

if use_deepspeed:
for_dataframe = np.vstack((times, mtimes, list(map(lambda t: t / (max_tokens - 3), times)))).T
columns = ["(e2e) latency", "(model-only) latency", "(e2e) per token latency"]

else:
for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T
columns = ["(e2e) latency", "(e2e) per token latency"]

df = pd.DataFrame(
for_dataframe,
columns = columns)

if local_rank == 0:


deepspeed_str = "deepspeed" if use_deepspeed else "hf"
deepspeed_dir = os.path.join(output_dir, deepspeed_str)
max_tokens_dir = os.path.join(deepspeed_dir, "max_tokens_{}".format(max_tokens))
world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size))

os.makedirs(world_size_dir, exist_ok=True)

fname = os.path.join(world_size_dir,
"{}_{}_benchmark.csv".format(model.split('/')[-1], str(dtype).split('.')[1]))

print("saving benchmark to {}".format(fname))

df.to_csv(fname, index=False)
return df


def main(models, output_dir, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials):
deepspeed_dfs = []
hf_dfs = []
print("Models to benchmark: {}".format(models))
for model in models:
print("Benchmarking model: {}".format(model))
# run using deepspeed
print("Running with deepspeed")
deepspeed_dfs.append(benchmark_model(
model, output_dir, True, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials))

# run using huggingface
print("Running with huggingface")
hf_dfs.append(benchmark_model(
model, output_dir, False, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials))


print("plotting results")
# drop first 3 rows (warmup)
ds_means = [x["(e2e) latency"].iloc[3:].mean() for x in deepspeed_dfs]
ds_std = [x["(e2e) latency"].iloc[3:].std() for x in deepspeed_dfs]
hf_means = [x["(e2e) latency"].iloc[3:].mean() for x in hf_dfs]
hf_std = [x["(e2e) latency"].iloc[3:].std() for x in hf_dfs]


# plot results
fig, ax = plt.subplots(figsize=(12, 4))
ax.bar(
np.arange(len(ds_means)) - 0.24,
ds_means, yerr=ds_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Deepspeed')
ax.bar(
np.arange(len(hf_means)) + 0.24,
hf_means, yerr=hf_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Huggingface')
ax.set_xticks(np.arange(len(models)))
ax.set_xticklabels(models)
ax.set_xlabel('Model')
ax.set_ylabel('Time (s)')
plt.legend()
plt.tight_layout()
plt.title("e2e latency (s), {} tokens, {} world size, {} trials".format(max_tokens, world_size, trials))
plt.savefig(os.path.join(output_dir, "benchmark.png"))
print("plot saved to {}".format(os.path.join(output_dir, "benchmark.png")))

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--output_dir", type=str, default='/home/mchorse/benchmarking/output', help="output_directory")
parser.add_argument("--config", type=str, default='configs/inference_test.yml')
parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "int8"], help="int8, fp16, or fp32")
parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
args = parser.parse_args()

with open(args.config, "r") as f:
config = yaml.safe_load(f)

models = config["models"]
world_size = config["world_size"]
trials = config["trials"]
max_tokens = config["max_tokens"]

main(models=models,
output_dir=args.output_dir,
dtype=args.dtype,
graphs=args.graphs,
kernel_inject=args.kernel_inject,
max_tokens=max_tokens,
local_rank=args.local_rank,
world_size=world_size,
trials=trials)

1 change: 1 addition & 0 deletions benchmarking/megatron_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 32, "optimizer": {"type": "Adam", "params": {"lr": 0.0008, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true}, "wall_clock_breakdown": true, "precision": "fp16", "num_layers": 10, "hidden_size": 640, "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "rotary_pct": 0.25, "init_method": "small_init", "output_layer_init_method": "wang_init", "gpt_j_residual": true, "output_layer_parallelism": "column", "lr_decay_style": "cosine", "lr_decay_iters": 143000, "min_lr": 8e-05, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.0008, "padded_vocab_size": 50304, "data_path": "../data/enwik8/enwik8_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"49M.yml": "{\n # parallelism settings\n \"pipe-parallel-size\": 2,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 10,\n \"hidden-size\": 640,\n \"num-attention-heads\": 10,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"pos-emb\": \"rotary\",\n \"rotary-pct\": 0.25,\n \"no-weight-tying\": true,\n \"gpt-j-residual\": true,\n \"output-layer-parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": false,\n\n # init methods\n \"init_method\": \"small_init\",\n \"output_layer_init_method\": \"wang_init\",\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 0.0008,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"min_lr\": 0.00008,\n\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 32,\n \"gas\": 1,\n \"data-impl\": \"mmap\",\n \"num_workers\": 1,\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.1,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"loss_scale_window\": 1000,\n \"initial_scale_power\": 12,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1,\n },\n\n # misc. training settings\n \"train-iters\": 143000,\n \"lr-decay-iters\": 143000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"checkpoint-factor\": 1000,\n \"eval-interval\": 100000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 10,\n \"steps_per_print\": 10,\n \"wall_clock_breakdown\": true,\n}\n", "benchmark_setup.yml": "# Suggested data paths when using GPT-NeoX locally\n{\n \"data-path\": \"../data/enwik8/enwik8_text_document\",\n\n # or for weighted datasets:\n # \"train-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"test-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"valid-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"train-data-weights\": [1., 2.],\n # \"test-data-weights\": [2., 1.],\n # \"valid-data-weights\": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\n # WARNING: setting this to True will override any user provided weights\n # \"weight_by_num_documents\": false,\n # \"weighted_sampler_alpha\": 0.3,\n\n \"vocab-file\": \"../data/gpt2-vocab.json\",\n \"merge-file\": \"../data/gpt2-merges.txt\",\n\n \"save\": \"checkpoints\",\n \"load\": \"checkpoints\",\n \"checkpoint_validation_with_forward_pass\": False,\n\n \"tensorboard-dir\": \"tensorboard\",\n \"log-dir\": \"logs\",\n \"use_wandb\": True,\n \"wandb_host\": \"https://api.wandb.ai\",\n \"wandb_project\": \"neox\"\n}\n", "benchmarking.yml": "# Parameters used for text generation\n# Make sure `load` is specified somewhere else\n{\n # Text gen type: `input-file`, `unconditional` or `interactive`\n \"text-gen-type\": \"from_prompt\",\n\n # Params for all\n \"maximum_tokens\": 128,\n \"prompt_end\": \"\\n\",\n \"temperature\": 1.0,\n \"top_p\": 0.0,\n \"top_k\": 0,\n \"recompute\": false,\n\n # `unconditional`: samples\n \"num-samples\": 10,\n}\n"}, "load": "checkpoints", "checkpoint_factor": 1000, "batch_size": 32, "train_iters": 143000, "eval_iters": 10, "eval_interval": 100000, "vocab_file": "../data/gpt2-vocab.json", "merge_file": "../data/gpt2-merges.txt", "num_workers": 1, "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0.1, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 1, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 2, "world_size": 2, "is_pipe_parallel": true, "use_wandb": true, "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 10, "text_gen_type": "from_prompt", "temperature": 1.0, "maximum_tokens": 128, "num_samples": 10, "local_rank": 0, "rank": 0, "save_iters": [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], "global_num_gpus": 8}
87 changes: 87 additions & 0 deletions benchmarking/neox_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py'''

import argparse
import os
import sys
sys.path.insert(0, os.path.abspath(os.getcwd()))


import tempfile
import time

import deepspeed
from deepspeed.accelerator import get_accelerator
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from transformers import pipeline
import torch
import yaml

from megatron.text_generation_utils import generate_samples_from_prompt
from megatron.utils import print_rank_0, setup_for_inference_or_eval


PYTHIA_TO_OLD_SUFFIXES = {
"70m": "19M",
"160m": "125M",
"410m": "350M",
"1b": "800M",
"1.4b": "1-3B",
"2.8b": "2.7B",
"6.9b": "6-7B",
"12b": "13B",
"20b": "20B"}


def main():
model, neox_args = setup_for_inference_or_eval(use_cache=True)
max_tokens = 10
print_rank_0("Finished loading model")

prompts = ["DeepSpeed is" for x in range(100)]

generated_texts = generate_samples_from_prompt(
neox_args=neox_args,
model=model,
text=prompts,
eos_token_id=0,
maximum_tokens=10,
recompute=neox_args.recompute,
temperature=neox_args.temperature,
top_k=neox_args.top_k,
top_p=neox_args.top_p,
)

times = [x["duration_seconds"] for x in generated_texts]

for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T
columns = ["(e2e) latency", "(e2e) per token latency"]

df = pd.DataFrame(
for_dataframe,
columns = columns)


# save dataframe to CSV inside the directory for world_size
# if local_rank == 0:

# neox_dir = os.path.join(output_dir, "neox")
# max_tokens_dir = os.path.join(neox_dir, "max_tokens_{}".format(max_tokens))
# world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size))

# os.makedirs(world_size_dir, exist_ok=True)

# fname = os.path.join(world_size_dir,
# "{}_fp16_benchmark.csv".format(model.split('/')[-1]))

# print("saving benchmark to {}".format(fname))
# df.to_csv(fname, index=False)
print("Starting data generation...")
df.to_csv(sys.stdout, index=False)
print("Data generation complete!")


if __name__ == "__main__":
main()

100 changes: 100 additions & 0 deletions benchmarking/neox_benchmark_input.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Deepspeed is \n
Loading