EleutherAI · cr458 · Mar 29, 2023 · Mar 31, 2023 · Mar 31, 2023 · Apr 11, 2023
@@ -0,0 +1,11 @@
+# GPT inference testing setup
+models:
+ - EleutherAI/pythia-70m
+ - EleutherAI/pythia-160m
+ - EleutherAI/pythia-410m
+ - EleutherAI/pythia-1b
+ - EleutherAI/pythia-1.4b
+
+world_size: 1
+trials: 10
+max_tokens: 4
@@ -0,0 +1,179 @@
+'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py'''
+
+import argparse
+import io
+import os
+import subprocess
+import time
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from transformers import pipeline
+import torch
+import yaml
+
+PYTHIA_TO_OLD_SUFFIXES = {
+ "70m": "19M",
+ "160m": "125M",
+ "410m": "350M",
+ "1b": "800M",
+ "1.4b": "1-3B",
+ "2.8b": "2.7B",
+ "6.9b": "6-7B",
+ "12b": "13B",
+ "20b": "20B"}
+
+def benchmark_model(
+ model, output_dir, use_deepspeed, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials):
+
+ deepspeed.init_distributed()
+ if local_rank == 0:
+ print("BENCHMARK SETTINGS:")
+ print(f"\tMODEL: {model}")
+ print(f"\tMAX_TOKENS: {max_tokens}")
+ print(f"\tDTYPE: {dtype}")
+ print(f"\tCUDA_GRAPHS: {graphs}")
+ print(f"\tKERNEL_INJECT: {kernel_inject}")
+ print(f"\tWORLD_SIZE: {world_size}")
+
+ if dtype == "int8":
+ dtype = torch.int8
+ elif dtype == "fp16":
+ dtype = torch.float16
+ else:
+ dtype = torch.float32
+
+ pipe = pipeline("text-generation", model=model, framework="pt")
+
+ if dtype == torch.float16:
+ pipe.model.half()
+ print("")
+ if use_deepspeed:
+ pipe.model = deepspeed.init_inference(
+ pipe.model,
+ dtype=dtype,
+ mp_size=world_size,
+ replace_with_kernel_inject=kernel_inject,
+ enable_cuda_graph=graphs,
+ )
+ pipe.model.profile_model_time()
+
+ responses = []
+ times = []
+ mtimes = []
+ for i in range(trials):
+ get_accelerator().synchronize()
+ start = time.time()
+ r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=max_tokens)
+ get_accelerator().synchronize()
+ end = time.time()
+ responses.append(r)
+ times.append(end - start) # / (max_tokens - 3))
+ if use_deepspeed:
+ mtimes.append(sum(pipe.model.model_times()))
+
+ if use_deepspeed:
+ for_dataframe = np.vstack((times, mtimes, list(map(lambda t: t / (max_tokens - 3), times)))).T
+ columns = ["(e2e) latency", "(model-only) latency", "(e2e) per token latency"]
+
+ else:
+ for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T
+ columns = ["(e2e) latency", "(e2e) per token latency"]
+
+ df = pd.DataFrame(
+ for_dataframe,
+ columns = columns)
+
+ if local_rank == 0:
+
+
+ deepspeed_str = "deepspeed" if use_deepspeed else "hf"
+ deepspeed_dir = os.path.join(output_dir, deepspeed_str)
+ max_tokens_dir = os.path.join(deepspeed_dir, "max_tokens_{}".format(max_tokens))
+ world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size))
+
+ os.makedirs(world_size_dir, exist_ok=True)
+
+ fname = os.path.join(world_size_dir,
+ "{}_{}_benchmark.csv".format(model.split('/')[-1], str(dtype).split('.')[1]))
+
+ print("saving benchmark to {}".format(fname))
+
+ df.to_csv(fname, index=False)
+ return df
+
+
+def main(models, output_dir, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials):
+ deepspeed_dfs = []
+ hf_dfs = []
+ print("Models to benchmark: {}".format(models))
+ for model in models:
+ print("Benchmarking model: {}".format(model))
+ # run using deepspeed
+ print("Running with deepspeed")
+ deepspeed_dfs.append(benchmark_model(
+ model, output_dir, True, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials))
+
+ # run using huggingface
+ print("Running with huggingface")
+ hf_dfs.append(benchmark_model(
+ model, output_dir, False, dtype, graphs, kernel_inject, max_tokens, local_rank, world_size, trials))
+
+
+ print("plotting results")
+ # drop first 3 rows (warmup)
+ ds_means = [x["(e2e) latency"].iloc[3:].mean() for x in deepspeed_dfs]
+ ds_std = [x["(e2e) latency"].iloc[3:].std() for x in deepspeed_dfs]
+ hf_means = [x["(e2e) latency"].iloc[3:].mean() for x in hf_dfs]
+ hf_std = [x["(e2e) latency"].iloc[3:].std() for x in hf_dfs]
+
+
+ # plot results
+ fig, ax = plt.subplots(figsize=(12, 4))
+ ax.bar(
+ np.arange(len(ds_means)) - 0.24,
+ ds_means, yerr=ds_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Deepspeed')
+ ax.bar(
+ np.arange(len(hf_means)) + 0.24,
+ hf_means, yerr=hf_std, align='center', alpha=0.5, ecolor='black', capsize=10, width=0.4, label='Huggingface')
+ ax.set_xticks(np.arange(len(models)))
+ ax.set_xticklabels(models)
+ ax.set_xlabel('Model')
+ ax.set_ylabel('Time (s)')
+ plt.legend()
+ plt.tight_layout()
+ plt.title("e2e latency (s), {} tokens, {} world size, {} trials".format(max_tokens, world_size, trials))
+ plt.savefig(os.path.join(output_dir, "benchmark.png"))
+ print("plot saved to {}".format(os.path.join(output_dir, "benchmark.png")))
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--output_dir", type=str, default='/home/mchorse/benchmarking/output', help="output_directory")
+ parser.add_argument("--config", type=str, default='configs/inference_test.yml')
+ parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "fp32", "int8"], help="int8, fp16, or fp32")
+ parser.add_argument("--graphs", action="store_true", help="CUDA Graphs on")
+ parser.add_argument("--kernel-inject", action="store_true", help="inject kernels on")
+ parser.add_argument("--local_rank", type=int, default=int(os.getenv("LOCAL_RANK", "0")), help="local rank")
+ args = parser.parse_args()
+
+ with open(args.config, "r") as f:
+ config = yaml.safe_load(f)
+
+ models = config["models"]
+ world_size = config["world_size"]
+ trials = config["trials"]
+ max_tokens = config["max_tokens"]
+
+ main(models=models,
+ output_dir=args.output_dir,
+ dtype=args.dtype,
+ graphs=args.graphs,
+ kernel_inject=args.kernel_inject,
+ max_tokens=max_tokens,
+ local_rank=args.local_rank,
+ world_size=world_size,
+ trials=trials)
+
@@ -0,0 +1 @@
+{"train_batch_size": 128, "train_micro_batch_size_per_gpu": 32, "optimizer": {"type": "Adam", "params": {"lr": 0.0008, "betas": [0.9, 0.95], "eps": 1e-08}}, "fp16": {"fp16": true, "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 12, "hysteresis": 2, "min_loss_scale": 1}, "zero_optimization": {"stage": 1, "allgather_partitions": true, "allgather_bucket_size": 500000000, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 500000000, "contiguous_gradients": true}, "wall_clock_breakdown": true, "precision": "fp16", "num_layers": 10, "hidden_size": 640, "num_attention_heads": 10, "seq_length": 2048, "max_position_embeddings": 2048, "pos_emb": "rotary", "no_weight_tying": true, "attention_config": ["global", "global", "global", "global", "global", "global", "global", "global", "global", "global"], "sparsity_config": {}, "rotary_pct": 0.25, "init_method": "small_init", "output_layer_init_method": "wang_init", "gpt_j_residual": true, "output_layer_parallelism": "column", "lr_decay_style": "cosine", "lr_decay_iters": 143000, "min_lr": 8e-05, "optimizer_type": "Adam", "zero_stage": 1, "zero_reduce_scatter": true, "zero_contiguous_gradients": true, "zero_reduce_bucket_size": 500000000, "zero_allgather_bucket_size": 500000000, "lr": 0.0008, "padded_vocab_size": 50304, "data_path": "../data/enwik8/enwik8_text_document", "data_impl": "mmap", "save": "checkpoints", "config_files": {"49M.yml": "{\n # parallelism settings\n \"pipe-parallel-size\": 2,\n \"model-parallel-size\": 1,\n\n # model settings\n \"num-layers\": 10,\n \"hidden-size\": 640,\n \"num-attention-heads\": 10,\n \"seq-length\": 2048,\n \"max-position-embeddings\": 2048,\n \"pos-emb\": \"rotary\",\n \"rotary-pct\": 0.25,\n \"no-weight-tying\": true,\n \"gpt-j-residual\": true,\n \"output-layer-parallelism\": \"column\",\n\n # these should provide some speedup but takes a while to build, set to true if desired\n \"scaled-upper-triang-masked-softmax-fusion\": false,\n \"bias-gelu-fusion\": false,\n\n # init methods\n \"init_method\": \"small_init\",\n \"output_layer_init_method\": \"wang_init\",\n\n # optimizer settings\n \"optimizer\": {\n \"type\": \"Adam\",\n \"params\": {\n \"lr\": 0.0008,\n \"betas\": [0.9, 0.95],\n \"eps\": 1.0e-8,\n }\n },\n \"min_lr\": 0.00008,\n\n # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training\n \"zero_optimization\": {\n \"stage\": 1,\n \"allgather_partitions\": True,\n \"allgather_bucket_size\": 500000000,\n \"overlap_comm\": True,\n \"reduce_scatter\": True,\n \"reduce_bucket_size\": 500000000,\n \"contiguous_gradients\": True,\n },\n\n # batch / data settings\n \"train_micro_batch_size_per_gpu\": 32,\n \"gas\": 1,\n \"data-impl\": \"mmap\",\n \"num_workers\": 1,\n\n # activation checkpointing\n \"checkpoint-activations\": true,\n \"checkpoint-num-layers\": 1,\n \"partition-activations\": true,\n \"synchronize-each-layer\": true,\n\n # regularization\n \"gradient_clipping\": 1.0,\n \"weight-decay\": 0.1,\n \"hidden-dropout\": 0,\n \"attention-dropout\": 0,\n\n # precision settings\n \"fp16\": {\n \"fp16\": true,\n \"enabled\": true,\n \"loss_scale\": 0,\n \"loss_scale_window\": 1000,\n \"initial_scale_power\": 12,\n \"hysteresis\": 2,\n \"min_loss_scale\": 1,\n },\n\n # misc. training settings\n \"train-iters\": 143000,\n \"lr-decay-iters\": 143000,\n \"distributed-backend\": \"nccl\",\n \"lr-decay-style\": \"cosine\",\n \"warmup\": 0.01,\n \"checkpoint-factor\": 1000,\n \"eval-interval\": 100000,\n \"eval-iters\": 10,\n\n # logging\n \"log-interval\": 10,\n \"steps_per_print\": 10,\n \"wall_clock_breakdown\": true,\n}\n", "benchmark_setup.yml": "# Suggested data paths when using GPT-NeoX locally\n{\n \"data-path\": \"../data/enwik8/enwik8_text_document\",\n\n # or for weighted datasets:\n # \"train-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"test-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"valid-data-paths\": [\"data/enwik8/enwik8_text_document\", \"data/enwik8/enwik8_text_document\"],\n # \"train-data-weights\": [1., 2.],\n # \"test-data-weights\": [2., 1.],\n # \"valid-data-weights\": [0.5, 0.4],\n\n # If weight_by_num_documents is True, Builds dataset weights from a multinomial distribution over groups of data according to the number of documents in each group.\n # WARNING: setting this to True will override any user provided weights\n # \"weight_by_num_documents\": false,\n # \"weighted_sampler_alpha\": 0.3,\n\n \"vocab-file\": \"../data/gpt2-vocab.json\",\n \"merge-file\": \"../data/gpt2-merges.txt\",\n\n \"save\": \"checkpoints\",\n \"load\": \"checkpoints\",\n \"checkpoint_validation_with_forward_pass\": False,\n\n \"tensorboard-dir\": \"tensorboard\",\n \"log-dir\": \"logs\",\n \"use_wandb\": True,\n \"wandb_host\": \"https://api.wandb.ai\",\n \"wandb_project\": \"neox\"\n}\n", "benchmarking.yml": "# Parameters used for text generation\n# Make sure `load` is specified somewhere else\n{\n # Text gen type: `input-file`, `unconditional` or `interactive`\n \"text-gen-type\": \"from_prompt\",\n\n # Params for all\n \"maximum_tokens\": 128,\n \"prompt_end\": \"\\n\",\n \"temperature\": 1.0,\n \"top_p\": 0.0,\n \"top_k\": 0,\n \"recompute\": false,\n\n # `unconditional`: samples\n \"num-samples\": 10,\n}\n"}, "load": "checkpoints", "checkpoint_factor": 1000, "batch_size": 32, "train_iters": 143000, "eval_iters": 10, "eval_interval": 100000, "vocab_file": "../data/gpt2-vocab.json", "merge_file": "../data/gpt2-merges.txt", "num_workers": 1, "attention_dropout": 0, "hidden_dropout": 0, "weight_decay": 0.1, "checkpoint_activations": true, "synchronize_each_layer": true, "partition_activations": true, "gas": 1, "clip_grad": 1.0, "dynamic_loss_scale": true, "pipe_parallel_size": 2, "world_size": 2, "is_pipe_parallel": true, "use_wandb": true, "log_dir": "logs", "tensorboard_dir": "tensorboard", "log_interval": 10, "text_gen_type": "from_prompt", "temperature": 1.0, "maximum_tokens": 128, "num_samples": 10, "local_rank": 0, "rank": 0, "save_iters": [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000, 133000, 134000, 135000, 136000, 137000, 138000, 139000, 140000, 141000, 142000], "global_num_gpus": 8}
@@ -0,0 +1,87 @@
+'''Adapted from https://github.com/microsoft/DeepSpeed/blob/master/benchmarks/inference/gpt-bench.py'''
+
+import argparse
+import os
+import sys
+sys.path.insert(0, os.path.abspath(os.getcwd()))
+
+
+import tempfile
+import time
+
+import deepspeed
+from deepspeed.accelerator import get_accelerator
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from transformers import pipeline
+import torch
+import yaml
+
+from megatron.text_generation_utils import generate_samples_from_prompt
+from megatron.utils import print_rank_0, setup_for_inference_or_eval
+
+
+PYTHIA_TO_OLD_SUFFIXES = {
+ "70m": "19M",
+ "160m": "125M",
+ "410m": "350M",
+ "1b": "800M",
+ "1.4b": "1-3B",
+ "2.8b": "2.7B",
+ "6.9b": "6-7B",
+ "12b": "13B",
+ "20b": "20B"}
+
+
+def main():
+ model, neox_args = setup_for_inference_or_eval(use_cache=True)
+ max_tokens = 10
+ print_rank_0("Finished loading model")
+
+ prompts = ["DeepSpeed is" for x in range(100)]
+
+ generated_texts = generate_samples_from_prompt(
+ neox_args=neox_args,
+ model=model,
+ text=prompts,
+ eos_token_id=0,
+ maximum_tokens=10,
+ recompute=neox_args.recompute,
+ temperature=neox_args.temperature,
+ top_k=neox_args.top_k,
+ top_p=neox_args.top_p,
+ )
+
+ times = [x["duration_seconds"] for x in generated_texts]
+
+ for_dataframe = np.vstack((times, list(map(lambda t: t / (max_tokens - 3), times)))).T
+ columns = ["(e2e) latency", "(e2e) per token latency"]
+
+ df = pd.DataFrame(
+ for_dataframe,
+ columns = columns)
+
+
+ # save dataframe to CSV inside the directory for world_size
+ # if local_rank == 0:
+
+ # neox_dir = os.path.join(output_dir, "neox")
+ # max_tokens_dir = os.path.join(neox_dir, "max_tokens_{}".format(max_tokens))
+ # world_size_dir = os.path.join(max_tokens_dir, "world_size_{}".format(world_size))
+
+ # os.makedirs(world_size_dir, exist_ok=True)
+
+ # fname = os.path.join(world_size_dir,
+ # "{}_fp16_benchmark.csv".format(model.split('/')[-1]))
+
+ # print("saving benchmark to {}".format(fname))
+ # df.to_csv(fname, index=False)
+ print("Starting data generation...")
+ df.to_csv(sys.stdout, index=False)
+ print("Data generation complete!")
+
+
+if __name__ == "__main__":
+ main()
+
@@ -0,0 +1,100 @@
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n
+Deepspeed is \n