Add DS inference

EleutherAI · cr458 · Mar 29, 2023 · Mar 31, 2023 · Mar 31, 2023 · Apr 11, 2023
commit d8184f3ca76e218eea0f2e1a7fa0cf0620ce3cea
@@ -0,0 +1,53 @@
+# Neox Inference with Deepspeed
+
+For Inference of neox models we use the Deepspeed MII library. The installation and usage instructions remain same as from [Deepspeed-MII](https://github.com/microsoft/DeepSpeed-MII#getting-started-with-mii).
+
+# Installation
+`pip install deepspeed-mii`
+
+# Inference Usage
+DeepSpeed MII incorporates both DS inference and Zero inference into one framework. Both of which serve different purposes and cannot be used together.
+
+## 1. DS Inference:
+This fits the entire model into GPUs memory and is more suitable for inference applications that are latency sensitive or have small batch sizes.
+
+```
+# Deployment
+import mii
+mii_configs = {"tensor_parallel": 2, "dtype": "fp16", "load_with_sys_mem": True}
+mii.deploy(task="text-generation",
+ model="EleutherAI/gpt-neox-20b",
+ deployment_name="gpt-neox-20b-deploy",
+ mii_config=mii_configs)
+
+# Generation
+generator = mii.mii_query_handle("gpt-neox-20b-deploy")
+
+# Terminate (if you no longer want to infer)
+mii.terminate("gpt-neox-20b-deploy")
+```
+
+Neox-20b fp16 model requires greater than 40GB memory and cannot fit on single A100 40GB GPU, so we keep `tensor_parallel:2` to use two GPUs. If you have 80GB GPU, you can set `tensor_parallel:1` for neox-20b to use single GPU.
+
+## 2. Zero Inference:
+It adapts and optimises ZeRO-Infinity techniques for model inference on GPUs by hosting the model weights in CPU or NVMe memory, thus hosting no weights (zero) in GPU. It is designed for inference applications that require GPU acceleration but lack sufficient GPU memory to host the model. This therefore have higher latency compared to DS inference.
+
+Example usage:
+```
+# Deployment
+python zero_inference.py
+
+# Generation
+generator = mii.mii_query_handle("EleutherAI/pythia-160m_deploy")
+
+# Terminate (if you no longer want to infer)
+mii.terminate("EleutherAI/pythia-160m_deploy")
+```
+
+# Batch size
+Batch size at inference is not directly supported with deepspeed mii. However you can run with few changes and caveats, but note that the higher batch size does not necessarily decrease inference time. Follow the [issue](https://github.com/microsoft/DeepSpeed-MII/issues/133#issuecomment-1509534568) for more details.
+
+# HF Vs DS Inference Comparison
+![HF Vs DS Comparision plot](HFvsDS_comparision.png)
+
+Using benchmark.py, we benchmark different pythia models with neox-20b model to compare HF and Deepspeed inference. All inference are done for fp-16 models using single A100 40GB GPU for pythia models and two A100 40GB GPUs for neox-20b. Relative comparision between HF and Deepspeed is more important than Absolute latency values in the plot.
@@ -0,0 +1,78 @@
+import torch
+import mii
+from transformers import pipeline
+import time
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--model', '-m', type=str, default='EleutherAI/pythia-160m', help='hf model name')
+parser.add_argument('--trials', type=int, default=50, help='number of trials')
+parser.add_argument('--dtype', type=str, default='fp16', help='Data type for model')
+parser.add_argument('--tensor_parallel', type=int, default=1, help='Tensor parallelism degree')
+parser.add_argument('--load_with_sys_mem', action='store_true', help='Load model with system memory')
+args = parser.parse_args()
+
+def hf_infer(model, torch_dtype, query=['Deepspeed is', 'Seattle is'], trials=1):
+
+ generator = pipeline('text-generation', model=model, device=0, torch_dtype=torch_dtype)
+ eos_token = generator.tokenizer.eos_token_id
+
+ start_time = time.time()
+ for i in range(trials):
+ hf_result = generator(query, max_new_tokens=100, pad_token_id=eos_token)
+ end_time = time.time()
+
+ hf_time = (end_time - start_time) / trials
+
+ generator = None
+ torch.cuda.empty_cache()
+
+ return eos_token, hf_result, hf_time
+
+def mii_infer(model, eos_token, query=['Deepspeed is', 'Seattle is'], trials=1):
+ generator = mii.mii_query_handle(model + '_deploy')
+ start_time = time.time()
+ for i in range(trials):
+ mii_result = generator.query({'query': query}, pad_token_id=eos_token, max_new_tokens=100)
+ end_time = time.time()
+ mii_time = (end_time - start_time) / trials
+
+ return mii_result, mii_time
+
+def main():
+
+ dtype_mapping = {
+ 'fp16': torch.float16,
+ 'fp32': torch.float32,
+ 'fp64': torch.float64,
+ 'int8': torch.int8,
+ 'int16': torch.int16,
+ 'int32': torch.int32,
+ 'int64': torch.int64
+ }
+
+ torch_dtype = dtype_mapping[args.dtype]
+ load_with_sys_mem = args.load_with_sys_mem
+ tensor_parallel = args.tensor_parallel
+ trials = args.trials
+ model = args.model
+
+ eos_token, hf_result, hf_time = hf_infer(model, torch_dtype, trials=trials)
+
+ mii_configs = {'tensor_parallel': tensor_parallel, 'dtype': torch_dtype, 'load_with_sys_mem': load_with_sys_mem}
+ mii.deploy(task='text-generation',
+ model=model,
+ deployment_name=model + '_deploy',
+ mii_config=mii_configs)
+ mii_result, mii_time = mii_infer(model, eos_token, trials=trials)
+
+ print('HF sample output', hf_result)
+ print('HF Average Inference time: ', hf_time)
+
+ print('MII sample output', mii_result)
+ print('MII Average Inference time: ', mii_time)
+
+ mii.terminate(model + '_deploy')
+
+if __name__ == '__main__':
+ main()
@@ -0,0 +1,47 @@
+import mii
+from transformers import AutoConfig
+
+mii_config = {"dtype": "fp16"}
+
+name = "EleutherAI/pythia-160m"
+
+config = AutoConfig.from_pretrained(name)
+model_hidden_size = config.hidden_size
+
+ds_config = {
+ "fp16": {
+ "enabled": True
+ },
+ "bf16": {
+ "enabled": False
+ },
+ "aio": {
+ "block_size": 262144,
+ "queue_depth": 32,
+ "thread_count": 1,
+ "single_submit": False,
+ "overlap_events": True
+ },
+ "zero_optimization": {
+ "stage": 3,
+ "offload_param": {
+ "device": "cpu",
+ },
+ "overlap_comm": True,
+ "contiguous_gradients": True,
+ "reduce_bucket_size": model_hidden_size * model_hidden_size,
+ "stage3_prefetch_bucket_size": 0.1 * model_hidden_size * model_hidden_size,
+ "stage3_max_live_parameters": 1e8,
+ "stage3_max_reuse_distance": 1e8,
+ "stage3_param_persistence_threshold": 10 * model_hidden_size
+ },
+ "train_micro_batch_size_per_gpu": 1,
+}
+
+mii.deploy(task='text-generation',
+ model=name,
+ deployment_name=name + "_deploy",
+ mii_config=mii_config,
+ enable_deepspeed=False,
+ enable_zero=True,
+ ds_config=ds_config)