Load LLMs in FP16 for faster inference

EleutherAI · Kyle1668 · Jul 5, 2023 · Jul 15, 2023 · Aug 7, 2023 · Aug 9, 2023
commit acbb67fc8044d496062127261ad69def63635c2a
diff --git a/inference.py b/inference.py
@@ -53,7 +53,7 @@ def load_model(split_name):
  isDeduped = split_name.startswith("deduped")
  model = split_name.split("duped.")[-1]
  corresponding_model = f"EleutherAI/pythia-{model}{'-deduped' if isDeduped else ''}"
- return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto")
+ return GPTNeoXForCausalLM.from_pretrained(corresponding_model, device_map="auto", torch_dtype=torch.float16)
 
 
 def calculate_perplexity(logits: torch.Tensor, labels: torch.Tensor) -> torch.float64:
@@ -293,7 +293,6 @@ def parse_cli_args():
  "--models",
  type=str,
  help=models_arg_help,
- choices=models_args_default,
  default=models_args_default,
  )