support multi-gpu inference for server.

ProjectD-AI · Apr 21, 2023 · 47107a2 · 47107a2
1 parent dd5193c
commit 47107a2
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/llama_server.py b/llama_server.py
@@ -10,7 +10,6 @@
 app = Flask(__name__)
 args = None
 lm_generation = None
-torch.cuda.set_device(0)
 
 
 def init_model():
@@ -65,6 +64,7 @@ def init_model():
  gpus = ["cuda:" + str(i) for i in range(args.world_size)]
  model = tp.tensor_parallel(model, gpus)
  else:
+ torch.cuda.set_device(0)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)