Set a missed VLLM_ARG to accelerator_count.

The main Llama2 deployment instructions are pulled from this notebook: https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_deployment.ipynb Which specifies that `tensor-parallel-size` be set to accelerator_count. I initially hardcoded that to 1 & need to set it now that accelerator_count can be 8 instead for Llama2 70b. PiperOrigin-RevId: 678756743
GoogleCloudPlatform · Sep 25, 2024 · 8e2bf7d · 8e2bf7d
1 parent b74b317
commit 8e2bf7d
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/perfkitbenchmarker/providers/gcp/vertex_ai.py b/perfkitbenchmarker/providers/gcp/vertex_ai.py
@@ -43,7 +43,6 @@
 VLLM_ARGS = [
     '--host=0.0.0.0',
     '--port=7080',
-    '--tensor-parallel-size=1',
     '--swap-space=16',
     '--gpu-memory-utilization=0.95',
     '--max-model-len=2048',
@@ -358,7 +357,6 @@ def __init__(self, component_full_name, flag_values=None, **kwargs):
     self.model_bucket_suffix = os.path.join(
         'llama2', f'llama2-{self.model_size}-hf'
     )
-    self.serving_container_args = VLLM_ARGS
     self.serving_container_ports = [7080]
     self.serving_container_predict_route = '/generate'
     self.serving_container_health_route = '/ping'
@@ -371,6 +369,10 @@ def __init__(self, component_full_name, flag_values=None, **kwargs):
       self.machine_type = 'g2-standard-96'
       self.accelerator_count = 8
     self.accelerator_type = 'NVIDIA_L4'
+    self.serving_container_args = VLLM_ARGS
+    self.serving_container_args.append(
+        f'--tensor-parallel-size={self.accelerator_count}'
+    )
 
   def GetEnvironmentVariables(self, **kwargs) -> dict[str, str]:
     """Returns container's environment variables needed by Llama2."""