Skip to content

Commit

Permalink
Set a missed VLLM_ARG to accelerator_count.
Browse files Browse the repository at this point in the history
The main Llama2 deployment instructions are pulled from this notebook:
https://github.com/GoogleCloudPlatform/vertex-ai-samples/blob/main/notebooks/community/model_garden/model_garden_pytorch_llama2_deployment.ipynb

Which specifies that `tensor-parallel-size` be set to accelerator_count. I initially hardcoded that to 1 & need to set it now that accelerator_count can be 8 instead for Llama2 70b.

PiperOrigin-RevId: 678756743
  • Loading branch information
Zach Howell authored and copybara-github committed Sep 25, 2024
1 parent b74b317 commit 8e2bf7d
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions perfkitbenchmarker/providers/gcp/vertex_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
VLLM_ARGS = [
'--host=0.0.0.0',
'--port=7080',
'--tensor-parallel-size=1',
'--swap-space=16',
'--gpu-memory-utilization=0.95',
'--max-model-len=2048',
Expand Down Expand Up @@ -358,7 +357,6 @@ def __init__(self, component_full_name, flag_values=None, **kwargs):
self.model_bucket_suffix = os.path.join(
'llama2', f'llama2-{self.model_size}-hf'
)
self.serving_container_args = VLLM_ARGS
self.serving_container_ports = [7080]
self.serving_container_predict_route = '/generate'
self.serving_container_health_route = '/ping'
Expand All @@ -371,6 +369,10 @@ def __init__(self, component_full_name, flag_values=None, **kwargs):
self.machine_type = 'g2-standard-96'
self.accelerator_count = 8
self.accelerator_type = 'NVIDIA_L4'
self.serving_container_args = VLLM_ARGS
self.serving_container_args.append(
f'--tensor-parallel-size={self.accelerator_count}'
)

def GetEnvironmentVariables(self, **kwargs) -> dict[str, str]:
"""Returns container's environment variables needed by Llama2."""
Expand Down

0 comments on commit 8e2bf7d

Please sign in to comment.