From 9bb9437ce3de4a9d6258e1e1bd5f0b9e67e17e74 Mon Sep 17 00:00:00 2001 From: Artur Fierka Date: Mon, 12 Aug 2024 21:10:00 +0300 Subject: [PATCH 1/3] add hpu synchronization after each layer in llama model --- vllm/model_executor/models/llama.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 306d22e42ed1d..f35f5882d5cd8 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -49,7 +49,7 @@ default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors, SamplerOutput -from vllm.utils import is_hip +from vllm.utils import is_hip, is_hpu from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers @@ -511,6 +511,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + if is_hpu: + torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale # factors (or else raise an exception). Thus, handled exceptions should From d85f6dff794f9949138bee84a055adaebf2655da Mon Sep 17 00:00:00 2001 From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com> Date: Wed, 14 Aug 2024 12:45:48 +0200 Subject: [PATCH 2/3] fix missing parenthesis in is_hpu function --- vllm/model_executor/models/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f35f5882d5cd8..2680230163569 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -511,7 +511,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - if is_hpu: + if is_hpu(): torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale From 955b9416154eeec6a7cb0b476a4337ed3310849e Mon Sep 17 00:00:00 2001 From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com> Date: Tue, 3 Sep 2024 12:47:18 +0200 Subject: [PATCH 3/3] Change is_hpu -> current_platform.is_hpu --- vllm/model_executor/models/llama.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2680230163569..cf604471e39dd 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -48,8 +48,9 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SamplerOutput -from vllm.utils import is_hip, is_hpu +from vllm.utils import is_hip from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers @@ -511,7 +512,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) - if is_hpu(): + if current_platform.is_hpu(): torch.hpu.synchronize() # If this function is called, it should always initialize KV cache scale