From 9bb9437ce3de4a9d6258e1e1bd5f0b9e67e17e74 Mon Sep 17 00:00:00 2001
From: Artur Fierka <afierka@habana.ai>
Date: Mon, 12 Aug 2024 21:10:00 +0300
Subject: [PATCH 1/3] add hpu synchronization after each layer in llama model

---
 vllm/model_executor/models/llama.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 306d22e42ed1d..f35f5882d5cd8 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -49,7 +49,7 @@
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import is_hip
+from vllm.utils import is_hip, is_hpu
 
 from .interfaces import SupportsLoRA
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
@@ -511,6 +511,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            if is_hpu:
+                torch.hpu.synchronize()
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should

From d85f6dff794f9949138bee84a055adaebf2655da Mon Sep 17 00:00:00 2001
From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com>
Date: Wed, 14 Aug 2024 12:45:48 +0200
Subject: [PATCH 2/3] fix missing parenthesis in is_hpu function

---
 vllm/model_executor/models/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index f35f5882d5cd8..2680230163569 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -511,7 +511,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
-            if is_hpu:
+            if is_hpu():
                 torch.hpu.synchronize()
 
     # If this function is called, it should always initialize KV cache scale

From 955b9416154eeec6a7cb0b476a4337ed3310849e Mon Sep 17 00:00:00 2001
From: Artur Fierka <160735857+afierka-intel@users.noreply.github.com>
Date: Tue, 3 Sep 2024 12:47:18 +0200
Subject: [PATCH 3/3] Change is_hpu -> current_platform.is_hpu

---
 vllm/model_executor/models/llama.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2680230163569..cf604471e39dd 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -48,8 +48,9 @@
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors, SamplerOutput
-from vllm.utils import is_hip, is_hpu
+from vllm.utils import is_hip
 
 from .interfaces import SupportsLoRA
 from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
@@ -511,7 +512,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
-            if is_hpu():
+            if current_platform.is_hpu():
                 torch.hpu.synchronize()
 
     # If this function is called, it should always initialize KV cache scale