diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 10352a030258e..5d16efe4dbefd 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -274,14 +274,13 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
 
 def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"):
     if module.__class__.__name__.endswith(suffix):
-        module.original_forward = module.forward
 
-        def new_forward(self, *args, **kwargs):
-            ret = self.original_forward(*args, **kwargs)
+        def forward_hook(module, args, output):
             htorch.core.mark_step()
-            return ret
+            return output
+
+        module.register_forward_hook(forward_hook)
 
-        module.forward = new_forward.__get__(module)
     for child_name, child_module in module.named_children():
         modify_decoder_layer(child_module)