diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 10352a030258e..5d16efe4dbefd 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -274,14 +274,13 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt): def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"): if module.__class__.__name__.endswith(suffix): - module.original_forward = module.forward - def new_forward(self, *args, **kwargs): - ret = self.original_forward(*args, **kwargs) + def forward_hook(module, args, output): htorch.core.mark_step() - return ret + return output + + module.register_forward_hook(forward_hook) - module.forward = new_forward.__get__(module) for child_name, child_module in module.named_children(): modify_decoder_layer(child_module)