Do not pass warmup_mode to execute_model_kwargs (#229)

This fixes a very silly issue where mismatching values of `warmup_mode` flag could cause graph recompilations and eventually memory leaks.
HabanaAI · Sep 6, 2024 · 17447ed · 17447ed
1 parent 7488c58
commit 17447ed
Showing 1 changed file with 1 addition and 4 deletions.
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
@@ -1704,10 +1704,7 @@ def execute_model(
         if multi_modal_input is not None:
             execute_model_kwargs.update(multi_modal_input)
         if htorch.utils.internal.is_lazy():
-            execute_model_kwargs.update({
-                "bypass_hpu_graphs": not use_graphs,
-                "warmup_mode": warmup_mode
-            })
+            execute_model_kwargs.update({"bypass_hpu_graphs": not use_graphs})
 
         htorch.core.mark_step()
         if self.is_driver_worker: