Add error handling for PT_COMPILE_ONLY_MODE (HabanaAI#251)

This PR fixes crashes observed on older Synapse builds introduced with HabanaAI#227. Setting PT_COMPILE_ONLY_MODE is not supported in current or older public Synapse builds, but we should not crash because of it, rather we should advise user to use the latest build. Previous behavior: ``` ... INFO 09-06 17:08:37 habana_executor.py:85] # HPU blocks: 10761, # CPU blocks: 910 INFO 09-06 17:08:37 habana_worker.py:201] Initializing cache engine took 47.29 GiB of device memory (54.34 GiB/94.62 GiB used) and -159.6 MiB of host memory (414.9 GiB/1007 GiB used) [rank0]: Traceback (most recent call last): [rank0]: File "/software/users/kzawora/vllm-utils/vllm_hpu_simple_test.py", line 9, in <module> [rank0]: llm = LLM(model="facebook/opt-125m") [rank0]: File "/software/users/kzawora/vllm-fork/vllm/entrypoints/llm.py", line 155, in __init__ [rank0]: self.llm_engine = LLMEngine.from_engine_args( [rank0]: File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 456, in from_engine_args [rank0]: engine = cls( [rank0]: File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 266, in __init__ [rank0]: self._initialize_kv_caches() [rank0]: File "/software/users/kzawora/vllm-fork/vllm/engine/llm_engine.py", line 378, in _initialize_kv_caches [rank0]: self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) [rank0]: File "/software/users/kzawora/vllm-fork/vllm/executor/habana_executor.py", line 89, in initialize_cache [rank0]: self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks) [rank0]: File "/software/users/kzawora/vllm-fork/vllm/worker/habana_worker.py", line 202, in initialize_cache [rank0]: self._warm_up_model() [rank0]: File "/software/users/kzawora/vllm-fork/vllm/worker/habana_worker.py", line 220, in _warm_up_model [rank0]: self.model_runner.warmup_model(self.hpu_cache[0]) [rank0]: File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context [rank0]: return func(*args, **kwargs) [rank0]: File "/software/users/kzawora/vllm-fork/vllm/worker/habana_model_runner.py", line 1412, in warmup_model [rank0]: with compile_only_mode_context(): [rank0]: File "/usr/lib/python3.10/contextlib.py", line 135, in __enter__ [rank0]: return next(self.gen) [rank0]: File "/usr/local/lib/python3.10/dist-packages/habana_frameworks/torch/internal/bridge_config.py", line 20, in env_setting [rank0]: get_func = globals()['get_' + var.lower()] [rank0]: KeyError: 'get_pt_compile_only_mode' inc shutdown inc shutdown inc shutdown inc shutdown ``` Current behavior: ``` ... INFO 09-06 17:06:42 habana_executor.py:85] # HPU blocks: 10761, # CPU blocks: 910 INFO 09-06 17:06:43 habana_worker.py:201] Initializing cache engine took 47.29 GiB of device memory (54.34 GiB/94.62 GiB used) and -143.7 MiB of host memory (415 GiB/1007 GiB used) WARNING 09-06 17:06:43 habana_model_runner.py:1419] Cannot use PT_COMPILE_ONLY_MODE. Warmup time will be negatively impacted. Please update Gaudi Software Suite. INFO 09-06 17:06:43 habana_model_runner.py:1336] [Warmup][Prompt][1/23] batch_size:2 seq_len:1024 free_mem:40.28 GiB ... ```
zhouyu5 · Sep 20, 2024 · f7c88cb · f7c88cb
1 parent 5d15efc
commit f7c88cb
Showing 1 changed file with 17 additions and 1 deletion.
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
@@ -3,7 +3,9 @@
 ###############################################################################
 
 import collections
+import contextlib
 import dataclasses
+import functools
 import gc
 import itertools
 import math
@@ -1404,7 +1406,21 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         start_mem = HabanaMemoryProfiler.current_device_memory_usage()
         start_time = time.perf_counter()
 
-        with bc.env_setting("PT_COMPILE_ONLY_MODE", True):
+        compile_only_mode_context = functools.partial(bc.env_setting,
+                                                      "PT_COMPILE_ONLY_MODE",
+                                                      True)
+        can_use_compile_only_mode = True
+        try:
+            with compile_only_mode_context():
+                pass
+            logger.debug("Using PT_COMPILE_ONLY_MODE.")
+        except KeyError:
+            can_use_compile_only_mode = False
+            logger.warning('Cannot use PT_COMPILE_ONLY_MODE. '
+                           'Warmup time will be negatively impacted. '
+                           'Please update Gaudi Software Suite.')
+        with compile_only_mode_context(
+        ) if can_use_compile_only_mode else contextlib.nullcontext():
             self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
             self.warmup_all_buckets(self.decode_buckets, False, kv_caches)