[Core] Support Torch profiler in Habana Worker (#357)

This PR allows to profile execution on HPU through flag VLLM_TORCH_PROFILER_DIR. Similar as it is done for GPU. The profiling can be controlled: 1. Asynchronously by posting requests to the server: a) to start collecting profile: ` curl -X POST http://localhost:8080/start_profile ` b) to stop collecting profile: ` curl -X POST http://localhost:8080/stop_profile ` 2. In script, by instructing LLM object to start and stop profiling: ```python from vllm import LLM, SamplingParams llm = LLM(...) llm.start_profile() llm.stop_profile() ```
HabanaAI · Oct 4, 2024 · d8ba780 · d8ba780
1 parent f848d27
commit d8ba780
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 6 deletions.
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -16,6 +16,7 @@
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.executor.executor_base import ExecutorAsyncBase
 from vllm.executor.gpu_executor import GPUExecutorAsync
+from vllm.executor.habana_executor import HabanaExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType
 from vllm.logger import init_logger
@@ -1204,15 +1205,17 @@ def remove_logger(self, logger_name: str) -> None:
     async def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
+        if type(self.engine.model_executor) == GPUExecutorAsync or \
+            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     async def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes
-        if type(self.engine.model_executor) == GPUExecutorAsync:  # noqa: E721
+        if type(self.engine.model_executor) == GPUExecutorAsync or \
+            type(self.engine.model_executor) == HabanaExecutorAsync:  # noqa: E721
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -28,6 +28,7 @@
 from vllm.entrypoints.openai.logits_processors import get_logits_processors
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.gpu_executor import GPUExecutor
+from vllm.executor.habana_executor import HabanaExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderLLMInputs,
                          InputRegistry, LLMInputs, PromptType)
@@ -1794,15 +1795,17 @@ def check_health(self) -> None:
     def start_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
+        if type(self.model_executor) == GPUExecutor or \
+            type(self.model_executor) == HabanaExecutor:  # noqa: E721
             self.model_executor.start_profile()
         else:
             self.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
         # using type instead of isinstance to check to avoid capturing
         # inherited classes (MultiprocessingGPUExecutor)
-        if type(self.model_executor) == GPUExecutor:  # noqa: E721
+        if type(self.model_executor) == GPUExecutor or \
+            type(self.model_executor) == HabanaExecutor:  # noqa: E721
             self.model_executor.stop_profile()
         else:
             self.model_executor._run_workers("stop_profile")

diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
@@ -23,6 +23,7 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.executor.gpu_executor import GPUExecutor
+from vllm.executor.habana_executor import HabanaExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -364,13 +365,15 @@ def _alive(self):
         self._last_alive_time = time.time()
 
     def start_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor or \
+                type(self.engine.model_executor) is HabanaExecutor:
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor or \
+                type(self.engine.model_executor) is HabanaExecutor:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
@@ -192,6 +192,12 @@ def check_health(self) -> None:
         # it's running.
         return
 
+    def start_profile(self) -> None:
+        self.driver_worker.start_profile()
+
+    def stop_profile(self) -> None:
+        self.driver_worker.stop_profile()
+
     def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 

diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
@@ -11,6 +11,7 @@
 import torch.distributed
 from vllm_hpu_extension.profiler import HabanaMemoryProfiler, format_bytes
 
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, ObservabilityConfig, ParallelConfig,
                          PromptAdapterConfig, SchedulerConfig,
@@ -95,6 +96,32 @@ def __init__(
         self.cache_engine: List[CacheEngine]
         # Initialize gpu_cache as embedding models don't initialize kv_caches
         self.hpu_cache: Optional[List[List[torch.tensor]]] = None
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+            self.profiler = torch.profiler.profile(
+                activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.HPU,
+                ],
+                with_stack=True,
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir, use_gzip=True))
+        else:
+            self.profiler = None
+
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
 
     def _set_env_vars(self):
         local_rank = self.local_rank