remove leftovers from habana_main

HabanaAI · Oct 16, 2024 · b6428cd · b6428cd
1 parent d2ce468
commit b6428cd
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 7 deletions.
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
@@ -23,7 +23,6 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.executor.gpu_executor import GPUExecutor
-from vllm.executor.hpu_executor import HPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
@@ -33,6 +32,7 @@
 
 logger = init_logger(__name__)
 
+POLLING_TIMEOUT_MS = 10000
 HEALTHY_RESPONSE = (pickle.dumps(VLLM_RPC_SUCCESS_STR), )
 
 
@@ -209,7 +209,7 @@ def run_engine_loop(self):
             self._alive()
             if not self.engine.has_unfinished_requests():
                 # Poll until there is work to do.
-                while self.input_socket.poll(timeout=VLLM_RPC_TIMEOUT) == 0:
+                while self.input_socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
                     self._alive()
                     self.engine.do_log_stats()
                     logger.debug("Waiting for new requests in engine loop.")
@@ -368,15 +368,13 @@ def _alive(self):
         self._last_alive_time = time.time()
 
     def start_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor:
             self.engine.model_executor.start_profile()
         else:
             self.engine.model_executor._run_workers("start_profile")
 
     def stop_profile(self) -> None:
-        if type(self.engine.model_executor) is GPUExecutor or \
-                type(self.engine.model_executor) is HPUExecutor:
+        if type(self.engine.model_executor) is GPUExecutor:
             self.engine.model_executor.stop_profile()
         else:
             self.engine.model_executor._run_workers("stop_profile")

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -121,7 +121,7 @@ def apply_fp8_linear(
         qinput, x_scale = ops.scaled_fp8_quant(
             input,
             input_scale,
-            batch_dim_padding=17,
+            num_token_padding=17,
             use_per_token_if_dynamic=use_per_token_if_dynamic)
 
         per_tensor_weights = (weight_scale.numel() == 1)