Fix INC FP8 inference after rebase (HabanaAI#333)

This PR fixes the "RuntimeError: HPU does not have device capability." error introduced after rebase & fixes loading weights on CPU for quantization.
zhouyu5 · Sep 24, 2024 · 73f4b48 · 73f4b48
1 parent e16918d
commit 73f4b48
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 9 deletions.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -59,7 +59,7 @@ def device_loading_context(module: torch.nn.Module,
 
     # Store original device states and move parameters to GPU if they're on CPU
     for name, p in module.named_parameters():
-        if p.device.type == "cpu":
+        if p.device.type == "cpu" and target_device.type != 'hpu':
             original_device_states[name] = p.device
             p.data = p.data.to(target_device)
         # Parameters already on target device are not touched

diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
@@ -1,18 +1,11 @@
-from typing import Optional
-
 import torch
 
-from .interface import DeviceCapability, Platform, PlatformEnum
+from .interface import Platform, PlatformEnum
 
 
 class HpuPlatform(Platform):
     _enum = PlatformEnum.HPU
 
-    @staticmethod
-    def get_device_capability(
-            device_id: int = 0) -> Optional[DeviceCapability]:
-        raise RuntimeError("HPU does not have device capability.")
-
     @staticmethod
     def inference_mode():
         return torch.no_grad()