diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 1ab81898b5f7..c7376a7c504f 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,4 +6,4 @@ ray == 2.32.0 triton pandas tabulate -vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0e05e25 +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@0a7adab diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py index dd99dd94e4ad..422c1f4bd8f8 100644 --- a/vllm/core/block/cpu_gpu_block_allocator.py +++ b/vllm/core/block/cpu_gpu_block_allocator.py @@ -4,7 +4,8 @@ DeviceAwareBlockAllocator) from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator -from vllm.utils import Device, is_hpu +from vllm.platforms import current_platform +from vllm.utils import Device class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): @@ -53,7 +54,7 @@ def create( before CPU block IDs. """ # For HPU, block id 0 is used only for padding - reserved_blocks = 1 if is_hpu() else 0 + reserved_blocks = 1 if current_platform.is_hpu() else 0 block_ids = list( range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) num_gpu_blocks -= reserved_blocks diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py index 8e7335a4016e..b1160e8d2f16 100644 --- a/vllm/core/block_manager_v1.py +++ b/vllm/core/block_manager_v1.py @@ -13,8 +13,9 @@ from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.sequence import Sequence, SequenceGroup, SequenceStatus -from vllm.utils import Device, is_hpu +from vllm.utils import Device logger = init_logger(__name__) @@ -185,7 +186,7 @@ def __init__( # Initialize the free blocks. self.free_blocks: List[PhysicalTokenBlock] = [] # For HPU, block id 0 is used only for padding - reserved_blocks = 1 if is_hpu() else 0 + reserved_blocks = 1 if current_platform.is_hpu() else 0 for i in range(reserved_blocks, num_blocks): block = PhysicalTokenBlock(device=device, block_number=i, diff --git a/vllm/utils.py b/vllm/utils.py index e5cef9b4419c..ca36ad8cd959 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -337,11 +337,6 @@ def is_neuron() -> bool: return transformers_neuronx is not None -@lru_cache(maxsize=None) -def is_hpu() -> bool: - return _is_habana_frameworks_installed() or _is_built_for_hpu() - - @lru_cache(maxsize=None) def is_fake_hpu() -> bool: return os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0' @@ -359,21 +354,6 @@ def hpu_backend_string(): return backend_string -@lru_cache(maxsize=None) -def _is_habana_frameworks_installed() -> bool: - from importlib import util - return util.find_spec('habana_frameworks') is not None - - -@lru_cache(maxsize=None) -def _is_built_for_hpu() -> bool: - from importlib.metadata import PackageNotFoundError, version - try: - return "gaudi" in version("vllm") - except PackageNotFoundError: - return False - - @lru_cache(maxsize=None) def is_xpu() -> bool: from importlib.metadata import PackageNotFoundError, version @@ -777,7 +757,7 @@ def print_warning_once(msg: str) -> None: def get_device() -> str: - if is_hpu(): + if current_platform.is_hpu(): return "hpu" return "cuda" @@ -797,7 +777,7 @@ def is_pin_memory_available() -> bool: elif is_neuron(): print_warning_once("Pin memory is not supported on Neuron.") return False - elif is_hpu(): + elif current_platform.is_hpu(): print_warning_once("Pin memory is not supported on HPU.") return False elif is_cpu() or is_openvino():