diff --git a/vllm/distributed/device_communicators/hpu_communicator.py b/vllm/distributed/device_communicators/hpu_communicator.py index cc9b19ce022b..840f26b31797 100644 --- a/vllm/distributed/device_communicators/hpu_communicator.py +++ b/vllm/distributed/device_communicators/hpu_communicator.py @@ -3,9 +3,11 @@ from torch.distributed import ProcessGroup from vllm.platforms import current_platform +from vllm.utils import is_fake_hpu if current_platform.is_hpu(): - import habana_frameworks.torch as htorch # noqa: F401 + if not is_fake_hpu(): + import habana_frameworks.torch as htorch # noqa: F401 class HpuCommunicator: @@ -22,7 +24,8 @@ def all_reduce(self, x: torch.Tensor) -> torch.Tensor: # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used # (which is required for tensor parallel HPUGraph inference) - htorch.core.mark_step() + if not is_fake_hpu(): + htorch.core.mark_step() dist.all_reduce(x, group=self.group) return x @@ -37,7 +40,8 @@ def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor: dtype=x.dtype, device=x.device) # All-gather. - htorch.core.mark_step() + if not is_fake_hpu(): + htorch.core.mark_step() dist.all_gather_into_tensor(output_tensor, x, group=self.group) # Reshape output_tensor = output_tensor.movedim(0, dim) diff --git a/vllm/executor/ray_habana_executor.py b/vllm/executor/ray_habana_executor.py index 9e0a89cbeb8a..37498453cc23 100644 --- a/vllm/executor/ray_habana_executor.py +++ b/vllm/executor/ray_habana_executor.py @@ -13,7 +13,7 @@ from vllm.utils import (_run_task_with_lock, error_on_invalid_device_count_status, get_distributed_init_method, get_ip, get_open_port, - get_vllm_instance_id, make_async) + get_vllm_instance_id, is_fake_hpu, make_async) if ray is not None: from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -87,18 +87,20 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", driver_ip = get_ip() worker_wrapper_kwargs = self._get_worker_wrapper_args() for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("HPU", 0): + resource_name = "HPU" if not is_fake_hpu() else "CPU" + if not bundle.get(resource_name,0): continue scheduling_strategy = PlacementGroupSchedulingStrategy( placement_group=placement_group, placement_group_capture_child_tasks=True, placement_group_bundle_index=bundle_id, ) - + resources = {'HPU': num_gpus} if not is_fake_hpu() else {} + num_cpus = 0 if not is_fake_hpu() else num_gpus worker = ray.remote( - num_cpus=0, + num_cpus=num_cpus, num_gpus=0, - resources={'HPU': num_gpus}, + resources=resources, scheduling_strategy=scheduling_strategy, **ray_remote_kwargs, )(RayWorkerWrapper).remote(**worker_wrapper_kwargs) diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py index 507dc04f4812..8259e2fc49a8 100644 --- a/vllm/executor/ray_utils.py +++ b/vllm/executor/ray_utils.py @@ -3,7 +3,7 @@ from vllm.config import ParallelConfig from vllm.logger import init_logger from vllm.sequence import ExecuteModelRequest -from vllm.utils import get_ip, is_hip, is_hpu, is_tpu, is_xpu +from vllm.utils import get_ip, is_fake_hpu, is_hip, is_hpu, is_tpu, is_xpu from vllm.worker.worker_base import WorkerWrapperBase logger = init_logger(__name__) @@ -97,7 +97,7 @@ def initialize_ray_cluster( if is_tpu(): device_str = "TPU" elif is_hpu(): - device_str = "HPU" + device_str = "HPU" if not is_fake_hpu() else 'CPU' # Create placement group for worker processes current_placement_group = ray.util.get_current_placement_group() if current_placement_group: