diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index aa01e9fb77af..015dbb8bd0d5 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -1250,6 +1250,8 @@ def _get_logits( nan=float("-inf"), posinf=float("inf"), neginf=float("-inf"))) + if is_hpu(): + lora_logits = lora_logits[:logits.shape[0], :] logits[:, self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + lora_logits.shape[1]] = lora_logits diff --git a/vllm/lora/models.py b/vllm/lora/models.py index e8d39591cb17..f819e3816fd3 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -24,7 +24,7 @@ from vllm.lora.utils import (from_layer, from_layer_logits_processor, parse_fine_tuned_lora_name, replace_submodule) from vllm.model_executor.models.interfaces import SupportsLoRA -from vllm.utils import get_device, is_pin_memory_available +from vllm.utils import get_device, is_hpu, is_pin_memory_available logger = init_logger(__name__) @@ -829,6 +829,8 @@ def create_lora_manager( """Create a LoRA adapter for a given model.""" if not hasattr(model, "supported_lora_modules"): raise ValueError(f"Model {type(model)} is not supported for LoRA.") + if is_hpu(): + max_num_batched_tokens = 3 * max_num_batched_tokens lora_manager = lora_manager_cls( model=model, max_num_seqs=max_num_seqs, diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 161364ce05f9..685e9c0140d8 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1937,14 +1937,6 @@ def execute_model( ) if self.lora_config: - from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - modules = unwrap_model(self.model.model) - for module in modules: - if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, len(module.indices_len)): - module.indices_len[ - i] = sampling_metadata.selected_token_indices.numel( - ) lora_logits_mask: torch.Tensor = model_input.lora_logits_mask LoraMask.setLoraMask( lora_logits_mask.index_select(