diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 2af5634a8d1a..662c53486b4c 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -191,3 +191,78 @@ def prompt_attention( valid_seq_lengths, 'right') attn_weights = attn_weights.transpose(1, 2) return attn_weights + + +def dispatch_bgmv_linear( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + wb_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices + stacked into single tensors, assuming same rank. HPU handles no-LoRA + requests using zero valued A and B tensors. These zero valued tensors are + appended at the end of `wa_t_all` and `wb_t_all` during initialization. For + custom BGMV, the corresponding `wa` and `wb` for each batch is created + based on the lora_index of each sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The `wa` tensor for a batch of size batch_Size will have + a shape of (batch_size, num_layers, hidden_dim, lora_rank) + + This method avoids for-loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out @ wb + out = out.squeeze(1) + y += out * scale + + +def dispatch_bgmv_embedding( + y: torch.Tensor, + x: torch.Tensor, + wa_t_all: torch.Tensor, + indices: torch.LongTensor, + layer_idx: int, + scale: float, +): + """ + `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor + assuming same rank. HPU handles no-LoRA requests using zero valued A + tensor. This zero valued tensor is appended at the end of `wa_t_all` during + initialization. For custom BGMV, the corresponding wa for each batch is + created based on the lora_index of the sample. + + For example: + `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, + hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles + no-LoRA case. The wa tensor for a batch of size batch_Size will have a + shape of (batch_size, num_layers, lora_rank, hidden_dim) + + + This method avoids for-loop as well as graph breaks. + """ + assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' + max_loras = wa_t_all.size(0) + # Wrap-around for negative indices + indices = indices % max_loras + wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) + + x = x.unsqueeze(1) + out = x @ wa + out = out.squeeze(1) + y += out * scale \ No newline at end of file diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index 912cd0b47202..4a45f3fda88f 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -29,6 +29,9 @@ VocabParallelEmbedding) from vllm.utils import is_hpu +if is_hpu(): + from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear + if TYPE_CHECKING: pass @@ -64,81 +67,6 @@ def dec(*args, **kwargs): return dec -def custom_bgmv( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices - stacked into single tensors, assuming same rank. HPU handles no-LoRA - requests using zero valued A and B tensors. These zero valued tensors are - appended at the end of `wa_t_all` and `wb_t_all` during initialization. For - custom BGMV, the corresponding `wa` and `wb` for each batch is created - based on the lora_index of each sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The `wa` tensor for a batch of size batch_Size will have - a shape of (batch_size, num_layers, hidden_dim, lora_rank) - - This method avoids for-loop as well as graph breaks. - """ - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wa_t_all.size(0) - # Wrap-around for negative indices - indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - wb = torch.index_select(wb_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - - x = x.unsqueeze(1) - out = x @ wa - out = out @ wb - out = out.squeeze(1) - y += out * scale - - -def custom_bgmv_embed( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wa_t_all` contains all LoRA A weight matrices stacked into a single tensor - assuming same rank. HPU handles no-LoRA requests using zero valued A - tensor. This zero valued tensor is appended at the end of `wa_t_all` during - initialization. For custom BGMV, the corresponding wa for each batch is - created based on the lora_index of the sample. - - For example: - `wa_t_all` is tensor of shape (num_loras, num_layers, lora_rank, - hidden_dim), where `wa_t_all[-1]` is zero valued tensor which handles - no-LoRA case. The wa tensor for a batch of size batch_Size will have a - shape of (batch_size, num_layers, lora_rank, hidden_dim) - - - This method avoids for-loop as well as graph breaks. - """ - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wa_t_all.size(0) - # Wrap-around for negative indices - indices = indices % max_loras - wa = torch.index_select(wa_t_all, 0, indices)[:, 0, :, :].transpose(-1, -2) - - x = x.unsqueeze(1) - out = x @ wa - out = out.squeeze(1) - y += out * scale - - def _apply_lora( x: torch.Tensor, lora_a_stacked: torch.Tensor, @@ -166,7 +94,8 @@ def _apply_lora( output = output.view(-1, output.shape[-1]) indices = indices.view(-1) if is_hpu(): - custom_bgmv(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) + dispatch_bgmv_linear(output, x, lora_a_stacked, lora_b_stacked, + indices, 0, 1.0) else: add_lora(output, x, lora_a_stacked, lora_b_stacked, indices, 0, 1.0) return output.view_as(org_output) @@ -207,7 +136,7 @@ def _apply_lora_packed_nslice( offset_left = 0 for slice_idx in range(len(output_slices)): if is_hpu(): - custom_bgmv( + dispatch_bgmv_linear( output[:, offset_left:offset_left + output_slices[slice_idx]], x, lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], indices, 0, 1.0) @@ -416,9 +345,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: full_lora_a_embeddings.shape[0] * full_lora_a_embeddings.shape[1], -1) if is_hpu(): - custom_bgmv_embed(full_output, full_lora_a_embeddings, - self.lora_b_stacked, - self.indices[:self.indices_len[0]], 0, 1.0) + dispatch_bgmv_embedding(full_output, full_lora_a_embeddings, + self.lora_b_stacked, + self.indices[:self.indices_len[0]], 0, 1.0) else: bgmv(full_output, full_lora_a_embeddings, self.lora_b_stacked, self.indices[:self.indices_len[0]], 0, 1.0) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index ce7a0ad8dd1f..d129bb5cbc0c 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1556,16 +1556,18 @@ def execute_model( selected_token_indices=sampling_metadata.selected_token_indices ) - from vllm.lora.layers import VocabParallelEmbeddingWithLoRA - property = vars(self.model.model) - model = list(property['_modules'].values())[0] - property = vars(model) - modules = list(property['_modules'].values()) - for module in modules: - if isinstance(module, VocabParallelEmbeddingWithLoRA): - for i in range(0, 4): - module.indices_len[ - i] = sampling_metadata.selected_token_indices.numel() + if self.lora_config: + from vllm.lora.layers import VocabParallelEmbeddingWithLoRA + property = vars(self.model.model) + model = list(property['_modules'].values())[0] + property = vars(model) + modules = list(property['_modules'].values()) + for module in modules: + if isinstance(module, VocabParallelEmbeddingWithLoRA): + for i in range(0, 4): + module.indices_len[ + i] = sampling_metadata.selected_token_indices.numel( + ) # Compute the logits. with self.profiler.record_event(