Skip to content

Commit

Permalink
Remove limitation on max_num_batched_tokens when using LoRA
Browse files Browse the repository at this point in the history
  • Loading branch information
hlahkar committed Sep 2, 2024
1 parent a8f1d7d commit 8734f8f
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
2 changes: 1 addition & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1326,7 +1326,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
model_config.quantization)

def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
if scheduler_config.max_num_batched_tokens > 65528:
if not is_hpu() and scheduler_config.max_num_batched_tokens > 65528:
raise ValueError(
"Due to limitations of the custom LoRA CUDA kernel, "
"max_num_batched_tokens must be <= 65528 when "
Expand Down
11 changes: 11 additions & 0 deletions vllm/lora/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,17 @@ def set_mapping(
def forward(self, x: torch.Tensor) -> torch.Tensor:
added_tokens_mask = x > self.base_layer.org_vocab_size - 1
embedding_len = self.indices_len[3]
# NOTE(vgoel): These asserts can be skipped when upstreaming.
# Can be removed from vllm-fork also once lora functionality
# on Gaudi stabilizes.
if is_hpu():
emb_len = embedding_len
x_shape = x.shape
ind_shape = self.embeddings_indices[1].shape
assert embedding_len == x.shape[0] * x.shape[1], \
f"Extra Info: {emb_len}, {x_shape}, {ind_shape}"
assert embedding_len <= self.embeddings_indices[1].shape[0], \
f"Extra Info: {emb_len}, {x.shape}, {ind_shape}"
indices = self.embeddings_indices[1][:embedding_len].view_as(x)
full_lora_a_embeddings = F.embedding(
x + indices,
Expand Down

0 comments on commit 8734f8f

Please sign in to comment.