Remove limitation on max_num_batched_tokens when using LoRA

HabanaAI · Sep 2, 2024 · 8734f8f · 8734f8f
1 parent a8f1d7d
commit 8734f8f
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 1 deletion.
diff --git a/vllm/config.py b/vllm/config.py
@@ -1326,7 +1326,7 @@ def verify_with_model_config(self, model_config: ModelConfig):
                            model_config.quantization)
 
     def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
-        if scheduler_config.max_num_batched_tokens > 65528:
+        if not is_hpu() and scheduler_config.max_num_batched_tokens > 65528:
             raise ValueError(
                 "Due to limitations of the custom LoRA CUDA kernel, "
                 "max_num_batched_tokens must be <= 65528 when "

diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
@@ -327,6 +327,17 @@ def set_mapping(
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         added_tokens_mask = x > self.base_layer.org_vocab_size - 1
         embedding_len = self.indices_len[3]
+        # NOTE(vgoel): These asserts can be skipped when upstreaming.
+        # Can be removed from vllm-fork also once lora functionality
+        # on Gaudi stabilizes.
+        if is_hpu():
+            emb_len = embedding_len
+            x_shape = x.shape
+            ind_shape = self.embeddings_indices[1].shape
+            assert embedding_len == x.shape[0] * x.shape[1], \
+                 f"Extra Info: {emb_len}, {x_shape}, {ind_shape}"
+            assert embedding_len <= self.embeddings_indices[1].shape[0], \
+                f"Extra Info: {emb_len}, {x.shape}, {ind_shape}"
         indices = self.embeddings_indices[1][:embedding_len].view_as(x)
         full_lora_a_embeddings = F.embedding(
             x + indices,