vllm-project · DarkLight1337 · Nov 14, 2024 · Oct 22, 2024 · Oct 15, 2024 · Nov 11, 2024
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -739,6 +739,9 @@ void paged_attention_v1_launcher(
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
     // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V1(32);
+      break;
     case 64:
       LAUNCH_PAGED_ATTENTION_V1(64);
       break;
@@ -903,6 +906,9 @@ void paged_attention_v2_launcher(
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
     // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V2(32);
+      break;
     case 64:
       LAUNCH_PAGED_ATTENTION_V2(64);
       break;

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
@@ -385,6 +385,9 @@ void paged_attention_v1_impl_launcher(
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
+    case 32:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
     case 64:
       LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
       break;
@@ -702,6 +705,9 @@ void paged_attention_v2_impl_launcher(
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
+    case 32:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
     case 64:
       LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
       break;

@@ -10,11 +10,11 @@
 MODELS = [
     "intfloat/e5-mistral-7b-instruct",
     "BAAI/bge-base-en-v1.5",
-    "BAAI/bge-multilingual-gemma2",
+    "BAAI/bge-multilingual-gemma2"
 ]
 
 ENCODER_ONLY = [
-    "BAAI/bge-base-en-v1.5",
+    "BAAI/bge-base-en-v1.5"
 ]
 
 

diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
@@ -10,7 +10,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 128, 256]
+        return [32, 64, 80, 96, 112, 128, 256]
 
     @staticmethod
     def get_kv_cache_shape(

diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -34,7 +34,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 120, 128, 192, 256]
+        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
     @staticmethod
     def get_kv_cache_shape(

diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
@@ -313,9 +313,10 @@ def __init__(self,
                  config: BertConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+                 prefix: str = "",
+                 embedding_class: type = BertEmbedding):
         super().__init__()
-        self.embeddings = BertEmbedding(config)
+        self.embeddings = embedding_class(config)
         self.encoder = BertEncoder(config,
                                    cache_config,
                                    quant_config,
@@ -422,3 +423,9 @@ def pooler(
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         self.model.load_weights(weights)
+
+    def _build_model(self,
+                     config: BertConfig,
+                     cache_config: Optional[CacheConfig] = None,
+                     quant_config: Optional[QuantizationConfig] = None):
+        return BertModel(config, cache_config, quant_config, BertEmbedding)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -94,6 +94,8 @@
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
+    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
     "LlamaModel": ("llama", "LlamaEmbeddingModel"),

diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
@@ -0,0 +1,85 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import RobertaConfig
+
+from vllm.config import CacheConfig, PoolerConfig
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.models.bert import (BertEmbedding, BertEmbeddingModel,
+                                             BertEncoder, BertModel)
+
+
+class RobertaModel(BertModel):
+
+    def __init__(
+        self,
+        config: RobertaConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        # Skip BertModel.__init__()
+        nn.Module.__init__(self)
+        self.embeddings = RobertaEmbedding(config)
+        self.encoder = BertEncoder(config, cache_config, quant_config)
+
+
+class RobertaEmbedding(BertEmbedding):
+
+    def __init__(self, config: RobertaConfig):
+        # Skip BertEmbedding.__init__()
+        nn.Module.__init__(self)
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                      config.hidden_size)
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size,
+                                                padding_idx=self.padding_idx)
+
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)), )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError("Only 'absolute' position_embedding_type" +
+                             " is supported")
+
+
+class RobertaEmbeddingModel(BertEmbeddingModel):
+    """A model that uses Roberta to provide embedding functionalities.
+
+   This class encapsulates the RobertaModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       model: An instance of RobertaModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    def __init__(self,
+                 config: RobertaConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 pooler_config: Optional[PoolerConfig] = None) -> None:
+        nn.Module.__init__(self)
+        self.model = RobertaModel(config, cache_config, quant_config)
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.CLS,
+            normalize=True,
+            softmax=False)
+
+    def _build_model(self,
+                     config: RobertaConfig,
+                     cache_config: Optional[CacheConfig] = None,
+                     quant_config: Optional[QuantizationConfig] = None):
+        return BertModel(config, cache_config, quant_config, RobertaEmbedding)