From f7e23fb34c97bf288f11271c15717593f30780c2 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 22 Oct 2024 20:45:55 -0300 Subject: [PATCH 01/24] support head size 32 Signed-off-by: Max de Bayser --- csrc/attention/attention_kernels.cu | 6 ++++++ csrc/cpu/attention.cpp | 6 ++++++ vllm/attention/ops/ipex_attn.py | 2 +- vllm/attention/ops/paged_attn.py | 2 +- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index bcd170411e7cb..c53cda16d4714 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -739,6 +739,9 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V1(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V1(64); break; @@ -903,6 +906,9 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V2(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V2(64); break; diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e3953c7c45719..e73eca1b345fd 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -385,6 +385,9 @@ void paged_attention_v1_impl_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { + case 32: + LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); + break; case 64: LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); break; @@ -702,6 +705,9 @@ void paged_attention_v2_impl_launcher( int* seq_lens_ptr = seq_lens.data_ptr(); switch (head_size) { + case 32: + LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE); + break; case 64: LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE); break; diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py index 6b270ffd5bc00..8df6d4ced9dc6 100644 --- a/vllm/attention/ops/ipex_attn.py +++ b/vllm/attention/ops/ipex_attn.py @@ -10,7 +10,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 128, 256] + return [32, 64, 80, 96, 112, 128, 256] @staticmethod def get_kv_cache_shape( diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py index 92023d5b75f5a..076f151ffcb61 100644 --- a/vllm/attention/ops/paged_attn.py +++ b/vllm/attention/ops/paged_attn.py @@ -34,7 +34,7 @@ class PagedAttention: @staticmethod def get_supported_head_sizes() -> List[int]: - return [64, 80, 96, 112, 120, 128, 192, 256] + return [32, 64, 80, 96, 112, 120, 128, 192, 256] @staticmethod def get_kv_cache_shape( From 10ebc9e2907dd2bd353e33846c5c3724979c1dd8 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Tue, 15 Oct 2024 15:53:24 -0300 Subject: [PATCH 02/24] add support for Roberta models Signed-off-by: Max de Bayser --- vllm/model_executor/models/registry.py | 2 + vllm/model_executor/models/roberta.py | 74 ++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 vllm/model_executor/models/roberta.py diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 32750602b988c..ad6dae971593e 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -94,6 +94,8 @@ _EMBEDDING_MODELS = { # [Text-only] "BertModel": ("bert", "BertEmbeddingModel"), + "RobertaModel": ("roberta", "RobertaEmbeddingModel"), + "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"), "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"), "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"), "LlamaModel": ("llama", "LlamaEmbeddingModel"), diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py new file mode 100644 index 0000000000000..1fff4e354a343 --- /dev/null +++ b/vllm/model_executor/models/roberta.py @@ -0,0 +1,74 @@ +from typing import Optional + +from torch import nn +from transformers import RobertaConfig + +from vllm.config import CacheConfig +from vllm.model_executor.layers.pooler import Pooler, PoolingConfig +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) +from vllm.model_executor.layers.vocab_parallel_embedding import ( + VocabParallelEmbedding) +from vllm.model_executor.models.bert import (BertEmbedding, BertEmbeddingModel, + BertEncoder, BertModel) + + +class RobertaModel(BertModel): + + def __init__( + self, + config: RobertaConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + ): + # Skip BertModel.__init__() + nn.Module.__init__(self) + self.embeddings = RobertaEmbedding(config) + self.encoder = BertEncoder(config, cache_config, quant_config) + + +class RobertaEmbedding(BertEmbedding): + + def __init__(self, config: RobertaConfig): + # Skip BertEmbedding.__init__() + nn.Module.__init__(self) + self.size = config.hidden_size + self.word_embeddings = VocabParallelEmbedding(config.vocab_size, + config.hidden_size) + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding(config.max_position_embeddings, + config.hidden_size, + padding_idx=self.padding_idx) + + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, + config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, + eps=config.layer_norm_eps) + + self.position_embedding_type = config.position_embedding_type + if self.position_embedding_type != "absolute": + raise ValueError("Only 'absolute' position_embedding_type" + + " is supported") + + +class RobertaEmbeddingModel(BertEmbeddingModel): + """A model that uses Roberta to provide embedding functionalities. + + This class encapsulates the RobertaModel and provides an interface for + embedding operations and customized pooling functions. + + Attributes: + model: An instance of RobertaModel used for forward operations. + _pooler: An instance of Pooler used for pooling operations. + """ + + def __init__(self, + config: RobertaConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None, + pooling_config: Optional[PoolingConfig] = None) -> None: + # Skip BertEmbeddingModule.__init__() + nn.Module.__init__(self) + self.model = RobertaModel(config, cache_config, quant_config) + self._pooler = Pooler(pooling_config.pooling_type, + pooling_config.normalize) From b457cc55a30c34b3548bb9fe0798438d73ffabd8 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Mon, 11 Nov 2024 13:51:09 -0300 Subject: [PATCH 03/24] fix after refactoring Signed-off-by: Max de Bayser --- vllm/model_executor/models/roberta.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 1fff4e354a343..7b93431b73355 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,10 +1,11 @@ from typing import Optional +import torch from torch import nn from transformers import RobertaConfig -from vllm.config import CacheConfig -from vllm.model_executor.layers.pooler import Pooler, PoolingConfig +from vllm.config import CacheConfig, PoolerConfig +from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -44,6 +45,8 @@ def __init__(self, config: RobertaConfig): config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.position_ids = nn.Parameter( + torch.empty((1, config.max_position_embeddings)), ) self.position_embedding_type = config.position_embedding_type if self.position_embedding_type != "absolute": @@ -66,9 +69,12 @@ def __init__(self, config: RobertaConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - pooling_config: Optional[PoolingConfig] = None) -> None: + pooler_config: Optional[PoolerConfig] = None) -> None: # Skip BertEmbeddingModule.__init__() nn.Module.__init__(self) self.model = RobertaModel(config, cache_config, quant_config) - self._pooler = Pooler(pooling_config.pooling_type, - pooling_config.normalize) + self._pooler = Pooler.from_config_with_defaults( + pooler_config, + pooling_type=PoolingType.CLS, + normalize=True, + softmax=False) From 3fe28f6e8a13aebfc792e458988116d6568553b7 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 08:29:05 -0300 Subject: [PATCH 04/24] Review suggestions Signed-off-by: Flavia Beo --- tests/models/embedding/language/test_embedding.py | 4 ++-- vllm/model_executor/models/bert.py | 11 +++++++++-- vllm/model_executor/models/roberta.py | 7 ++++++- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 39b6bbaf43180..70bed066cac9f 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -10,11 +10,11 @@ MODELS = [ "intfloat/e5-mistral-7b-instruct", "BAAI/bge-base-en-v1.5", - "BAAI/bge-multilingual-gemma2", + "BAAI/bge-multilingual-gemma2" ] ENCODER_ONLY = [ - "BAAI/bge-base-en-v1.5", + "BAAI/bge-base-en-v1.5" ] diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index bfed2929d57d2..4683d2aa605bf 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -313,9 +313,10 @@ def __init__(self, config: BertConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - prefix: str = ""): + prefix: str = "", + embedding_class: type = BertEmbedding): super().__init__() - self.embeddings = BertEmbedding(config) + self.embeddings = embedding_class(config) self.encoder = BertEncoder(config, cache_config, quant_config, @@ -422,3 +423,9 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self.model.load_weights(weights) + + def _build_model(self, + config: BertConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + return BertModel(config, cache_config, quant_config, BertEmbedding) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 7b93431b73355..5ca3565cc31e8 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -70,7 +70,6 @@ def __init__(self, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, pooler_config: Optional[PoolerConfig] = None) -> None: - # Skip BertEmbeddingModule.__init__() nn.Module.__init__(self) self.model = RobertaModel(config, cache_config, quant_config) self._pooler = Pooler.from_config_with_defaults( @@ -78,3 +77,9 @@ def __init__(self, pooling_type=PoolingType.CLS, normalize=True, softmax=False) + + def _build_model(self, + config: RobertaConfig, + cache_config: Optional[CacheConfig] = None, + quant_config: Optional[QuantizationConfig] = None): + return BertModel(config, cache_config, quant_config, RobertaEmbedding) From 971acea616ca2ed409a0a230f75ab7518cd654ad Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 13:53:04 -0300 Subject: [PATCH 05/24] Fixes conflicts with new upstream changes Signed-off-by: Flavia Beo --- vllm/model_executor/models/bert.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index f838e7ad74285..0645790e24f31 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -305,17 +305,17 @@ def forward(self, hidden_states: torch.Tensor, class BertModel(nn.Module): - def __init__(self, *, - vllm_config: VllmConfig, + def __init__(self, + *, + config: BertConfig, + vllm_config: VllmConfig, prefix: str = "", embedding_class: type = BertEmbedding): super().__init__() - self.embeddings = embedding_class(config) config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - - self.embeddings = BertEmbedding(config) + self.embeddings = embedding_class(config) self.encoder = BertEncoder(config, cache_config, quant_config, From 18a2d581bdafd8c258718bee7c6f959efbb4fcaf Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 14:34:34 -0300 Subject: [PATCH 06/24] Merge changes fixes Signed-off-by: Flavia Beo --- csrc/attention/attention_kernels.cuh | 334 +-------------------------- csrc/attention/paged_attention_v1.cu | 3 + csrc/attention/paged_attention_v2.cu | 3 + vllm/model_executor/models/bert.py | 4 +- 4 files changed, 9 insertions(+), 335 deletions(-) diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index c53cda16d4714..1f1110b0497a9 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -670,339 +670,7 @@ __global__ void paged_attention_v2_reduce_kernel( } // namespace vllm -#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \ - VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \ - ((void*)vllm::paged_attention_v1_kernel), \ - shared_mem_size); \ - vllm::paged_attention_v1_kernel \ - <<>>( \ - out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \ - scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \ - alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -// TODO(woosuk): Tune NUM_THREADS. -template -void paged_attention_v1_launcher( - torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int padded_max_seq_len = - DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE; - int logits_size = padded_max_seq_len * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len - // Keep that in sync with the logic here! - int shared_mem_size = std::max(logits_size, outputs_size); - - dim3 grid(num_heads, num_seqs, 1); - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 32: - LAUNCH_PAGED_ATTENTION_V1(32); - break; - case 64: - LAUNCH_PAGED_ATTENTION_V1(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V1(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V1(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V1(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V1(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V1(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V1(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V1(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v1_launcher( \ - out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \ - seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); - -#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v1( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V1_LAUNCHER_BLOCK_SIZE) -} - -#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \ - vllm::paged_attention_v2_kernel \ - <<>>( \ - exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \ - value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \ - seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \ - kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank, \ - blocksparse_local_blocks, blocksparse_vert_stride, \ - blocksparse_block_size, blocksparse_head_sliding_step); \ - vllm::paged_attention_v2_reduce_kernel \ - <<>>( \ - out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \ - max_num_partitions); - -template -void paged_attention_v2_launcher( - torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits, - torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, - torch::Tensor& value_cache, int num_kv_heads, float scale, - torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, - float v_scale, const int tp_rank, const int blocksparse_local_blocks, - const int blocksparse_vert_stride, const int blocksparse_block_size, - const int blocksparse_head_sliding_step) { - int num_seqs = query.size(0); - int num_heads = query.size(1); - int head_size = query.size(2); - int max_num_blocks_per_seq = block_tables.size(1); - int q_stride = query.stride(0); - int kv_block_stride = key_cache.stride(0); - int kv_head_stride = key_cache.stride(1); - - [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1); - assert(head_size % thread_group_size == 0); - - // NOTE: alibi_slopes is optional. - const float* alibi_slopes_ptr = - alibi_slopes - ? reinterpret_cast(alibi_slopes.value().data_ptr()) - : nullptr; - - T* out_ptr = reinterpret_cast(out.data_ptr()); - float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr()); - float* max_logits_ptr = reinterpret_cast(max_logits.data_ptr()); - T* tmp_out_ptr = reinterpret_cast(tmp_out.data_ptr()); - T* query_ptr = reinterpret_cast(query.data_ptr()); - CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr()); - CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr()); - int* block_tables_ptr = block_tables.data_ptr(); - int* seq_lens_ptr = seq_lens.data_ptr(); - - constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE; - int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE); - int logits_size = PARTITION_SIZE * sizeof(float); - int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float); - - // For paged attention v2 kernel. - dim3 grid(num_heads, num_seqs, max_num_partitions); - int shared_mem_size = std::max(logits_size, outputs_size); - // For paged attention v2 reduce kernel. - dim3 reduce_grid(num_heads, num_seqs); - int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float); - - dim3 block(NUM_THREADS); - const at::cuda::OptionalCUDAGuard device_guard(device_of(query)); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - switch (head_size) { - // NOTE(woosuk): To reduce the compilation time, we only compile for the - // head sizes that we use in the model. However, we can easily extend this - // to support any head size which is a multiple of 16. - case 32: - LAUNCH_PAGED_ATTENTION_V2(32); - break; - case 64: - LAUNCH_PAGED_ATTENTION_V2(64); - break; - case 80: - LAUNCH_PAGED_ATTENTION_V2(80); - break; - case 96: - LAUNCH_PAGED_ATTENTION_V2(96); - break; - case 112: - LAUNCH_PAGED_ATTENTION_V2(112); - break; - case 120: - LAUNCH_PAGED_ATTENTION_V2(120); - break; - case 128: - LAUNCH_PAGED_ATTENTION_V2(128); - break; - case 192: - LAUNCH_PAGED_ATTENTION_V2(192); - break; - case 256: - LAUNCH_PAGED_ATTENTION_V2(256); - break; - default: - TORCH_CHECK(false, "Unsupported head size: ", head_size); - break; - } -} - -#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \ - paged_attention_v2_launcher( \ - out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \ - num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \ - k_scale, v_scale, tp_rank, blocksparse_local_blocks, \ - blocksparse_vert_stride, blocksparse_block_size, \ - blocksparse_head_sliding_step); - -#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \ - switch (is_block_sparse) { \ - case true: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \ - break; \ - case false: \ - CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \ - break; \ - } - -// NOTE(woosuk): To reduce the compilation time, we omitted block sizes -// 1, 2, 4, 64, 128, 256. -#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \ - switch (block_size) { \ - case 8: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \ - break; \ - case 16: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \ - break; \ - case 32: \ - CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \ - break; \ - default: \ - TORCH_CHECK(false, "Unsupported block size: ", block_size); \ - break; \ - } - -void paged_attention_v2( - torch::Tensor& out, // [num_seqs, num_heads, head_size] - torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions] - torch::Tensor& - tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size] - torch::Tensor& query, // [num_seqs, num_heads, head_size] - torch::Tensor& - key_cache, // [num_blocks, num_heads, head_size/x, block_size, x] - torch::Tensor& - value_cache, // [num_blocks, num_heads, head_size, block_size] - int64_t num_kv_heads, // [num_heads] - double scale, - torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] - torch::Tensor& seq_lens, // [num_seqs] - int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, - const std::string& kv_cache_dtype, double k_scale, double v_scale, - const int64_t tp_rank, const int64_t blocksparse_local_blocks, - const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, - const int64_t blocksparse_head_sliding_step) { - const bool is_block_sparse = (blocksparse_vert_stride > 1); - DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype, - CALL_V2_LAUNCHER_BLOCK_SIZE) -} - #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP +#undef DIVIDE_ROUND_UP \ No newline at end of file diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index 8b99f0843aaf6..c0983b8c338d5 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -98,6 +98,9 @@ void paged_attention_v1_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V2(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V1(64); break; diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index 3a7a9dee916aa..6de8d0bdd5b8d 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -104,6 +104,9 @@ void paged_attention_v2_launcher( // NOTE(woosuk): To reduce the compilation time, we only compile for the // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. + case 32: + LAUNCH_PAGED_ATTENTION_V2(32); + break; case 64: LAUNCH_PAGED_ATTENTION_V2(64); break; diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 0645790e24f31..c3308d690090b 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -305,8 +305,8 @@ def forward(self, hidden_states: torch.Tensor, class BertModel(nn.Module): - def __init__(self, - *, + def __init__(self, + *, config: BertConfig, vllm_config: VllmConfig, prefix: str = "", From 40ac579fd6f06d13880c25e4637c9f3869047853 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 15:37:20 -0300 Subject: [PATCH 07/24] More fixed related to the upstream merge Signed-off-by: Flavia Beo --- csrc/attention/attention_kernels.cuh | 2 +- csrc/attention/paged_attention_v1.cu | 2 +- tests/models/embedding/language/test_embedding.py | 4 ++-- vllm/model_executor/models/roberta.py | 3 +++ 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index 1f1110b0497a9..563e1438f0b01 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -673,4 +673,4 @@ __global__ void paged_attention_v2_reduce_kernel( #undef WARP_SIZE #undef MAX #undef MIN -#undef DIVIDE_ROUND_UP \ No newline at end of file +#undef DIVIDE_ROUND_UP diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index c0983b8c338d5..741cd0c82dc89 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -99,7 +99,7 @@ void paged_attention_v1_launcher( // head sizes that we use in the model. However, we can easily extend this // to support any head size which is a multiple of 16. case 32: - LAUNCH_PAGED_ATTENTION_V2(32); + LAUNCH_PAGED_ATTENTION_V1(32); break; case 64: LAUNCH_PAGED_ATTENTION_V1(64); diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index c2a2ac3088e96..cd920aec6502e 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -12,11 +12,11 @@ MODELS = [ "intfloat/e5-mistral-7b-instruct", "BAAI/bge-base-en-v1.5", - "BAAI/bge-multilingual-gemma2" + "BAAI/bge-multilingual-gemma2", ] ENCODER_ONLY = [ - "BAAI/bge-base-en-v1.5" + "BAAI/bge-base-en-v1.5", ] diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 5ca3565cc31e8..82b86a34c073d 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -23,6 +23,9 @@ def __init__( quant_config: Optional[QuantizationConfig] = None, ): # Skip BertModel.__init__() + config = vllm_config.model_config.hf_config + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config nn.Module.__init__(self) self.embeddings = RobertaEmbedding(config) self.encoder = BertEncoder(config, cache_config, quant_config) From e1718966e4a5469ee135ab388ce8b5ef3f80aa77 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 15:48:55 -0300 Subject: [PATCH 08/24] Adds test for roberta model executor Signed-off-by: Flavia Beo --- .../test_model_load_with_params.py | 38 +++++++++++++++++++ vllm/model_executor/models/roberta.py | 6 +-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 7e5e2780d3916..a441303ba3891 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -10,6 +10,8 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") +MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "sdadas/polish-roberta-base-v2") +REVISION_ROBERTA = os.environ.get("REVISION", "main") @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") @@ -48,3 +50,39 @@ def test_model_loading_with_params(vllm_runner): assert model._pooler.normalize # assert output assert output + +def test_roberta_model_loading_with_params(vllm_runner): + """ + Test parameter weight loading with tp>1. + """ + with vllm_runner(model_name=MODEL_NAME_ROBERTA, + revision=REVISION_ROBERTA, + dtype="float16", + max_model_len=MAX_MODEL_LEN) as model: + output = model.encode("Write a short story about a robot that" + " dreams for the first time.\n") + + model_config = model.model.llm_engine.model_config + + model_tokenizer = model.model.llm_engine.tokenizer + + # asserts on the bert model config file + assert model_config.encoder_config["max_seq_length"] == 512 + assert not model_config.encoder_config["do_lower_case"] + + # asserts on the pooling config files + assert model_config.pooler_config.pooling_type == PoolingType.CLS.name + assert model_config.pooler_config.pooling_norm + + # asserts on the tokenizer loaded + assert model_tokenizer.tokenizer_id == "sdadas/polish-roberta-base-v2" + assert model_tokenizer.tokenizer_config["do_lower_case"] + assert model_tokenizer.tokenizer.model_max_length == 768 + + model = model.model.llm_engine.model_executor\ + .driver_worker.model_runner.model + assert isinstance(model, BertEmbeddingModel) + assert model._pooler.pooling_type == PoolingType.CLS + assert model._pooler.normalize + # assert output + assert output \ No newline at end of file diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 82b86a34c073d..9a0b77664169f 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -19,14 +19,12 @@ class RobertaModel(BertModel): def __init__( self, config: RobertaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, + vllm_config: VllmConfig ): - # Skip BertModel.__init__() + nn.Module.__init__(self) config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - nn.Module.__init__(self) self.embeddings = RobertaEmbedding(config) self.encoder = BertEncoder(config, cache_config, quant_config) From 55912f92b462959f1ff8377ba25c27f407e138f8 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 16:14:07 -0300 Subject: [PATCH 09/24] Asserts for Roberta models instance Signed-off-by: Flavia Beo --- .../test_model_load_with_params.py | 16 +++++----- vllm/model_executor/models/bert.py | 9 ++---- vllm/model_executor/models/roberta.py | 29 +++++-------------- 3 files changed, 19 insertions(+), 35 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index a441303ba3891..934300a2f40e9 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -4,13 +4,14 @@ from vllm.model_executor.layers.pooler import PoolingType from vllm.model_executor.models.bert import BertEmbeddingModel +from vllm.model_executor.models.roberta import RobertaEmbeddingModel from vllm.platforms import current_platform MAX_MODEL_LEN = 128 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") -MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "sdadas/polish-roberta-base-v2") +MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "sdadas/mmlw-roberta-base") REVISION_ROBERTA = os.environ.get("REVISION", "main") @pytest.mark.skipif(current_platform.is_rocm(), @@ -72,17 +73,16 @@ def test_roberta_model_loading_with_params(vllm_runner): # asserts on the pooling config files assert model_config.pooler_config.pooling_type == PoolingType.CLS.name - assert model_config.pooler_config.pooling_norm + assert not model_config.pooler_config.pooling_norm # asserts on the tokenizer loaded - assert model_tokenizer.tokenizer_id == "sdadas/polish-roberta-base-v2" - assert model_tokenizer.tokenizer_config["do_lower_case"] - assert model_tokenizer.tokenizer.model_max_length == 768 + assert model_tokenizer.tokenizer_id == "sdadas/mmlw-roberta-base" + assert not model_tokenizer.tokenizer_config["do_lower_case"] model = model.model.llm_engine.model_executor\ .driver_worker.model_runner.model - assert isinstance(model, BertEmbeddingModel) + assert isinstance(model, RobertaEmbeddingModel) assert model._pooler.pooling_type == PoolingType.CLS - assert model._pooler.normalize + assert not model._pooler.normalize # assert output - assert output \ No newline at end of file + assert output diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index c3308d690090b..13e2f508c754a 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -307,7 +307,6 @@ class BertModel(nn.Module): def __init__(self, *, - config: BertConfig, vllm_config: VllmConfig, prefix: str = "", embedding_class: type = BertEmbedding): @@ -419,8 +418,6 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self.model.load_weights(weights) - def _build_model(self, - config: BertConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): - return BertModel(config, cache_config, quant_config, BertEmbedding) + def _build_model(self, vllm_config: VllmConfig): + return BertModel(vllm_config=vllm_config, + embedding_class=BertEmbedding) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 9a0b77664169f..b0bd58548bad7 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,13 +1,9 @@ -from typing import Optional - import torch from torch import nn from transformers import RobertaConfig -from vllm.config import CacheConfig, PoolerConfig +from vllm.config import VllmConfig from vllm.model_executor.layers.pooler import Pooler, PoolingType -from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.bert import (BertEmbedding, BertEmbeddingModel, @@ -16,11 +12,7 @@ class RobertaModel(BertModel): - def __init__( - self, - config: RobertaConfig, - vllm_config: VllmConfig - ): + def __init__(self, vllm_config: VllmConfig): nn.Module.__init__(self) config = vllm_config.model_config.hf_config cache_config = vllm_config.cache_config @@ -66,21 +58,16 @@ class RobertaEmbeddingModel(BertEmbeddingModel): _pooler: An instance of Pooler used for pooling operations. """ - def __init__(self, - config: RobertaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None, - pooler_config: Optional[PoolerConfig] = None) -> None: + def __init__(self, *, vllm_config: VllmConfig) -> None: nn.Module.__init__(self) - self.model = RobertaModel(config, cache_config, quant_config) + pooler_config = vllm_config.model_config.pooler_config + self.model = RobertaModel(vllm_config=vllm_config) self._pooler = Pooler.from_config_with_defaults( pooler_config, pooling_type=PoolingType.CLS, normalize=True, softmax=False) - def _build_model(self, - config: RobertaConfig, - cache_config: Optional[CacheConfig] = None, - quant_config: Optional[QuantizationConfig] = None): - return BertModel(config, cache_config, quant_config, RobertaEmbedding) + def _build_model(self, vllm_config: VllmConfig): + return BertModel(vllm_config=vllm_config, + embedding_class=RobertaEmbedding) From 6f06a76a48ee27c4a6c85ca08429d03f4c22696a Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 16:49:55 -0300 Subject: [PATCH 10/24] Fix space for linting Signed-off-by: Flavia Beo --- tests/model_executor/test_model_load_with_params.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 934300a2f40e9..47d8f07abd764 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -14,6 +14,7 @@ MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "sdadas/mmlw-roberta-base") REVISION_ROBERTA = os.environ.get("REVISION", "main") + @pytest.mark.skipif(current_platform.is_rocm(), reason="Xformers backend is not supported on ROCm.") def test_model_loading_with_params(vllm_runner): From d4c8849f00deb5df236a0baf141d6450f353d2f9 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 16:51:33 -0300 Subject: [PATCH 11/24] Fix space for linting Signed-off-by: Flavia Beo --- tests/model_executor/test_model_load_with_params.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 47d8f07abd764..1d1d5164024cc 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -53,6 +53,7 @@ def test_model_loading_with_params(vllm_runner): # assert output assert output + def test_roberta_model_loading_with_params(vllm_runner): """ Test parameter weight loading with tp>1. From b9e64b1b34c8c49f45fd56ae41e2be68955abd94 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Tue, 12 Nov 2024 20:18:16 -0300 Subject: [PATCH 12/24] Modifies test for multilingual-e5-large Signed-off-by: Flavia Beo --- tests/compile/test_basic_correctness.py | 9 +++++++++ .../model_executor/test_model_load_with_params.py | 14 ++++++++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 833589ba5dc9f..b91ee032a1c53 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -62,6 +62,15 @@ class TestSetting: method="encode", fullgraph=True, ), + TestSetting( + model="intfloat/multilingual-e5-large", + model_args=["--task", "embedding"], + pp_size=1, + tp_size=1, + attn_backend="FLASHINFER", + method="encode", + fullgraph=True, + ), # vision language model TestSetting( model="microsoft/Phi-3.5-vision-instruct", diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 1d1d5164024cc..1bd29edded253 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -11,7 +11,8 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") -MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "sdadas/mmlw-roberta-base") +MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", + "intfloat/multilingual-e5-large") REVISION_ROBERTA = os.environ.get("REVISION", "main") @@ -74,17 +75,18 @@ def test_roberta_model_loading_with_params(vllm_runner): assert not model_config.encoder_config["do_lower_case"] # asserts on the pooling config files - assert model_config.pooler_config.pooling_type == PoolingType.CLS.name - assert not model_config.pooler_config.pooling_norm + assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name + assert model_config.pooler_config.pooling_norm # asserts on the tokenizer loaded - assert model_tokenizer.tokenizer_id == "sdadas/mmlw-roberta-base" + assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large" assert not model_tokenizer.tokenizer_config["do_lower_case"] model = model.model.llm_engine.model_executor\ .driver_worker.model_runner.model assert isinstance(model, RobertaEmbeddingModel) - assert model._pooler.pooling_type == PoolingType.CLS - assert not model._pooler.normalize + assert model._pooler.pooling_type == PoolingType.MEAN + assert model._pooler.normalize + # assert output assert output From 366a992a5bb2dde723e27044e8324c84279bdb84 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Wed, 13 Nov 2024 11:11:42 -0300 Subject: [PATCH 13/24] Fix linting in test Signed-off-by: Flavia Beo --- tests/model_executor/test_model_load_with_params.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 1bd29edded253..a2042aa00787f 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -11,7 +11,7 @@ MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5") REVISION = os.environ.get("REVISION", "main") -MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", +MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME", "intfloat/multilingual-e5-large") REVISION_ROBERTA = os.environ.get("REVISION", "main") @@ -87,6 +87,6 @@ def test_roberta_model_loading_with_params(vllm_runner): assert isinstance(model, RobertaEmbeddingModel) assert model._pooler.pooling_type == PoolingType.MEAN assert model._pooler.normalize - + # assert output assert output From aae474e1f272d58a5d3b110892c536400975c62a Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Wed, 13 Nov 2024 13:19:10 -0300 Subject: [PATCH 14/24] trigger ci Signed-off-by: Flavia Beo From 07c931c99c8a46b170457b0eeb376ec5757f4874 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 13 Nov 2024 14:04:54 -0300 Subject: [PATCH 15/24] finish generalizing the Bert classes Signed-off-by: Max de Bayser --- vllm/model_executor/models/bert.py | 23 +++++++++++-------- vllm/model_executor/models/roberta.py | 33 ++++++--------------------- 2 files changed, 21 insertions(+), 35 deletions(-) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 13e2f508c754a..42dd6119e76f1 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -5,7 +5,7 @@ from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.config import CacheConfig, VllmConfig +from vllm.config import CacheConfig, PoolerConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -384,13 +384,9 @@ class BertEmbeddingModel(nn.Module): def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() pooler_config = vllm_config.model_config.pooler_config - self.model = BertModel(vllm_config=vllm_config, - prefix=maybe_prefix(prefix, "model")) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.CLS, - normalize=True, - softmax=False) + self.model = self._build_model(vllm_config=vllm_config, + prefix=maybe_prefix(prefix, "model")) + self._pooler = self._build_pooler(pooler_config) def forward( self, @@ -418,6 +414,15 @@ def pooler( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): self.model.load_weights(weights) - def _build_model(self, vllm_config: VllmConfig): + def _build_model(self, + vllm_config: VllmConfig, + prefix: str = "") -> BertModel: return BertModel(vllm_config=vllm_config, + prefix=prefix, embedding_class=BertEmbedding) + + def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler: + return Pooler.from_config_with_defaults(pooler_config, + pooling_type=PoolingType.CLS, + normalize=True, + softmax=False) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index b0bd58548bad7..5b36c91b584ac 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -3,22 +3,10 @@ from transformers import RobertaConfig from vllm.config import VllmConfig -from vllm.model_executor.layers.pooler import Pooler, PoolingType from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.bert import (BertEmbedding, BertEmbeddingModel, - BertEncoder, BertModel) - - -class RobertaModel(BertModel): - - def __init__(self, vllm_config: VllmConfig): - nn.Module.__init__(self) - config = vllm_config.model_config.hf_config - cache_config = vllm_config.cache_config - quant_config = vllm_config.quant_config - self.embeddings = RobertaEmbedding(config) - self.encoder = BertEncoder(config, cache_config, quant_config) + BertModel) class RobertaEmbedding(BertEmbedding): @@ -50,24 +38,17 @@ def __init__(self, config: RobertaConfig): class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. - This class encapsulates the RobertaModel and provides an interface for + This class encapsulates the BertModel and provides an interface for embedding operations and customized pooling functions. Attributes: - model: An instance of RobertaModel used for forward operations. + model: An instance of BertModel used for forward operations. _pooler: An instance of Pooler used for pooling operations. """ - def __init__(self, *, vllm_config: VllmConfig) -> None: - nn.Module.__init__(self) - pooler_config = vllm_config.model_config.pooler_config - self.model = RobertaModel(vllm_config=vllm_config) - self._pooler = Pooler.from_config_with_defaults( - pooler_config, - pooling_type=PoolingType.CLS, - normalize=True, - softmax=False) - - def _build_model(self, vllm_config: VllmConfig): + def _build_model(self, + vllm_config: VllmConfig, + prefix: str = "") -> BertModel: return BertModel(vllm_config=vllm_config, + prefix=prefix, embedding_class=RobertaEmbedding) From 4495a505bd24015674f183d54a366ae4b4294abb Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Wed, 13 Nov 2024 14:49:44 -0300 Subject: [PATCH 16/24] Skips test for ROCm unsupported platform Signed-off-by: Flavia Beo --- tests/model_executor/test_model_load_with_params.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index a2042aa00787f..ed321ba9f00c1 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -55,6 +55,8 @@ def test_model_loading_with_params(vllm_runner): assert output +@pytest.mark.skipif(current_platform.is_rocm(), + reason="Xformers backend is not supported on ROCm.") def test_roberta_model_loading_with_params(vllm_runner): """ Test parameter weight loading with tp>1. From 49e8381204bf7c5a0ab6e4f8cc5d56377ae6717e Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Wed, 13 Nov 2024 22:57:16 -0300 Subject: [PATCH 17/24] fix roberta position_ids Signed-off-by: Max de Bayser --- vllm/model_executor/models/roberta.py | 40 +++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 5b36c91b584ac..7ff408c6c1cdb 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,3 +1,5 @@ +from typing import Optional + import torch from torch import nn from transformers import RobertaConfig @@ -5,15 +7,13 @@ from vllm.config import VllmConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) -from vllm.model_executor.models.bert import (BertEmbedding, BertEmbeddingModel, - BertModel) +from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel -class RobertaEmbedding(BertEmbedding): +class RobertaEmbedding(nn.Module): def __init__(self, config: RobertaConfig): - # Skip BertEmbedding.__init__() - nn.Module.__init__(self) + super().__init__() self.size = config.hidden_size self.word_embeddings = VocabParallelEmbedding(config.vocab_size, config.hidden_size) @@ -34,6 +34,36 @@ def __init__(self, config: RobertaConfig): raise ValueError("Only 'absolute' position_embedding_type" + " is supported") + def forward( + self, + input_ids: torch.Tensor, + position_ids: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + input_shape = input_ids.size() + + # Input embeddings. + inputs_embeds = self.word_embeddings(input_ids) + + # TODO: figure out if there is a better way + # to make to make position ids start at padding_idx + 1 + # References: + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133 + # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669 + position_ids += self.padding_idx + 1 + + # Position embeddings. + position_embeddings = self.position_embeddings(position_ids) + + # Token type embeddings. (TODO: move off hotpath?) + token_type_embeddings = self.token_type_embeddings( + torch.zeros(input_shape, + dtype=torch.long, + device=inputs_embeds.device)) + + embeddings = inputs_embeds + token_type_embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) + return embeddings + class RobertaEmbeddingModel(BertEmbeddingModel): """A model that uses Roberta to provide embedding functionalities. From 1267bbab136e0fb1aa70f6f59881da05efd1213d Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 11:18:08 -0300 Subject: [PATCH 18/24] add assert to verify assumption Signed-off-by: Max de Bayser --- vllm/model_executor/models/roberta.py | 36 ++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index 7ff408c6c1cdb..fad8a51cf8df5 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,13 +1,16 @@ -from typing import Optional +from typing import List, Optional import torch from torch import nn from transformers import RobertaConfig +from vllm.attention import AttentionMetadata +from vllm.attention.backends.xformers import XFormersMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel +from vllm.sequence import IntermediateTensors class RobertaEmbedding(nn.Module): @@ -82,3 +85,34 @@ def _build_model(self, return BertModel(vllm_config=vllm_config, prefix=prefix, embedding_class=RobertaEmbedding) + + def forward( + self, + input_ids: Optional[torch.Tensor], + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + + # Verify assumption that position are always a sequence from + # 0 to N. (Actually here we just check 0 and N to simplify). + # This is important to fix the position which are assumed to + # start from padding_idx + 1 instead of 0 in the Roberta models. + assert isinstance(attn_metadata, XFormersMetadata) + cumulative = attn_metadata.seq_lens_tensor.cumsum(dim=0) + start_pos = torch.cat( + (torch.tensor([0], device=attn_metadata.seq_lens_tensor.device), + cumulative[:-1])) + assert len(torch.nonzero(positions[start_pos])) == 0 + end_pos = cumulative - 1 + last_tokens = attn_metadata.seq_lens_tensor - 1 + assert len(torch.nonzero(positions[end_pos] - last_tokens)) == 0 + + return super().forward(input_ids=input_ids, + positions=positions, + kv_caches=kv_caches, + attn_metadata=attn_metadata, + intermediate_tensors=intermediate_tensors, + inputs_embeds=inputs_embeds) From 49cc57b7b5ab15b84f6b95911b86a6d8646a5658 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 12:45:29 -0300 Subject: [PATCH 19/24] improve assert Signed-off-by: Max de Bayser --- vllm/model_executor/models/roberta.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index fad8a51cf8df5..c1dcdd36ec3de 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -5,7 +5,6 @@ from transformers import RobertaConfig from vllm.attention import AttentionMetadata -from vllm.attention.backends.xformers import XFormersMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) @@ -100,7 +99,7 @@ def forward( # 0 to N. (Actually here we just check 0 and N to simplify). # This is important to fix the position which are assumed to # start from padding_idx + 1 instead of 0 in the Roberta models. - assert isinstance(attn_metadata, XFormersMetadata) + assert hasattr(attn_metadata, "seq_lens_tensor") cumulative = attn_metadata.seq_lens_tensor.cumsum(dim=0) start_pos = torch.cat( (torch.tensor([0], device=attn_metadata.seq_lens_tensor.device), From 0f334aec018b92aa4c3ac7bec709f19dcbc7774b Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 13:33:01 -0300 Subject: [PATCH 20/24] add model to embedding test Signed-off-by: Max de Bayser --- tests/compile/test_basic_correctness.py | 2 +- tests/models/embedding/language/test_embedding.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index b91ee032a1c53..496e07772dbc5 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -67,7 +67,7 @@ class TestSetting: model_args=["--task", "embedding"], pp_size=1, tp_size=1, - attn_backend="FLASHINFER", + attn_backend="FLASH_ATTN", method="encode", fullgraph=True, ), diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index cd920aec6502e..fcdd684168d04 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -13,10 +13,12 @@ "intfloat/e5-mistral-7b-instruct", "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2", + "intfloat/multilingual-e5-large", ] ENCODER_ONLY = [ "BAAI/bge-base-en-v1.5", + "intfloat/multilingual-e5-large", ] From f27aae1d9eda86f80d62f44953445948ad0f80df Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 14:02:42 -0300 Subject: [PATCH 21/24] Remove encoder embedding model for compile test The test is failing with Unsupported('dynamic shape operator: aten.nonzero.default; to enable, set torch._dynamo.config.capture_dynamic_output_shape_ops = True\n\nfrom user code:\n File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/roberta.py", line 107, in forward\n assert len(torch.nonzero(positions[start_pos])) == 0\n\nSet TORCH_LOGS="+dynamo" and TORCHDYNAMO_VERBOSE=1 for more information\n\n\nYou can suppress this exception and fall back to eager by setting:\n import torch._dynamo\n torch._dynamo.config.suppress_errors = True\n') Signed-off-by: Max de Bayser --- tests/compile/test_basic_correctness.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 496e07772dbc5..833589ba5dc9f 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -62,15 +62,6 @@ class TestSetting: method="encode", fullgraph=True, ), - TestSetting( - model="intfloat/multilingual-e5-large", - model_args=["--task", "embedding"], - pp_size=1, - tp_size=1, - attn_backend="FLASH_ATTN", - method="encode", - fullgraph=True, - ), # vision language model TestSetting( model="microsoft/Phi-3.5-vision-instruct", From 44a9d22984ee770642dd0c741e5469cdf5d4077e Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 15:14:23 -0300 Subject: [PATCH 22/24] trigger ci Signed-off-by: Max de Bayser From 9f31bd5187a6869f4221969a252916ea999a7c9b Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 16:03:45 -0300 Subject: [PATCH 23/24] trigger ci Signed-off-by: Max de Bayser From 80ead23af18a3ce0e36cc6e8b60def383a391f65 Mon Sep 17 00:00:00 2001 From: Max de Bayser Date: Thu, 14 Nov 2024 16:49:55 -0300 Subject: [PATCH 24/24] trigger ci Signed-off-by: Max de Bayser