diff --git a/setup.py b/setup.py index 0470e172eb3f6..f7bec65f4cf4e 100644 --- a/setup.py +++ b/setup.py @@ -238,7 +238,7 @@ def _is_hpu() -> bool: is_hpu_available = True try: subprocess.run(["hl-smi"], capture_output=True, check=True) - except (FileNotFoundError, NotADirectoryError, PermissionError, subprocess.CalledProcessError): + except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): if not os.path.exists('/dev/accel/accel0') and not os.path.exists( '/dev/accel/accel_controlD0'): # last resort... @@ -267,7 +267,7 @@ def _is_neuron() -> bool: torch_neuronx_installed = True try: subprocess.run(["neuron-ls"], capture_output=True, check=True) - except (FileNotFoundError, NotADirectoryError, PermissionError, subprocess.CalledProcessError): + except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): torch_neuronx_installed = False return torch_neuronx_installed or VLLM_TARGET_DEVICE == "neuron" diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 6b1082325e026..7a867e79b203d 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -12,8 +12,8 @@ AttentionMetadata, AttentionType) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) -from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache from vllm.hpu import cache_ops +from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache from vllm.logger import init_logger logger = init_logger(__name__) @@ -144,11 +144,11 @@ def __init__( self.num_heads = num_heads self.head_size = head_size self.scale = float(scale) - self.qk_matmul = Matmul() + self.matmul_qk = Matmul() self.softmax = Softmax() - self.av_matmul = Matmul() - self.key_cache = VLLMKVCache() - self.value_cache = VLLMKVCache() + self.matmul_av = Matmul() + self.k_cache = VLLMKVCache() + self.v_cache = VLLMKVCache() self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads self.sliding_window = sliding_window self.position_bias = None @@ -212,9 +212,13 @@ def forward( # Reshape the input keys and values and store them in the cache. # If kv_cache is not provided, the new key and value tensors are # not cached. This happens during the initial memory profiling run. - num_kv_cache_passes, num_slots_available, indices, offsets = cache_ops.prepare_to_cache(key_cache, attn_metadata.slot_mapping) - key_cache = self.key_cache(key, key_cache, num_kv_cache_passes, num_slots_available, indices, offsets) - value_cache = self.value_cache(value, value_cache, num_kv_cache_passes, num_slots_available, indices, offsets) + num_kv_cache_passes, num_slots_available, indices, offsets = \ + cache_ops.prepare_to_cache(key_cache, + attn_metadata.slot_mapping) + key_cache = self.k_cache(key, key_cache, num_kv_cache_passes, + num_slots_available, indices, offsets) + value_cache = self.v_cache(value, value_cache, num_kv_cache_passes, + num_slots_available, indices, offsets) if attn_metadata.is_prompt: # Prompt run. @@ -240,9 +244,9 @@ def forward( attn_bias=attn_bias, p=0.0, scale=self.scale, - qk_matmul_op=self.qk_matmul, + matmul_qk_op=self.matmul_qk, softmax_op=self.softmax, - av_matmul_op=self.av_matmul, + matmul_av_op=self.matmul_av, ) output = out.reshape(batch_size, seq_len, hidden_size) else: @@ -266,8 +270,8 @@ def forward( query, key_cache, value_cache, attn_metadata.block_tables, attn_metadata.seq_lens_tensor, self.kv_cache_dtype, self.num_kv_heads, self.scale, self.position_bias, k_scale, - v_scale, self.qk_matmul, self.softmax, self.av_matmul, - self.key_cache, self.value_cache) + v_scale, self.matmul_qk, self.softmax, self.matmul_av, + self.k_cache, self.v_cache) # Reshape the output tensor. return output.view(batch_size, seq_len, hidden_size) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index 286fda8cd500c..9602886299c47 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -75,9 +75,9 @@ def forward_decode( alibi_slopes: Optional[torch.Tensor], k_scale: float, v_scale: float, - qk_matmul_op, + matmul_qk_op, softmax_op, - av_matmul_op, + matmul_av_op, k_cache_cls, v_cache_cls, ) -> torch.Tensor: @@ -93,9 +93,9 @@ def forward_decode( block_size, alibi_slopes, kv_cache_dtype, - qk_matmul_op, + matmul_qk_op, softmax_op, - av_matmul_op, + matmul_av_op, k_cache_cls, v_cache_cls, ) diff --git a/vllm/config.py b/vllm/config.py index ec7e8fed30fdb..6acb70ad047b2 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -474,13 +474,13 @@ def _verify_args(self) -> None: def _verify_cache_dtype(self) -> None: if self.cache_dtype == "auto": pass - elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2", "hf8"): + elif self.cache_dtype in ("fp8", "fp8_e4m3", "fp8_e5m2", "fp8_inc"): logger.info( "Using fp8 data type to store kv cache. It reduces the GPU " "memory footprint and boosts the performance. " "Meanwhile, it may cause accuracy drop without a proper " "scaling factor. " - "FP8_E4M3 is also supported on hpu (hf8).") + "Intel Gaudi (HPU) supports fp8 (using fp8_inc).") else: raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}") diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 29160143ef469..d6c544750afea 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -229,12 +229,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: parser.add_argument( '--kv-cache-dtype', type=str, - choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'hf8'], + choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3', 'fp8_inc'], default=EngineArgs.kv_cache_dtype, help='Data type for kv cache storage. If "auto", will use model ' 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ' 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3). ' - 'FP8_E4M3 is also supported on hpu (hf8).') + 'Intel Gaudi (HPU) supports fp8 (using fp8_inc).') parser.add_argument( '--quantization-param-path', type=nullable_str, @@ -842,7 +842,8 @@ def create_engine_config(self, ) -> EngineConfig: self.model_loader_extra_config[ "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path - device = device_config.device if self.weights_load_device is None else self.weights_load_device + device = device_config.device if self.weights_load_device is None else \ + self.weights_load_device load_config = LoadConfig( load_format=self.load_format, download_dir=self.download_dir, diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py index 9527354719aba..98f109accea06 100644 --- a/vllm/hpu/cache_ops.py +++ b/vllm/hpu/cache_ops.py @@ -64,13 +64,14 @@ def prepare_to_cache(cache, slot_mapping): return num_kv_cache_passes, num_slots_available, indices, offsets -def insert_or_update_cache(input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offsets): +def insert_or_update_cache(input, cache, num_kv_cache_passes, + num_slots_available, block_indices, block_offsets): for i in range(num_kv_cache_passes): start_idx = i * num_slots_available end_idx = (i + 1) * num_slots_available - cache.index_put_( - (block_indices[start_idx:end_idx], block_offsets[start_idx:end_idx]), - input[start_idx:end_idx]) + cache.index_put_((block_indices[start_idx:end_idx], + block_offsets[start_idx:end_idx]), + input[start_idx:end_idx]) def swap_blocks(src, dst, block_mapping): diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py index 3384729a1e479..f9e560de5b6b9 100644 --- a/vllm/hpu/ops.py +++ b/vllm/hpu/ops.py @@ -11,7 +11,6 @@ import torch import torch.nn.functional as F -import vllm.hpu.utils as hpu_utils from vllm.logger import init_logger logger = init_logger() @@ -43,9 +42,9 @@ def paged_attention_v1(query, block_size, alibi_slopes=None, kv_cache_dtype=None, - qk_matmul_op=torch.matmul, + matmul_qk_op=torch.matmul, softmax_op=torch.softmax, - av_matmul_op=torch.matmul, + matmul_av_op=torch.matmul, k_cache_cls=None, v_cache_cls=None) -> None: seq_len = block_tables.size(1) @@ -60,20 +59,22 @@ def paged_attention_v1(query, batch_size, 1, 1, -1)) query.mul_(scale) query = query.unsqueeze(-2) - fetch_keys = fetch_from_cache if k_cache_cls is None else k_cache_cls.fetch_from_cache + fetch_keys = fetch_from_cache if k_cache_cls is None else \ + k_cache_cls.fetch_from_cache keys = fetch_keys(key_cache, block_tables, (0, 2, 3, 1)) if query_heads != kv_heads: query = query.unflatten(1, (kv_heads, -1)) keys = [k.unflatten(1, (kv_heads, 1)) for k in keys] mask = mask.unsqueeze(2) - attn_weights = torch.cat([qk_matmul_op(query, k) for k in keys], dim=-1) + attn_weights = torch.cat([matmul_qk_op(query, k) for k in keys], dim=-1) if alibi_slopes is not None: attn_weights.add_(alibi_slopes[:, :, -attn_weights.size(2):, -attn_weights.size(3):]) attn_weights = softmax_op(attn_weights.masked_fill(mask, min_inf), dim=-1) - fetch_values = fetch_from_cache if v_cache_cls is None else k_cache_cls.fetch_from_cache + fetch_values = fetch_from_cache if v_cache_cls is None else \ + v_cache_cls.fetch_from_cache values = fetch_values(value_cache, block_tables, (0, 2, 1, 3)) if PA_SPLIT_VALUE: attn_weights = attn_weights.split(block_size, dim=-1) @@ -82,7 +83,7 @@ def paged_attention_v1(query, attn_weights = [attn_weights] if query_heads != kv_heads: values = [v.unflatten(1, (kv_heads, 1)) for v in values] - attn_weights = [av_matmul_op(a, v) for a, v in zip(attn_weights, values)] + attn_weights = [matmul_av_op(a, v) for a, v in zip(attn_weights, values)] if query_heads != kv_heads: attn_weights = [a.flatten(1, 2) for a in attn_weights] attn_weights = sum(attn_weights) @@ -132,9 +133,9 @@ def prompt_attention( attn_bias: Optional[torch.Tensor] = None, p: float = 0.0, scale: Optional[float] = None, - qk_matmul_op = torch.matmul, - softmax_op = torch.softmax, - av_matmul_op = torch.matmul, + matmul_qk_op=torch.matmul, + softmax_op=torch.softmax, + matmul_av_op=torch.matmul, ) -> torch.Tensor: query = query.transpose(1, 2) key = key.transpose(1, 2) @@ -147,11 +148,11 @@ def prompt_attention( value = value.unflatten(1, (kv_heads, 1)) if attn_bias is not None: attn_bias = attn_bias.unsqueeze(2) - attn_weights = qk_matmul_op(query * scale, key.transpose(-1, -2)) + attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2)) if attn_bias is not None: attn_weights.add_(attn_bias) attn_weights = softmax_op(attn_weights, dim=-1) - attn_weights = av_matmul_op(attn_weights, value) + attn_weights = matmul_av_op(attn_weights, value) if query_heads != kv_heads: attn_weights = attn_weights.flatten(1, 2) attn_weights = attn_weights.transpose(1, 2) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index 8013f014ebd94..3d9c7cb1c4c22 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -5,12 +5,14 @@ # LICENSE file in the root directory of this source tree. ############################################################################### -import torch from functools import wraps import habana_frameworks.torch as htorch +import torch + from vllm.hpu.cache_ops import insert_or_update_cache + def with_mark_steps(fn): @wraps(fn) @@ -24,7 +26,9 @@ def wrapped(*args, **kwargs): return wrapped + class Matmul(torch.nn.Module): + def __init__(self): super(Matmul, self).__init__() @@ -33,19 +37,28 @@ def forward(self, x, y): class Softmax(torch.nn.Module): - def __init__(self): + + def __init__(self): super().__init__() - def forward(self, x, dim = None, inv_head = None): + def forward(self, x, dim=None, inv_head=None): return torch.softmax(x, dim) + class VLLMKVCache(torch.nn.Module): + def __init__(self): super(VLLMKVCache, self).__init__() - def forward(self, input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offset): - insert_or_update_cache(input, cache, num_kv_cache_passes, num_slots_available, block_indices, block_offset) + def forward(self, input, cache, num_kv_cache_passes, num_slots_available, + block_indices, block_offset): + insert_or_update_cache(input, cache, num_kv_cache_passes, + num_slots_available, block_indices, + block_offset) return cache def fetch_from_cache(self, cache, blocks, permutations): - return [cache.index_select(0, blocks[:, i]).permute(permutations) for i in range(blocks.size(1))] + return [ + cache.index_select(0, blocks[:, i]).permute(permutations) + for i in range(blocks.size(1)) + ] diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 7434d02b60ada..c12668c14887d 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -86,8 +86,7 @@ def forward_hpu( self.variance_epsilon) return x.view(orig_shape), residual - x = HPUFusedRMSNorm.apply(x, self.weight, - self.variance_epsilon) + x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon) return x def forward_xpu( diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 746fa726354ba..7590d3e980275 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -18,9 +18,9 @@ GPTQMarlinConfig) from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQMarlin24Config) +from vllm.model_executor.layers.quantization.inc import INCConfig from vllm.model_executor.layers.quantization.marlin import MarlinConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig -from vllm.model_executor.layers.quantization.inc import INCConfig QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = { "aqlm": AQLMConfig, diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py index 931c9eefe741f..f6718ec2ac9e7 100644 --- a/vllm/model_executor/layers/quantization/inc.py +++ b/vllm/model_executor/layers/quantization/inc.py @@ -1,11 +1,9 @@ -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional import torch -from torch.nn import Module -from torch.nn.parameter import Parameter import torch.nn.functional as F +from torch.nn.parameter import Parameter -from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase from vllm.model_executor.layers.quantization.base_config import ( @@ -59,7 +57,8 @@ def get_quant_method(self, layer: torch.nn.Module, def get_scaled_act_names(self) -> List[str]: return [] - def get_min_capability(self) -> int: + @classmethod + def get_min_capability(cls) -> int: # The AWQ kernel only supports Turing or newer GPUs. return 75 @@ -67,6 +66,7 @@ def get_min_capability(self) -> int: def get_config_filenames() -> List[str]: return [] + class INCLinearMethod(LinearMethodBase): """Linear method for FP8. Supports loading FP8 checkpoints with static weight scale and @@ -83,7 +83,9 @@ class INCLinearMethod(LinearMethodBase): quant_config: The quantization config. """ - def __init__(self, quant_config: INCConfig, separate_bias_add: bool = False): + def __init__(self, + quant_config: INCConfig, + separate_bias_add: bool = False): self.separate_bias_add = separate_bias_add self.quant_config = quant_config @@ -110,4 +112,4 @@ def apply(self, if bias is not None: return F.linear(x, weight) + bias return F.linear(x, weight) - return F.linear(x, weight, bias) \ No newline at end of file + return F.linear(x, weight, bias) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index bb40a5835c3c8..06048d97088e1 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -37,7 +37,7 @@ supports_vision) from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform -from vllm.utils import is_tpu, is_hpu +from vllm.utils import is_hpu, is_tpu logger = init_logger(__name__) @@ -53,8 +53,8 @@ def _get_quantization_config( capability = capability[0] * 10 + capability[1] if capability < quant_config.get_min_capability(): raise ValueError( - f"The quantization method {model_config.quantization} is not " - "supported for the current GPU. " + f"The quantization method {model_config.quantization} " + "is not supported for the current GPU. " f"Minimum capability: {quant_config.get_min_capability()}. " f"Current capability: {capability}.") supported_dtypes = quant_config.get_supported_act_dtypes() diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index b71a4ee7e3b9d..676a51ce67f96 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -48,8 +48,9 @@ from vllm.model_executor.model_loader.weight_utils import ( default_weight_loader, kv_cache_scales_loader, maybe_remap_kv_scale_name) from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.platforms import current_platform from vllm.sequence import IntermediateTensors, SamplerOutput -from vllm.utils import is_hip, is_hpu +from vllm.utils import is_hip from .interfaces import SupportsLoRA from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers @@ -317,7 +318,7 @@ def forward( hidden_states = intermediate_tensors["hidden_states"] residual = intermediate_tensors["residual"] - if is_hpu(): + if current_platform.is_hpu(): import habana_frameworks.torch as htorch htorch.core.mark_step() for i in range(self.start_layer, self.end_layer): @@ -329,7 +330,7 @@ def forward( attn_metadata, residual, ) - if is_hpu(): + if current_platform.is_hpu(): htorch.core.mark_step() if not get_pp_group().is_last_rank: diff --git a/vllm/utils.py b/vllm/utils.py index af8ddc294aa95..fe84253feb172 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -39,7 +39,7 @@ "fp8": torch.uint8, "fp8_e4m3": torch.uint8, "fp8_e5m2": torch.uint8, - "hf8": torch.float8_e4m3fn, + "fp8_inc": torch.float8_e4m3fn, } TORCH_DTYPE_TO_NUMPY_DTYPE = { diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 8e41cbfd511ff..ec0b8c2369210 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -91,7 +91,8 @@ def _allocate_kv_cache( # null block in CpuGpuBlockAllocator requires at least that # block to be zeroed-out. # We zero-out everything for simplicity. - dtype = torch.int8 if self.dtype == torch.float8_e4m3fn else self.dtype + dtype = torch.uint8 if self.dtype == torch.float8_e4m3fn else \ + self.dtype kv_cache.append( torch.zeros(kv_cache_shape, dtype=dtype, diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 2588da84a3d6c..72aba42ae8553 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -413,6 +413,9 @@ def __init__( self._setup_buckets() def load_model(self) -> None: + import habana_frameworks.torch.core as htcore + if self.model_config.quantization == 'inc': + htcore.hpu_set_env() with HabanaMemoryProfiler() as m: with HabanaMemoryProfiler() as m_getmodel: self.model = get_model( @@ -429,18 +432,21 @@ def load_model(self) -> None: f"took {m_getmodel.get_summary_string()}") logger.info(msg) - import habana_frameworks.torch.core as htcore if self.model_config.quantization == 'inc': logger.info("Preparing model with INC..") with HabanaMemoryProfiler() as m_inc: - from neural_compressor.torch.quantization import FP8Config, convert, prepare - config = FP8Config.from_json_file(os.getenv("QUANT_CONFIG", "")) + from neural_compressor.torch.quantization import ( + FP8Config, convert, prepare) + config = FP8Config.from_json_file( + os.getenv("QUANT_CONFIG", "")) if config.measure: self.model = prepare(self.model, config) elif config.quantize: self.model = convert(self.model, config) - htcore.hpu_initialize(self.model, mark_only_scales_as_const=True) - logger.info(f"Preparing model with INC took {m_inc.get_summary_string()}") + htcore.hpu_initialize(self.model, + mark_only_scales_as_const=True) + logger.info("Preparing model with INC took %s", + m_inc.get_summary_string()) else: self.model = self.model.to("hpu") htcore.mark_step() @@ -1425,7 +1431,10 @@ def execute_model( if multi_modal_input is not None: execute_model_kwargs.update(multi_modal_input) if htorch.utils.internal.is_lazy(): - execute_model_kwargs.update({"bypass_hpu_graphs":not use_graphs, "warmup_mode":warmup_mode}) + execute_model_kwargs.update({ + "bypass_hpu_graphs": not use_graphs, + "warmup_mode": warmup_mode + }) htorch.core.mark_step() if self.is_driver_worker: @@ -1439,8 +1448,8 @@ def execute_model( with self.profiler.record_event('internal', model_event_name): hidden_states = self.model.forward( **execute_model_kwargs, - selected_token_indices=sampling_metadata. - selected_token_indices) + selected_token_indices=sampling_metadata.selected_token_indices + ) # Compute the logits. with self.profiler.record_event( @@ -1485,12 +1494,13 @@ def execute_model( def shutdown_inc(self): print('inc shutdown') - if model_config := getattr(self, "model_config", None): - if getattr(model_config, "quantization", None) == 'inc': - print('inc shutdown start') - from neural_compressor.torch.quantization import finalize_calibration - finalize_calibration(self.model.model) - print('inc shutdown') + if (model_config := getattr(self, "model_config", None)) and \ + getattr(model_config, "quantization", None) == 'inc': + print('inc shutdown start') + from neural_compressor.torch.quantization import ( + finalize_calibration) + finalize_calibration(self.model.model) + print('inc shutdown') def __del__(self): self.shutdown_inc() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 73278162dce03..87122c03d3c8f 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -118,9 +118,6 @@ def init_device(self) -> None: set_random_seed(self.model_config.seed) def load_model(self): - if self.model_config.quantization == 'inc': - import habana_frameworks.torch.core as htcore - htcore.hpu_set_env() self.model_runner.load_model() @torch.inference_mode()