From ceca996f7734381b9eafa098af705105d8639e47 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Tue, 13 Aug 2024 16:30:02 +0300 Subject: [PATCH] format.sh --- vllm/hpu/utils.py | 6 ++++-- vllm/model_executor/models/opt.py | 2 +- vllm/worker/cache_engine.py | 4 ++-- vllm/worker/habana_model_runner.py | 6 ++++-- vllm/worker/habana_worker.py | 5 ++++- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py index 2092eb3b99ad8..0d7e92351714a 100644 --- a/vllm/hpu/utils.py +++ b/vllm/hpu/utils.py @@ -17,11 +17,13 @@ def with_mark_steps(fn): @wraps(fn) def wrapped(*args, **kwargs): - htorch.core.mark_step() + if not is_fake_hpu(): + htorch.core.mark_step() result = fn(*args, **kwargs) del args del kwargs - htorch.core.mark_step() + if not is_fake_hpu(): + htorch.core.mark_step() return result return wrapped diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index a05090cd46648..aa65bb2625fc0 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -100,6 +100,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, ) -> torch.Tensor: + # import pdb; pdb.set_trace() qkv, _ = self.qkv_proj(hidden_states) q, k, v = qkv.chunk(chunks=3, dim=-1) attn_output = self.attn(q, k, v, kv_cache, attn_metadata) @@ -254,7 +255,6 @@ def forward( if self.project_in is not None: inputs_embeds, _ = self.project_in(inputs_embeds) hidden_states = inputs_embeds + pos_embeds - for i in range(len(self.layers)): layer = self.layers[i] hidden_states = layer(hidden_states, kv_caches[i], attn_metadata) diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py index 93be2f4c321fe..950b896c3b1b6 100644 --- a/vllm/worker/cache_engine.py +++ b/vllm/worker/cache_engine.py @@ -6,7 +6,7 @@ from vllm.attention import get_attn_backend from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig from vllm.logger import init_logger -from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu, is_pin_memory_available) logger = init_logger(__name__) @@ -78,7 +78,7 @@ def _allocate_kv_cache( pin_memory = is_pin_memory_available() if device == "cpu" else False kv_cache: List[torch.Tensor] = [] for _ in range(self.num_attention_layers): - if device == 'hpu': + if device == 'hpu' or is_fake_hpu(): key_cache = torch.zeros(kv_cache_shape, dtype=self.dtype, device=device) diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 6d06ffbc00ba4..0527310ff32c9 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -1059,7 +1059,8 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt, for _ in range(times): inputs = self.prepare_model_input(seqs) self.execute_model(inputs, kv_caches) - torch.hpu.synchronize() + if not is_fake_hpu(): + torch.hpu.synchronize() self.profiler.end() gc.collect() @@ -1145,7 +1146,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None: self.warmup_all_buckets(self.prompt_buckets, True, kv_caches) self.warmup_all_buckets(self.decode_buckets, False, kv_caches) - if not self.enforce_eager and htorch.utils.internal.is_lazy(): + if not is_fake_hpu( + ) and not self.enforce_eager and htorch.utils.internal.is_lazy(): assert self.mem_margin is not None, \ ("HabanaWorker.determine_num_available_blocks needs " "to be called before warming up the model.") diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index d3df7c026a8d0..5e3b48dc70356 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -132,7 +132,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. if is_fake_hpu(): - return 128, 0 + # self.model_runner.profile_run() + cache_block_size = self.get_cache_block_size_bytes() + fake_hpu_cache_alloc = 4 * 2**30 # take 4 GiB flat on fake hpu + return fake_hpu_cache_alloc // cache_block_size, 0 with HabanaMemoryProfiler() as m: self.model_runner.profile_run() torch.hpu.synchronize()