Skip to content

Commit

Permalink
format.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
kzawora-intel committed Aug 13, 2024
1 parent ed414dc commit ceca996
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 8 deletions.
6 changes: 4 additions & 2 deletions vllm/hpu/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,13 @@ def with_mark_steps(fn):

@wraps(fn)
def wrapped(*args, **kwargs):
htorch.core.mark_step()
if not is_fake_hpu():
htorch.core.mark_step()
result = fn(*args, **kwargs)
del args
del kwargs
htorch.core.mark_step()
if not is_fake_hpu():
htorch.core.mark_step()
return result

return wrapped
2 changes: 1 addition & 1 deletion vllm/model_executor/models/opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def forward(
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
# import pdb; pdb.set_trace()
qkv, _ = self.qkv_proj(hidden_states)
q, k, v = qkv.chunk(chunks=3, dim=-1)
attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
Expand Down Expand Up @@ -254,7 +255,6 @@ def forward(
if self.project_in is not None:
inputs_embeds, _ = self.project_in(inputs_embeds)
hidden_states = inputs_embeds + pos_embeds

for i in range(len(self.layers)):
layer = self.layers[i]
hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)
Expand Down
4 changes: 2 additions & 2 deletions vllm/worker/cache_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from vllm.attention import get_attn_backend
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
from vllm.logger import init_logger
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu,
is_pin_memory_available)

logger = init_logger(__name__)
Expand Down Expand Up @@ -78,7 +78,7 @@ def _allocate_kv_cache(
pin_memory = is_pin_memory_available() if device == "cpu" else False
kv_cache: List[torch.Tensor] = []
for _ in range(self.num_attention_layers):
if device == 'hpu':
if device == 'hpu' or is_fake_hpu():
key_cache = torch.zeros(kv_cache_shape,
dtype=self.dtype,
device=device)
Expand Down
6 changes: 4 additions & 2 deletions vllm/worker/habana_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,8 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt,
for _ in range(times):
inputs = self.prepare_model_input(seqs)
self.execute_model(inputs, kv_caches)
torch.hpu.synchronize()
if not is_fake_hpu():
torch.hpu.synchronize()
self.profiler.end()
gc.collect()

Expand Down Expand Up @@ -1145,7 +1146,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
self.warmup_all_buckets(self.decode_buckets, False, kv_caches)

if not self.enforce_eager and htorch.utils.internal.is_lazy():
if not is_fake_hpu(
) and not self.enforce_eager and htorch.utils.internal.is_lazy():
assert self.mem_margin is not None, \
("HabanaWorker.determine_num_available_blocks needs "
"to be called before warming up the model.")
Expand Down
5 changes: 4 additions & 1 deletion vllm/worker/habana_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
# Execute a forward pass with dummy inputs to profile the memory usage
# of the model.
if is_fake_hpu():
return 128, 0
# self.model_runner.profile_run()
cache_block_size = self.get_cache_block_size_bytes()
fake_hpu_cache_alloc = 4 * 2**30 # take 4 GiB flat on fake hpu
return fake_hpu_cache_alloc // cache_block_size, 0
with HabanaMemoryProfiler() as m:
self.model_runner.profile_run()
torch.hpu.synchronize()
Expand Down

0 comments on commit ceca996

Please sign in to comment.