format.sh

HabanaAI · Aug 13, 2024 · ceca996 · ceca996
1 parent ed414dc
commit ceca996
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 8 deletions.
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
@@ -17,11 +17,13 @@ def with_mark_steps(fn):
 
     @wraps(fn)
     def wrapped(*args, **kwargs):
-        htorch.core.mark_step()
+        if not is_fake_hpu():
+            htorch.core.mark_step()
         result = fn(*args, **kwargs)
         del args
         del kwargs
-        htorch.core.mark_step()
+        if not is_fake_hpu():
+            htorch.core.mark_step()
         return result
 
     return wrapped
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
@@ -100,6 +100,7 @@ def forward(
         kv_cache: torch.Tensor,
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
+        #        import pdb; pdb.set_trace()
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.chunk(chunks=3, dim=-1)
         attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
@@ -254,7 +255,6 @@ def forward(
         if self.project_in is not None:
             inputs_embeds, _ = self.project_in(inputs_embeds)
         hidden_states = inputs_embeds + pos_embeds
-
         for i in range(len(self.layers)):
             layer = self.layers[i]
             hidden_states = layer(hidden_states, kv_caches[i], attn_metadata)

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -6,7 +6,7 @@
 from vllm.attention import get_attn_backend
 from vllm.config import CacheConfig, DeviceConfig, ModelConfig, ParallelConfig
 from vllm.logger import init_logger
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size,
+from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, get_dtype_size, is_fake_hpu,
                         is_pin_memory_available)
 
 logger = init_logger(__name__)
@@ -78,7 +78,7 @@ def _allocate_kv_cache(
         pin_memory = is_pin_memory_available() if device == "cpu" else False
         kv_cache: List[torch.Tensor] = []
         for _ in range(self.num_attention_layers):
-            if device == 'hpu':
+            if device == 'hpu' or is_fake_hpu():
                 key_cache = torch.zeros(kv_cache_shape,
                                         dtype=self.dtype,
                                         device=device)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
@@ -1059,7 +1059,8 @@ def warmup_scenario(self, batch_size, seq_len, is_prompt,
         for _ in range(times):
             inputs = self.prepare_model_input(seqs)
             self.execute_model(inputs, kv_caches)
-            torch.hpu.synchronize()
+            if not is_fake_hpu():
+                torch.hpu.synchronize()
         self.profiler.end()
         gc.collect()
 
@@ -1145,7 +1146,8 @@ def warmup_model(self, kv_caches: List[torch.Tensor]) -> None:
         self.warmup_all_buckets(self.prompt_buckets, True, kv_caches)
         self.warmup_all_buckets(self.decode_buckets, False, kv_caches)
 
-        if not self.enforce_eager and htorch.utils.internal.is_lazy():
+        if not is_fake_hpu(
+        ) and not self.enforce_eager and htorch.utils.internal.is_lazy():
             assert self.mem_margin is not None, \
                 ("HabanaWorker.determine_num_available_blocks needs "
                 "to be called before warming up the model.")

diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
@@ -132,7 +132,10 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
         if is_fake_hpu():
-            return 128, 0
+            #            self.model_runner.profile_run()
+            cache_block_size = self.get_cache_block_size_bytes()
+            fake_hpu_cache_alloc = 4 * 2**30  # take 4 GiB flat on fake hpu
+            return fake_hpu_cache_alloc // cache_block_size, 0
         with HabanaMemoryProfiler() as m:
             self.model_runner.profile_run()
             torch.hpu.synchronize()