From 3e503d0606165975d07f6cba3a33e6ed4188f094 Mon Sep 17 00:00:00 2001
From: Dudi Lester <dlester@habana.ai>
Date: Wed, 7 Aug 2024 14:26:46 +0300
Subject: [PATCH 1/5] Support Mixtral quantization using INC

---
 vllm/hpu/ops.py                               |  84 ++++++++-----
 vllm/model_executor/layers/fused_moe/layer.py |  20 ++-
 .../model_executor/layers/quantization/inc.py | 116 ++++++++++++++++++
 vllm/model_executor/model_loader/utils.py     |   2 +-
 4 files changed, 185 insertions(+), 37 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/inc.py

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index c8f00c1cbd59d..a3dc2922e5b5f 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -88,37 +88,6 @@ def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
     return F.silu(x[..., :d]) * x[..., d:]
 
 
-def static_fused_moe(hidden_states, w1, w2, score, topk):
-    B, D = hidden_states.shape
-    num_experts = w1.shape[0]
-    routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-    routing_weights, selected_experts = torch.topk(routing_weights,
-                                                   topk,
-                                                   dim=-1)
-    routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-    routing_weights = routing_weights.to(hidden_states.dtype)
-    final_hidden_states = torch.zeros((1, B, D),
-                                      dtype=hidden_states.dtype,
-                                      device=hidden_states.device)
-    padded_weights = torch.zeros((B, num_experts),
-                                 dtype=hidden_states.dtype,
-                                 device=hidden_states.device)
-    padded_weights.scatter_(-1, selected_experts, routing_weights)
-    padded_weights = padded_weights.reshape(-1, B, w1.shape[0])
-    padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
-
-    htorch.core.mark_step()
-
-    for expert_idx in range(num_experts):
-        w_output = torch.matmul(hidden_states, w1[expert_idx].transpose(0, 1))
-        w_output = silu_and_mul(w_output)
-        w_output = torch.matmul(w_output, w2[expert_idx].transpose(0, 1))
-        final_hidden_states += w_output * padded_weights[expert_idx]
-        htorch.core.mark_step()
-
-    return final_hidden_states.view(-1, D)
-
-
 @hpu_utils.with_mark_steps
 def prompt_attention(
     query: torch.Tensor,
@@ -148,3 +117,56 @@ def prompt_attention(
         attn_weights = attn_weights.flatten(1, 2)
     attn_weights = attn_weights.transpose(1, 2)
     return attn_weights
+
+
+class MoeMatmul(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def set_weight(self, w):
+        self.weight = w
+
+    def calc(self, state, expert_id, w):
+        self.weight = w[expert_id].transpose(0, 1)
+        return self.forward(state)
+
+    def forward(self, state):
+        return torch.matmul(state, self.weight)
+
+
+class StaticFusedMOE(torch.nn.Module):
+    def __init__(self, num_total_experts):
+        super().__init__()
+        self.w13_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)])
+        self.w2_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)])
+        self.num_total_experts = num_total_experts
+
+
+    def forward(self, hidden_states, w1, w2, score, topk):
+        B, D = hidden_states.shape
+        routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
+        routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        final_hidden_states = torch.zeros((1, B, D),
+                                          dtype=hidden_states.dtype,
+                                          device=hidden_states.device)
+        padded_weights = torch.zeros((B, self.num_total_experts),
+                                     dtype=hidden_states.dtype,
+                                     device=hidden_states.device)
+        padded_weights.scatter_(-1, selected_experts, routing_weights)
+        padded_weights = padded_weights.reshape(-1, B, self.num_total_experts)
+        padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
+        htorch.core.mark_step()
+
+        for expert_idx in range(self.num_total_experts):
+            padded_weight = padded_weights[expert_idx]
+            current_state_static = hidden_states.reshape(-1, D)
+            w_output = self.w13_list[expert_idx].calc(current_state_static, expert_idx, w1)
+            w_output = silu_and_mul(w_output)
+            w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2)
+            current_hidden_states_static = w_output * padded_weight
+            final_hidden_states += current_hidden_states_static
+            htorch.core.mark_step()
+
+        return final_hidden_states.view(-1, D)
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index b49bf40d4746e..5ded0c2b5ea65 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,8 +13,6 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import is_hpu
 
-if is_hpu():
-    from vllm.hpu.ops import static_fused_moe
 
 logger = init_logger(__name__)
 
@@ -78,7 +76,7 @@ def apply(
     ) -> torch.Tensor:
         return self.forward(x, layer.w13_weight, layer.w2_weight,
                             router_logits, top_k, renormalize,
-                            use_grouped_topk, num_expert_group, topk_group)
+                            use_grouped_topk, num_expert_group, topk_group, layer)
 
     def forward_cuda(
         self,
@@ -91,6 +89,7 @@ def forward_cuda(
         use_grouped_topk: bool,
         num_expert_group: Optional[int],
         topk_group: Optional[int],
+        layer: Optional[torch.nn.Module],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.fused_moe import fused_moe
         return fused_moe(x,
@@ -107,12 +106,12 @@ def forward_cuda(
     def forward_hpu(self, x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                     router_logits: torch.Tensor, top_k: int, renormalize: bool,
                     use_grouped_topk: bool, num_expert_group: Optional[int],
-                    topk_group: Optional[int]):
+                    topk_group: Optional[int], layer: Optional[torch.nn.Module],):
         assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
         assert num_expert_group is None, ('num_expert_group is '
                                           'not supported on HPU')
         assert topk_group is None, 'topk_group is not supported on HPU'
-        return static_fused_moe(x, w1, w2, router_logits, top_k)
+        return layer.hpu_static_fused_moe(x, w1, w2, router_logits, top_k)
 
     def forward_cpu(self, *args, **kwargs):
         raise NotImplementedError(
@@ -129,6 +128,7 @@ def forward_tpu(
         use_grouped_topk: bool,
         num_expert_group: Optional[int],
         topk_group: Optional[int],
+        layer: Optional[torch.nn.Module],
     ) -> torch.Tensor:
         from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe
         assert not use_grouped_topk
@@ -191,6 +191,9 @@ def __init__(
             assert num_expert_group is not None and topk_group is not None
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
+        if is_hpu():
+            from vllm.hpu.ops import StaticFusedMOE
+            self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts)
 
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = (
@@ -207,6 +210,7 @@ def __init__(
             params_dtype=params_dtype,
             weight_loader=self.weight_loader)
 
+
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: int, expert_id: int):
@@ -245,13 +249,19 @@ def weight_loader(self, param: torch.nn.Parameter,
             if shard_id == 0:
                 param_data[expert_id,
                            0:shard_size, :] = loaded_weight[shard, :]
+                if is_hpu():
+                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(param_data[expert_id])
             # w3, up_proj case: Load into second shard of w13.
             elif shard_id == 2:
                 param_data[expert_id, shard_size:2 *
                            shard_size, :] = loaded_weight[shard, :]
+                if is_hpu():
+                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(param_data[expert_id])
             # w2, down_proj case: Load into only shard of w2.
             elif shard_id == 1:
                 param_data[expert_id, :, :] = loaded_weight[:, shard]
+                if is_hpu():
+                    self.hpu_static_fused_moe.w2_list[expert_id].set_weight(param_data[expert_id])
             else:
                 raise ValueError(
                     f"Shard id must be in [0,1,2] but got {shard_id}")
diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py
new file mode 100644
index 0000000000000..d2cca285670d8
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/inc.py
@@ -0,0 +1,116 @@
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.utils import set_weight_attrs
+
+ACTIVATION_SCHEMES = ["static", "dynamic"]
+
+logger = init_logger(__name__)
+
+
+class INCConfig(QuantizationConfig):
+    """Config class for FP8."""
+
+    def __init__(
+        self,
+        is_checkpoint_fp8_serialized: bool = False,
+        activation_scheme: str = "dynamic",
+    ) -> None:
+        self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
+        if is_checkpoint_fp8_serialized:
+            logger.warning("Detected fp8 checkpoint. Please note that the "
+                           "format is experimental and subject to change.")
+        if activation_scheme not in ACTIVATION_SCHEMES:
+            raise ValueError(
+                f"Unsupported activation scheme {activation_scheme}")
+        self.activation_scheme = activation_scheme
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "inc"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "INCConfig":
+        quant_method = cls.get_from_keys(config, ["quant_method"])
+        is_checkpoint_fp8_serialized = ("fp8" in quant_method)
+        activation_scheme = cls.get_from_keys(config, ["activation_scheme"])
+        return cls(is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized,
+                   activation_scheme=activation_scheme)
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["INCLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return INCLinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+           return UnquantizedFusedMoEMethod()
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+    def get_min_capability(self) -> int:
+        # The AWQ kernel only supports Turing or newer GPUs.
+        return 75
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return []
+
+class INCLinearMethod(LinearMethodBase):
+    """Linear method for FP8.
+    Supports loading FP8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Limitations:
+    1. Only support per-tensor quantization due to torch._scaled_mm support.
+    2. Only support float8_e4m3fn data type due to the limitation of
+       torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+
+    Args:
+        quant_config: The quantization config.
+    """
+
+    def __init__(self, quant_config: INCConfig, separate_bias_add: bool = False):
+        self.separate_bias_add = separate_bias_add
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        output_size_per_partition = sum(output_partition_sizes)
+        weight = Parameter(torch.empty(output_size_per_partition,
+                                       input_size_per_partition,
+                                       dtype=params_dtype),
+                           requires_grad=False)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        weight = layer.weight
+        if self.separate_bias_add:
+            if bias is not None:
+                return F.linear(x, weight) + bias
+            return F.linear(x, weight)
+        return F.linear(x, weight, bias)
\ No newline at end of file
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index f7e0f56c1a46e..a8b0a7b07ed8e 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -24,7 +24,7 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     if (model_config.quantization is not None
-            and model_config.quantization != "fp8"
+            and model_config.quantization not in ["fp8", "inc"]
             and "MixtralForCausalLM" in architectures):
         architectures = ["QuantMixtralForCausalLM"]
 

From 1a5fd1d4096384024acdc86ba610b5036e2de54b Mon Sep 17 00:00:00 2001
From: Roi Tiefenbrunn <rtiefenbrunn@habana.ai>
Date: Thu, 29 Aug 2024 12:21:35 +0300
Subject: [PATCH 2/5] Fix formatting errors

---
 vllm/hpu/ops.py                                | 17 ++++++++++++-----
 vllm/model_executor/layers/fused_moe/layer.py  | 18 +++++++++++-------
 vllm/model_executor/layers/quantization/inc.py | 15 ++++++++-------
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
index ea0a9977bd4ca..673e3472ac8df 100644
--- a/vllm/hpu/ops.py
+++ b/vllm/hpu/ops.py
@@ -236,7 +236,9 @@ def dispatch_bgmv_embedding(
     out = out.squeeze(1)
     y += out * scale
 
+
 class MoeMatmul(torch.nn.Module):
+
     def __init__(self):
         super().__init__()
 
@@ -252,17 +254,21 @@ def forward(self, state):
 
 
 class StaticFusedMOE(torch.nn.Module):
+
     def __init__(self, num_total_experts):
         super().__init__()
-        self.w13_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)])
-        self.w2_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)])
+        self.w13_list = torch.nn.ModuleList(
+            [MoeMatmul() for _ in range(num_total_experts)])
+        self.w2_list = torch.nn.ModuleList(
+            [MoeMatmul() for _ in range(num_total_experts)])
         self.num_total_experts = num_total_experts
 
-
     def forward(self, hidden_states, w1, w2, score, topk):
         B, D = hidden_states.shape
         routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-        routing_weights, selected_experts = torch.topk(routing_weights, topk, dim=-1)
+        routing_weights, selected_experts = torch.topk(routing_weights,
+                                                       topk,
+                                                       dim=-1)
         routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
         routing_weights = routing_weights.to(hidden_states.dtype)
         final_hidden_states = torch.zeros((1, B, D),
@@ -278,7 +284,8 @@ def forward(self, hidden_states, w1, w2, score, topk):
 
         for expert_idx in range(self.num_total_experts):
             padded_weight = padded_weights[expert_idx]
-            w_output = self.w13_list[expert_idx].calc(hidden_states, expert_idx, w1)
+            w_output = self.w13_list[expert_idx].calc(hidden_states,
+                                                      expert_idx, w1)
             w_output = silu_and_mul(w_output)
             w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2)
             final_hidden_states += w_output * padded_weight
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 5ded0c2b5ea65..23e2bbb5d9520 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -13,7 +13,6 @@
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import is_hpu
 
-
 logger = init_logger(__name__)
 
 
@@ -76,7 +75,8 @@ def apply(
     ) -> torch.Tensor:
         return self.forward(x, layer.w13_weight, layer.w2_weight,
                             router_logits, top_k, renormalize,
-                            use_grouped_topk, num_expert_group, topk_group, layer)
+                            use_grouped_topk, num_expert_group, topk_group,
+                            layer)
 
     def forward_cuda(
         self,
@@ -106,11 +106,13 @@ def forward_cuda(
     def forward_hpu(self, x: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
                     router_logits: torch.Tensor, top_k: int, renormalize: bool,
                     use_grouped_topk: bool, num_expert_group: Optional[int],
-                    topk_group: Optional[int], layer: Optional[torch.nn.Module],):
+                    topk_group: Optional[int],
+                    layer: Optional[torch.nn.Module]):
         assert not use_grouped_topk, 'use_grouped_topk must be False on HPU'
         assert num_expert_group is None, ('num_expert_group is '
                                           'not supported on HPU')
         assert topk_group is None, 'topk_group is not supported on HPU'
+        assert layer is not None, 'layer has to be provided on HP'
         return layer.hpu_static_fused_moe(x, w1, w2, router_logits, top_k)
 
     def forward_cpu(self, *args, **kwargs):
@@ -210,7 +212,6 @@ def __init__(
             params_dtype=params_dtype,
             weight_loader=self.weight_loader)
 
-
     def weight_loader(self, param: torch.nn.Parameter,
                       loaded_weight: torch.Tensor, weight_name: str,
                       shard_id: int, expert_id: int):
@@ -250,18 +251,21 @@ def weight_loader(self, param: torch.nn.Parameter,
                 param_data[expert_id,
                            0:shard_size, :] = loaded_weight[shard, :]
                 if is_hpu():
-                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(param_data[expert_id])
+                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
+                        param_data[expert_id])
             # w3, up_proj case: Load into second shard of w13.
             elif shard_id == 2:
                 param_data[expert_id, shard_size:2 *
                            shard_size, :] = loaded_weight[shard, :]
                 if is_hpu():
-                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(param_data[expert_id])
+                    self.hpu_static_fused_moe.w13_list[expert_id].set_weight(
+                        param_data[expert_id])
             # w2, down_proj case: Load into only shard of w2.
             elif shard_id == 1:
                 param_data[expert_id, :, :] = loaded_weight[:, shard]
                 if is_hpu():
-                    self.hpu_static_fused_moe.w2_list[expert_id].set_weight(param_data[expert_id])
+                    self.hpu_static_fused_moe.w2_list[expert_id].set_weight(
+                        param_data[expert_id])
             else:
                 raise ValueError(
                     f"Shard id must be in [0,1,2] but got {shard_id}")
diff --git a/vllm/model_executor/layers/quantization/inc.py b/vllm/model_executor/layers/quantization/inc.py
index d246e8b0130dd..ec0141b61f58f 100644
--- a/vllm/model_executor/layers/quantization/inc.py
+++ b/vllm/model_executor/layers/quantization/inc.py
@@ -1,14 +1,13 @@
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional
 
 import torch
-from torch.nn import Module
-from torch.nn.parameter import Parameter
 import torch.nn.functional as F
+from torch.nn.parameter import Parameter
 
-from vllm import _custom_ops as ops
 from vllm.logger import init_logger
+from vllm.model_executor.layers.fused_moe.layer import (
+    FusedMoE, UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
-from vllm.model_executor.layers.fused_moe.layer import FusedMoE, UnquantizedFusedMoEMethod
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.utils import set_weight_attrs
@@ -56,7 +55,7 @@ def get_quant_method(self, layer: torch.nn.Module,
         if isinstance(layer, LinearBase):
             return INCLinearMethod(self)
         elif isinstance(layer, FusedMoE):
-           return UnquantizedFusedMoEMethod()
+            return UnquantizedFusedMoEMethod()
         return None
 
     def get_scaled_act_names(self) -> List[str]:
@@ -88,7 +87,9 @@ class INCLinearMethod(LinearMethodBase):
         quant_config: The quantization config.
     """
 
-    def __init__(self, quant_config: INCConfig, separate_bias_add: bool = False):
+    def __init__(self,
+                 quant_config: INCConfig,
+                 separate_bias_add: bool = False):
         self.separate_bias_add = separate_bias_add
         self.quant_config = quant_config
 

From ca145799e4917ac190d8d13cdfa515128970e6f0 Mon Sep 17 00:00:00 2001
From: Roi Tiefenbrunn <rtiefenbrunn@habana.ai>
Date: Sun, 1 Sep 2024 13:14:11 +0300
Subject: [PATCH 3/5] Fix HabanaExecutorAsync bug when no driver_worker
 initialized

---
 vllm/executor/habana_executor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index baeaec5afa371..dbd6c35d3afd7 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -191,7 +191,8 @@ def check_health(self) -> None:
         return
 
     def shutdown(self) -> None:
-        self.driver_worker.shutdown_inc()
+        if hasattr(self, "driver_worker") and self.driver_worker is not None:
+            self.driver_worker.shutdown_inc()
 
     def __del__(self):
         self.shutdown()

From e7106dc6f1d1ae6ddd8a0ca8f74895da7c3493aa Mon Sep 17 00:00:00 2001
From: Roi Tiefenbrunn <rtiefenbrunn@habana.ai>
Date: Mon, 2 Sep 2024 11:15:57 +0300
Subject: [PATCH 4/5] Remove HabanaModelRunner D'TOR for mixtral run

---
 vllm/worker/habana_model_runner.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index a975dba6f5136..4761d50f4ca0e 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -16,6 +16,7 @@
 
 import habana_frameworks.torch as htorch
 import torch
+from neural_compressor.torch.quantization import finalize_calibration
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
@@ -1557,7 +1558,6 @@ def prepare_model_input(
                                    virtual_engine=virtual_engine)
 
     def finish_measurements(self):
-        from neural_compressor.torch.quantization import finalize_calibration
         finalize_calibration(self.model.model)
 
     @torch.inference_mode()
@@ -1680,9 +1680,7 @@ def shutdown_inc(self):
         if (model_config := getattr(self, "model_config", None)) and \
                          getattr(model_config, "quantization", None) == 'inc':
             print('inc shutdown start')
-            from neural_compressor.torch.quantization import (
-                finalize_calibration)
-            finalize_calibration(self.model.model)
+            #finalize_calibration(self.model.model)
             print('inc shutdown')
 
     def __del__(self):

From c6b132e6d36a675ecd4352c30e908fbdd782b3c8 Mon Sep 17 00:00:00 2001
From: Roi Tiefenbrunn <rtiefenbrunn@habana.ai>
Date: Tue, 3 Sep 2024 17:48:17 +0300
Subject: [PATCH 5/5] Revert Removal of call to finalize_calibration in
 HabanaModelRunner D'TOR

---
 vllm/worker/habana_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 4761d50f4ca0e..5d1def001a1b7 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -1680,7 +1680,7 @@ def shutdown_inc(self):
         if (model_config := getattr(self, "model_config", None)) and \
                          getattr(model_config, "quantization", None) == 'inc':
             print('inc shutdown start')
-            #finalize_calibration(self.model.model)
+            finalize_calibration(self.model.model)
             print('inc shutdown')
 
     def __del__(self):