diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index a975dba6f5136..5d1def001a1b7 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -16,6 +16,7 @@ import habana_frameworks.torch as htorch import torch +from neural_compressor.torch.quantization import finalize_calibration from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, @@ -1557,7 +1558,6 @@ def prepare_model_input( virtual_engine=virtual_engine) def finish_measurements(self): - from neural_compressor.torch.quantization import finalize_calibration finalize_calibration(self.model.model) @torch.inference_mode() @@ -1680,8 +1680,6 @@ def shutdown_inc(self): if (model_config := getattr(self, "model_config", None)) and \ getattr(model_config, "quantization", None) == 'inc': print('inc shutdown start') - from neural_compressor.torch.quantization import ( - finalize_calibration) finalize_calibration(self.model.model) print('inc shutdown')