diff --git a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py index ad0cf990..8284835b 100644 --- a/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py +++ b/ci/L0_backend_vllm/metrics_test/vllm_metrics_test.py @@ -70,12 +70,10 @@ def get_metrics(self): return vllm_dict - def vllm_async_stream_infer( + def vllm_infer( self, prompts, sampling_parameters, - stream, - send_parameters_as_tensor, model_name, ): """ @@ -89,15 +87,15 @@ def vllm_async_stream_infer( request_data = create_vllm_request( prompts[i], i, - stream, + False, sampling_parameters, model_name, - send_parameters_as_tensor, + True, ) self.triton_client.async_stream_infer( model_name=model_name, - request_id=request_data["request_id"], inputs=request_data["inputs"], + request_id=request_data["request_id"], outputs=request_data["outputs"], parameters=sampling_parameters, ) @@ -121,11 +119,9 @@ def test_vllm_metrics(self): } # Test vLLM metrics - self.vllm_async_stream_infer( + self.vllm_infer( prompts=self.prompts, sampling_parameters=self.sampling_parameters, - stream=False, - send_parameters_as_tensor=True, model_name=self.vllm_model_name, ) expected_metrics_dict["vllm:prompt_tokens_total"] = 18 diff --git a/src/model.py b/src/model.py index 650faccd..5e77e602 100644 --- a/src/model.py +++ b/src/model.py @@ -168,7 +168,7 @@ def init_engine(self): ) # Add vLLM custom metrics - if not self.metrics: + if self.metrics: self.llm_engine.add_logger("triton", self.metrics) def setup_lora(self):