From 1a3989eda4966ade46095ee64a7304f940c6b9b6 Mon Sep 17 00:00:00 2001 From: Kebe Date: Thu, 16 May 2024 09:19:38 +0000 Subject: [PATCH] Add input and output tokens to response --- src/model.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/src/model.py b/src/model.py index c9517208..c6ff2c90 100644 --- a/src/model.py +++ b/src/model.py @@ -69,7 +69,11 @@ def auto_complete_config(auto_complete_model_config): "optional": True, }, ] - outputs = [{"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}] + outputs = [ + {"name": "text_output", "data_type": "TYPE_STRING", "dims": [-1]}, + {"name": "input_tokens", "data_type": "TYPE_INT32", "dims": [-1]}, + {"name": "output_tokens", "data_type": "TYPE_INT32", "dims": [-1]}, + ] # Store the model configuration as a dictionary. config = auto_complete_model_config.as_dict() @@ -108,6 +112,14 @@ def initialize(self, args): self.model_config, "text_output" ) self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"]) + output_tokens_config = pb_utils.get_output_config_by_name( + self.model_config, "output_tokens" + ) + self.output_tokens_dtype = pb_utils.triton_string_to_numpy(output_tokens_config["data_type"]) + input_tokens_config = pb_utils.get_output_config_by_name( + self.model_config, "input_tokens" + ) + self.input_tokens_dtype = pb_utils.triton_string_to_numpy(input_tokens_config["data_type"]) # Prepare vLLM engine self.init_engine() @@ -313,10 +325,17 @@ def create_response(self, vllm_output, prepend_input): text_outputs = [ (prompt + output.text).encode("utf-8") for output in vllm_output.outputs ] + output_tokens = sum([len(output.token_ids) for output in vllm_output.outputs]) triton_output_tensor = pb_utils.Tensor( - "text_output", np.asarray(text_outputs, dtype=self.output_dtype) + "text_output", np.asarray(text_outputs, dtype=self.output_dtype), ) - return pb_utils.InferenceResponse(output_tensors=[triton_output_tensor]) + triton_tokens_tensor = pb_utils.Tensor( + "output_tokens", np.asarray(output_tokens, dtype=self.output_tokens_dtype), + ) + triton_input_tokens_tensor = pb_utils.Tensor( + "input_tokens", np.asarray(len(vllm_output.prompt_token_ids), dtype=self.input_tokens_dtype), + ) + return pb_utils.InferenceResponse(output_tensors=[triton_output_tensor, triton_tokens_tensor, triton_input_tokens_tensor]) def create_stream_response(self, vllm_output, previous_outputs_lengths): """