resolving merge conflict

triton-inference-server · Aug 18, 2024 · f7067f5 · f7067f5
2 parents dd10a84 + 9598a80
commit f7067f5
Show file tree

Hide file tree

Showing 16 changed files with 628 additions and 62 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -221,15 +221,13 @@ if(NOT TRITON_CORE_HEADERS_ONLY)
   # Some libs are installed to ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib64 instead
   # of ${TRITON_THIRD_PARTY_INSTALL_PREFIX}/{LIB}/lib on Centos
   set (LIB_DIR "lib")
-  # /etc/os-release does not exist on Windows
-  if(EXISTS "/etc/os-release")
-    file(STRINGS /etc/os-release DISTRO REGEX "^NAME=")
-    string(REGEX REPLACE "NAME=\"(.*)\"" "\\1" DISTRO "${DISTRO}")
-    message(STATUS "Distro Name: ${DISTRO}")
-    if(DISTRO MATCHES "CentOS.*")
+  if(LINUX)
+    file(STRINGS "/etc/os-release" DISTRO_ID_LIKE REGEX "ID_LIKE")
+    if(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
       set (LIB_DIR "lib64")
-    endif()
-  endif()
+    endif(${DISTRO_ID_LIKE} MATCHES "rhel|centos")
+  endif(LINUX)
+  set(TRITON_CORE_HEADERS_ONLY OFF)
 
   # Need to use ExternalProject for our builds so that we can get the
   # correct dependencies between Triton shared library components and

diff --git a/include/triton/core/tritonserver.h b/include/triton/core/tritonserver.h
@@ -64,6 +64,7 @@ struct TRITONSERVER_Server;
 struct TRITONSERVER_ServerOptions;
 struct TRITONSERVER_Metric;
 struct TRITONSERVER_MetricFamily;
+struct TRITONSERVER_MetricArgs;
 
 ///
 /// TRITONSERVER API Version
@@ -91,7 +92,7 @@ struct TRITONSERVER_MetricFamily;
 ///   }
 ///
 #define TRITONSERVER_API_VERSION_MAJOR 1
-#define TRITONSERVER_API_VERSION_MINOR 33
+#define TRITONSERVER_API_VERSION_MINOR 34
 
 /// Get the TRITONBACKEND API version supported by the Triton shared
 /// library. This value can be compared against the
@@ -2615,7 +2616,8 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_ServerInferAsync(
 ///
 typedef enum TRITONSERVER_metrickind_enum {
   TRITONSERVER_METRIC_KIND_COUNTER,
-  TRITONSERVER_METRIC_KIND_GAUGE
+  TRITONSERVER_METRIC_KIND_GAUGE,
+  TRITONSERVER_METRIC_KIND_HISTOGRAM
 } TRITONSERVER_MetricKind;
 
 /// Create a new metric family object. The caller takes ownership of the
@@ -2644,6 +2646,44 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricFamilyNew(
 TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
 TRITONSERVER_MetricFamilyDelete(struct TRITONSERVER_MetricFamily* family);
 
+/// Get the TRITONSERVER_MetricKind of the metric family.
+///
+/// \param family The metric family object to query.
+/// \param kind Returns the TRITONSERVER_MetricKind of metric.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
+TRITONSERVER_GetMetricFamilyKind(
+    struct TRITONSERVER_MetricFamily* family, TRITONSERVER_MetricKind* kind);
+
+/// Create a new metric args object. The caller takes ownership of the
+/// TRITONSERVER_MetricArgs object and must call TRITONSERVER_MetricArgsDelete
+/// to release the object.
+///
+/// \param args Returns the new metric args object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricArgsNew(
+    struct TRITONSERVER_MetricArgs** args);
+
+/// Set metric args with histogram metric parameter.
+///
+/// \param args The metric args object to set.
+/// \param buckets The array of bucket boundaries for the expected range of
+/// observed values.
+///
+/// \param buckets_count The number of bucket boundaries.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error*
+TRITONSERVER_MetricArgsSetHistogram(
+    struct TRITONSERVER_MetricArgs* args, const double* buckets,
+    const uint64_t buckets_count);
+
+/// Delete a metric args object.
+///
+/// \param args The metric args object.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricArgsDelete(
+    struct TRITONSERVER_MetricArgs* args);
+
 /// Create a new metric object. The caller takes ownership of the
 /// TRITONSERVER_Metric object and must call
 /// TRITONSERVER_MetricDelete to release the object. The caller is also
@@ -2661,6 +2701,28 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricNew(
     struct TRITONSERVER_MetricFamily* family,
     const struct TRITONSERVER_Parameter** labels, const uint64_t label_count);
 
+/// Create a new metric object. The caller takes ownership of the
+/// TRITONSERVER_Metric object and must call
+/// TRITONSERVER_MetricDelete to release the object. The caller is also
+/// responsible for ownership of the labels passed in.
+/// Each label can be deleted immediately after creating the metric with
+/// TRITONSERVER_ParameterDelete if not re-using the labels.
+/// Metric args can be deleted immediately after creating the metric with
+/// TRITONSERVER_MetricArgsDelete if not re-using the metric args.
+///
+/// \param metric Returns the new metric object.
+/// \param family The metric family to add this new metric to.
+/// \param labels The array of labels to associate with this new metric.
+/// \param label_count The number of labels.
+/// \param args Metric args that store additional arguments to construct
+/// particular metric types, e.g. histogram.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricNewWithArgs(
+    struct TRITONSERVER_Metric** metric,
+    struct TRITONSERVER_MetricFamily* family,
+    const struct TRITONSERVER_Parameter** labels, const uint64_t label_count,
+    const struct TRITONSERVER_MetricArgs* args);
+
 /// Delete a metric object.
 /// All TRITONSERVER_Metric* objects should be deleted BEFORE their
 /// corresponding TRITONSERVER_MetricFamily* objects have been deleted.
@@ -2705,7 +2767,17 @@ TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricIncrement(
 TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricSet(
     struct TRITONSERVER_Metric* metric, double value);
 
-/// Get the TRITONSERVER_MetricKind of metric and its corresponding family.
+/// Sample an observation and count it to the appropriate bucket of a metric.
+/// Supports metrics of kind TRITONSERVER_METRIC_KIND_HISTOGRAM and returns
+/// TRITONSERVER_ERROR_UNSUPPORTED for unsupported TRITONSERVER_MetricKind.
+///
+/// \param metric The metric object to update.
+/// \param value The amount for metric to sample observation.
+/// \return a TRITONSERVER_Error indicating success or failure.
+TRITONSERVER_DECLSPEC struct TRITONSERVER_Error* TRITONSERVER_MetricObserve(
+    struct TRITONSERVER_Metric* metric, double value);
+
+/// Get the TRITONSERVER_MetricKind of metric of its corresponding family.
 ///
 /// \param metric The metric object to query.
 /// \param kind Returns the TRITONSERVER_MetricKind of metric.

diff --git a/python/test/test_api.py b/python/test/test_api.py
@@ -70,7 +70,7 @@
     exit_on_error=True,
     strict_model_config=False,
     model_control_mode=tritonserver.ModelControlMode.EXPLICIT,
-    exit_timeout=10,
+    exit_timeout=30,
 )
 
 
@@ -345,11 +345,6 @@ def test_ready(self):
         server = tritonserver.Server(self._server_options).start()
         self.assertTrue(server.ready())
 
-    @pytest.mark.xfail(
-        tritonserver.__version__ <= "2.48.0",
-        reason="Known issue on stop: Exit timeout expired. Exiting immediately",
-        raises=tritonserver.InternalError,
-    )
     def test_stop(self):
         server = tritonserver.Server(self._server_options).start(wait_until_ready=True)
 
@@ -362,6 +357,11 @@ def test_stop(self):
                     {
                         "backend": "python",
                         "parameters": {"decoupled": {"string_value": "False"}},
+                        # Keep instance count low for fast startup/cleanup.
+                        # Alternatively can use KIND_CPU here, but keeping gpus/count explicit.
+                        "instance_group": [
+                            {"kind": "KIND_GPU", "gpus": [0], "count": 1}
+                        ],
                     }
                 )
             },

diff --git a/python/tritonserver/_c/tritonserver_pybind.cc b/python/tritonserver/_c/tritonserver_pybind.cc
@@ -1,4 +1,4 @@
-// Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -1439,7 +1439,18 @@ class PyServer : public PyWrapper<struct TRITONSERVER_Server> {
     return reinterpret_cast<uintptr_t>(this->Ptr());
   }
 
-  void Stop() const { ThrowIfError(TRITONSERVER_ServerStop(triton_object_)); }
+  void Stop() const
+  {
+    // ServerStop is blocking for the duration of the server exit timeout, so
+    // ensure to release the GIL. This can allow request release callbacks
+    // to be interleaved while server is waiting for live requests/models
+    // to complete. Without releasing GIL, this function may acquire the GIL
+    // first and block the Triton request from being released/freed, thus
+    // blocking the server's shutdown in a circular manner thinking a model is
+    // still alive.
+    py::gil_scoped_release release;
+    ThrowIfError(TRITONSERVER_ServerStop(triton_object_));
+  }
 
   void RegisterModelRepository(
       const std::string& repository_path,

diff --git a/src/infer_request.cc b/src/infer_request.cc
@@ -1015,6 +1015,17 @@ InferenceRequest::Normalize()
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
       *input.MutableShape() = input.OriginalShape();
+
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
+      if (input_config->is_shape_tensor()) {
+        // For a shape tensor, mark that the input is a shape tensor.
+        input.SetIsShapeTensor();
+      } else if (input_config->is_non_linear_format_io()) {
+        // If a tensor uses a non-linear IO format, indicate that the input uses
+        // a non-linear IO format.
+        input.SetIsNonLinearFormatIo();
+      }
     }
   } else {
     // Model does support Triton-style batching so each input tensor
@@ -1024,15 +1035,19 @@ InferenceRequest::Normalize()
     batch_size_ = 0;
     for (auto& pr : original_inputs_) {
       auto& input = pr.second;
+      const inference::ModelInput* input_config;
+      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
 
       // For a shape tensor, keep the tensor's shape as it is and mark
       // that the input is a shape tensor.
-      const inference::ModelInput* input_config;
-      RETURN_IF_ERROR(model_raw_->GetInput(input.Name(), &input_config));
       if (input_config->is_shape_tensor()) {
         *input.MutableShape() = input.OriginalShape();
-        input.SetIsShapeTensor(true);
+        input.SetIsShapeTensor();
         continue;
+      } else if (input_config->is_non_linear_format_io()) {
+        // If a tensor uses a non-linear IO format, indicate that the input uses
+        // a non-linear IO format.
+        input.SetIsNonLinearFormatIo();
       }
 
       if (input.OriginalShape().size() == 0) {
@@ -1182,28 +1197,26 @@ InferenceRequest::Normalize()
     {
       const auto& data_type = input.DType();
 
-      // FIXME: Skip byte size validation for TensorRT backend because it breaks
-      // shape-size assumption. See DLIS-6805 for proper fix for TRT backend
-      // reformat_free tensors.
-      bool skip_byte_size_check = false;
-      constexpr char trt_prefix[] = "tensorrt_";
-      const std::string& platform = model_raw_->Config().platform();
-      skip_byte_size_check |= (platform.rfind(trt_prefix) == 0);
-
-      if (!skip_byte_size_check) {
+      // Non-linear IO format input byte size validation will be handled in the
+      // TensorRT backend.
+      if (!input.IsNonLinearFormatIo()) {
         TRITONSERVER_MemoryType input_memory_type;
         // Because Triton expects STRING type to be in special format
         // (prepend 4 bytes to specify string length), so need to add all the
         // first 4 bytes for each element to find expected byte size
         if (data_type == inference::DataType::TYPE_STRING) {
           RETURN_IF_ERROR(
               ValidateBytesInputs(input_id, input, &input_memory_type));
+
           // FIXME: Temporarily skips byte size checks for GPU tensors. See
           // DLIS-6820.
-          skip_byte_size_check |=
-              (input_memory_type == TRITONSERVER_MEMORY_GPU);
         } else {
-          const auto& input_dims = input.ShapeWithBatchDim();
+          // Shape tensor with dynamic batching does not introduce a new
+          // dimension to the tensor but adds an additional value to the 1-D
+          // array.
+          const std::vector<int64_t>& input_dims =
+              input.IsShapeTensor() ? input.OriginalShape()
+                                    : input.ShapeWithBatchDim();
           int64_t expected_byte_size = INT_MAX;
           expected_byte_size =
               triton::common::GetByteSize(data_type, input_dims);
@@ -1506,7 +1519,7 @@ InferenceRequest::ReportStatisticsCacheHit(MetricModelReporter* metric_reporter)
 // Input
 //
 InferenceRequest::Input::Input()
-    : is_shape_tensor_(false), data_(new MemoryReference),
+    : tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
       has_host_policy_specific_data_(false)
 {
 }
@@ -1515,16 +1528,17 @@ InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const int64_t* shape, const uint64_t dim_count)
     : name_(name), datatype_(datatype),
-      original_shape_(shape, shape + dim_count), is_shape_tensor_(false),
-      data_(new MemoryReference), has_host_policy_specific_data_(false)
+      original_shape_(shape, shape + dim_count),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
+      has_host_policy_specific_data_(false)
 {
 }
 
 InferenceRequest::Input::Input(
     const std::string& name, const inference::DataType datatype,
     const std::vector<int64_t>& shape)
     : name_(name), datatype_(datatype), original_shape_(shape),
-      is_shape_tensor_(false), data_(new MemoryReference),
+      tensor_type_(TensorType::TENSOR), data_(new MemoryReference),
       has_host_policy_specific_data_(false)
 {
 }
@@ -1540,9 +1554,16 @@ InferenceRequest::Input::SetMetadata(
 }
 
 Status
-InferenceRequest::Input::SetIsShapeTensor(const bool is_shape_tensor)
+InferenceRequest::Input::SetIsShapeTensor()
+{
+  tensor_type_ = TensorType::SHAPE_TENSOR;
+  return Status::Success;
+}
+
+Status
+InferenceRequest::Input::SetIsNonLinearFormatIo()
 {
-  is_shape_tensor_ = is_shape_tensor;
+  tensor_type_ = TensorType::NON_LINEAR;
   return Status::Success;
 }
 

diff --git a/src/infer_request.h b/src/infer_request.h
@@ -82,6 +82,8 @@ class InferenceRequest {
   // Input tensor
   class Input {
    public:
+    enum class TensorType { TENSOR, SHAPE_TENSOR, NON_LINEAR };
+
     Input();
     Input(
         const std::string& name, const inference::DataType datatype,
@@ -134,10 +136,22 @@ class InferenceRequest {
     }
 
     // Whether or not the input is a tensorrt shape tensor
-    bool IsShapeTensor() const { return is_shape_tensor_; }
+    bool IsShapeTensor() const
+    {
+      return tensor_type_ == TensorType::SHAPE_TENSOR;
+    }
+
+    // Specifies whether the input uses a non-linear IO format
+    bool IsNonLinearFormatIo() const
+    {
+      return tensor_type_ == TensorType::NON_LINEAR;
+    }
 
     // Set the input to be treated as a shape tensor.
-    Status SetIsShapeTensor(const bool is_shape_tensor);
+    Status SetIsShapeTensor();
+
+    // Set the input uses a non-linear IO format
+    Status SetIsNonLinearFormatIo();
 
     // The data for this input.
     const std::shared_ptr<Memory>& Data() const { return data_; }
@@ -240,7 +254,7 @@ class InferenceRequest {
     std::vector<int64_t> original_shape_;
     std::vector<int64_t> shape_;
     std::vector<int64_t> shape_with_batch_dim_;
-    bool is_shape_tensor_;
+    TensorType tensor_type_;
     std::shared_ptr<Memory> data_;
 
     bool has_host_policy_specific_data_;