From e86117e09d6b9087e3f5e5777d95859314c277b9 Mon Sep 17 00:00:00 2001 From: John Cortell <109228283+jcortell68@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:34:42 -0400 Subject: [PATCH] Error out of counter splitting if group max is zero (#70) * Error out of counter splitting if group max is zero A counter group having a counter max of zero is invalid and will ultimately result in a hang when we try to split counters into multiple passes. This is one of various scenarios that result in a hang during counter splitting; see https://github.com/GPUOpen-Tools/gpu_performance_api/issues/69 This fixes only that specific scenario. We now check that the group max isn't zero, and if it is, we give up trying to split a public counter's HW counters into multiple passes. We log an error, too. Again, this isn't a comprehensive fix for issue 69. There could be other cases of bad data that result in a hang. Issue 69 should be fixed with a pass cap limit to cover all cases. But this commit still adds value in that it flags the specific invalid GPU counter metadata in addition to avoiding the hang. Change-Id: I56d7d2043ba92c1b6088f0fdd68f5ec844e7b823 --- include/gpu_performance_api/gpu_perf_api_types.h | 3 ++- source/gpu_perf_api_common/gpu_perf_api.cc | 3 ++- .../gpa_counter_scheduler_base.cc | 16 ++++++++++++++-- .../gpa_split_counters_interfaces.h | 7 ++++++- 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/gpu_performance_api/gpu_perf_api_types.h b/include/gpu_performance_api/gpu_perf_api_types.h index 44a9c052..e67eea10 100644 --- a/include/gpu_performance_api/gpu_perf_api_types.h +++ b/include/gpu_performance_api/gpu_perf_api_types.h @@ -170,7 +170,8 @@ typedef enum kGpaStatusErrorLibAlreadyLoaded = -41, kGpaStatusErrorOtherSessionActive = -42, kGpaStatusErrorException = -43, - kGpaStatusMin = kGpaStatusErrorException, + kGpaStatusErrorInvalidCounterGroupData = -44, + kGpaStatusMin = kGpaStatusErrorInvalidCounterGroupData, kGpaStatusInternal = 256, ///< Status codes used internally within GPUPerfAPI. } GpaStatus; diff --git a/source/gpu_perf_api_common/gpu_perf_api.cc b/source/gpu_perf_api_common/gpu_perf_api.cc index 665b5aa7..9a228ada 100644 --- a/source/gpu_perf_api_common/gpu_perf_api.cc +++ b/source/gpu_perf_api_common/gpu_perf_api.cc @@ -1659,7 +1659,8 @@ static const char* kErrorString[] = { GPA_ENUM_STRING_VAL(kGpaStatusErrorTimeout, "GPA Error: Attempt to Retrieve Data Failed due to Timeout."), GPA_ENUM_STRING_VAL(kGpaStatusErrorLibAlreadyLoaded, "GPA Error: Library Is Already Loaded."), GPA_ENUM_STRING_VAL(kGpaStatusErrorOtherSessionActive, "GPA Error: Other Session Is Active."), - GPA_ENUM_STRING_VAL(kGpaStatusErrorException, "GPA Error: Exception Occurred.")}; + GPA_ENUM_STRING_VAL(kGpaStatusErrorException, "GPA Error: Exception Occurred."), + GPA_ENUM_STRING_VAL(kGpaStatusErrorInvalidCounterGroupData, "GPA Error: Counter group data is invalid.")}; /// Size of kErrorString array. static size_t kErrorStringSize = sizeof(kErrorString) / sizeof(const char*); diff --git a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc index f022e590..73f6516f 100644 --- a/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc +++ b/source/gpu_perf_api_counter_generator/gpa_counter_scheduler_base.cc @@ -273,13 +273,25 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_ // Add the HW groups max's. for (unsigned int i = 0; i < hw_counters->internal_counter_groups_.size(); ++i) { - max_counters_per_group.push_back(hw_counters->internal_counter_groups_[i].max_active_discrete_counters); + auto count = hw_counters->internal_counter_groups_[i].max_active_discrete_counters; + if (count == 0) + { + GPA_LOG_DEBUG_ERROR("Hardware counter group '%s' has zero for max-counters-per-group.", hw_counters->internal_counter_groups_[i].name); + return kGpaStatusErrorInvalidCounterGroupData; + } + max_counters_per_group.push_back(count); } // Add the Additional groups max's. for (unsigned int i = 0; i < hw_counters->additional_group_count_; ++i) { - max_counters_per_group.push_back(hw_counters->additional_groups_[i].max_active_discrete_counters); + auto count = hw_counters->additional_groups_[i].max_active_discrete_counters; + if (count == 0) + { + GPA_LOG_DEBUG_ERROR("Hardware counter additional group '%s' has zero for max-counters-per-group.", hw_counters->additional_groups_[i].name); + return kGpaStatusErrorInvalidCounterGroupData; + } + max_counters_per_group.push_back(count); } GpaCounterGroupAccessor accessor(hw_counters->internal_counter_groups_, diff --git a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h index 875497db..cbd87e1f 100644 --- a/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h +++ b/source/gpu_perf_api_counter_generator/gpa_split_counters_interfaces.h @@ -16,10 +16,10 @@ #ifdef DEBUG_PUBLIC_COUNTER_SPLITTER #include -#include "gpu_perf_api_common/logging.h" #endif #include "gpu_perf_api_counter_generator/gpa_derived_counter.h" +#include "gpu_perf_api_common/logging.h" /// @brief Enum to represent the different SQ shader stages. enum GpaSqShaderStage @@ -378,6 +378,11 @@ class IGpaSplitCounters } unsigned int group_limit = max_counters_per_group[group_index]; + if (group_limit == 0) + { + GPA_LOG_DEBUG_ERROR("Group(%d) counter limit is zero.", group_index); + return false; + } // This should never occur. It indicates the counter relies on a block without any collectible events. assert(group_limit > 0);