Skip to content

Commit

Permalink
Error out of counter splitting if group max is zero (#70)
Browse files Browse the repository at this point in the history
* Error out of counter splitting if group max is zero

A counter group having a counter max of zero is invalid and
will ultimately result in a hang when we try to split counters
into multiple passes. This is one of various scenarios that
result in a hang during counter splitting; see

 #69

This fixes only that specific scenario. We now check that the
group max isn't zero, and if it is, we give up trying to split
a public counter's HW counters into multiple passes. We log
an error, too.

Again, this isn't a comprehensive fix for issue 69. There could
be other cases of bad data that result in a hang. Issue 69 should
be fixed with a pass cap limit to cover all cases. But this
commit still adds value in that it flags the specific invalid GPU
counter metadata in addition to avoiding the hang.

Change-Id: I56d7d2043ba92c1b6088f0fdd68f5ec844e7b823
  • Loading branch information
jcortell68 authored Jul 1, 2024
1 parent 25eb330 commit e86117e
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 5 deletions.
3 changes: 2 additions & 1 deletion include/gpu_performance_api/gpu_perf_api_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,8 @@ typedef enum
kGpaStatusErrorLibAlreadyLoaded = -41,
kGpaStatusErrorOtherSessionActive = -42,
kGpaStatusErrorException = -43,
kGpaStatusMin = kGpaStatusErrorException,
kGpaStatusErrorInvalidCounterGroupData = -44,
kGpaStatusMin = kGpaStatusErrorInvalidCounterGroupData,
kGpaStatusInternal = 256, ///< Status codes used internally within GPUPerfAPI.
} GpaStatus;

Expand Down
3 changes: 2 additions & 1 deletion source/gpu_perf_api_common/gpu_perf_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1659,7 +1659,8 @@ static const char* kErrorString[] = {
GPA_ENUM_STRING_VAL(kGpaStatusErrorTimeout, "GPA Error: Attempt to Retrieve Data Failed due to Timeout."),
GPA_ENUM_STRING_VAL(kGpaStatusErrorLibAlreadyLoaded, "GPA Error: Library Is Already Loaded."),
GPA_ENUM_STRING_VAL(kGpaStatusErrorOtherSessionActive, "GPA Error: Other Session Is Active."),
GPA_ENUM_STRING_VAL(kGpaStatusErrorException, "GPA Error: Exception Occurred.")};
GPA_ENUM_STRING_VAL(kGpaStatusErrorException, "GPA Error: Exception Occurred."),
GPA_ENUM_STRING_VAL(kGpaStatusErrorInvalidCounterGroupData, "GPA Error: Counter group data is invalid.")};

/// Size of kErrorString array.
static size_t kErrorStringSize = sizeof(kErrorString) / sizeof(const char*);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,25 @@ GpaStatus GpaCounterSchedulerBase::GetNumRequiredPasses(GpaUInt32* num_required_
// Add the HW groups max's.
for (unsigned int i = 0; i < hw_counters->internal_counter_groups_.size(); ++i)
{
max_counters_per_group.push_back(hw_counters->internal_counter_groups_[i].max_active_discrete_counters);
auto count = hw_counters->internal_counter_groups_[i].max_active_discrete_counters;
if (count == 0)
{
GPA_LOG_DEBUG_ERROR("Hardware counter group '%s' has zero for max-counters-per-group.", hw_counters->internal_counter_groups_[i].name);
return kGpaStatusErrorInvalidCounterGroupData;
}
max_counters_per_group.push_back(count);
}

// Add the Additional groups max's.
for (unsigned int i = 0; i < hw_counters->additional_group_count_; ++i)
{
max_counters_per_group.push_back(hw_counters->additional_groups_[i].max_active_discrete_counters);
auto count = hw_counters->additional_groups_[i].max_active_discrete_counters;
if (count == 0)
{
GPA_LOG_DEBUG_ERROR("Hardware counter additional group '%s' has zero for max-counters-per-group.", hw_counters->additional_groups_[i].name);
return kGpaStatusErrorInvalidCounterGroupData;
}
max_counters_per_group.push_back(count);
}

GpaCounterGroupAccessor accessor(hw_counters->internal_counter_groups_,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@

#ifdef DEBUG_PUBLIC_COUNTER_SPLITTER
#include <sstream>
#include "gpu_perf_api_common/logging.h"
#endif

#include "gpu_perf_api_counter_generator/gpa_derived_counter.h"
#include "gpu_perf_api_common/logging.h"

/// @brief Enum to represent the different SQ shader stages.
enum GpaSqShaderStage
Expand Down Expand Up @@ -378,6 +378,11 @@ class IGpaSplitCounters
}

unsigned int group_limit = max_counters_per_group[group_index];
if (group_limit == 0)
{
GPA_LOG_DEBUG_ERROR("Group(%d) counter limit is zero.", group_index);
return false;
}

// This should never occur. It indicates the counter relies on a block without any collectible events.
assert(group_limit > 0);
Expand Down

0 comments on commit e86117e

Please sign in to comment.