Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[HIP] Implement coarse-grained memory advice for the HIP adapter #1249

Merged
5 changes: 4 additions & 1 deletion include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3235,13 +3235,16 @@ typedef enum ur_usm_advice_flag_t {
UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST = UR_BIT(12), ///< Removes the affect of ::UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_HOST
UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST = UR_BIT(13), ///< Hint that the preferred memory location is the host
UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST = UR_BIT(14), ///< Removes the affect of ::UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST
UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY = UR_BIT(15), ///< Hint that memory coherence will be coarse-grained (up-to-date only at
///< kernel boundaries)
UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY = UR_BIT(16), ///< Removes the affect of ::UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY
/// @cond
UR_USM_ADVICE_FLAG_FORCE_UINT32 = 0x7fffffff
/// @endcond

} ur_usm_advice_flag_t;
/// @brief Bit Mask for validating ur_usm_advice_flags_t
#define UR_USM_ADVICE_FLAGS_MASK 0xffff8000
#define UR_USM_ADVICE_FLAGS_MASK 0xfffe0000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now has fewer bits set. Is this right? If so that's a somewhat unusual definition of "mask"

Copy link
Contributor Author

@GeorgeWeb GeorgeWeb Jan 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is also generated from the source generation scripts and initially threw me off too, but I think I got it now though. It is actually the inverse mask that UR uses for validation.


///////////////////////////////////////////////////////////////////////////////
/// @brief Handle of USM pool
Expand Down
26 changes: 26 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6277,6 +6277,12 @@ inline std::ostream &operator<<(std::ostream &os, ur_usm_advice_flag_t value) {
case UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST:
os << "UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST";
break;
case UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY:
os << "UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY";
break;
case UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY:
os << "UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY";
break;
default:
os << "unknown enumerator";
break;
Expand Down Expand Up @@ -6441,6 +6447,26 @@ inline ur_result_t printFlag<ur_usm_advice_flag_t>(std::ostream &os, uint32_t fl
}
os << UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION_HOST;
}

if ((val & UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY) == (uint32_t)UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know you're just copying from the surrounding style here, but if you're doing a bitwise AND of a single bit flag you only need to check against zero. Thus, the equality comparison and uint32_t cast is superfluous and just serves to make the line longer. Not going to block this patch, but it could do with cleaning up later, maybe

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that could be brought up to UR because I think it was the generate cmake target that produced the code.

val ^= (uint32_t)UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY;
if (!first) {
os << " | ";
} else {
first = false;
}
os << UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY;
}

if ((val & UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY) == (uint32_t)UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY) {
val ^= (uint32_t)UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY;
if (!first) {
os << " | ";
} else {
first = false;
}
os << UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY;
}
if (val != 0) {
std::bitset<32> bits(val);
if (!first) {
Expand Down
6 changes: 6 additions & 0 deletions scripts/core/usm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,12 @@ etors:
- name: CLEAR_PREFERRED_LOCATION_HOST
value: "$X_BIT(14)"
desc: "Removes the affect of $X_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION_HOST"
- name: SET_NON_COHERENT_MEMORY
value: "$X_BIT(15)"
desc: "Hint that memory coherence will be coarse-grained (up-to-date only at kernel boundaries)"
- name: CLEAR_NON_COHERENT_MEMORY
value: "$X_BIT(16)"
desc: "Removes the affect of $X_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY"
GeorgeWeb marked this conversation as resolved.
Show resolved Hide resolved
--- #--------------------------------------------------------------------------
type: handle
desc: "Handle of USM pool"
Expand Down
7 changes: 5 additions & 2 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,13 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
}
}

std::array<ur_usm_advice_flags_t, 4> UnmappedMemAdviceFlags = {
std::array<ur_usm_advice_flags_t, 6> UnmappedMemAdviceFlags = {
UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY,
UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY,
UR_USM_ADVICE_FLAG_BIAS_CACHED, UR_USM_ADVICE_FLAG_BIAS_UNCACHED};
UR_USM_ADVICE_FLAG_BIAS_CACHED,
UR_USM_ADVICE_FLAG_BIAS_UNCACHED,
UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY,
UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY};

for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
if (URAdviceFlags & UnmappedFlag) {
Expand Down
65 changes: 42 additions & 23 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,30 +92,46 @@ ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
if (URAdviceFlags &
(UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY |
UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY |
UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED)) {
UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED
#if !defined(__HIP_PLATFORM_AMD__)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it worth it to add ifdef conditions for this? I thought that we don't really support HIP for Nvidia devices.

Copy link
Contributor Author

@GeorgeWeb GeorgeWeb Jan 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't actively support it but until we have completely dropped it officially there are a bunch of other entry points in the hip adapter that maintain codepaths for hip-nv platforms. Either way, I do not disagree to removing it and simplifying this function a bit this way.

| UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY |
UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY
#endif
)) {
return UR_RESULT_ERROR_INVALID_ENUMERATION;
}

using ur_to_hip_advice_t = std::pair<ur_usm_advice_flags_t, hipMemoryAdvise>;

static constexpr std::array<ur_to_hip_advice_t, 6>
URToHIPMemAdviseDeviceFlags{
std::make_pair(UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
hipMemAdviseSetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
hipMemAdviseUnsetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
hipMemAdviseSetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
hipMemAdviseUnsetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
hipMemAdviseSetAccessedBy),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
hipMemAdviseUnsetAccessedBy),
};
for (auto &FlagPair : URToHIPMemAdviseDeviceFlags) {
if (URAdviceFlags & FlagPair.first) {
UR_CHECK_ERROR(hipMemAdvise(DevPtr, Size, FlagPair.second, Device));
#if defined(__HIP_PLATFORM_AMD__)
constexpr size_t DeviceFlagCount = 8;
#else
constexpr size_t DeviceFlagCount = 6;
#endif
static constexpr std::array<ur_to_hip_advice_t, DeviceFlagCount>
URToHIPMemAdviseDeviceFlags {
std::make_pair(UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
GeorgeWeb marked this conversation as resolved.
Show resolved Hide resolved
hipMemAdviseSetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
hipMemAdviseUnsetReadMostly),
std::make_pair(UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
hipMemAdviseSetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
hipMemAdviseUnsetPreferredLocation),
std::make_pair(UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
hipMemAdviseSetAccessedBy),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
hipMemAdviseUnsetAccessedBy),
#if defined(__HIP_PLATFORM_AMD__)
std::make_pair(UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY,
hipMemAdviseSetCoarseGrain),
std::make_pair(UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY,
hipMemAdviseUnsetCoarseGrain),
#endif
};
for (const auto &[URAdvice, HIPAdvice] : URToHIPMemAdviseDeviceFlags) {
if (URAdviceFlags & URAdvice) {
UR_CHECK_ERROR(hipMemAdvise(DevPtr, Size, HIPAdvice, Device));
ldrumm marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand All @@ -130,10 +146,9 @@ ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
hipMemAdviseUnsetAccessedBy),
};

for (auto &FlagPair : URToHIPMemAdviseHostFlags) {
if (URAdviceFlags & FlagPair.first) {
UR_CHECK_ERROR(
hipMemAdvise(DevPtr, Size, FlagPair.second, hipCpuDeviceId));
for (const auto &[URAdvice, HIPAdvice] : URToHIPMemAdviseHostFlags) {
if (URAdviceFlags & URAdvice) {
UR_CHECK_ERROR(hipMemAdvise(DevPtr, Size, HIPAdvice, hipCpuDeviceId));
GeorgeWeb marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down Expand Up @@ -1615,6 +1630,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
pMem, size, hipMemAdviseUnsetPreferredLocation, DeviceID));
UR_CHECK_ERROR(
hipMemAdvise(pMem, size, hipMemAdviseUnsetAccessedBy, DeviceID));
#if defined(__HIP_PLATFORM_AMD__)
UR_CHECK_ERROR(
hipMemAdvise(pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID));
#endif
} else {
Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID);
// UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but
Expand Down
8 changes: 8 additions & 0 deletions test/conformance/enqueue/urEnqueueUSMAdvise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,11 @@ TEST_P(urEnqueueUSMAdviseTest, InvalidSizeTooLarge) {
urEnqueueUSMAdvise(queue, ptr, allocation_size * 2,
UR_USM_ADVICE_FLAG_DEFAULT, nullptr));
}

TEST_P(urEnqueueUSMAdviseTest, NonCoherentDeviceMemorySuccessOrWarning) {
ur_result_t result =
urEnqueueUSMAdvise(queue, ptr, allocation_size,
UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY, nullptr);
ASSERT_EQ(result,
result & (UR_RESULT_SUCCESS | UR_RESULT_ERROR_ADAPTER_SPECIFIC));
}