Skip to content

Commit

Permalink
Implemented workaround for hipMemset2D
Browse files Browse the repository at this point in the history
  • Loading branch information
konradkusiak97 committed Mar 1, 2024
1 parent 8499b57 commit 7a05c32
Showing 1 changed file with 56 additions and 17 deletions.
73 changes: 56 additions & 17 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -770,29 +770,68 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {
// Calculate the number of patterns, stride, number of times the pattern
// needs to be applied, and the number of times the first 32 bit pattern
// needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;
auto Count32 = Size / sizeof(uint32_t);

// Get 4-byte chunk of the pattern and call hipMemsetD32Async
auto Count32 = Size / sizeof(uint32_t);
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
for (auto step = 4u; step < NumberOfSteps; ++step) {
// take 1 byte of the pattern
Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
(step * sizeof(uint8_t)));
auto memsetRemainPattern = [&Stream, &pPattern,
&Ptr](const auto Size, const auto PatternSize) {
// Calculate the number of patterns, stride and the number of times the
// pattern needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;

// set all of the pattern chunks
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
}
for (auto step = 4u; step < NumberOfSteps; ++step) {
// take 1 byte of the pattern
auto Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = reinterpret_cast<void *>(
reinterpret_cast<uint8_t *>(Ptr) + (step * sizeof(uint8_t)));

// set all of the pattern chunks
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
}
};
// There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D and
// hipMemset3D to behave incorrectly when acting on host pinned memory. In
// such a case the following part of memsetting the remaining part of the
// pattern is emulated with memcpy.
#if HIP_VERSION < 60000000
hipPointerAttribute_t ptrAttribs{};
UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr));

const bool ptrIsHost{ptrAttribs.memoryType == hipMemoryTypeHost};

// The memoryType member of ptrAttrbis is set to hipMemoryTypeHost for both
// hipHostMalloc and (incorrectly) for hipMallocManaged. So to make sure that
// the Ptr is corresponding to host pinned memory we need to additionally use
// a boolean member of ptrAttribs: isManaged.
if (ptrIsHost && !ptrAttribs.isManaged) {
const auto NumOfCopySteps = Size / PatternSize;
const auto Offset = sizeof(uint32_t);
const auto LeftPatternSize = PatternSize - Offset;
const auto OffsetPatternPtr = reinterpret_cast<const void *>(
reinterpret_cast<const uint8_t *>(pPattern) + Offset);

// Loop through the memory area to memset, advancing each time by the
// PatternSize and memcpy the left over pattern bits.
for (uint32_t i = 0; i < NumOfCopySteps; ++i) {
auto OffsetDstPtr = reinterpret_cast<void *>(
reinterpret_cast<uint8_t *>(Ptr) + Offset + i * PatternSize);
UR_CHECK_ERROR(hipMemcpyAsync(OffsetDstPtr, OffsetPatternPtr,
LeftPatternSize, hipMemcpyHostToHost,
Stream));
}
} else
memsetRemainPattern(Size, PatternSize);
#else
memsetRemainPattern(Size, PatternSize);
#endif
return UR_RESULT_SUCCESS;
}

Expand Down

0 comments on commit 7a05c32

Please sign in to comment.