From 7a05c32dcd5efe4294f724d8f8303a44cbcf8ab8 Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 28 Feb 2024 12:55:40 -0500 Subject: [PATCH] Implemented workaround for hipMemset2D --- source/adapters/hip/enqueue.cpp | 73 +++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 24ba905688..d331c0891d 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -770,29 +770,68 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, size_t Size, const void *pPattern, hipDeviceptr_t Ptr) { - // Calculate the number of patterns, stride, number of times the pattern - // needs to be applied, and the number of times the first 32 bit pattern - // needs to be applied. - auto NumberOfSteps = PatternSize / sizeof(uint8_t); - auto Pitch = NumberOfSteps * sizeof(uint8_t); - auto Height = Size / NumberOfSteps; - auto Count32 = Size / sizeof(uint32_t); // Get 4-byte chunk of the pattern and call hipMemsetD32Async + auto Count32 = Size / sizeof(uint32_t); auto Value = *(static_cast(pPattern)); UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream)); - for (auto step = 4u; step < NumberOfSteps; ++step) { - // take 1 byte of the pattern - Value = *(static_cast(pPattern) + step); - // offset the pointer to the part of the buffer we want to write to - auto OffsetPtr = reinterpret_cast(reinterpret_cast(Ptr) + - (step * sizeof(uint8_t))); + auto memsetRemainPattern = [&Stream, &pPattern, + &Ptr](const auto Size, const auto PatternSize) { + // Calculate the number of patterns, stride and the number of times the + // pattern needs to be applied. + auto NumberOfSteps = PatternSize / sizeof(uint8_t); + auto Pitch = NumberOfSteps * sizeof(uint8_t); + auto Height = Size / NumberOfSteps; - // set all of the pattern chunks - UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t), - Height, Stream)); - } + for (auto step = 4u; step < NumberOfSteps; ++step) { + // take 1 byte of the pattern + auto Value = *(static_cast(pPattern) + step); + + // offset the pointer to the part of the buffer we want to write to + auto OffsetPtr = reinterpret_cast( + reinterpret_cast(Ptr) + (step * sizeof(uint8_t))); + + // set all of the pattern chunks + UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t), + Height, Stream)); + } + }; + // There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D and + // hipMemset3D to behave incorrectly when acting on host pinned memory. In + // such a case the following part of memsetting the remaining part of the + // pattern is emulated with memcpy. +#if HIP_VERSION < 60000000 + hipPointerAttribute_t ptrAttribs{}; + UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr)); + + const bool ptrIsHost{ptrAttribs.memoryType == hipMemoryTypeHost}; + + // The memoryType member of ptrAttrbis is set to hipMemoryTypeHost for both + // hipHostMalloc and (incorrectly) for hipMallocManaged. So to make sure that + // the Ptr is corresponding to host pinned memory we need to additionally use + // a boolean member of ptrAttribs: isManaged. + if (ptrIsHost && !ptrAttribs.isManaged) { + const auto NumOfCopySteps = Size / PatternSize; + const auto Offset = sizeof(uint32_t); + const auto LeftPatternSize = PatternSize - Offset; + const auto OffsetPatternPtr = reinterpret_cast( + reinterpret_cast(pPattern) + Offset); + + // Loop through the memory area to memset, advancing each time by the + // PatternSize and memcpy the left over pattern bits. + for (uint32_t i = 0; i < NumOfCopySteps; ++i) { + auto OffsetDstPtr = reinterpret_cast( + reinterpret_cast(Ptr) + Offset + i * PatternSize); + UR_CHECK_ERROR(hipMemcpyAsync(OffsetDstPtr, OffsetPatternPtr, + LeftPatternSize, hipMemcpyHostToHost, + Stream)); + } + } else + memsetRemainPattern(Size, PatternSize); +#else + memsetRemainPattern(Size, PatternSize); +#endif return UR_RESULT_SUCCESS; }