From a513afb2256b62110ba272e808d27c2b38351aa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Tue, 21 Nov 2023 15:58:30 +0000 Subject: [PATCH] [CUDA] Fix synchronization issue in urEnqueueMemImageCopy For 1D images, urEnqueueMemImageCopy was using cuMemcpyAtoA which does not have an asynchronous version. This means that, when the MemCpy happens between two arrays in device memory, the call will be asynchronous and might complete after the event returned by urEnqueueMemImageCopy finishes. This commits fixes the issue by using cuMemcpy2DAsync to copy 1D images by setting the height to 1. --- source/adapters/cuda/enqueue.cpp | 34 ++++++++++++++------------------ 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index c752c3fd14..7d3758e820 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -862,7 +862,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { } } -/// General ND memory copy operation for images (where N > 1). +/// General ND memory copy operation for images. /// This function requires the corresponding CUDA context to be at the top of /// the context stack /// If the source and/or destination is an array, SrcPtr and/or DstPtr @@ -877,14 +877,14 @@ static ur_result_t commonEnqueueMemImageNDCopy( UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (ImgType == UR_MEM_TYPE_IMAGE2D) { + if (ImgType == UR_MEM_TYPE_IMAGE1D || ImgType == UR_MEM_TYPE_IMAGE2D) { CUDA_MEMCPY2D CpyDesc; memset(&CpyDesc, 0, sizeof(CpyDesc)); CpyDesc.srcMemoryType = SrcType; if (SrcType == CU_MEMORYTYPE_ARRAY) { CpyDesc.srcArray = *static_cast(SrcPtr); CpyDesc.srcXInBytes = SrcOffset.x; - CpyDesc.srcY = SrcOffset.y; + CpyDesc.srcY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : SrcOffset.y; } else { CpyDesc.srcHost = SrcPtr; } @@ -892,12 +892,12 @@ static ur_result_t commonEnqueueMemImageNDCopy( if (DstType == CU_MEMORYTYPE_ARRAY) { CpyDesc.dstArray = *static_cast(DstPtr); CpyDesc.dstXInBytes = DstOffset.x; - CpyDesc.dstY = DstOffset.y; + CpyDesc.dstY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : DstOffset.y; } else { CpyDesc.dstHost = DstPtr; } CpyDesc.WidthInBytes = Region.width; - CpyDesc.Height = Region.height; + CpyDesc.Height = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 1 : Region.height; UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream)); return UR_RESULT_SUCCESS; } @@ -1124,21 +1124,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( } ur_mem_type_t ImgType = std::get(hImageSrc->Mem).getImageType(); - if (ImgType == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray, - SrcByteOffsetX, BytesToCopy)); - } else { - ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, - region.depth}; - ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; - ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; - Result = commonEnqueueMemImageNDCopy( - CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY, - SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, + region.depth}; + ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; + ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; + + Result = commonEnqueueMemImageNDCopy( + CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY, + SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset); + if (Result != UR_RESULT_SUCCESS) { + return Result; } if (phEvent) {