Skip to content

Commit

Permalink
[CUDA] Fix synchronization issue in urEnqueueMemImageCopy
Browse files Browse the repository at this point in the history
For 1D images, urEnqueueMemImageCopy was using cuMemcpyAtoA which does
not have an asynchronous version. This means that, when the MemCpy
happens between two arrays in device memory, the call will be
asynchronous and might complete after the event returned by
urEnqueueMemImageCopy finishes.

This commits fixes the issue by using cuMemcpy2DAsync to copy 1D images
by setting the height to 1.
  • Loading branch information
fabiomestre committed Nov 21, 2023
1 parent ce152a6 commit b414b50
Showing 1 changed file with 15 additions and 19 deletions.
34 changes: 15 additions & 19 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,7 @@ static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) {
}
}

/// General ND memory copy operation for images (where N > 1).
/// General ND memory copy operation for images.
/// This function requires the corresponding CUDA context to be at the top of
/// the context stack
/// If the source and/or destination is an array, SrcPtr and/or DstPtr
Expand All @@ -877,27 +877,27 @@ static ur_result_t commonEnqueueMemImageNDCopy(
UR_ASSERT(DstType == CU_MEMORYTYPE_ARRAY || DstType == CU_MEMORYTYPE_HOST,
UR_RESULT_ERROR_INVALID_MEM_OBJECT);

if (ImgType == UR_MEM_TYPE_IMAGE2D) {
if (ImgType == UR_MEM_TYPE_IMAGE1D || ImgType == UR_MEM_TYPE_IMAGE2D) {
CUDA_MEMCPY2D CpyDesc;
memset(&CpyDesc, 0, sizeof(CpyDesc));
CpyDesc.srcMemoryType = SrcType;
if (SrcType == CU_MEMORYTYPE_ARRAY) {
CpyDesc.srcArray = *static_cast<const CUarray *>(SrcPtr);
CpyDesc.srcXInBytes = SrcOffset.x;
CpyDesc.srcY = SrcOffset.y;
CpyDesc.srcY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : SrcOffset.y;
} else {
CpyDesc.srcHost = SrcPtr;
}
CpyDesc.dstMemoryType = DstType;
if (DstType == CU_MEMORYTYPE_ARRAY) {
CpyDesc.dstArray = *static_cast<CUarray *>(DstPtr);
CpyDesc.dstXInBytes = DstOffset.x;
CpyDesc.dstY = DstOffset.y;
CpyDesc.dstY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : DstOffset.y;
} else {
CpyDesc.dstHost = DstPtr;
}
CpyDesc.WidthInBytes = Region.width;
CpyDesc.Height = Region.height;
CpyDesc.Height = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 1 : Region.height;
UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc, CuStream));
return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -1124,21 +1124,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
}

ur_mem_type_t ImgType = std::get<SurfaceMem>(hImageSrc->Mem).getImageType();
if (ImgType == UR_MEM_TYPE_IMAGE1D) {
UR_CHECK_ERROR(cuMemcpyAtoA(DstArray, DstByteOffsetX, SrcArray,
SrcByteOffsetX, BytesToCopy));
} else {
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
region.depth};
ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z};
ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z};

Result = commonEnqueueMemImageNDCopy(
CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY,
SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset);
if (Result != UR_RESULT_SUCCESS) {
return Result;
}
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
region.depth};
ur_rect_offset_t SrcOffset = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z};
ur_rect_offset_t DstOffset = {DstByteOffsetX, dstOrigin.y, dstOrigin.z};

Result = commonEnqueueMemImageNDCopy(
CuStream, ImgType, AdjustedRegion, &SrcArray, CU_MEMORYTYPE_ARRAY,
SrcOffset, &DstArray, CU_MEMORYTYPE_ARRAY, DstOffset);
if (Result != UR_RESULT_SUCCESS) {
return Result;
}

if (phEvent) {
Expand Down

0 comments on commit b414b50

Please sign in to comment.