Skip to content

Commit

Permalink
Adds CUDA support
Browse files Browse the repository at this point in the history
  • Loading branch information
mfrancepillois committed Dec 4, 2023
1 parent eac29af commit 6af3881
Showing 1 changed file with 110 additions and 27 deletions.
137 changes: 110 additions & 27 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,90 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
Params.Depth = 1;
}

// Helper function for enqueuing memory fills
static ur_result_t enqueueCommandBufferFillHelper(
ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice,
const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
size_t Size, uint32_t NumSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
ur_exp_command_buffer_sync_point_t *SyncPoint) {
ur_result_t Result;
std::vector<CUgraphNode> DepsList;
UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
SyncPointWaitList, DepsList));

try {
size_t N = Size / PatternSize;
auto Value = *static_cast<const uint32_t *>(Pattern);
auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
? *static_cast<CUdeviceptr *>(DstDevice)
: (CUdeviceptr)DstDevice;

if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
// Create a new node
CUgraphNode GraphNode;
CUDA_MEMSET_NODE_PARAMS NodeParams = {};
NodeParams.dst = DstPtr;
NodeParams.elementSize = PatternSize;
NodeParams.height = N;
NodeParams.pitch = PatternSize;
NodeParams.value = Value;
NodeParams.width = 1;

Result = UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));

// Get sync point and register the cuNode with it.
*SyncPoint =
CommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));

} else {
// CUDA has no memset functions that allow setting values more than 4
// bytes. UR API lets you pass an arbitrary "pattern" to the buffer
// fill, which can be more than 4 bytes. We must break up the pattern
// into 4 byte values, and set the buffer using multiple strided calls.
// This means that one cuGraphAddMemsetNode call is made for every 4 bytes
// in the pattern.

size_t NumberOfSteps = PatternSize / sizeof(uint32_t);

// we walk up the pattern in 4-byte steps, and call cuMemset for each
// 4-byte chunk of the pattern.
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
// take 4 bytes of the pattern
auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));

// Create a new node
CUgraphNode GraphNode;
// Update NodeParam
CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
NodeParamsStep.elementSize = 4;
NodeParamsStep.height = N;
NodeParamsStep.pitch = PatternSize;
NodeParamsStep.value = Value;
NodeParamsStep.width = 1;

Result = UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParamsStep,
CommandBuffer->Device->getContext()));

// Get sync point and register the cuNode with it.
*SyncPoint = CommandBuffer->AddSyncPoint(
std::make_shared<CUgraphNode>(GraphNode));
}
}
} catch (ur_result_t Err) {
Result = Err;
}
return Result;
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
Expand Down Expand Up @@ -531,20 +615,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
(void)hCommandBuffer;
(void)hBuffer;
(void)pPattern;
(void)patternSize;
(void)offset;
(void)size;

(void)numSyncPointsInWaitList;
(void)pSyncPointWaitList;
(void)pSyncPoint;

detail::ur::die("Experimental Command-buffer feature is not "
"implemented for CUDA adapter.");
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
auto ArgsAreMultiplesOfPatternSize =
(offset % patternSize == 0) || (size % patternSize == 0);

auto PatternIsValid = (pPattern != nullptr);

auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
(patternSize > 0); // is a positive power of two
UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
PatternSizeIsValid,
UR_RESULT_ERROR_INVALID_SIZE);

auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;

return enqueueCommandBufferFillHelper(
hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
Expand All @@ -553,19 +639,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
uint32_t numSyncPointsInWaitList,
const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
ur_exp_command_buffer_sync_point_t *pSyncPoint) {
(void)hCommandBuffer;
(void)pPtr;
(void)pPattern;
(void)patternSize;
(void)size;

(void)numSyncPointsInWaitList;
(void)pSyncPointWaitList;
(void)pSyncPoint;

detail::ur::die("Experimental Command-buffer feature is not "
"implemented for CUDA adapter.");
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;

auto PatternIsValid = (pPattern != nullptr);

auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
(patternSize > 0); // is a positive power of two

UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE);
return enqueueCommandBufferFillHelper(
hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size,
numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
}

UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
Expand Down

0 comments on commit 6af3881

Please sign in to comment.