diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 9cf7561ba6..ea60fc8f06 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -99,6 +99,90 @@ static void setCopyParams(const void *SrcPtr, const CUmemorytype_enum SrcType,
   Params.Depth = 1;
 }
 
+// Helper function for enqueuing memory fills
+static ur_result_t enqueueCommandBufferFillHelper(
+    ur_exp_command_buffer_handle_t CommandBuffer, void *DstDevice,
+    const CUmemorytype_enum DstType, const void *Pattern, size_t PatternSize,
+    size_t Size, uint32_t NumSyncPointsInWaitList,
+    const ur_exp_command_buffer_sync_point_t *SyncPointWaitList,
+    ur_exp_command_buffer_sync_point_t *SyncPoint) {
+  ur_result_t Result;
+  std::vector<CUgraphNode> DepsList;
+  UR_CALL(getNodesFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
+                                 SyncPointWaitList, DepsList));
+
+  try {
+    size_t N = Size / PatternSize;
+    auto Value = *static_cast<const uint32_t *>(Pattern);
+    auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
+                      ? *static_cast<CUdeviceptr *>(DstDevice)
+                      : (CUdeviceptr)DstDevice;
+
+    if ((PatternSize == 1) || (PatternSize == 2) || (PatternSize == 4)) {
+      // Create a new node
+      CUgraphNode GraphNode;
+      CUDA_MEMSET_NODE_PARAMS NodeParams = {};
+      NodeParams.dst = DstPtr;
+      NodeParams.elementSize = PatternSize;
+      NodeParams.height = N;
+      NodeParams.pitch = PatternSize;
+      NodeParams.value = Value;
+      NodeParams.width = 1;
+
+      Result = UR_CHECK_ERROR(cuGraphAddMemsetNode(
+          &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
+          DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
+
+      // Get sync point and register the cuNode with it.
+      *SyncPoint =
+          CommandBuffer->AddSyncPoint(std::make_shared<CUgraphNode>(GraphNode));
+
+    } else {
+      // CUDA has no memset functions that allow setting values more than 4
+      // bytes. UR API lets you pass an arbitrary "pattern" to the buffer
+      // fill, which can be more than 4 bytes. We must break up the pattern
+      // into 4 byte values, and set the buffer using multiple strided calls.
+      // This means that one cuGraphAddMemsetNode call is made for every 4 bytes
+      // in the pattern.
+
+      size_t NumberOfSteps = PatternSize / sizeof(uint32_t);
+
+      // we walk up the pattern in 4-byte steps, and call cuMemset for each
+      // 4-byte chunk of the pattern.
+      for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
+        // take 4 bytes of the pattern
+        auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);
+
+        // offset the pointer to the part of the buffer we want to write to
+        auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));
+
+        // Create a new node
+        CUgraphNode GraphNode;
+        // Update NodeParam
+        CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
+        NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
+        NodeParamsStep.elementSize = 4;
+        NodeParamsStep.height = N;
+        NodeParamsStep.pitch = PatternSize;
+        NodeParamsStep.value = Value;
+        NodeParamsStep.width = 1;
+
+        Result = UR_CHECK_ERROR(cuGraphAddMemsetNode(
+            &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
+            DepsList.size(), &NodeParamsStep,
+            CommandBuffer->Device->getContext()));
+
+        // Get sync point and register the cuNode with it.
+        *SyncPoint = CommandBuffer->AddSyncPoint(
+            std::make_shared<CUgraphNode>(GraphNode));
+      }
+    }
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_exp_command_buffer_desc_t *pCommandBufferDesc,
@@ -531,20 +615,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)hBuffer;
-  (void)pPattern;
-  (void)patternSize;
-  (void)offset;
-  (void)size;
-
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  auto ArgsAreMultiplesOfPatternSize =
+      (offset % patternSize == 0) || (size % patternSize == 0);
+
+  auto PatternIsValid = (pPattern != nullptr);
+
+  auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
+                            (patternSize > 0); // is a positive power of two
+  UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid &&
+                PatternSizeIsValid,
+            UR_RESULT_ERROR_INVALID_SIZE);
+
+  auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+
+  return enqueueCommandBufferFillHelper(
+      hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
+      size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
@@ -553,19 +639,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
     uint32_t numSyncPointsInWaitList,
     const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList,
     ur_exp_command_buffer_sync_point_t *pSyncPoint) {
-  (void)hCommandBuffer;
-  (void)pPtr;
-  (void)pPattern;
-  (void)patternSize;
-  (void)size;
-
-  (void)numSyncPointsInWaitList;
-  (void)pSyncPointWaitList;
-  (void)pSyncPoint;
-
-  detail::ur::die("Experimental Command-buffer feature is not "
-                  "implemented for CUDA adapter.");
-  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+
+  auto PatternIsValid = (pPattern != nullptr);
+
+  auto PatternSizeIsValid = ((patternSize & (patternSize - 1)) == 0) &&
+                            (patternSize > 0); // is a positive power of two
+
+  UR_ASSERT(PatternIsValid && PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE);
+  return enqueueCommandBufferFillHelper(
+      hCommandBuffer, pPtr, CU_MEMORYTYPE_UNIFIED, pPattern, patternSize, size,
+      numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(