Skip to content

Commit

Permalink
Merge branch 'main' into usm-p2p-add-test-and-macro
Browse files Browse the repository at this point in the history
  • Loading branch information
JackAKirk committed Mar 15, 2024
2 parents f39d41f + 6513abc commit ad198f9
Show file tree
Hide file tree
Showing 12 changed files with 310 additions and 214 deletions.
89 changes: 45 additions & 44 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,50 +115,51 @@ jobs:
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace"

fuzztest-build:
name: Build and run quick fuzztest scenarios
strategy:
matrix:
build_type: [Debug, Release]
compiler: [{c: clang, cxx: clang++}]

runs-on: 'ubuntu-22.04'

steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Install pip packages
run: pip install -r third_party/requirements.txt

- name: Download DPC++
run: |
sudo apt install libncurses5
wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz
tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz
- name: Setup DPC++
run: |
source ${{github.workspace}}/dpcpp_compiler/startup.sh
- name: Configure CMake
run: >
cmake
-B${{github.workspace}}/build
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-DUR_ENABLE_TRACING=ON
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-DUR_BUILD_TESTS=ON
-DUR_USE_ASAN=ON
-DUR_USE_UBSAN=ON
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
- name: Build
run: cmake --build ${{github.workspace}}/build -j $(nproc)

- name: Fuzz test
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short"
# Disable short fuzz tests until the ubuntu-22.04 runner is fixed
# fuzztest-build:
# name: Build and run quick fuzztest scenarios
# strategy:
# matrix:
# build_type: [Debug, Release]
# compiler: [{c: clang, cxx: clang++}]

# runs-on: 'ubuntu-22.04'

# steps:
# - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

# - name: Install pip packages
# run: pip install -r third_party/requirements.txt

# - name: Download DPC++
# run: |
# sudo apt install libncurses5
# wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz
# tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz

# - name: Setup DPC++
# run: |
# source ${{github.workspace}}/dpcpp_compiler/startup.sh

# - name: Configure CMake
# run: >
# cmake
# -B${{github.workspace}}/build
# -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
# -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
# -DUR_ENABLE_TRACING=ON
# -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
# -DUR_BUILD_TESTS=ON
# -DUR_USE_ASAN=ON
# -DUR_USE_UBSAN=ON
# -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++

# - name: Build
# run: cmake --build ${{github.workspace}}/build -j $(nproc)

# - name: Fuzz test
# working-directory: ${{github.workspace}}/build
# run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short"

adapter-build-hw:
name: Build - Adapters on HW
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ jobs:
LD_LIBRARY_PATH=${{github.workspace}}/dpcpp_compiler/lib
cmake --build ${{github.workspace}}/build -j $(nproc)
- name: Fuzz long test
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-long"
# Disable long fuzz tests until the ubuntu-22.04 runner is fixed
# - name: Fuzz long test
# working-directory: ${{github.workspace}}/build
# run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-long"
10 changes: 10 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@ set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING
"List of sycl targets to build CTS device binaries for")
set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for")
set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH
"Path to external 'level_zero' adapter source dir")
set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH
"Path to external 'opencl' adapter source dir")
set(UR_ADAPTER_CUDA_SOURCE_DIR "" CACHE PATH
"Path to external 'cuda' adapter source dir")
set(UR_ADAPTER_HIP_SOURCE_DIR "" CACHE PATH
"Path to external 'hip' adapter source dir")
set(UR_ADAPTER_NATIVE_CPU_SOURCE_DIR "" CACHE PATH
"Path to external 'native_cpu' adapter source dir")

# There's little reason not to generate the compile_commands.json
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
Expand Down
26 changes: 21 additions & 5 deletions source/adapters/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,37 @@ endfunction()

add_subdirectory(null)

function(add_ur_adapter_subdirectory name)
string(TOUPPER ${name} NAME)
if(UR_ADAPTER_${NAME}_SOURCE_DIR)
if(NOT IS_DIRECTORY ${UR_ADAPTER_${NAME}_SOURCE_DIR})
message(FATAL_ERROR
"UR_ADAPTER_${NAME}_SOURCE_DIR is not a directory: "
"${UR_ADAPTER_${NAME}_SOURCE_DIR}")
endif()
add_subdirectory(
"${UR_ADAPTER_${NAME}_SOURCE_DIR}"
"${CMAKE_CURRENT_BINARY_DIR}/${name}")
else()
add_subdirectory(${name})
endif()
endfunction()

if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_ALL)
add_subdirectory(level_zero)
add_ur_adapter_subdirectory(level_zero)
endif()

if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL)
add_subdirectory(cuda)
add_ur_adapter_subdirectory(cuda)
endif()

if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL)
add_subdirectory(hip)
add_ur_adapter_subdirectory(hip)
endif()

if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL)
add_subdirectory(opencl)
add_ur_adapter_subdirectory(opencl)
endif()
if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL)
add_subdirectory(native_cpu)
add_ur_adapter_subdirectory(native_cpu)
endif()
77 changes: 61 additions & 16 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,6 @@ static ur_result_t enqueueCommandBufferFillHelper(

try {
const size_t N = Size / PatternSize;
auto Value = *static_cast<const uint32_t *>(Pattern);
auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
? *static_cast<CUdeviceptr *>(DstDevice)
: (CUdeviceptr)DstDevice;
Expand All @@ -183,9 +182,27 @@ static ur_result_t enqueueCommandBufferFillHelper(
NodeParams.elementSize = PatternSize;
NodeParams.height = N;
NodeParams.pitch = PatternSize;
NodeParams.value = Value;
NodeParams.width = 1;

// pattern size in bytes
switch (PatternSize) {
case 1: {
auto Value = *static_cast<const uint8_t *>(Pattern);
NodeParams.value = Value;
break;
}
case 2: {
auto Value = *static_cast<const uint16_t *>(Pattern);
NodeParams.value = Value;
break;
}
case 4: {
auto Value = *static_cast<const uint32_t *>(Pattern);
NodeParams.value = Value;
break;
}
}

UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
Expand All @@ -198,29 +215,54 @@ static ur_result_t enqueueCommandBufferFillHelper(
// CUDA has no memset functions that allow setting values more than 4
// bytes. UR API lets you pass an arbitrary "pattern" to the buffer
// fill, which can be more than 4 bytes. We must break up the pattern
// into 4 byte values, and set the buffer using multiple strided calls.
// This means that one cuGraphAddMemsetNode call is made for every 4 bytes
// in the pattern.
// into 1 byte values, and set the buffer using multiple strided calls.
// This means that one cuGraphAddMemsetNode call is made for every 1
// bytes in the pattern.

size_t NumberOfSteps = PatternSize / sizeof(uint8_t);

size_t NumberOfSteps = PatternSize / sizeof(uint32_t);
// Shared pointer that will point to the last node created
std::shared_ptr<CUgraphNode> GraphNodePtr;
// Create a new node
CUgraphNode GraphNodeFirst;
// Update NodeParam
CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {};
NodeParamsStepFirst.dst = DstPtr;
NodeParamsStepFirst.elementSize = sizeof(uint32_t);
NodeParamsStepFirst.height = Size / sizeof(uint32_t);
NodeParamsStepFirst.pitch = sizeof(uint32_t);
NodeParamsStepFirst.value = *static_cast<const uint32_t *>(Pattern);
NodeParamsStepFirst.width = 1;

// we walk up the pattern in 4-byte steps, and call cuMemset for each
// 4-byte chunk of the pattern.
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParamsStepFirst,
CommandBuffer->Device->getContext()));

// Get sync point and register the cuNode with it.
*SyncPoint = CommandBuffer->addSyncPoint(
std::make_shared<CUgraphNode>(GraphNodeFirst));

DepsList.clear();
DepsList.push_back(GraphNodeFirst);

// we walk up the pattern in 1-byte steps, and call cuMemset for each
// 1-byte chunk of the pattern.
for (auto Step = 4u; Step < NumberOfSteps; ++Step) {
// take 4 bytes of the pattern
auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);
auto Value = *(static_cast<const uint8_t *>(Pattern) + Step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));
auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t));

// Create a new node
CUgraphNode GraphNode;
// Update NodeParam
CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
NodeParamsStep.elementSize = 4;
NodeParamsStep.height = N;
NodeParamsStep.pitch = PatternSize;
NodeParamsStep.elementSize = sizeof(uint8_t);
NodeParamsStep.height = Size / NumberOfSteps;
NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t);
NodeParamsStep.value = Value;
NodeParamsStep.width = 1;

Expand All @@ -229,9 +271,12 @@ static ur_result_t enqueueCommandBufferFillHelper(
DepsList.size(), &NodeParamsStep,
CommandBuffer->Device->getContext()));

GraphNodePtr = std::make_shared<CUgraphNode>(GraphNode);
// Get sync point and register the cuNode with it.
*SyncPoint = CommandBuffer->addSyncPoint(
std::make_shared<CUgraphNode>(GraphNode));
*SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr);

DepsList.clear();
DepsList.push_back(*GraphNodePtr.get());
}
}
} catch (ur_result_t Err) {
Expand Down
14 changes: 10 additions & 4 deletions source/adapters/cuda/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1006,17 +1006,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
ArrayDesc.Format = format;

CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {};
mipmapDesc.numLevels = 1;
mipmapDesc.numLevels = pImageDesc->numMipLevel;
mipmapDesc.arrayDesc = ArrayDesc;

// External memory is mapped to a CUmipmappedArray
// If desired, a CUarray is retrieved from the mipmaps 0th level
CUmipmappedArray memMipMap;
UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray(
&memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc));

CUarray memArray;
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));
if (pImageDesc->numMipLevel > 1) {
*phImageMem = (ur_exp_image_mem_handle_t)memMipMap;
} else {
CUarray memArray;
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));

*phImageMem = (ur_exp_image_mem_handle_t)memArray;
*phImageMem = (ur_exp_image_mem_handle_t)memArray;
}

} catch (ur_result_t Err) {
return Err;
Expand Down
22 changes: 11 additions & 11 deletions source/adapters/cuda/tracing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@
using tracing_event_t = xpti_td *;
using subscriber_handle_t = CUpti_SubscriberHandle;

using cuptiSubscribe_fn = CUPTIAPI
CUptiResult (*)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback,
void *userdata);
using cuptiSubscribe_fn =
CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *subscriber,
CUpti_CallbackFunc callback, void *userdata);

using cuptiUnsubscribe_fn = CUPTIAPI
CUptiResult (*)(CUpti_SubscriberHandle subscriber);
using cuptiUnsubscribe_fn =
CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle subscriber);

using cuptiEnableDomain_fn = CUPTIAPI
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain);
using cuptiEnableDomain_fn =
CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain);

using cuptiEnableCallback_fn = CUPTIAPI
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);
using cuptiEnableCallback_fn =
CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);

#define LOAD_CUPTI_SYM(p, lib, x) \
p.x = (cupti##x##_fn)ur_loader::LibLoader::getFunctionPtr(lib.get(), \
Expand Down
Loading

0 comments on commit ad198f9

Please sign in to comment.