Skip to content

Commit

Permalink
Merge branch 'adapters' into fabio/opencl_ci
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiomestre committed Nov 17, 2023
2 parents aea05a8 + 534071e commit ac02423
Show file tree
Hide file tree
Showing 69 changed files with 4,143 additions and 136 deletions.
3 changes: 3 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ source/adapters/hip @oneapi-src/unified-runtime-hip-write
# OpenCL adapter
source/adapters/opencl @oneapi-src/unified-runtime-opencl-write

# Native CPU adapter
source/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write

# Command-buffer experimental feature
source/adapters/**/command_buffer.* @oneapi-src/unified-runtime-command-buffer-write
scripts/core/EXP-COMMAND-BUFFER.rst @oneapi-src/unified-runtime-command-buffer-write
Expand Down
11 changes: 7 additions & 4 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
strategy:
matrix:
adapter: [
{name: OPENCL, runner: OPENCL, triplet: spir64}
{name: OPENCL, runner: OPENCL, platform: "Intel(R) OpenCL", triplet: spir64}
]
build_type: [Release]
compiler: [{c: clang, cxx: clang++}]
Expand Down Expand Up @@ -65,14 +65,17 @@ jobs:
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
-DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
-DUR_CONFORMANCE_TARGET_TRIPLES=${{matrix.adapter.triplet}}
${{ matrix.adapter.name == 'HIP' && '-DAMD_ARCH=gfx1030' || '' }}
${{ matrix.adapter.name == 'HIP' && '-DUR_HIP_PLATFORM=AMD' || '' }}
- name: Build
# This is so that device binaries can find the sycl runtime library
run: cmake --build ${{github.workspace}}/build -j $(nproc)

- name: Test adapter specific
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
run: |
ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
# Temporarily disabling platform test for L0, because of hang
# See issue: #824
Expand All @@ -84,5 +87,5 @@ jobs:
- name: Test adapters
if: matrix.adapter.name != 'L0'
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180

run: |
env UR_CTS_ADAPTER_PLATFORM="${{matrix.adapter.platform}}" ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF)
option(UR_BUILD_ADAPTER_OPENCL "build opencl adapter from SYCL" OFF)
option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF)
option(UR_BUILD_ADAPTER_HIP "build hip adapter from SYCL" OFF)
option(UR_BUILD_ADAPTER_NATIVE_CPU "build native_cpu adapter from SYCL" OFF)
option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ List of options provided by CMake:
| UR_BUILD_ADAPTER_OPENCL | Fetch and use opencl adapter from SYCL | ON/OFF | OFF |
| UR_BUILD_ADAPTER_CUDA | Fetch and use cuda adapter from SYCL | ON/OFF | OFF |
| UR_BUILD_ADAPTER_HIP | Fetch and use hip adapter from SYCL | ON/OFF | OFF |
| UR_BUILD_ADAPTER_NATIVE_CPU | Fetch and use native-cpu adapter from SYCL | ON/OFF | OFF |
| UR_HIP_PLATFORM | Build hip adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD |
| UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` |
Expand Down
3 changes: 3 additions & 0 deletions source/adapters/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,6 @@ endif()
if(UR_BUILD_ADAPTER_OPENCL)
add_subdirectory(opencl)
endif()
if(UR_BUILD_ADAPTER_NATIVE_CPU)
add_subdirectory(native_cpu)
endif()
17 changes: 9 additions & 8 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,

for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
if (URAdviceFlags & UnmappedFlag) {
throw UR_RESULT_ERROR_INVALID_ENUMERATION;
setErrorMessage("Memory advice ignored because the CUDA backend does not "
"support some of the specified flags",
UR_RESULT_SUCCESS);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}
}

Expand Down Expand Up @@ -1355,15 +1358,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
ur_queue_handle_t hQueue, const void *pMem, size_t size,
ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
unsigned int PointerRangeSize = 0;
std::ignore = flags;

size_t PointerRangeSize = 0;
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
ur_device_handle_t Device = hQueue->getContext()->getDevice();

// Certain cuda devices and Windows do not have support for some Unified
// Memory features. cuMemPrefetchAsync requires concurrent memory access
// for managed memory. Therfore, ignore prefetch hint if concurrent managed
// for managed memory. Therefore, ignore prefetch hint if concurrent managed
// memory access is not available.
if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
setErrorMessage("Prefetch hint ignored as device does not support "
Expand All @@ -1381,10 +1386,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

// flags is currently unused so fail if set
if (flags != 0)
return UR_RESULT_ERROR_INVALID_VALUE;

ur_result_t Result = UR_RESULT_SUCCESS;
std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};

Expand Down Expand Up @@ -1415,7 +1416,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
UR_APIEXPORT ur_result_t UR_APICALL
urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
unsigned int PointerRangeSize = 0;
size_t PointerRangeSize = 0;
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
Expand Down
85 changes: 50 additions & 35 deletions source/adapters/cuda/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
ur_result_t createProgram(ur_context_handle_t hContext,
ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary,
const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
UR_CHECK_ERROR(
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count));
}

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size));
*phProgram = RetProgram.release();

return UR_RESULT_SUCCESS;
}

/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
Expand All @@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
ur_device_handle_t hDevice = hContext->getDevice();
auto pBinary = reinterpret_cast<const uint8_t *>(pIL);

return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
pProperties, phProgram);
return createProgram(hContext, hDevice, length, pBinary, pProperties,
phProgram);
}

/// CUDA will handle the PTX/CUBIN binaries internally through a call to
Expand All @@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
UR_APIEXPORT ur_result_t UR_APICALL
urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
const char *pOptions) {
return urProgramBuild(hContext, hProgram, pOptions);
UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
return UR_RESULT_SUCCESS;
}

/// Loads the images from a UR program into a CUmodule that can be
Expand All @@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
ScopedContext Active(hProgram->getContext());

hProgram->buildProgram(pOptions);
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;

} catch (ur_result_t Err) {
Result = Err;
Expand Down Expand Up @@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);

Result = RetProgram->buildProgram(pOptions);
RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
} catch (...) {
// Upon error attempt cleanup
UR_CHECK_ERROR(cuLinkDestroy(State));
Expand Down Expand Up @@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
return ReturnValue(hProgram->BuildOptions.c_str());
case UR_PROGRAM_BUILD_INFO_LOG:
return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: {
return ReturnValue(hProgram->BinaryType);
}
default:
break;
}
Expand Down Expand Up @@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
return UR_RESULT_SUCCESS;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

ur_result_t Result = UR_RESULT_SUCCESS;
UR_CHECK_ERROR(
createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram));
(*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
}
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

Result = RetProgram->setBinary(pBinary_string, size);
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

*phProgram = RetProgram.release();

return Result;
return UR_RESULT_SUCCESS;
}

// This entry point is only used for native specialization constants (SPIR-V),
Expand Down
6 changes: 6 additions & 0 deletions source/adapters/cuda/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ struct ur_program_handle_t_ {
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;

/* The ur_program_binary_type_t property is defined individually for every
* device in a program. However, since the CUDA adapter only has 1 device per
* context / program, there is no need to keep track of its value for each
* device. */
ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;

// Metadata
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
KernelReqdWorkGroupSizeMD;
Expand Down
56 changes: 32 additions & 24 deletions source/adapters/hip/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(uint64_t{MaxAlloc});
}
case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
return ReturnValue(uint32_t{true});
return ReturnValue(true);
}
case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
// This call doesn't match to HIP as it doesn't have images, but instead
// surfaces and textures. No clear call in the HIP API to determine this,
// but some searching found as of SM 2.x 128 are supported.
return ReturnValue(128u);
}
case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: {
// This call doesn't match to HIP as it doesn't have images, but instead
// surfaces and textures. No clear call in the HIP API to determine this,
// but some searching found as of SM 2.x 128 are supported.
return ReturnValue(128u);
}
case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
// This call doesn't match to HIP as it doesn't have images, but instead
// surfaces and textures. No clear call in the HIP API to determine this,
Expand Down Expand Up @@ -339,7 +345,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(0u);
}
case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
uint64_t Config =
ur_device_fp_capability_flags_t Config =
UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
Expand All @@ -350,12 +356,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(Config);
}
case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
ur_device_fp_capability_flags_t Config =
UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
return ReturnValue(Config);
}
case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
Expand Down Expand Up @@ -459,14 +466,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
}
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
// The mandated minimum capability:
uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
return ReturnValue(Capability);
}
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_PROPERTIES: {
// The mandated minimum capability:
uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
return ReturnValue(Capability);
}
case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
Expand Down Expand Up @@ -730,9 +737,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
}

case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
ur_memory_order_capability_flags_t Capabilities =
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
return ReturnValue(Capabilities);
}
case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
Expand Down Expand Up @@ -821,7 +829,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
case UR_DEVICE_INFO_BFLOAT16:
return UR_RESULT_ERROR_INVALID_ENUMERATION;
case UR_DEVICE_INFO_IL_VERSION:
case UR_DEVICE_INFO_ASYNC_BARRIER:
case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT:
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;

default:
break;
Expand Down Expand Up @@ -939,21 +950,18 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
if (pDeviceTimestamp) {
UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault));
UR_CHECK_ERROR(hipEventRecord(Event));
}
if (pHostTimestamp) {
using namespace std::chrono;
*pHostTimestamp =
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
.count();
}

if (pDeviceTimestamp) {
UR_CHECK_ERROR(hipEventSynchronize(Event));
float ElapsedTime = 0.0f;
UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime,
ur_platform_handle_t_::EvBase, Event));
*pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6);
}

if (pHostTimestamp) {
using namespace std::chrono;
*pHostTimestamp =
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
.count();
}
return UR_RESULT_SUCCESS;
}
Loading

0 comments on commit ac02423

Please sign in to comment.