Merge branch 'adapters' into fabio/opencl_ci

oneapi-src · Nov 17, 2023 · ac02423 · ac02423
2 parents aea05a8 + 534071e
commit ac02423
Show file tree

Hide file tree

Showing 69 changed files with 4,143 additions and 136 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -10,6 +10,9 @@ source/adapters/hip             @oneapi-src/unified-runtime-hip-write
 # OpenCL adapter
 source/adapters/opencl          @oneapi-src/unified-runtime-opencl-write
 
+# Native CPU adapter
+source/adapters/native_cpu          @oneapi-src/unified-runtime-native-cpu-write
+
 # Command-buffer experimental feature
 source/adapters/**/command_buffer.*  @oneapi-src/unified-runtime-command-buffer-write
 scripts/core/EXP-COMMAND-BUFFER.rst  @oneapi-src/unified-runtime-command-buffer-write

diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
@@ -13,7 +13,7 @@ jobs:
     strategy:
       matrix:
         adapter: [
-          {name: OPENCL, runner: OPENCL, triplet: spir64}
+          {name: OPENCL, runner: OPENCL, platform: "Intel(R) OpenCL", triplet: spir64}
         ]
         build_type: [Release]
         compiler: [{c: clang, cxx: clang++}]
@@ -65,14 +65,17 @@ jobs:
         -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
         -DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
         -DUR_CONFORMANCE_TARGET_TRIPLES=${{matrix.adapter.triplet}}
+        ${{ matrix.adapter.name == 'HIP' && '-DAMD_ARCH=gfx1030' || '' }} 
+        ${{ matrix.adapter.name == 'HIP' && '-DUR_HIP_PLATFORM=AMD' || '' }}
 
     - name: Build
       # This is so that device binaries can find the sycl runtime library
       run: cmake --build ${{github.workspace}}/build -j $(nproc)
 
     - name: Test adapter specific
       working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
+      run: |
+        ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
 
     # Temporarily disabling platform test for L0, because of hang
     # See issue: #824
@@ -84,5 +87,5 @@ jobs:
     - name: Test adapters
       if: matrix.adapter.name != 'L0'
       working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180
-
+      run: |
+        env UR_CTS_ADAPTER_PLATFORM="${{matrix.adapter.platform}}" ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -39,6 +39,7 @@ option(UR_BUILD_ADAPTER_L0 "build level 0 adapter from SYCL" OFF)
 option(UR_BUILD_ADAPTER_OPENCL "build opencl adapter from SYCL" OFF)
 option(UR_BUILD_ADAPTER_CUDA "build cuda adapter from SYCL" OFF)
 option(UR_BUILD_ADAPTER_HIP "build hip adapter from SYCL" OFF)
+option(UR_BUILD_ADAPTER_NATIVE_CPU "build native_cpu adapter from SYCL" OFF)
 option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
 option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
 set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")

diff --git a/README.md b/README.md
@@ -131,6 +131,7 @@ List of options provided by CMake:
 | UR_BUILD_ADAPTER_OPENCL | Fetch and use opencl adapter from SYCL                 | ON/OFF     | OFF     |
 | UR_BUILD_ADAPTER_CUDA   | Fetch and use cuda adapter from SYCL                   | ON/OFF     | OFF     |
 | UR_BUILD_ADAPTER_HIP    | Fetch and use hip adapter from SYCL                    | ON/OFF     | OFF     |
+| UR_BUILD_ADAPTER_NATIVE_CPU | Fetch and use native-cpu adapter from SYCL         | ON/OFF     | OFF     |
 | UR_HIP_PLATFORM         | Build hip adapter for AMD or NVIDIA platform           | AMD/NVIDIA | AMD     |
 | UR_ENABLE_COMGR         | Enable comgr lib usage           | AMD/NVIDIA | AMD     |
 | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` |

diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt
@@ -47,3 +47,6 @@ endif()
 if(UR_BUILD_ADAPTER_OPENCL)
     add_subdirectory(opencl)
 endif()
+if(UR_BUILD_ADAPTER_NATIVE_CPU)
+    add_subdirectory(native_cpu)
+endif()
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -121,7 +121,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
 
   for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
     if (URAdviceFlags & UnmappedFlag) {
-      throw UR_RESULT_ERROR_INVALID_ENUMERATION;
+      setErrorMessage("Memory advice ignored because the CUDA backend does not "
+                      "support some of the specified flags",
+                      UR_RESULT_SUCCESS);
+      return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
     }
   }
 
@@ -1355,15 +1358,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
     ur_queue_handle_t hQueue, const void *pMem, size_t size,
     ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
-  unsigned int PointerRangeSize = 0;
+  std::ignore = flags;
+
+  size_t PointerRangeSize = 0;
   UR_CHECK_ERROR(cuPointerGetAttribute(
       &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
   ur_device_handle_t Device = hQueue->getContext()->getDevice();
 
   // Certain cuda devices and Windows do not have support for some Unified
   // Memory features. cuMemPrefetchAsync requires concurrent memory access
-  // for managed memory. Therfore, ignore prefetch hint if concurrent managed
+  // for managed memory. Therefore, ignore prefetch hint if concurrent managed
   // memory access is not available.
   if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
     setErrorMessage("Prefetch hint ignored as device does not support "
@@ -1381,10 +1386,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
     return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
   }
 
-  // flags is currently unused so fail if set
-  if (flags != 0)
-    return UR_RESULT_ERROR_INVALID_VALUE;
-
   ur_result_t Result = UR_RESULT_SUCCESS;
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
@@ -1415,7 +1416,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
 UR_APIEXPORT ur_result_t UR_APICALL
 urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
                    ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
-  unsigned int PointerRangeSize = 0;
+  size_t PointerRangeSize = 0;
   UR_CHECK_ERROR(cuPointerGetAttribute(
       &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);

diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
@@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
+/// Loads images from a list of PTX or CUBIN binaries.
+/// Note: No calls to CUDA driver API in this function, only store binaries
+/// for later.
+///
+/// Note: Only supports one device
+///
+ur_result_t createProgram(ur_context_handle_t hContext,
+                          ur_device_handle_t hDevice, size_t size,
+                          const uint8_t *pBinary,
+                          const ur_program_properties_t *pProperties,
+                          ur_program_handle_t *phProgram) {
+  UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
+            UR_RESULT_ERROR_INVALID_CONTEXT);
+  UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
+
+  std::unique_ptr<ur_program_handle_t_> RetProgram{
+      new ur_program_handle_t_{hContext}};
+
+  if (pProperties) {
+    if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
+      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
+      return UR_RESULT_ERROR_INVALID_SIZE;
+    }
+    UR_CHECK_ERROR(
+        RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count));
+  }
+
+  auto pBinary_string = reinterpret_cast<const char *>(pBinary);
+
+  UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size));
+  *phProgram = RetProgram.release();
+
+  return UR_RESULT_SUCCESS;
+}
+
 /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
 /// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
 /// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
@@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
   ur_device_handle_t hDevice = hContext->getDevice();
   auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
 
-  return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
-                                   pProperties, phProgram);
+  return createProgram(hContext, hDevice, length, pBinary, pProperties,
+                       phProgram);
 }
 
 /// CUDA will handle the PTX/CUBIN binaries internally through a call to
@@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
                  const char *pOptions) {
-  return urProgramBuild(hContext, hProgram, pOptions);
+  UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
+  hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+  return UR_RESULT_SUCCESS;
 }
 
 /// Loads the images from a UR program into a CUmodule that can be
@@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
     ScopedContext Active(hProgram->getContext());
 
     hProgram->buildProgram(pOptions);
+    hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
 
   } catch (ur_result_t Err) {
     Result = Err;
@@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
           RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);
 
       Result = RetProgram->buildProgram(pOptions);
+      RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
     } catch (...) {
       // Upon error attempt cleanup
       UR_CHECK_ERROR(cuLinkDestroy(State));
@@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
     return ReturnValue(hProgram->BuildOptions.c_str());
   case UR_PROGRAM_BUILD_INFO_LOG:
     return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
+  case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: {
+    return ReturnValue(hProgram->BinaryType);
+  }
   default:
     break;
   }
@@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
   return UR_RESULT_SUCCESS;
 }
 
-/// Loads images from a list of PTX or CUBIN binaries.
-/// Note: No calls to CUDA driver API in this function, only store binaries
-/// for later.
-///
-/// Note: Only supports one device
-///
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
     const uint8_t *pBinary, const ur_program_properties_t *pProperties,
     ur_program_handle_t *phProgram) {
-  UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
-            UR_RESULT_ERROR_INVALID_CONTEXT);
-  UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
+  UR_CHECK_ERROR(
+      createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram));
+  (*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
 
-  std::unique_ptr<ur_program_handle_t_> RetProgram{
-      new ur_program_handle_t_{hContext}};
-
-  if (pProperties) {
-    if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
-      return UR_RESULT_ERROR_INVALID_NULL_POINTER;
-    } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
-      return UR_RESULT_ERROR_INVALID_SIZE;
-    }
-    Result =
-        RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
-  }
-  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
-
-  auto pBinary_string = reinterpret_cast<const char *>(pBinary);
-
-  Result = RetProgram->setBinary(pBinary_string, size);
-  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
-
-  *phProgram = RetProgram.release();
-
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 // This entry point is only used for native specialization constants (SPIR-V),

diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp
@@ -25,6 +25,12 @@ struct ur_program_handle_t_ {
   std::atomic_uint32_t RefCount;
   ur_context_handle_t Context;
 
+  /* The ur_program_binary_type_t property is defined individually for every
+   * device in a program. However, since the CUDA adapter only has 1 device per
+   * context / program, there is no need to keep track of its value for each
+   * device. */
+  ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;
+
   // Metadata
   std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
       KernelReqdWorkGroupSizeMD;

diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
@@ -210,14 +210,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(uint64_t{MaxAlloc});
   }
   case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
-    return ReturnValue(uint32_t{true});
+    return ReturnValue(true);
   }
   case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
     // This call doesn't match to HIP as it doesn't have images, but instead
     // surfaces and textures. No clear call in the HIP API to determine this,
     // but some searching found as of SM 2.x 128 are supported.
     return ReturnValue(128u);
   }
+  case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: {
+    // This call doesn't match to HIP as it doesn't have images, but instead
+    // surfaces and textures. No clear call in the HIP API to determine this,
+    // but some searching found as of SM 2.x 128 are supported.
+    return ReturnValue(128u);
+  }
   case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
     // This call doesn't match to HIP as it doesn't have images, but instead
     // surfaces and textures. No clear call in the HIP API to determine this,
@@ -339,7 +345,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(0u);
   }
   case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
-    uint64_t Config =
+    ur_device_fp_capability_flags_t Config =
         UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
         UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
         UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
@@ -350,12 +356,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(Config);
   }
   case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
-    uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
-                      UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
-                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
-                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
-                      UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
-                      UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
+    ur_device_fp_capability_flags_t Config =
+        UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
+        UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
+        UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
+        UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
     return ReturnValue(Config);
   }
   case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
@@ -459,14 +466,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   }
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
     // The mandated minimum capability:
-    uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
-                          UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+    ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
+                                  UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
     return ReturnValue(Capability);
   }
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_PROPERTIES: {
     // The mandated minimum capability:
-    uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
+    ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
     return ReturnValue(Capability);
   }
   case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
@@ -730,9 +737,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   }
 
   case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
-    uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
-                            UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
-                            UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
+    ur_memory_order_capability_flags_t Capabilities =
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
+        UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
     return ReturnValue(Capabilities);
   }
   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
@@ -821,7 +829,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
   case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
   case UR_DEVICE_INFO_BFLOAT16:
-    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  case UR_DEVICE_INFO_IL_VERSION:
+  case UR_DEVICE_INFO_ASYNC_BARRIER:
+  case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
 
   default:
     break;
@@ -939,21 +950,18 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
   if (pDeviceTimestamp) {
     UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault));
     UR_CHECK_ERROR(hipEventRecord(Event));
-  }
-  if (pHostTimestamp) {
-    using namespace std::chrono;
-    *pHostTimestamp =
-        duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
-            .count();
-  }
-
-  if (pDeviceTimestamp) {
     UR_CHECK_ERROR(hipEventSynchronize(Event));
     float ElapsedTime = 0.0f;
     UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime,
                                        ur_platform_handle_t_::EvBase, Event));
     *pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6);
   }
 
+  if (pHostTimestamp) {
+    using namespace std::chrono;
+    *pHostTimestamp =
+        duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
+            .count();
+  }
   return UR_RESULT_SUCCESS;
 }