Skip to content

Commit

Permalink
[SYCL][CI] Fuse two post-commit builds into one (intel#9695)
Browse files Browse the repository at this point in the history
  • Loading branch information
aelovikov-intel authored Jun 7, 2023
1 parent d48a5fb commit 64bd508
Show file tree
Hide file tree
Showing 8 changed files with 125 additions and 109 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/sycl_linux_build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,11 @@ jobs:
cmake --build $GITHUB_WORKSPACE/build --target install-llvm-cov
cmake --build $GITHUB_WORKSPACE/build --target install-llvm-profdata
cmake --build $GITHUB_WORKSPACE/build --target install-compiler-rt
- name: Additional Install for "--shared-libs" build
if: ${{ contains(inputs.build_configure_extra_args, '--shared-libs') }}
run: |
cmake --build $GITHUB_WORKSPACE/build --target install-clang-libraries
cmake --build $GITHUB_WORKSPACE/build --target install-llvm-libraries
- name: Install lint utilities
# We install these into our nightly container that CI uses to run lint
Expand Down
21 changes: 6 additions & 15 deletions .github/workflows/sycl_post_commit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,23 @@ jobs:
uses: ./.github/workflows/sycl_gen_test_matrix.yml
with:
lts_config: "l0_gen9;win_l0_gen12"
linux_default:
name: Linux Default
if: github.repository == 'intel/llvm'
needs: test_matrix
uses: ./.github/workflows/sycl_linux_build_and_test.yml
secrets: inherit
with:
build_cache_root: "/__w/llvm"
build_artifact_suffix: "post_commit"
lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }}
cts_matrix: ${{ needs.test_matrix.outputs.cts_matrix }}
lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }}
build_configure_extra_args: --hip --cuda --enable-esimd-emulator --cmake-opt="-DSYCL_ENABLE_STACK_PRINTING=ON" --cmake-opt="-DSYCL_LIB_WITH_DEBUG_SYMBOL=ON"
linux_self_prod:
name: Linux (Self build + shared libraries + no-assertions)
if: github.repository == 'intel/llvm'
needs: test_matrix
uses: ./.github/workflows/sycl_linux_build_and_test.yml
with:
build_cache_root: "/__w/llvm"
build_cache_suffix: sprod_shared
build_artifact_suffix: sprod_shared
build_configure_extra_args: --shared-libs --no-assertions
build_configure_extra_args: --shared-libs --no-assertions --hip --cuda --enable-esimd-emulator --cmake-opt="-DSYCL_ENABLE_STACK_PRINTING=ON" --cmake-opt="-DSYCL_LIB_WITH_DEBUG_SYMBOL=ON"
# Docker image has last nightly pre-installed and added to the PATH
build_image: "ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:no-drivers"
build_image: "ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build"
cc: clang
cxx: clang++
lts_matrix: ${{ needs.test_matrix.outputs.lts_lx_matrix }}
cts_matrix: ${{ needs.test_matrix.outputs.cts_matrix }}
lts_aws_matrix: ${{ needs.test_matrix.outputs.lts_aws_matrix }}

windows_default:
name: Windows
Expand Down
1 change: 1 addition & 0 deletions devops/actions/e2e-tests/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ runs:
run: |
echo "::group::CMake configuration"
export PATH=$PWD/toolchain/bin/:$PATH
export LD_LIBRARY_PATH=$PWD/toolchain/lib/:$LD_LIBRARY_PATH
cmake -GNinja -B./build-e2e -S./llvm/sycl/test-e2e -DSYCL_TEST_E2E_TARGETS="${{ inputs.targets }}" -DCMAKE_CXX_COMPILER="$PWD/toolchain/bin/clang++" -DLLVM_LIT="$PWD/llvm/llvm/utils/lit/lit.py" ${{ inputs.cmake_args }}
echo "::endgroup::"
- name: Run testing
Expand Down
107 changes: 57 additions & 50 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,7 @@ bool hasExceededMaxRegistersPerBlock(pi_device device, pi_kernel kernel,
kernel->get()));

return blockSize * regsPerThread > size_t(maxRegsPerBlock);
};
}

} // anonymous namespace

Expand Down Expand Up @@ -1012,7 +1012,7 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
}
}

pi_result cuda_piPlatformGetInfo(pi_platform platform,
pi_result cuda_piPlatformGetInfo([[maybe_unused]] pi_platform platform,
pi_platform_info param_name,
size_t param_value_size, void *param_value,
size_t *param_value_size_ret) {
Expand Down Expand Up @@ -1176,7 +1176,7 @@ pi_result cuda_piextDeviceSelectBinary(pi_device device,
return PI_ERROR_INVALID_BINARY;
}

pi_result cuda_piextGetDeviceFunctionPointer(pi_device device,
pi_result cuda_piextGetDeviceFunctionPointer([[maybe_unused]] pi_device device,
pi_program program,
const char *func_name,
pi_uint64 *func_pointer_ret) {
Expand Down Expand Up @@ -2270,12 +2270,13 @@ pi_result cuda_piextDeviceCreateWithNativeHandle(pi_native_handle nativeHandle,
/// \param[out] retcontext Set to created context on success.
///
/// \return PI_SUCCESS on success, otherwise an error return code.
pi_result cuda_piContextCreate(const pi_context_properties *properties,
pi_uint32 num_devices, const pi_device *devices,
void (*pfn_notify)(const char *errinfo,
const void *private_info,
size_t cb, void *user_data),
void *user_data, pi_context *retcontext) {
pi_result cuda_piContextCreate(
[[maybe_unused]] const pi_context_properties *properties,
[[maybe_unused]] pi_uint32 num_devices, const pi_device *devices,
[[maybe_unused]] void (*pfn_notify)(const char *errinfo,
const void *private_info, size_t cb,
void *user_data),
[[maybe_unused]] void *user_data, pi_context *retcontext) {

assert(devices != nullptr);
// TODO: How to implement context callback?
Expand Down Expand Up @@ -2350,9 +2351,10 @@ pi_result cuda_piextContextCreateWithNativeHandle(pi_native_handle nativeHandle,
/// Can trigger a manual copy depending on the mode.
/// \TODO Implement USE_HOST_PTR using cuHostRegister
///
pi_result cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags,
size_t size, void *host_ptr, pi_mem *ret_mem,
const pi_mem_properties *properties) {
pi_result
cuda_piMemBufferCreate(pi_context context, pi_mem_flags flags, size_t size,
void *host_ptr, pi_mem *ret_mem,
[[maybe_unused]] const pi_mem_properties *properties) {
// Need input memory object
assert(ret_mem != nullptr);
assert((properties == nullptr || *properties == 0) &&
Expand Down Expand Up @@ -2490,9 +2492,10 @@ pi_result cuda_piMemRelease(pi_mem memObj) {
/// A buffer partition (or a sub-buffer, in OpenCL terms) is simply implemented
/// as an offset over an existing CUDA allocation.
///
pi_result cuda_piMemBufferPartition(pi_mem parent_buffer, pi_mem_flags flags,
pi_buffer_create_type buffer_create_type,
void *buffer_create_info, pi_mem *memObj) {
pi_result cuda_piMemBufferPartition(
pi_mem parent_buffer, pi_mem_flags flags,
[[maybe_unused]] pi_buffer_create_type buffer_create_type,
void *buffer_create_info, pi_mem *memObj) {
assert((parent_buffer != nullptr) && "PI_ERROR_INVALID_MEM_OBJECT");
assert(parent_buffer->is_buffer() && "PI_ERROR_INVALID_MEM_OBJECTS");
assert(!parent_buffer->is_sub_buffer() && "PI_ERROR_INVALID_MEM_OBJECT");
Expand Down Expand Up @@ -2576,10 +2579,8 @@ pi_result cuda_piextMemGetNativeHandle(pi_mem mem,
/// \param[out] mem Set to the PI mem object created from native handle.
///
/// \return TBD
pi_result cuda_piextMemCreateWithNativeHandle(pi_native_handle nativeHandle,
pi_context context,
bool ownNativeHandle,
pi_mem *mem) {
pi_result cuda_piextMemCreateWithNativeHandle(pi_native_handle, pi_context,
bool, pi_mem *) {
sycl::detail::pi::die(
"Creation of PI mem from native handle not implemented");
return {};
Expand Down Expand Up @@ -3572,11 +3573,11 @@ pi_result cuda_piclProgramCreateWithSource(pi_context, pi_uint32, const char **,
/// used later on to extract functions (kernels).
/// See \ref _pi_program for implementation details.
///
pi_result cuda_piProgramBuild(pi_program program, pi_uint32 num_devices,
const pi_device *device_list, const char *options,
void (*pfn_notify)(pi_program program,
void *user_data),
void *user_data) {
pi_result cuda_piProgramBuild(
pi_program program, [[maybe_unused]] pi_uint32 num_devices,
[[maybe_unused]] const pi_device *device_list, const char *options,
[[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
[[maybe_unused]] void *user_data) {

assert(program != nullptr);
assert(num_devices == 1 || num_devices == 0);
Expand Down Expand Up @@ -3609,10 +3610,11 @@ pi_result cuda_piProgramCreate(pi_context, const void *, size_t, pi_program *) {
/// Note: Only supports one device
///
pi_result cuda_piProgramCreateWithBinary(
pi_context context, pi_uint32 num_devices, const pi_device *device_list,
const size_t *lengths, const unsigned char **binaries,
size_t num_metadata_entries, const pi_device_binary_property *metadata,
pi_int32 *binary_status, pi_program *program) {
pi_context context, [[maybe_unused]] pi_uint32 num_devices,
[[maybe_unused]] const pi_device *device_list, const size_t *lengths,
const unsigned char **binaries, size_t num_metadata_entries,
const pi_device_binary_property *metadata, pi_int32 *binary_status,
pi_program *program) {
// Ignore unused parameter
(void)binary_status;

Expand Down Expand Up @@ -3686,13 +3688,12 @@ pi_result cuda_piProgramGetInfo(pi_program program, pi_program_info param_name,
/// programs.
/// \TODO Implement linker options, requires mapping of OpenCL to CUDA
///
pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices,
const pi_device *device_list, const char *options,
pi_uint32 num_input_programs,
const pi_program *input_programs,
void (*pfn_notify)(pi_program program,
void *user_data),
void *user_data, pi_program *ret_program) {
pi_result cuda_piProgramLink(
pi_context context, [[maybe_unused]] pi_uint32 num_devices,
[[maybe_unused]] const pi_device *device_list, const char *options,
pi_uint32 num_input_programs, const pi_program *input_programs,
[[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
[[maybe_unused]] void *user_data, pi_program *ret_program) {

assert(ret_program != nullptr);
assert(num_devices == 1 || num_devices == 0);
Expand Down Expand Up @@ -3751,10 +3752,12 @@ pi_result cuda_piProgramLink(pi_context context, pi_uint32 num_devices,
/// \TODO Implement asynchronous compilation
///
pi_result cuda_piProgramCompile(
pi_program program, pi_uint32 num_devices, const pi_device *device_list,
const char *options, pi_uint32 num_input_headers,
pi_program program, [[maybe_unused]] pi_uint32 num_devices,
[[maybe_unused]] const pi_device *device_list, const char *options,
[[maybe_unused]] pi_uint32 num_input_headers,
const pi_program *input_headers, const char **header_include_names,
void (*pfn_notify)(pi_program program, void *user_data), void *user_data) {
[[maybe_unused]] void (*pfn_notify)(pi_program program, void *user_data),
[[maybe_unused]] void *user_data) {
// Ignore unused parameters
(void)header_include_names;
(void)input_headers;
Expand Down Expand Up @@ -5178,9 +5181,10 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj,

/// USM: Implements USM Host allocations using CUDA Pinned Memory
///
pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
pi_usm_mem_properties *properties, size_t size,
pi_uint32 alignment) {
pi_result
cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,
[[maybe_unused]] pi_usm_mem_properties *properties,
size_t size, [[maybe_unused]] pi_uint32 alignment) {
assert(result_ptr != nullptr);
assert(context != nullptr);
assert(properties == nullptr || *properties == 0);
Expand All @@ -5200,10 +5204,11 @@ pi_result cuda_piextUSMHostAlloc(void **result_ptr, pi_context context,

/// USM: Implements USM device allocations using a normal CUDA device pointer
///
pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
pi_device device,
pi_usm_mem_properties *properties,
size_t size, pi_uint32 alignment) {
pi_result
cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,
[[maybe_unused]] pi_device device,
[[maybe_unused]] pi_usm_mem_properties *properties,
size_t size, [[maybe_unused]] pi_uint32 alignment) {
assert(result_ptr != nullptr);
assert(context != nullptr);
assert(device != nullptr);
Expand All @@ -5224,10 +5229,11 @@ pi_result cuda_piextUSMDeviceAlloc(void **result_ptr, pi_context context,

/// USM: Implements USM Shared allocations using CUDA Managed Memory
///
pi_result cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
pi_device device,
pi_usm_mem_properties *properties,
size_t size, pi_uint32 alignment) {
pi_result
cuda_piextUSMSharedAlloc(void **result_ptr, pi_context context,
[[maybe_unused]] pi_device device,
[[maybe_unused]] pi_usm_mem_properties *properties,
size_t size, [[maybe_unused]] pi_uint32 alignment) {
assert(result_ptr != nullptr);
assert(context != nullptr);
assert(device != nullptr);
Expand Down Expand Up @@ -5568,7 +5574,8 @@ pi_result cuda_piextUSMEnqueueMemcpy2D(pi_queue queue, pi_bool blocking,

// Determine the direction of copy using cuPointerGetAttribute
// for both the src_ptr and dst_ptr
CUDA_MEMCPY2D cpyDesc = {0};
CUDA_MEMCPY2D cpyDesc;
memset(&cpyDesc, 0, sizeof(cpyDesc));

getUSMHostOrDevicePtr(src_ptr, &cpyDesc.srcMemoryType, &cpyDesc.srcDevice,
&cpyDesc.srcHost);
Expand Down
3 changes: 2 additions & 1 deletion sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,8 @@ struct _pi_queue {
};
{
unsigned int size = static_cast<unsigned int>(compute_streams_.size());
std::lock_guard compute_sync_guard(compute_stream_sync_mutex_);
std::lock_guard<std::mutex> compute_sync_guard(
compute_stream_sync_mutex_);
std::lock_guard<std::mutex> compute_guard(compute_stream_mutex_);
unsigned int start = last_sync_compute_streams_;
unsigned int end = num_compute_streams_ < size
Expand Down
4 changes: 2 additions & 2 deletions sycl/plugins/cuda/tracing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ constexpr int GMajVer = 0;
constexpr int GMinVer = 1;

#ifdef XPTI_ENABLE_INSTRUMENTATION
static void cuptiCallback(void *userdata, CUpti_CallbackDomain,
CUpti_CallbackId CBID, const void *CBData) {
static void cuptiCallback(void *, CUpti_CallbackDomain, CUpti_CallbackId CBID,
const void *CBData) {
if (xptiTraceEnabled()) {
const auto *CBInfo = static_cast<const CUpti_CallbackData *>(CBData);

Expand Down
Loading

0 comments on commit 64bd508

Please sign in to comment.