diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 32a7814fa1c5c..fdaaf2f664fc0 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -44,6 +44,7 @@ jobs: target_devices: ext_oneapi_level_zero:gpu reset_gpu: true tests_selector: e2e + extra_lit_opts: --param gpu-intel-gen12=True - name: Intel OCL GPU runner: '["Linux", "gen12"]' @@ -52,6 +53,7 @@ jobs: target_devices: opencl:gpu reset_gpu: true tests_selector: e2e + extra_lit_opts: --param gpu-intel-gen12=True - name: OCL CPU (AMD) runner: '["Linux", "amdgpu"]' @@ -116,6 +118,7 @@ jobs: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }} + extra_lit_opts: --param gpu-intel-gen12=True nightly_build_upload: name: Nightly Build Upload diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 2f379b04b9a82..975585390a95d 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -11158,16 +11158,13 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA, // Create a comma separated list to pass along to the linker wrapper. SmallString<256> LibList; - // TODO: TargetTriple should not be used here for creating linker wrapper - // options. It should also not be passed to the linker wrapper. llvm::Triple TargetTriple; auto ToolChainRange = C.getOffloadToolChains(); for (auto &I : llvm::make_range(ToolChainRange.first, ToolChainRange.second)) { const ToolChain *TC = I.second; - // TODO: Third party AOT support needs to be added in new offloading - // model. - if (TC->getTriple().isSPIROrSPIRV()) { + // Note: For AMD targets, we do not pass any SYCL device libraries. + if (TC->getTriple().isSPIROrSPIRV() || TC->getTriple().isNVPTX()) { TargetTriple = TC->getTriple(); SmallVector SYCLDeviceLibs; bool IsSPIR = TargetTriple.isSPIROrSPIRV(); diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 29dfb4a55150c..d704c09a47e65 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -1113,18 +1113,24 @@ static std::pair constructFreeFunctionKernelName( SemaSYCL &SemaSYCLRef, const FunctionDecl *FreeFunc, MangleContext &MC) { SmallString<256> Result; llvm::raw_svector_ostream Out(Result); + std::string NewName; std::string StableName; - MC.mangleName(FreeFunc, Out); - std::string MangledName(Out.str()); - size_t StartNums = MangledName.find_first_of("0123456789"); - size_t EndNums = MangledName.find_first_not_of("0123456789", StartNums); - size_t NameLength = - std::stoi(MangledName.substr(StartNums, EndNums - StartNums)); - size_t NewNameLength = 14 /*length of __sycl_kernel_*/ + NameLength; - std::string NewName = MangledName.substr(0, StartNums) + - std::to_string(NewNameLength) + "__sycl_kernel_" + - MangledName.substr(EndNums); + // Handle extern "C" + if (FreeFunc->getLanguageLinkage() == CLanguageLinkage) { + const IdentifierInfo *II = FreeFunc->getIdentifier(); + NewName = "__sycl_kernel_" + II->getName().str(); + } else { + MC.mangleName(FreeFunc, Out); + std::string MangledName(Out.str()); + size_t StartNums = MangledName.find_first_of("0123456789"); + size_t EndNums = MangledName.find_first_not_of("0123456789", StartNums); + size_t NameLength = + std::stoi(MangledName.substr(StartNums, EndNums - StartNums)); + size_t NewNameLength = 14 /*length of __sycl_kernel_*/ + NameLength; + NewName = MangledName.substr(0, StartNums) + std::to_string(NewNameLength) + + "__sycl_kernel_" + MangledName.substr(EndNums); + } StableName = NewName; return {NewName, StableName}; } diff --git a/clang/test/Driver/Inputs/libsycl-complex.new.o b/clang/test/Driver/Inputs/libsycl-complex.new.o deleted file mode 100644 index c7501e92dc7a9..0000000000000 Binary files a/clang/test/Driver/Inputs/libsycl-complex.new.o and /dev/null differ diff --git a/clang/test/Driver/Inputs/libsycl-complex.o b/clang/test/Driver/Inputs/libsycl-complex.o deleted file mode 100644 index 6804fe73e39c6..0000000000000 Binary files a/clang/test/Driver/Inputs/libsycl-complex.o and /dev/null differ diff --git a/clang/test/Driver/Inputs/libsycl-crt.new.o b/clang/test/Driver/Inputs/libsycl-crt.new.o deleted file mode 100644 index 2fd53a58c4aa8..0000000000000 Binary files a/clang/test/Driver/Inputs/libsycl-crt.new.o and /dev/null differ diff --git a/clang/test/Driver/Inputs/libsycl-crt.o b/clang/test/Driver/Inputs/libsycl-crt.o deleted file mode 100644 index ae78bcb15b1ac..0000000000000 Binary files a/clang/test/Driver/Inputs/libsycl-crt.o and /dev/null differ diff --git a/clang/test/Driver/Inputs/test-sycl-aot-cpu.o b/clang/test/Driver/Inputs/test-sycl-aot-cpu.o deleted file mode 100644 index 506b31bfe832d..0000000000000 Binary files a/clang/test/Driver/Inputs/test-sycl-aot-cpu.o and /dev/null differ diff --git a/clang/test/Driver/Inputs/test-sycl-aot-gen.o b/clang/test/Driver/Inputs/test-sycl-aot-gen.o deleted file mode 100644 index 65da7bd036447..0000000000000 Binary files a/clang/test/Driver/Inputs/test-sycl-aot-gen.o and /dev/null differ diff --git a/clang/test/Driver/Inputs/test-sycl.o b/clang/test/Driver/Inputs/test-sycl.o deleted file mode 100644 index 4fbd50977f084..0000000000000 Binary files a/clang/test/Driver/Inputs/test-sycl.o and /dev/null differ diff --git a/clang/test/Driver/linker-wrapper-sycl-win.cpp b/clang/test/Driver/linker-wrapper-sycl-win.cpp index 50804716c1939..82d5fa1dcab48 100644 --- a/clang/test/Driver/linker-wrapper-sycl-win.cpp +++ b/clang/test/Driver/linker-wrapper-sycl-win.cpp @@ -1,40 +1,112 @@ // REQUIRES: system-windows /// Check for list of commands for standalone clang-linker-wrapper run for sycl -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t.devicelib.cpp +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s // CHK-CMDS: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-NEXT: "{{.*}}llvm-spirv.exe"{{.*}} LLVM_SPIRV_OPTIONS -o {{.*}} // CHK-CMDS-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc -// CHK-CMDS-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}test-sycl.o +// CHK-CMDS-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o /// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for Intel GPU) -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl-aot-gen.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-GEN %s +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=intel_gpu_pvc -c --offload-new-driver -o %t1.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t1.devicelib.cpp +// RUN: %clang %t1.devicelib.cpp -fsycl -fsycl-targets=intel_gpu_pvc -c --offload-new-driver -o %t1.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t1.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t1.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-GEN %s // CHK-CMDS-AOT-GEN: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-spirv.exe"{{.*}} LLVM_SPIRV_OPTIONS -o {{.*}} -/// -DFOO1 and -DFOO2 are backend options parsed from device image. -/// These options were specified when the input fat binary (test-sycl-aot-gen.o) was compiled. -// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}ocloc{{.*}} -output_no_suffix -spirv_input -device pvc -device_options pvc -ze-intel-enable-auto-large-GRF-mode -DFOO1 -DFOO2 -output {{.*}} -file {{.*}} +// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}ocloc{{.*}} -output_no_suffix -spirv_input -device pvc -output {{.*}} -file {{.*}} // CHK-CMDS-AOT-GEN-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc -// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}test-sycl-aot-gen.o +// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o /// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for Intel CPU) -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl-aot-cpu.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-CPU %s +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=spir64_x86_64 -c --offload-new-driver -o %t2.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t2.devicelib.cpp +// RUN: %clang %t2.devicelib.cpp -fsycl -fsycl-targets=spir64_x86_64 -c --offload-new-driver -o %t2.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t2.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t2.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-CPU %s // CHK-CMDS-AOT-CPU: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-spirv.exe"{{.*}} LLVM_SPIRV_OPTIONS -o {{.*}} -/// -DFOO1 and -DFOO2 are backend options parsed from device image. -/// These options were specified when the input fat binary (test-sycl-aot-cpu.o) was compiled. -// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}opencl-aot.exe"{{.*}} --device=cpu -DFOO1 -DFOO2 -o {{.*}} +// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}opencl-aot.exe"{{.*}} --device=cpu -o {{.*}} // CHK-CMDS-AOT-CPU-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc -// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}test-sycl-aot-cpu.o +// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o + +/// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for NVPTX) +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_50 -c -nocudalib -fno-sycl-libspirv --offload-new-driver -o %t3.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t3.devicelib.cpp +// RUN: %clang %t3.devicelib.cpp -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_50 -nocudalib -fno-sycl-libspirv -c --offload-new-driver -o %t3.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t3.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t3.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-NV %s +// CHK-CMDS-AOT-NV: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}clang.exe"{{.*}} -o [[CLANGOUT:.*]] --target=nvptx64-nvidia-cuda -march={{.*}} +// CHK-CMDS-AOT-NV-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o + +/// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for AMD) +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx803 -fgpu-rdc -nogpulib -fno-sycl-libspirv -c --offload-new-driver -o %t4.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t4.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-AMD %s +// CHK-CMDS-AOT-AMD: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[FIRSTLLVMLINKOUT]].bc +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}clang.exe"{{.*}} -o [[CLANGOUT:.*]] --target=amdgcn-amd-amdhsa -mcpu={{.*}} +// CHK-CMDS-AOT-AMD-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o diff --git a/clang/test/Driver/linker-wrapper-sycl.cpp b/clang/test/Driver/linker-wrapper-sycl.cpp index c976ea4b012d2..2c5ddf27e5aea 100644 --- a/clang/test/Driver/linker-wrapper-sycl.cpp +++ b/clang/test/Driver/linker-wrapper-sycl.cpp @@ -1,44 +1,116 @@ // REQUIRES: system-linux /// Check for list of commands for standalone clang-linker-wrapper run for sycl -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--triple=spir64" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t.devicelib.cpp +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s // CHK-CMDS: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-NEXT: "{{.*}}llvm-spirv"{{.*}} LLVM_SPIRV_OPTIONS -o {{.*}} // CHK-CMDS-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc -// CHK-CMDS-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc -// CHK-CMDS-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}test-sycl.o +// CHK-CMDS-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]] [[WRAPPEROUT]].bc +// CHK-CMDS-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o /// check for PIC for device wrap compilation when using -shared -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--triple=spir64" "--linker-path=/usr/bin/ld" -shared "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-SHARED %s +// RUN: clang-linker-wrapper -sycl-device-libraries=%t.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--triple=spir64" "--linker-path=/usr/bin/ld" -shared "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-SHARED %s // CHK-SHARED: "{{.*}}llc"{{.*}} -relocation-model=pic /// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for Intel GPU) -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl-aot-gen.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-GEN %s +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=intel_gpu_pvc -c --offload-new-driver -o %t1.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t1.devicelib.cpp +// RUN: %clang %t1.devicelib.cpp -fsycl -fsycl-targets=intel_gpu_pvc -c --offload-new-driver -o %t1.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t1.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t1.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-GEN %s // CHK-CMDS-AOT-GEN: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-spirv"{{.*}} LLVM_SPIRV_OPTIONS -o {{.*}} -/// -DFOO1 and -DFOO2 are backend options parsed from device image. -/// These options were specified when the input fat binary (test-sycl-aot-gen.o) was compiled. -// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}ocloc"{{.*}} -output_no_suffix -spirv_input -device pvc -device_options pvc -ze-intel-enable-auto-large-GRF-mode -DFOO1 -DFOO2 -output {{.*}} -file {{.*}} +// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}ocloc"{{.*}} -output_no_suffix -spirv_input -device pvc {{.*}} -output {{.*}} -file {{.*}} // CHK-CMDS-AOT-GEN-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc -// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}test-sycl-aot-gen.o +// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o /// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for Intel CPU) -// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl-aot-cpu.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-CPU %s +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=spir64_x86_64 -c --offload-new-driver -o %t2.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t2.devicelib.cpp +// RUN: %clang %t2.devicelib.cpp -fsycl -fsycl-targets=spir64_x86_64 -c --offload-new-driver -o %t2.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t2.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t2.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-CPU %s // CHK-CMDS-AOT-CPU: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings -// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llvm-spirv"{{.*}} LLVM_SPIRV_OPTIONS -o {{.*}} -/// -DFOO1 and -DFOO2 are backend options parsed from device image. -/// These options were specified when the input fat binary (test-sycl-aot-cpu.o) was compiled. -// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}opencl-aot"{{.*}} --device=cpu -DFOO1 -DFOO2 -o {{.*}} +// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}opencl-aot"{{.*}} --device=cpu -o {{.*}} // CHK-CMDS-AOT-CPU-NEXT: offload-wrapper: input: {{.*}}, output: [[WRAPPEROUT:.*]].bc // CHK-CMDS-AOT-CPU-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc -// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}/ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}test-sycl-aot-cpu.o +// CHK-CMDS-AOT-CPU-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]].o HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o + +/// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for NVPTX) +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_50 -c -nocudalib -fno-sycl-libspirv --offload-new-driver -o %t3.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t3.devicelib.cpp +// RUN: %clang %t3.devicelib.cpp -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_50 -nocudalib -fno-sycl-libspirv -c --offload-new-driver -o %t3.devicelib.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-device-libraries=%t3.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t3.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-NV %s +// CHK-CMDS-AOT-NV: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}clang"{{.*}} -o [[CLANGOUT:.*]] --target=nvptx64-nvidia-cuda -march={{.*}} +// CHK-CMDS-AOT-NV-NEXT: offload-wrapper: input: [[WRAPPERIN:.*]], output: [[WRAPPEROUT:.*]] +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]] [[WRAPPEROUT]] +// CHK-CMDS-AOT-NV-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o + +/// Check for list of commands for standalone clang-linker-wrapper run for sycl (AOT for AMD) +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx803 -fgpu-rdc -nogpulib -fno-sycl-libspirv -c --offload-new-driver -o %t4.o +// +// Run clang-linker-wrapper test +// +// RUN: clang-linker-wrapper -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t4.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-AMD %s +// CHK-CMDS-AOT-AMD: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[FIRSTLLVMLINKOUT]].bc +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}clang"{{.*}} -o [[CLANGOUT:.*]] --target=amdgcn-amd-amdhsa -mcpu={{.*}} +// CHK-CMDS-AOT-AMD-NEXT: offload-wrapper: input: [[WRAPPERIN:.*]], output: [[WRAPPEROUT:.*]] +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]] [[WRAPPEROUT]] +// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o diff --git a/clang/test/Driver/sycl-linker-wrapper-image.cpp b/clang/test/Driver/sycl-linker-wrapper-image.cpp index 9de339420599f..fb81abd4b7ef0 100644 --- a/clang/test/Driver/sycl-linker-wrapper-image.cpp +++ b/clang/test/Driver/sycl-linker-wrapper-image.cpp @@ -1,12 +1,22 @@ // REQUIRES: system-linux // This test check wrapping of SYCL binaries in clang-linker-wrapper. - +// +// Generate .o file as linker wrapper input. +// // RUN: %clang -cc1 -fsycl-is-device -disable-llvm-passes -triple=spir64-unknown-unknown %s -emit-llvm-bc -o %t.device.bc // RUN: clang-offload-packager -o %t.fat --image=file=%t.device.bc,kind=sycl,triple=spir64-unknown-unknown // RUN: %clang -cc1 %s -triple=x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.fat -// RUN: clang-linker-wrapper --print-wrapped-module --host-triple=x86_64-unknown-linux-gnu \ -// RUN: -sycl-device-library-location=%S/Inputs -sycl-post-link-options="-split=auto -symbols -properties" \ -// RUN: %t.o -o %t.out 2>&1 --linker-path="/usr/bin/ld" | FileCheck %s +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t.devicelib.cpp +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.devicelib.o +// +// Run clang-linker-wrapper test +// +//// RUN: clang-linker-wrapper --print-wrapped-module --host-triple=x86_64-unknown-linux-gnu \ +// RUN: -sycl-device-libraries=%t.devicelib.o \ +// RUN: -sycl-post-link-options="-split=auto -symbols -properties" %t.o -o %t.out 2>&1 --linker-path="/usr/bin/ld" | FileCheck %s template __attribute__((sycl_kernel)) void kernel(const Func &func) { diff --git a/clang/test/Driver/sycl-post-link-options-win.cpp b/clang/test/Driver/sycl-post-link-options-win.cpp index d68d4881c9404..110dcfb498163 100644 --- a/clang/test/Driver/sycl-post-link-options-win.cpp +++ b/clang/test/Driver/sycl-post-link-options-win.cpp @@ -4,13 +4,20 @@ // RUN: -Xdevice-post-link -O0 %s 2>&1 \ // RUN: | FileCheck -check-prefix OPTIONS_POSTLINK_JIT_OLD %s // OPTIONS_POSTLINK_JIT_OLD: sycl-post-link{{.*}} "-O2" "-device-globals" "-properties" "-spec-const=native" "-split=auto" "-emit-only-kernels-as-entry-points" "-emit-param-info" "-symbols" "-emit-exported-symbols" "-emit-imported-symbols" "-split-esimd" "-lower-esimd" "-O0" - -// RUN: %clang -cc1 %s -triple x86_64-pc-windows-msvc -emit-obj -o %t.elf.o -// RUN: clang-offload-packager -o %t.out --image=file=%t.elf.o,kind=sycl,triple=spir64 -// RUN: %clang -cc1 %s -triple x86_64-pc-windows-msvc -emit-obj -o %t.o \ -// RUN: -fembed-offload-object=%t.out +// ------- +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t.devicelib.cpp +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.devicelib.o +// +// Run clang-linker-wrapper test +// // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-pc-windows-msvc \ -// RUN: -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj \ +// RUN: -sycl-device-libraries=%t.devicelib.o \ // RUN: --sycl-post-link-options="-O2 -device-globals -O0" \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck --check-prefix OPTIONS_POSTLINK_JIT_NEW %s // OPTIONS_POSTLINK_JIT_NEW: sycl-post-link{{.*}} -spec-const=native -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -emit-imported-symbols -split-esimd -lower-esimd -O2 -device-globals -O0 diff --git a/clang/test/Driver/sycl-post-link-options.cpp b/clang/test/Driver/sycl-post-link-options.cpp index 55a6b94feb86e..96134faba4ae2 100644 --- a/clang/test/Driver/sycl-post-link-options.cpp +++ b/clang/test/Driver/sycl-post-link-options.cpp @@ -4,13 +4,20 @@ // RUN: -Xdevice-post-link -O0 %s 2>&1 \ // RUN: | FileCheck -check-prefix OPTIONS_POSTLINK_JIT_OLD %s // OPTIONS_POSTLINK_JIT_OLD: sycl-post-link{{.*}} "-O2" "-device-globals" "-properties" "-spec-const=native" "-split=auto" "-emit-only-kernels-as-entry-points" "-emit-param-info" "-symbols" "-emit-exported-symbols" "-emit-imported-symbols" "-split-esimd" "-lower-esimd" "-O0" - -// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o -// RUN: clang-offload-packager -o %t.out --image=file=%t.elf.o,kind=sycl,triple=spir64 -// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ -// RUN: -fembed-offload-object=%t.out +// +// Generate .o file as linker wrapper input. +// +// RUN: %clang %s -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.o +// +// Generate .o file as SYCL device library file. +// +// RUN: touch %t.devicelib.cpp +// RUN: %clang %t.devicelib.cpp -fsycl -fsycl-targets=spir64-unknown-unknown -c --offload-new-driver -o %t.devicelib.o +// +// Run clang-linker-wrapper test +// // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \ -// RUN: -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o \ +// RUN: -sycl-device-libraries=%t.devicelib.o \ // RUN: --sycl-post-link-options="-O2 -device-globals -properties -O0" \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck --check-prefix OPTIONS_POSTLINK_JIT_NEW %s // OPTIONS_POSTLINK_JIT_NEW: sycl-post-link{{.*}} -spec-const=native -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -emit-imported-symbols -split-esimd -lower-esimd -O2 -device-globals -properties -O0 diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index eb2dbdfc55a8d..279123e3254b8 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -469,12 +469,9 @@ namespace sycl { // The list of files and its location are passed from driver. static Error getSYCLDeviceLibs(SmallVector &DeviceLibFiles, const ArgList &Args) { - StringRef SYCLDeviceLibLoc; + StringRef SYCLDeviceLibLoc(""); if (Arg *A = Args.getLastArg(OPT_sycl_device_library_location_EQ)) SYCLDeviceLibLoc = A->getValue(); - else - return createStringError(inconvertibleErrorCode(), - "SYCL device library location is invalid."); if (Arg *A = Args.getLastArg(OPT_sycl_device_lib_EQ)) { if (A->getValues().size() == 0) return createStringError( @@ -494,10 +491,15 @@ static Error getSYCLDeviceLibs(SmallVector &DeviceLibFiles, return Error::success(); } -// This routine is used to convert SPIR-V input files into LLVM IR files. -// If input is not a SPIR-V file, then the original file is returned. -// TODO: Add a check to identify SPIR-V files and exit early if the input is -// not a SPIR-V file. +/// This routine is used to convert SPIR-V input files into LLVM IR files. +/// 'llvm-spirv -r' command is used for this purpose. +/// If input is not a SPIR-V file, then the original file is returned. +/// TODO: Add a check to identify SPIR-V files and exit early if the input is +/// not a SPIR-V file. +/// 'Filename' is the input file that could be a SPIR-V file. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// llvm-spirv tool. static Expected convertSPIRVToIR(StringRef Filename, const ArgList &Args) { Expected SPIRVToIRWrapperPath = findProgram( @@ -524,10 +526,10 @@ static Expected convertSPIRVToIR(StringRef Filename, return *TempFileOrErr; } -// Add any sycl-post-link options that rely on a specific Triple in addition -// to user supplied options. -// NOTE: Any changes made here should be reflected in the similarly named -// function in clang/lib/Driver/ToolChains/Clang.cpp. +/// Add any sycl-post-link options that rely on a specific Triple in addition +/// to user supplied options. +/// NOTE: Any changes made here should be reflected in the similarly named +/// function in clang/lib/Driver/ToolChains/Clang.cpp. static void getTripleBasedSYCLPostLinkOpts(const ArgList &Args, SmallVector &PostLinkArgs, @@ -589,7 +591,11 @@ getTripleBasedSYCLPostLinkOpts(const ArgList &Args, PostLinkArgs.push_back("-generate-device-image-default-spec-consts"); } -// Run sycl-post-link tool +/// Run sycl-post-link tool for SYCL offloading. +/// 'InputFiles' is the list of input LLVM IR files. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// sycl-post-link tool. static Expected runSYCLPostLink(ArrayRef InputFiles, const ArgList &Args) { Expected SYCLPostLinkPath = @@ -692,10 +698,10 @@ struct Table { } }; -// Add any llvm-spirv option that relies on a specific Triple in addition -// to user supplied options. -// NOTE: Any changes made here should be reflected in the similarly named -// function in clang/lib/Driver/ToolChains/Clang.cpp. +/// Add any llvm-spirv option that relies on a specific Triple in addition +/// to user supplied options. +/// NOTE: Any changes made here should be reflected in the similarly named +/// function in clang/lib/Driver/ToolChains/Clang.cpp. static void getTripleBasedSPIRVTransOpts(const ArgList &Args, SmallVector &TranslatorArgs, @@ -766,22 +772,18 @@ getTripleBasedSPIRVTransOpts(const ArgList &Args, TranslatorArgs.push_back(Args.MakeArgString(ExtArg)); } -// Run LLVM to SPIR-V translation. -static Expected runLLVMToSPIRVTranslation(StringRef InputTable, +/// Run LLVM to SPIR-V translation. +/// Converts 'File' from LLVM bitcode to SPIR-V format using llvm-spirv tool. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// llvm-spirv tool. +static Expected runLLVMToSPIRVTranslation(StringRef File, const ArgList &Args) { - Table LiveSYCLTable; Expected LLVMToSPIRVPath = findProgram("llvm-spirv", {getMainExecutable("llvm-spirv")}); if (!LLVMToSPIRVPath) return LLVMToSPIRVPath.takeError(); - if (Error Err = LiveSYCLTable.populateSYCLTable(InputTable)) - return std::move(Err); - auto InputFiles = LiveSYCLTable.getListOfIRFiles(); - // For dry runs, add a 'dummy' file to emit the command - if (DryRun) - InputFiles.push_back("dummy"); - SmallVector CmdArgs; CmdArgs.push_back(*LLVMToSPIRVPath); const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); @@ -793,30 +795,26 @@ static Expected runLLVMToSPIRVTranslation(StringRef InputTable, /* KeepEmpty = */ false); CmdArgs.push_back("-o"); - for (unsigned I = 0; I < InputFiles.size(); ++I) { - const auto &File = InputFiles[I]; - // Create a new file to write the translated file to. - auto TempFileOrErr = - createOutputFile(sys::path::filename(ExecutableName), "spv"); - if (!TempFileOrErr) - return TempFileOrErr.takeError(); + // Create a new file to write the translated file to. + auto TempFileOrErr = + createOutputFile(sys::path::filename(ExecutableName), "spv"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); - CmdArgs.push_back(*TempFileOrErr); - CmdArgs.push_back(File); - if (Error Err = executeCommands(*LLVMToSPIRVPath, CmdArgs)) - return std::move(Err); - // Replace bc file in SYCL table with spv file (avoid for dry runs) - if (!DryRun) - LiveSYCLTable.Entries[I].IRFile = *TempFileOrErr; - // Pop back last two items - CmdArgs.pop_back_n(2); - } - auto Output = LiveSYCLTable.writeSYCLTableToFile(); - if (!Output) - return Output.takeError(); - return *Output; + CmdArgs.push_back(*TempFileOrErr); + CmdArgs.push_back(File); + if (Error Err = executeCommands(*LLVMToSPIRVPath, CmdArgs)) + return std::move(Err); + return *TempFileOrErr; } +/// Adds all AOT backend options required for SYCL AOT compilation step to +/// 'CmdArgs'. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate backend options required to be passed +/// into the SYCL AOT compilation step. +/// IsCPU is a bool used to direct option generation. If IsCPU is false, then +/// options are generated for AOT compilation targeting Intel GPUs. static void addBackendOptions(const ArgList &Args, SmallVector &CmdArgs, bool IsCPU) { StringRef OptC = @@ -831,10 +829,14 @@ static void addBackendOptions(const ArgList &Args, return; } -/// Run AOT compilation for Intel CPU -static Error runAOTCompileIntelCPU(SmallVector InputFiles, - SmallVector OutputFiles, - const ArgList &Args) { +/// Run AOT compilation for Intel CPU. +/// Calls opencl-aot tool to generate device code for Intel CPU backend. +/// 'InputFile' is the input SPIR-V file. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// SYCL AOT compilation step. +static Expected runAOTCompileIntelCPU(StringRef InputFile, + const ArgList &Args) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); StringRef Arch(Args.getLastArgValue(OPT_arch_EQ)); SmallVector CmdArgs; @@ -846,31 +848,27 @@ static Error runAOTCompileIntelCPU(SmallVector InputFiles, CmdArgs.push_back(*OpenCLAOTPath); CmdArgs.push_back("--device=cpu"); addBackendOptions(Args, CmdArgs, /* IsCPU */ true); - for (unsigned I = 0; I < InputFiles.size(); ++I) { - const auto &File = InputFiles[I]; - // Create a new file to write the translated file to. - auto TempFileOrErr = - createOutputFile(sys::path::filename(ExecutableName), "out"); - if (!TempFileOrErr) - return TempFileOrErr.takeError(); - CmdArgs.push_back("-o"); - CmdArgs.push_back(*TempFileOrErr); - CmdArgs.push_back(File); - if (Error Err = executeCommands(*OpenCLAOTPath, CmdArgs)) - return std::move(Err); - // Pop back last three items - CmdArgs.pop_back_n(3); - // Accumulate AOT output files (avoid for dry runs) - if (!DryRun) - OutputFiles[I] = *TempFileOrErr; - } - return Error::success(); + // Create a new file to write the translated file to. + auto TempFileOrErr = + createOutputFile(sys::path::filename(ExecutableName), "out"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); + CmdArgs.push_back("-o"); + CmdArgs.push_back(*TempFileOrErr); + CmdArgs.push_back(InputFile); + if (Error Err = executeCommands(*OpenCLAOTPath, CmdArgs)) + return std::move(Err); + return *TempFileOrErr; } /// Run AOT compilation for Intel GPU -static Error runAOTCompileIntelGPU(SmallVector &InputFiles, - SmallVector &OutputFiles, - const ArgList &Args) { +/// Calls ocloc tool to generate device code for Intel GPU backend. +/// 'InputFile' is the input SPIR-V file. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// SYCL AOT compilation step. +static Expected runAOTCompileIntelGPU(StringRef InputFile, + const ArgList &Args) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); StringRef Arch(Args.getLastArgValue(OPT_arch_EQ)); SmallVector CmdArgs; @@ -888,54 +886,33 @@ static Error runAOTCompileIntelGPU(SmallVector &InputFiles, CmdArgs.push_back(Arch); } addBackendOptions(Args, CmdArgs, /* IsCPU */ false); - for (unsigned I = 0; I < InputFiles.size(); ++I) { - const auto &File = InputFiles[I]; - // Create a new file to write the translated file to. - auto TempFileOrErr = - createOutputFile(sys::path::filename(ExecutableName), "out"); - if (!TempFileOrErr) - return TempFileOrErr.takeError(); - CmdArgs.push_back("-output"); - CmdArgs.push_back(*TempFileOrErr); - CmdArgs.push_back("-file"); - CmdArgs.push_back(File); - if (Error Err = executeCommands(*OclocPath, CmdArgs)) - return std::move(Err); - // Pop back last four items - CmdArgs.pop_back_n(4); - // Accumulate AOT output files (avoid for dry runs) - if (!DryRun) - OutputFiles[I] = *TempFileOrErr; - } - return Error::success(); + // Create a new file to write the translated file to. + auto TempFileOrErr = + createOutputFile(sys::path::filename(ExecutableName), "out"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); + CmdArgs.push_back("-output"); + CmdArgs.push_back(*TempFileOrErr); + CmdArgs.push_back("-file"); + CmdArgs.push_back(InputFile); + if (Error Err = executeCommands(*OclocPath, CmdArgs)) + return std::move(Err); + return *TempFileOrErr; } -/// Run AOT compilation -static Expected runAOTCompile(StringRef InputTable, +/// Run AOT compilation for Intel CPU/GPU. +/// 'InputFile' is the input SPIR-V file. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// SYCL AOT compilation step. +static Expected runAOTCompile(StringRef InputFile, const ArgList &Args) { const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ)); - Table LiveSYCLTable; - if (Error Err = LiveSYCLTable.populateSYCLTable(InputTable)) - return std::move(Err); - auto InputFiles = LiveSYCLTable.getListOfIRFiles(); - // For dry runs, add a 'dummy' file to emit the command - if (DryRun) - InputFiles.push_back("dummy"); if (Triple.isSPIRAOT()) { - SmallVector OutputFiles(InputFiles.size()); if (Triple.getSubArch() == llvm::Triple::SPIRSubArch_gen) - if (Error E = runAOTCompileIntelGPU(InputFiles, OutputFiles, Args)) - return E; + return runAOTCompileIntelGPU(InputFile, Args); if (Triple.getSubArch() == llvm::Triple::SPIRSubArch_x86_64) - if (Error E = runAOTCompileIntelCPU(InputFiles, OutputFiles, Args)) - return E; - if (!DryRun) - for (int I = 0; I < OutputFiles.size(); ++I) - LiveSYCLTable.Entries[I].IRFile = OutputFiles[I]; - auto Output = LiveSYCLTable.writeSYCLTableToFile(); - if (!Output) - return Output.takeError(); - return *Output; + return runAOTCompileIntelCPU(InputFile, Args); } return createStringError(inconvertibleErrorCode(), "Unsupported SYCL Triple and Arch"); @@ -1090,7 +1067,11 @@ Expected wrapSYCLBinariesFromFile(StringRef InputFile, return OutputFilePath; } -// Run llc +/// Run llc tool for SYCL offloading. +/// 'InputFile' is the wrapped input file. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// llc tool. static Expected runCompile(StringRef &InputFile, const ArgList &Args) { // Create a new file to write the output of llc to. @@ -1131,7 +1112,12 @@ static Expected runWrapperAndCompile(StringRef &InputFile, return *OutputFileOrErr; } -// Link all SYCL input files into one before adding device library files. +/// Link all SYCL device input files into one before adding device library +/// files. Device linking is performed using llvm-link tool. +/// 'InputFiles' is the list of all LLVM IR device input files. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// llvm-link tool. Expected linkDeviceInputFiles(SmallVectorImpl &InputFiles, const ArgList &Args) { llvm::TimeTraceScope TimeScope("SYCL LinkDeviceInputFiles"); @@ -1163,7 +1149,12 @@ Expected linkDeviceInputFiles(SmallVectorImpl &InputFiles, return *OutFileOrErr; } -// Link all device library files and input file into one. +/// Link all device library files and input file into one LLVM IR file. This +/// linking is performed using llvm-link tool. +/// 'InputFiles' is the list of all LLVM IR device input files. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// llvm-link tool. static Expected linkDeviceLibFiles(SmallVectorImpl &InputFiles, const ArgList &Args) { @@ -1193,6 +1184,13 @@ linkDeviceLibFiles(SmallVectorImpl &InputFiles, return *OutFileOrErr; } +/// This function is used to link all SYCL device input files into a single +/// LLVM IR file. This file is in turn linked with all SYCL device library +/// files. +/// 'InputFiles' is the list of all LLVM IR device input files. +/// 'Args' encompasses all arguments required for linking and wrapping device +/// code and will be parsed to generate options required to be passed into the +/// llvm-link tool. static Expected linkDevice(ArrayRef InputFiles, const ArgList &Args) { SmallVector InputFilesVec; @@ -1236,6 +1234,37 @@ static Expected linkDevice(ArrayRef InputFiles, << "Compatible SYCL device library binary not found\n"; } + // For NVPTX backend we need to also link libclc and CUDA libdevice. + if (Triple.isNVPTX()) { + if (Arg *A = Args.getLastArg(OPT_sycl_nvptx_device_lib_EQ)) { + if (A->getValues().size() == 0) + return createStringError( + inconvertibleErrorCode(), + "Number of device library files cannot be zero."); + for (StringRef Val : A->getValues()) { + SmallString<128> LibName(Val); + if (llvm::sys::fs::exists(LibName)) + ExtractedDeviceLibFiles.emplace_back(std::string(LibName)); + else + return createStringError( + inconvertibleErrorCode(), + std::string(LibName) + + " SYCL device library file for NVPTX is not found."); + } + } + } + + // Make sure that SYCL device library files are available. + // Note: For AMD targets, we do not pass any SYCL device libraries. + if (ExtractedDeviceLibFiles.empty()) { + // TODO: Add NVPTX when ready + if (Triple.isSPIROrSPIRV()) + return createStringError( + inconvertibleErrorCode(), + " SYCL device library file list cannot be empty."); + return *LinkedFile; + } + for (auto &File : ExtractedDeviceLibFiles) InputFilesVec.emplace_back(File); // second llvm-link step @@ -1391,10 +1420,7 @@ Expected linkDevice(ArrayRef InputFiles, "For SPIR targets, Linking is supported only for JIT compilations " "and AOT compilations for Intel CPUs/GPUs"); if (IsSYCLKind) { - auto SYCLPostLinkFile = sycl::runSYCLPostLink(InputFiles, Args); - if (!SYCLPostLinkFile) - return SYCLPostLinkFile.takeError(); - auto SPVFile = sycl::runLLVMToSPIRVTranslation(*SYCLPostLinkFile, Args); + auto SPVFile = sycl::runLLVMToSPIRVTranslation(InputFiles[0], Args); if (!SPVFile) return SPVFile.takeError(); // TODO(NOM6): Add AOT support for other targets @@ -1405,12 +1431,7 @@ Expected linkDevice(ArrayRef InputFiles, (NeedAOTCompile) ? sycl::runAOTCompile(*SPVFile, Args) : *SPVFile; if (!AOTFile) return AOTFile.takeError(); - // TODO(NOM7): Remove this call and use community flow for bundle/wrap - auto OutputFile = sycl::runWrapperAndCompile( - NeedAOTCompile ? *AOTFile : *SPVFile, Args); - if (!OutputFile) - return OutputFile.takeError(); - return *OutputFile; + return NeedAOTCompile ? *AOTFile : *SPVFile; } // Return empty file return StringRef(""); @@ -2047,41 +2068,57 @@ Expected> linkAndWrapDeviceFiles( auto LinkerArgs = getLinkerArgs(Input, BaseArgs); DenseSet ActiveOffloadKinds; bool HasSYCLOffloadKind = false; + bool HasNonSYCLOffloadKinds = false; for (const auto &File : Input) { if (File.getBinary()->getOffloadKind() != OFK_None) ActiveOffloadKinds.insert(File.getBinary()->getOffloadKind()); if (File.getBinary()->getOffloadKind() == OFK_SYCL) HasSYCLOffloadKind = true; + else + HasNonSYCLOffloadKinds = true; } - - // First link and remove all the input files containing bitcode. - SmallVector InputFiles; - if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) - return Err; - - // Write any remaining device inputs to an output file for the linker. - for (const OffloadFile &File : Input) { - auto FileNameOrErr = writeOffloadFile(File); - if (!FileNameOrErr) - return FileNameOrErr.takeError(); - InputFiles.emplace_back(*FileNameOrErr); - } - if (HasSYCLOffloadKind) { - // Link the remaining device files using the device linker for SYCL + SmallVector InputFiles; + // Write device inputs to an output file for the linker. + for (const OffloadFile &File : Input) { + auto FileNameOrErr = writeOffloadFile(File); + if (!FileNameOrErr) + return FileNameOrErr.takeError(); + InputFiles.emplace_back(*FileNameOrErr); + } + // Link the input device files using the device linker for SYCL // offload. auto TmpOutputOrErr = sycl::linkDevice(InputFiles, LinkerArgs); if (!TmpOutputOrErr) return TmpOutputOrErr.takeError(); SmallVector InputFilesSYCL; InputFilesSYCL.emplace_back(*TmpOutputOrErr); - - auto SYCLOutputOrErr = - Args.hasArg(OPT_embed_bitcode) - ? InputFilesSYCL.front() - : linkDevice(InputFilesSYCL, LinkerArgs, true /* IsSYCLKind */); - if (!SYCLOutputOrErr) - return SYCLOutputOrErr.takeError(); + auto SYCLPostLinkFile = sycl::runSYCLPostLink(InputFilesSYCL, LinkerArgs); + if (!SYCLPostLinkFile) + return SYCLPostLinkFile.takeError(); + sycl::Table LiveSYCLTable; + if (Error Err = LiveSYCLTable.populateSYCLTable(*SYCLPostLinkFile)) + return std::move(Err); + auto PostLinkedFiles = LiveSYCLTable.getListOfIRFiles(); + if (DryRun) + PostLinkedFiles.push_back("dummy"); + for (unsigned I = 0; I < PostLinkedFiles.size(); ++I) { + SmallVector Files; + Files.emplace_back(PostLinkedFiles[I]); + auto LinkedFileFinalOrErr = + linkDevice(Files, LinkerArgs, true /* IsSYCLKind */); + if (!LinkedFileFinalOrErr) + return LinkedFileFinalOrErr.takeError(); + if (!DryRun) + LiveSYCLTable.Entries[I].IRFile = *LinkedFileFinalOrErr; + } + auto WrapperInput = LiveSYCLTable.writeSYCLTableToFile(); + if (!WrapperInput) + return WrapperInput.takeError(); + // TODO(NOM7): Remove this call and use community flow for bundle/wrap + auto OutputFile = sycl::runWrapperAndCompile(*WrapperInput, LinkerArgs); + if (!OutputFile) + return OutputFile.takeError(); // SYCL offload kind images are all ready to be sent to host linker. // TODO: Currently, device code wrapping for SYCL offload happens in a @@ -2089,41 +2126,52 @@ Expected> linkAndWrapDeviceFiles( // This will eventually be refactored to use the 'common' wrapping logic // that is used for other offload kinds. std::scoped_lock Guard(ImageMtx); - WrappedOutput.push_back(*SYCLOutputOrErr); + WrappedOutput.push_back(*OutputFile); } + if (HasNonSYCLOffloadKinds) { + // First link and remove all the input files containing bitcode. + SmallVector InputFiles; + if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) + return Err; + + // Write any remaining device inputs to an output file for the linker. + for (const OffloadFile &File : Input) { + auto FileNameOrErr = writeOffloadFile(File); + if (!FileNameOrErr) + return FileNameOrErr.takeError(); + InputFiles.emplace_back(*FileNameOrErr); + } - // Link the remaining device files using the device linker. - auto OutputOrErr = !Args.hasArg(OPT_embed_bitcode) - ? linkDevice(InputFiles, LinkerArgs) - : InputFiles.front(); - if (!OutputOrErr) - return OutputOrErr.takeError(); + // Link the remaining device files using the device linker. + auto OutputOrErr = !Args.hasArg(OPT_embed_bitcode) + ? linkDevice(InputFiles, LinkerArgs) + : InputFiles.front(); + if (!OutputOrErr) + return OutputOrErr.takeError(); + // Store the offloading image for each linked output file. + for (OffloadKind Kind : ActiveOffloadKinds) { + llvm::ErrorOr> FileOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(*OutputOrErr); + if (std::error_code EC = FileOrErr.getError()) { + if (DryRun) + FileOrErr = MemoryBuffer::getMemBuffer(""); + else + return createFileError(*OutputOrErr, EC); + } - // Store the offloading image for each linked output file. - for (OffloadKind Kind : ActiveOffloadKinds) { - if (Kind == OFK_SYCL) - continue; - llvm::ErrorOr> FileOrErr = - llvm::MemoryBuffer::getFileOrSTDIN(*OutputOrErr); - if (std::error_code EC = FileOrErr.getError()) { - if (DryRun) - FileOrErr = MemoryBuffer::getMemBuffer(""); - else - return createFileError(*OutputOrErr, EC); + std::scoped_lock Guard(ImageMtx); + OffloadingImage TheImage{}; + TheImage.TheImageKind = + Args.hasArg(OPT_embed_bitcode) ? IMG_Bitcode : IMG_Object; + TheImage.TheOffloadKind = Kind; + TheImage.StringData["triple"] = + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ)); + TheImage.StringData["arch"] = + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ)); + TheImage.Image = std::move(*FileOrErr); + + Images[Kind].emplace_back(std::move(TheImage)); } - - std::scoped_lock Guard(ImageMtx); - OffloadingImage TheImage{}; - TheImage.TheImageKind = - Args.hasArg(OPT_embed_bitcode) ? IMG_Bitcode : IMG_Object; - TheImage.TheOffloadKind = Kind; - TheImage.StringData["triple"] = - Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ)); - TheImage.StringData["arch"] = - Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ)); - TheImage.Image = std::move(*FileOrErr); - - Images[Kind].emplace_back(std::move(TheImage)); } return Error::success(); }); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td index d819f862caf26..ac5b2a3b3dda1 100644 --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -150,6 +150,9 @@ def sycl_device_lib_EQ : CommaJoined<["--", "-"], "sycl-device-libraries=">, def sycl_device_library_location_EQ : Joined<["--", "-"], "sycl-device-library-location=">, Flags<[WrapperOnlyOption]>, HelpText<"Location of SYCL device library files">; +def sycl_nvptx_device_lib_EQ : CommaJoined<["--", "-"], "sycl-nvptx-device-libraries=">, + Flags<[WrapperOnlyOption]>, + HelpText<"A comma separated list of nvptx-specific device libraries that are linked during the device link.">; // Options for SYCL backends and linker options for shared libraries. def sycl_backend_compile_options_EQ : diff --git a/libdevice/fallback-complex-fp64.cpp b/libdevice/fallback-complex-fp64.cpp index 5ca69c0100962..5c4bc67924998 100644 --- a/libdevice/fallback-complex-fp64.cpp +++ b/libdevice/fallback-complex-fp64.cpp @@ -311,8 +311,8 @@ double __complex__ __devicelib_cacos(double __complex__ z) { return CMPLX(z_real, -z_imag); return CMPLX(z_real, z_real); } - if (__spirv_IsInf(z_real)) - return CMPLX(__pi / 2.0, -z_real); + if (__spirv_IsInf(z_imag)) + return CMPLX(__pi / 2.0, -z_imag); if (z_real == 0 && (z_imag == 0 || __spirv_IsNan(z_imag))) return CMPLX(__pi / 2.0, -z_imag); double __complex__ w = diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp index 89452429fe1fc..5d54ffc4f665d 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp @@ -126,6 +126,7 @@ void SPIRVDecorate::encode(spv_ostream &O) const { break; case internal::DecorationFuncParamDescINTEL: SPIRVDecorateFuncParamDescAttr::encodeLiterals(Encoder, Literals); + break; case internal::DecorationHostAccessINTEL: SPIRVDecorateHostAccessINTELLegacy::encodeLiterals(Encoder, Literals); break; @@ -163,6 +164,7 @@ void SPIRVDecorate::decode(std::istream &I) { break; case internal::DecorationFuncParamDescINTEL: SPIRVDecorateFuncParamDescAttr::decodeLiterals(Decoder, Literals); + break; case internal::DecorationHostAccessINTEL: SPIRVDecorateHostAccessINTELLegacy::decodeLiterals(Decoder, Literals); break; diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 3a8cfd0e2cc52..e7015e94b0ed3 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1280,7 +1280,8 @@ struct FunctionStackPoisoner : public InstVisitor { static void ExtendSpirKernelArgs(Module &M, FunctionAnalysisManager &FAM) { SmallVector SpirFixupFuncs; for (Function &F : M) { - if (F.getCallingConv() == CallingConv::SPIR_KERNEL) { + if (F.getCallingConv() == CallingConv::SPIR_KERNEL && + F.hasFnAttribute(Attribute::SanitizeAddress)) { SpirFixupFuncs.emplace_back(&F); } } @@ -1296,9 +1297,7 @@ static void ExtendSpirKernelArgs(Module &M, FunctionAnalysisManager &FAM) { Types.push_back(I->getType()); } - // New argument type: uintptr_t as(1)*, as it's allocated in USM buffer, and - // it can also be treated as a pointer point to the base address of private - // shadow memory + // New argument: uintptr_t as(1)*, which is allocated in shared USM buffer Types.push_back(IntptrTy->getPointerTo(kSpirOffloadGlobalAS)); FunctionType *NewFTy = FunctionType::get(F->getReturnType(), Types, false); @@ -1413,6 +1412,21 @@ PreservedAnalyses AddressSanitizerPass::run(Module &M, ClUseStackSafety ? &MAM.getResult(M) : nullptr; if (Triple(M.getTargetTriple()).isSPIR()) { + bool HasESIMDKernel = false; + + // ESIMD kernel doesn't support noinline functions, so we can't + // support sanitizer for it + for (Function &F : M) + if (F.hasMetadata("sycl_explicit_simd")) { + F.removeFnAttr(Attribute::SanitizeAddress); + HasESIMDKernel = true; + } + + // FIXME: we can't check if the kernel is ESIMD kernel at UR, so we + // have to disable ASan completely in this case + if (HasESIMDKernel) + return PreservedAnalyses::all(); + ExtendSpirKernelArgs(M, FAM); } diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll new file mode 100644 index 0000000000000..1e1bc38c2f189 --- /dev/null +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/sycl_esimd.ll @@ -0,0 +1,17 @@ +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-stack=0 -asan-globals=0 -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +@__spirv_BuiltInGlobalInvocationId = external addrspace(1) constant <3 x i64> + +; Function Attrs: sanitize_address +define spir_kernel void @esimd_kernel() #0 !sycl_explicit_simd !1 { +entry: + %0 = load i64, ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @__spirv_BuiltInGlobalInvocationId, i64 8), align 8 + ret void +} +; CHECK-NOT: {{ sanitize_address }} + +attributes #0 = { sanitize_address } +!1 = !{} diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 77fab2ebe5fb1..3e5e46823dc7c 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -1736,15 +1736,23 @@ passed an invalid event. The new handler methods, and queue shortcuts, defined by link:../supported/sycl_ext_oneapi_enqueue_barrier.asciidoc[sycl_ext_oneapi_enqueue_barrier] can only be used in graph nodes created using the Record & Replay API, as -barriers rely on events to enforce dependencies. For barriers with an empty -wait list parameter, the semantics are that the barrier node being added to -will depend on all the existing graph leaf nodes, not only the leaf nodes -that were added from the queue being recorded. +barriers rely on events to enforce dependencies. A synchronous exception will be thrown with error code `invalid` if a user -tries to add them to a graph using the Explicit API. Empty nodes created with -the `node::depends_on_all_leaves` property can be used instead of barriers when -a user is building a graph with the explicit API. +tries to add a barrier command to a graph using the explicit API. Empty nodes +created with the `node::depends_on_all_leaves` property can be used instead of +barriers when a user is building a graph with the explicit API. + +The semantics of barriers are defined in `sycl_ext_oneapi_enqueue_barrier` for +a single command-queue, and correlate as follows to a graph that may contain +nodes that are recorded from multiple queues and/or added by the explicit API: + +* Barriers with an empty wait list parameter will only depend on the leaf nodes + that were added to the graph from the queue the barrier command is being + recorded from. + +* The only commands which have an implicit dependency on the barrier command + are those recorded from the same queue the barrier command was submitted to. ==== sycl_ext_oneapi_memcpy2d diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md index a35d69cb2a0fa..e787194beedda 100644 --- a/sycl/doc/syclcompat/README.md +++ b/sycl/doc/syclcompat/README.md @@ -78,18 +78,24 @@ those to learn to use the library. SYCLcompat provides a `dim3` class akin to that of CUDA or HIP programming models. `dim3` encapsulates other languages iteration spaces that are -represented with coordinate letters (x, y, z). +represented with coordinate letters (x, y, z). In SYCL, the fastest-moving +dimension is the one with the highest index, e.g. in a SYCL 2D range iteration +space, there are two dimensions, 0 and 1, and 1 will be the one that "moves +faster". For CUDA/HIP, the convention is reversed: `x` is the dimension which +moves fastest. `syclcompat::dim3` follows this convention, so that +`syclcompat::dim3(32, 4)` is equivalent to `sycl::range<2>(4, 32)`, and +`syclcompat::dim3(32, 4, 2)` is equivalent to `sycl::range<3>(2, 4, 32)`. ```cpp namespace syclcompat { class dim3 { public: - const size_t x, y, z; + unsigned int x, y, z; dim3(const sycl::range<3> &r); dim3(const sycl::range<2> &r); dim3(const sycl::range<1> &r); - constexpr dim3(size_t x, size_t y = 1, size_t z = 1); + constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1); constexpr size_t size(); @@ -106,12 +112,10 @@ inline dim3 operator-(const dim3 &a, const dim3 &b); } // syclcompat ``` -In SYCL, the fastest-moving dimension is the one with the highest index, e.g. in -a SYCL 2D range iteration space, there are two dimensions, 0 and 1, and 1 will -be the one that "moves faster". The compatibility headers for SYCL offer a -number of convenience functions that help the mapping between xyz-based -coordinates to SYCL iteration spaces in the different scopes available. In -addition to the global range, the following helper functions are also provided: +The compatibility headers for SYCL offer a number of convenience functions that +help the mapping between xyz-based coordinates to SYCL iteration spaces in the +different scopes available. In addition to the global range, the following +helper functions are also provided: ``` c++ namespace syclcompat { diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 3f1bb28268d39..50b385f3f0cab 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -209,8 +209,10 @@ enum PropKind : uint32_t { CallsIndirectly = 68, InputDataPlacement = 69, OutputDataPlacement = 70, + IncludeFiles = 71, + RegisteredKernelNames = 72, // PropKindSize must always be the last value. - PropKindSize = 71, + PropKindSize = 73, }; struct property_key_base_tag {}; diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 0d3e757c175b0..61b23ffd707d5 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -466,7 +466,7 @@ class __SYCL_EXPORT handler { /// \param Queue is a SYCL queue. /// \param IsHost indicates if this handler is created for SYCL host device. /// TODO: Unused. Remove with ABI break. - handler(std::shared_ptr Queue, bool IsHost); + handler(std::shared_ptr Queue, bool /*Unused*/); /// Constructs SYCL handler from the associated queue and the submission's /// primary and secondary queue. @@ -476,11 +476,11 @@ class __SYCL_EXPORT handler { /// \param PrimaryQueue is the primary SYCL queue of the submission. /// \param SecondaryQueue is the secondary SYCL queue of the submission. This /// is null if no secondary queue is associated with the submission. - /// \param IsHost indicates if this handler is created for SYCL host device. /// TODO: Unused. Remove with ABI break. handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, bool IsHost); + std::shared_ptr SecondaryQueue, + bool /* Unused */); /// Constructs SYCL handler from queue. /// @@ -488,8 +488,8 @@ class __SYCL_EXPORT handler { /// \param IsHost indicates if this handler is created for SYCL host device. /// \param CallerNeedsEvent indicates if the event resulting from this handler /// is needed by the caller. - handler(std::shared_ptr Queue, bool IsHost, - bool CallerNeedsEvent); + handler(std::shared_ptr Queue, + bool /* ABI break: remove */, bool CallerNeedsEvent); /// Constructs SYCL handler from the associated queue and the submission's /// primary and secondary queue. @@ -504,8 +504,8 @@ class __SYCL_EXPORT handler { /// is needed by the caller. handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, bool IsHost, - bool CallerNeedsEvent); + std::shared_ptr SecondaryQueue, + bool /* ABI break: remove */, bool CallerNeedsEvent); /// Constructs SYCL handler from Graph. /// @@ -644,7 +644,7 @@ class __SYCL_EXPORT handler { ~handler() = default; // TODO: Private and unusued. Remove when ABI break is allowed. - bool is_host() { return MIsHost; } + bool is_host() { return false; } #ifdef __SYCL_DEVICE_ONLY__ // In device compilation accessor isn't inherited from host base classes, so @@ -923,12 +923,6 @@ class __SYCL_EXPORT handler { detail::KernelLambdaHasKernelHandlerArgT::value; - if (IsCallableWithKernelHandler && MIsHost) { - throw sycl::feature_not_supported( - "kernel_handler is not yet supported by host device.", - PI_ERROR_INVALID_OPERATION); - } - KernelType *KernelPtr = ResetHostKernel(KernelFunc); @@ -1077,8 +1071,7 @@ class __SYCL_EXPORT handler { std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool> copyAccToAccHelper(accessor Src, accessor Dst) { - if (!MIsHost && - IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range())) + if (IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range())) return false; range<1> LinearizedRange(Src.size()); @@ -1100,23 +1093,19 @@ class __SYCL_EXPORT handler { /// /// \param Src is a source SYCL accessor. /// \param Dst is a destination SYCL accessor. + // ABI break: to remove whole method template std::enable_if_t - copyAccToAccHelper(accessor Src, - accessor Dst) { - if (!MIsHost) - return false; - - single_task<__copyAcc2Acc>( - [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); }); - return true; + copyAccToAccHelper(accessor, + accessor) { + return false; } #ifndef __SYCL_DEVICE_ONLY__ + // ABI break: to remove whole method /// Copies the content of memory object accessed by Src into the memory /// pointed by Dst. /// @@ -1136,6 +1125,7 @@ class __SYCL_EXPORT handler { }); } + // ABI break: to remove whole method /// Copies 1 element accessed by 0-dimensional accessor Src into the memory /// pointed by Dst. /// @@ -1153,6 +1143,7 @@ class __SYCL_EXPORT handler { }); } + // ABI break: to remove whole method /// Copies the memory pointed by Src into the memory accessed by Dst. /// /// \param Src is a pointer to source memory. @@ -1170,6 +1161,7 @@ class __SYCL_EXPORT handler { }); } + // ABI break: to remove whole method /// Copies 1 element pointed by Src to memory accessed by 0-dimensional /// accessor Dst. /// @@ -2282,7 +2274,7 @@ class __SYCL_EXPORT handler { MNDRDesc.set(range<1>{1}); MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2319,7 +2311,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); setNDRangeUsed(false); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2359,7 +2351,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); setNDRangeUsed(false); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2398,7 +2390,7 @@ class __SYCL_EXPORT handler { MKernel = detail::getSyclObjImpl(std::move(Kernel)); setType(detail::CG::Kernel); setNDRangeUsed(true); - if (!MIsHost && !lambdaAndKernelHaveEqualName()) { + if (!lambdaAndKernelHaveEqualName()) { extractArgsAndReqs(); MKernelName = getKernelName(); } else @@ -2725,14 +2717,6 @@ class __SYCL_EXPORT handler { "Invalid accessor target for the copy method."); static_assert(isValidModeForSourceAccessor(AccessMode), "Invalid accessor mode for the copy method."); -#ifndef __SYCL_DEVICE_ONLY__ - if (MIsHost) { - // TODO: Temporary implementation for host. Should be handled by memory - // manager. - copyAccToPtrHost(Src, Dst); - return; - } -#endif setType(detail::CG::CopyAccToPtr); detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Src; @@ -2769,14 +2753,7 @@ class __SYCL_EXPORT handler { "Invalid accessor mode for the copy method."); // TODO: Add static_assert with is_device_copyable when vec is // device-copyable. -#ifndef __SYCL_DEVICE_ONLY__ - if (MIsHost) { - // TODO: Temporary implementation for host. Should be handled by memory - // manager. - copyPtrToAccHost(Src, Dst); - return; - } -#endif + setType(detail::CG::CopyPtrToAcc); detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst; @@ -2890,8 +2867,6 @@ class __SYCL_EXPORT handler { fill(accessor Dst, const T &Pattern) { - assert(!MIsHost && "fill() should no longer be callable on a host device."); - if (Dst.is_placeholder()) checkIfPlaceholderIsBoundToHandler(Dst); @@ -3429,7 +3404,7 @@ class __SYCL_EXPORT handler { /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; - bool MIsHost = false; + bool MIsHost = false; // ABI break: to remove detail::code_location MCodeLoc = {}; bool MIsFinalized = false; diff --git a/sycl/include/sycl/kernel_bundle.hpp b/sycl/include/sycl/kernel_bundle.hpp index 5bba4735561a2..99663183ad8d3 100644 --- a/sycl/include/sycl/kernel_bundle.hpp +++ b/sycl/include/sycl/kernel_bundle.hpp @@ -814,6 +814,32 @@ build(const kernel_bundle &InputBundle, namespace ext::oneapi::experimental { +namespace detail { +struct create_bundle_from_source_props; +struct build_source_bundle_props; +} // namespace detail + +///////////////////////// +// PropertyT syclex::include_files +///////////////////////// +struct include_files + : detail::run_time_property_key { + include_files(); + include_files(const std::string &name, const std::string &content) { + record.emplace_back(std::make_pair(name, content)); + } + void add(const std::string &name, const std::string &content) { + record.emplace_back(std::make_pair(name, content)); + } + std::vector> record; +}; +using include_files_key = include_files; + +template <> +struct is_property_key_of + : std::true_type {}; + ///////////////////////// // PropertyT syclex::build_options ///////////////////////// @@ -826,8 +852,7 @@ struct build_options using build_options_key = build_options; template <> -struct is_property_key_of> +struct is_property_key_of : std::true_type {}; ///////////////////////// @@ -840,72 +865,132 @@ struct save_log : detail::run_time_property_key { using save_log_key = save_log; template <> -struct is_property_key_of> +struct is_property_key_of : std::true_type {}; ///////////////////////// -// syclex::is_source_kernel_bundle_supported +// PropertyT syclex::registered_kernel_names ///////////////////////// +struct registered_kernel_names + : detail::run_time_property_key { + std::vector kernel_names; + registered_kernel_names() {} + registered_kernel_names(const std::string &knArg) : kernel_names{knArg} {} + registered_kernel_names(const std::vector &knsArg) + : kernel_names(knsArg) {} + void add(const std::string &name) { kernel_names.push_back(name); } +}; +using registered_kernel_names_key = registered_kernel_names; + +template <> +struct is_property_key_of : std::true_type { +}; + +namespace detail { +// forward decls __SYCL_EXPORT bool is_source_kernel_bundle_supported(backend BE, source_language Language); -///////////////////////// -// syclex::create_kernel_bundle_from_source -///////////////////////// - __SYCL_EXPORT kernel_bundle -create_kernel_bundle_from_source(const context &SyclContext, - source_language Language, - const std::string &Source); +make_kernel_bundle_from_source( + const context &SyclContext, source_language Language, + const std::string &Source, + std::vector> IncludePairsVec); #if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) __SYCL_EXPORT kernel_bundle -create_kernel_bundle_from_source(const context &SyclContext, - source_language Language, - const std::vector &Bytes); +make_kernel_bundle_from_source( + const context &SyclContext, source_language Language, + const std::vector &Bytes, + std::vector> IncludePairsVec); #endif -///////////////////////// -// syclex::build(source_kb) => exe_kb -///////////////////////// -namespace detail { -// forward decl __SYCL_EXPORT kernel_bundle build_from_source(kernel_bundle &SourceKB, const std::vector &Devices, const std::vector &BuildOptions, - std::string *LogPtr); + std::string *LogPtr, + const std::vector &RegisteredKernelNames); } // namespace detail +///////////////////////// +// syclex::create_kernel_bundle_from_source +///////////////////////// +template < + typename PropertyListT = empty_properties_t, + typename = std::enable_if_t< + is_property_list_v && + detail::all_props_are_keys_of::value>> +kernel_bundle create_kernel_bundle_from_source( + const context &SyclContext, source_language Language, + const std::string &Source, PropertyListT props = {}) { + std::vector> IncludePairsVec; + if constexpr (props.template has_property()) { + IncludePairsVec = props.template get_property().record; + } + + return detail::make_kernel_bundle_from_source(SyclContext, Language, Source, + IncludePairsVec); +} + +#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) +template < + typename PropertyListT = empty_properties_t, + typename = std::enable_if_t< + is_property_list_v && + detail::all_props_are_keys_of::value>> +kernel_bundle create_kernel_bundle_from_source( + const context &SyclContext, source_language Language, + const std::vector &Bytes, PropertyListT props = {}) { + std::vector> IncludePairsVec; + if constexpr (props.template has_property()) { + IncludePairsVec = props.template get_property().record; + } + + return detail::make_kernel_bundle_from_source(SyclContext, Language, Bytes, + IncludePairsVec); +} +#endif + +///////////////////////// +// syclex::build(source_kb) => exe_kb +///////////////////////// + template && - detail::all_props_are_keys_of< - kernel_bundle, - PropertyListT>::value>> + detail::all_props_are_keys_of::value>> kernel_bundle build(kernel_bundle &SourceKB, const std::vector &Devices, PropertyListT props = {}) { std::vector BuildOptionsVec; std::string *LogPtr = nullptr; + std::vector RegisteredKernelNamesVec; if constexpr (props.template has_property()) { BuildOptionsVec = props.template get_property().opts; } if constexpr (props.template has_property()) { LogPtr = props.template get_property().log; } - return detail::build_from_source(SourceKB, Devices, BuildOptionsVec, LogPtr); + if constexpr (props.template has_property()) { + RegisteredKernelNamesVec = + props.template get_property().kernel_names; + } + return detail::build_from_source(SourceKB, Devices, BuildOptionsVec, LogPtr, + RegisteredKernelNamesVec); } template && - detail::all_props_are_keys_of< - kernel_bundle, - PropertyListT>::value>> + detail::all_props_are_keys_of::value>> kernel_bundle build(kernel_bundle &SourceKB, PropertyListT props = {}) { diff --git a/sycl/include/sycl/kernel_bundle_enums.hpp b/sycl/include/sycl/kernel_bundle_enums.hpp index 936b0de3879f9..fd53f8cd3a740 100644 --- a/sycl/include/sycl/kernel_bundle_enums.hpp +++ b/sycl/include/sycl/kernel_bundle_enums.hpp @@ -20,7 +20,7 @@ enum class bundle_state : char { namespace ext::oneapi::experimental { -enum class source_language : int { opencl = 0, spirv = 1 /* sycl, cuda */ }; +enum class source_language : int { opencl = 0, spirv = 1, sycl = 2 /* cuda */ }; // opencl versions struct cl_version { diff --git a/sycl/include/syclcompat/dims.hpp b/sycl/include/syclcompat/dims.hpp index 60f9927897304..1ae9ae62877f7 100644 --- a/sycl/include/syclcompat/dims.hpp +++ b/sycl/include/syclcompat/dims.hpp @@ -30,7 +30,7 @@ namespace syclcompat { class dim3 { public: - const size_t x, y, z; + unsigned int x, y, z; dim3(const sycl::range<3> &r) : x(r[2]), y(r[1]), z(r[0]) {} @@ -38,7 +38,8 @@ class dim3 { dim3(const sycl::range<1> &r) : x(r[0]), y(1), z(1) {} - constexpr dim3(size_t x, size_t y = 1, size_t z = 1) : x(x), y(y), z(z) {} + constexpr dim3(unsigned int x = 1, unsigned int y = 1, unsigned int z = 1) + : x(x), y(y), z(z) {} constexpr size_t size() const { return x * y * z; } diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index f915ef4e2cb8e..e72cdebf506c2 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -217,6 +217,7 @@ set(SYCL_COMMON_SOURCES "detail/jit_compiler.cpp" "detail/jit_device_binaries.cpp" "detail/kernel_compiler/kernel_compiler_opencl.cpp" + "detail/kernel_compiler/kernel_compiler_sycl.cpp" "detail/kernel_impl.cpp" "detail/kernel_program_cache.cpp" "detail/memory_manager.cpp" diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp index 3273c4f3056c2..e4c7404c7b078 100644 --- a/sycl/source/context.cpp +++ b/sycl/source/context.cpp @@ -56,31 +56,20 @@ context::context(const std::vector &DeviceList, throw invalid_parameter_error("DeviceList is empty.", PI_ERROR_INVALID_VALUE); } - auto NonHostDeviceIter = std::find_if_not( - DeviceList.begin(), DeviceList.end(), [&](const device &CurrentDevice) { - return detail::getSyclObjImpl(CurrentDevice)->is_host(); - }); - if (NonHostDeviceIter == DeviceList.end()) - impl = std::make_shared(DeviceList[0], AsyncHandler, + + const auto &RefPlatform = + detail::getSyclObjImpl(DeviceList[0].get_platform())->getHandleRef(); + if (std::any_of(DeviceList.begin(), DeviceList.end(), + [&](const device &CurrentDevice) { + return (detail::getSyclObjImpl(CurrentDevice.get_platform()) + ->getHandleRef() != RefPlatform); + })) + throw invalid_parameter_error( + "Can't add devices across platforms to a single context.", + PI_ERROR_INVALID_DEVICE); + else + impl = std::make_shared(DeviceList, AsyncHandler, PropList); - else { - const device &NonHostDevice = *NonHostDeviceIter; - const auto &NonHostPlatform = - detail::getSyclObjImpl(NonHostDevice.get_platform())->getHandleRef(); - if (std::any_of(DeviceList.begin(), DeviceList.end(), - [&](const device &CurrentDevice) { - return ( - detail::getSyclObjImpl(CurrentDevice)->is_host() || - (detail::getSyclObjImpl(CurrentDevice.get_platform()) - ->getHandleRef() != NonHostPlatform)); - })) - throw invalid_parameter_error( - "Can't add devices across platforms to a single context.", - PI_ERROR_INVALID_DEVICE); - else - impl = std::make_shared(DeviceList, AsyncHandler, - PropList); - } } context::context(cl_context ClContext, async_handler AsyncHandler) { const auto &Plugin = sycl::detail::pi::getPlugin(); @@ -138,9 +127,8 @@ context::get_backend_info() const { cl_context context::get() const { return impl->get(); } bool context::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "context::is_host should not be called in implementation."); - return IsHost; + assert(false && "context::is_host should not be called in implementation."); + return false; } backend context::get_backend() const noexcept { return impl->getBackend(); } diff --git a/sycl/source/detail/backend_impl.hpp b/sycl/source/detail/backend_impl.hpp index ca23ceb48815c..0c160ed1920c4 100644 --- a/sycl/source/detail/backend_impl.hpp +++ b/sycl/source/detail/backend_impl.hpp @@ -15,7 +15,6 @@ inline namespace _V1 { namespace detail { template backend getImplBackend(const T &Impl) { - assert(!Impl->is_host() && "Cannot get the backend for host."); return Impl->getContextImplPtr()->getBackend(); } diff --git a/sycl/source/detail/bindless_images.cpp b/sycl/source/detail/bindless_images.cpp index 933b93bf0a025..a02fa47a0139c 100644 --- a/sycl/source/detail/bindless_images.cpp +++ b/sycl/source/detail/bindless_images.cpp @@ -862,10 +862,6 @@ __SYCL_EXPORT void *pitched_alloc_device(size_t *resultPitch, std::shared_ptr CtxImpl = sycl::detail::getSyclObjImpl(syclContext); - if (CtxImpl->is_host()) { - throw sycl::exception(sycl::make_error_code(sycl::errc::memory_allocation), - "Cannot allocate pitched memory on host!"); - } pi_context PiContext = CtxImpl->getHandleRef(); const sycl::detail::PluginPtr &Plugin = CtxImpl->getPlugin(); diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 835c732a40bf9..1795992594078 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -24,8 +24,7 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData, sycl::detail::pi::PiEvent &OutEventToWait) { bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); - - assert(!(nullptr == HostPtr && BaseT::useHostPtr() && Context->is_host()) && + assert(!(nullptr == HostPtr && BaseT::useHostPtr() && !Context) && "Internal error. Allocating memory on the host " "while having use_host_ptr property"); return MemoryManager::allocateMemBuffer( @@ -71,10 +70,13 @@ buffer_impl::getNativeVector(backend BackendName) const { sycl::detail::pi::PiMem NativeMem = pi::cast(Cmd->getMemAllocation()); auto Ctx = Cmd->getWorkerContext(); - auto Platform = Ctx->getPlatformImpl(); // If Host Shared Memory is not supported then there is alloca for host that - // doesn't have platform - if (!Platform || (Platform->getBackend() != BackendName)) + // doesn't have context and platform + if (!Ctx) + continue; + PlatformImplPtr Platform = Ctx->getPlatformImpl(); + assert(Platform && "Platform must be present for device context"); + if (Platform->getBackend() != BackendName) continue; auto Plugin = Platform->getPlugin(); diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp index add8bfa679b1a..910f731071837 100644 --- a/sycl/source/detail/context_impl.cpp +++ b/sycl/source/detail/context_impl.cpp @@ -33,9 +33,7 @@ context_impl::context_impl(const device &Device, async_handler AsyncHandler, : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(1, Device), MContext(nullptr), MPlatform(detail::getSyclObjImpl(Device.get_platform())), - MPropList(PropList), - MHostContext(detail::getSyclObjImpl(Device)->is_host()), - MSupportBufferLocationByDevices(NotChecked) { + MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) { MKernelProgramCache.setContextPtr(this); } @@ -43,7 +41,7 @@ context_impl::context_impl(const std::vector Devices, async_handler AsyncHandler, const property_list &PropList) : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(Devices), - MContext(nullptr), MPlatform(), MPropList(PropList), MHostContext(false), + MContext(nullptr), MPlatform(), MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) { MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform()); std::vector DeviceIds; @@ -88,7 +86,7 @@ context_impl::context_impl(sycl::detail::pi::PiContext PiContext, bool OwnedByRuntime) : MOwnedByRuntime(OwnedByRuntime), MAsyncHandler(AsyncHandler), MDevices(DeviceList), MContext(PiContext), MPlatform(), - MHostContext(false), MSupportBufferLocationByDevices(NotChecked) { + MSupportBufferLocationByDevices(NotChecked) { if (!MDevices.empty()) { MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform()); } else { @@ -132,18 +130,11 @@ context_impl::context_impl(sycl::detail::pi::PiContext PiContext, } cl_context context_impl::get() const { - if (MHostContext) { - throw invalid_object_error( - "This instance of context doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_CONTEXT); - } // TODO catch an exception and put it to list of asynchronous exceptions getPlugin()->call(MContext); return pi::cast(MContext); } -bool context_impl::is_host() const { return MHostContext; } - context_impl::~context_impl() { try { // Free all events associated with the initialization of device globals. @@ -160,10 +151,8 @@ context_impl::~context_impl() { assert(LibProg.second && "Null program must not be kept in the cache"); getPlugin()->call(LibProg.second); } - if (!MHostContext) { - // TODO catch an exception and put it to list of asynchronous exceptions - getPlugin()->call(MContext); - } + // TODO catch an exception and put it to list of asynchronous exceptions + getPlugin()->call(MContext); } catch (std::exception &e) { __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~context_impl", e); } @@ -175,15 +164,10 @@ const async_handler &context_impl::get_async_handler() const { template <> uint32_t context_impl::get_info() const { - if (is_host()) - return 0; return get_context_info(this->getHandleRef(), this->getPlugin()); } template <> platform context_impl::get_info() const { - if (is_host()) - return createSyclObjFromImpl( - platform_impl::getHostPlatformImpl()); return createSyclObjFromImpl(MPlatform); } template <> @@ -199,8 +183,6 @@ context_impl::get_info() sycl::memory_order::relaxed, sycl::memory_order::acquire, sycl::memory_order::release, sycl::memory_order::acq_rel, sycl::memory_order::seq_cst}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet< sycl::memory_order, info::device::atomic_memory_order_capabilities>( @@ -216,8 +198,6 @@ context_impl::get_info() sycl::memory_scope::work_item, sycl::memory_scope::sub_group, sycl::memory_scope::work_group, sycl::memory_scope::device, sycl::memory_scope::system}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet< sycl::memory_scope, info::device::atomic_memory_scope_capabilities>( @@ -232,8 +212,6 @@ context_impl::get_info() const { sycl::memory_order::relaxed, sycl::memory_order::acquire, sycl::memory_order::release, sycl::memory_order::acq_rel, sycl::memory_order::seq_cst}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet( @@ -248,8 +226,6 @@ context_impl::get_info() const { sycl::memory_scope::work_item, sycl::memory_scope::sub_group, sycl::memory_scope::work_group, sycl::memory_scope::device, sycl::memory_scope::system}; - if (is_host()) - return CapabilityList; GetCapabilitiesIntersectionSet( diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp index 87d5595c832be..9e3beffd4deeb 100644 --- a/sycl/source/detail/context_impl.hpp +++ b/sycl/source/detail/context_impl.hpp @@ -97,11 +97,6 @@ class context_impl { /// \return an instance of OpenCL cl_context. cl_context get() const; - /// Checks if this context is a host context. - /// - /// \return true if this context is a host context. - bool is_host() const; - /// Gets asynchronous exception handler. /// /// \return an instance of SYCL async_handler. @@ -190,7 +185,7 @@ class context_impl { } return false; - } else if (!is_host() && Device->getBackend() == backend::opencl) { + } else if (Device->getBackend() == backend::opencl) { // OpenCL does not support using descendants of context members within // that context yet. We make the exception in case it supports // component/composite devices. @@ -272,7 +267,6 @@ class context_impl { sycl::detail::pi::PiContext MContext; PlatformImplPtr MPlatform; property_list MPropList; - bool MHostContext; CachedLibProgramsT MCachedLibPrograms; std::mutex MCachedLibProgramsMutex; mutable KernelProgramCache MKernelProgramCache; diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp index 8547a40d4b999..8d39a61027208 100644 --- a/sycl/source/detail/device_impl.cpp +++ b/sycl/source/detail/device_impl.cpp @@ -17,11 +17,6 @@ namespace sycl { inline namespace _V1 { namespace detail { -device_impl::device_impl() - : MIsHostDevice(true), MPlatform(platform_impl::getHostPlatformImpl()), - // assert is natively supported by host - MIsAssertFailSupported(true) {} - device_impl::device_impl(pi_native_handle InteropDeviceHandle, const PluginPtr &Plugin) : device_impl(InteropDeviceHandle, nullptr, nullptr, Plugin) {} @@ -39,8 +34,7 @@ device_impl::device_impl(sycl::detail::pi::PiDevice Device, device_impl::device_impl(pi_native_handle InteropDeviceHandle, sycl::detail::pi::PiDevice Device, PlatformImplPtr Platform, const PluginPtr &Plugin) - : MDevice(Device), MIsHostDevice(false), - MDeviceHostBaseTime(std::make_pair(0, 0)) { + : MDevice(Device), MDeviceHostBaseTime(std::make_pair(0, 0)) { bool InteroperabilityConstructor = false; if (Device == nullptr) { @@ -84,13 +78,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle, } device_impl::~device_impl() { - if (!MIsHostDevice) { - // TODO catch an exception and put it to list of asynchronous exceptions - const PluginPtr &Plugin = getPlugin(); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck(MDevice); - __SYCL_CHECK_OCL_CODE_NO_EXC(Err); - } + // TODO catch an exception and put it to list of asynchronous exceptions + const PluginPtr &Plugin = getPlugin(); + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck(MDevice); + __SYCL_CHECK_OCL_CODE_NO_EXC(Err); } bool device_impl::is_affinity_supported( @@ -101,11 +93,6 @@ bool device_impl::is_affinity_supported( } cl_device_id device_impl::get() const { - if (MIsHostDevice) { - throw invalid_object_error( - "This instance of device doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_DEVICE); - } // TODO catch an exception and put it to list of asynchronous exceptions getPlugin()->call(MDevice); return pi::cast(getNative()); @@ -117,9 +104,6 @@ platform device_impl::get_platform() const { template typename Param::return_type device_impl::get_info() const { - if (is_host()) { - return get_device_info_host(); - } return get_device_info( MPlatform->getOrMakeDeviceImpl(MDevice, MPlatform)); } @@ -180,9 +164,6 @@ device_impl::get_backend_info() const { } bool device_impl::has_extension(const std::string &ExtensionName) const { - if (MIsHostDevice) - // TODO: implement extension management for host device; - return false; std::string AllExtensionNames = get_device_info_string(PiInfoCode::value); return (AllExtensionNames.find(ExtensionName) != std::string::npos); @@ -224,8 +205,6 @@ device_impl::create_sub_devices(const cl_device_partition_property *Properties, } std::vector device_impl::create_sub_devices(size_t ComputeUnits) const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported(info::partition_property::partition_equally)) { throw sycl::feature_not_supported( "Device does not support " @@ -248,8 +227,6 @@ std::vector device_impl::create_sub_devices(size_t ComputeUnits) const { std::vector device_impl::create_sub_devices(const std::vector &Counts) const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported(info::partition_property::partition_by_counts)) { throw sycl::feature_not_supported( "Device does not support " @@ -291,8 +268,6 @@ device_impl::create_sub_devices(const std::vector &Counts) const { std::vector device_impl::create_sub_devices( info::partition_affinity_domain AffinityDomain) const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported( info::partition_property::partition_by_affinity_domain)) { throw sycl::feature_not_supported( @@ -319,8 +294,6 @@ std::vector device_impl::create_sub_devices( } std::vector device_impl::create_sub_devices() const { - assert(!MIsHostDevice && "Partitioning is not supported on host."); - if (!is_partition_supported( info::partition_property::ext_intel_partition_by_cslice)) { throw sycl::feature_not_supported( @@ -354,7 +327,8 @@ bool device_impl::has(aspect Aspect) const { switch (Aspect) { case aspect::host: - return is_host(); + // Deprecated + return false; case aspect::cpu: return is_cpu(); case aspect::gpu: @@ -395,16 +369,14 @@ bool device_impl::has(aspect Aspect) const { case aspect::ext_intel_mem_channel: return get_info(); case aspect::usm_atomic_host_allocations: - return is_host() || - (get_device_info_impl:: get(MPlatform->getDeviceImpl(MDevice)) & PI_USM_CONCURRENT_ATOMIC_ACCESS); case aspect::usm_shared_allocations: return get_info(); case aspect::usm_atomic_shared_allocations: - return is_host() || - (get_device_info_impl:: get(MPlatform->getDeviceImpl(MDevice)) & PI_USM_CONCURRENT_ATOMIC_ACCESS); @@ -759,13 +731,6 @@ bool device_impl::has(aspect Aspect) const { return false; // This device aspect has not been implemented yet. } -std::shared_ptr device_impl::getHostDeviceImpl() { - static std::shared_ptr HostImpl = - std::make_shared(); - - return HostImpl; -} - bool device_impl::isAssertFailSupported() const { return MIsAssertFailSupported; } @@ -802,9 +767,6 @@ uint64_t device_impl::getCurrentDeviceTime() { uint64_t HostTime = duration_cast(steady_clock::now().time_since_epoch()) .count(); - if (MIsHostDevice) { - return HostTime; - } // To account for potential clock drift between host clock and device clock. // The value set is arbitrary: 200 seconds @@ -862,7 +824,8 @@ bool device_impl::isGetDeviceAndHostTimerSupported() { bool device_impl::extOneapiCanCompile( ext::oneapi::experimental::source_language Language) { try { - return is_source_kernel_bundle_supported(getBackend(), Language); + return sycl::ext::oneapi::experimental::detail:: + is_source_kernel_bundle_supported(getBackend(), Language); } catch (sycl::exception &) { return false; } diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp index 981b1e059a30e..a3344ecdd3870 100644 --- a/sycl/source/detail/device_impl.hpp +++ b/sycl/source/detail/device_impl.hpp @@ -64,48 +64,29 @@ class device_impl { /// For host device an exception is thrown /// /// \return non-constant reference to PI device - sycl::detail::pi::PiDevice &getHandleRef() { - if (MIsHostDevice) - throw invalid_object_error("This instance of device is a host instance", - PI_ERROR_INVALID_DEVICE); - - return MDevice; - } + sycl::detail::pi::PiDevice &getHandleRef() { return MDevice; } /// Get constant reference to PI device /// /// For host device an exception is thrown /// /// \return constant reference to PI device - const sycl::detail::pi::PiDevice &getHandleRef() const { - if (MIsHostDevice) - throw invalid_object_error("This instance of device is a host instance", - PI_ERROR_INVALID_DEVICE); - - return MDevice; - } - - /// Check if SYCL device is a host device - /// - /// \return true if SYCL device is a host device - bool is_host() const { return MIsHostDevice; } + const sycl::detail::pi::PiDevice &getHandleRef() const { return MDevice; } /// Check if device is a CPU device /// /// \return true if SYCL device is a CPU device - bool is_cpu() const { return (!is_host() && (MType == PI_DEVICE_TYPE_CPU)); } + bool is_cpu() const { return MType == PI_DEVICE_TYPE_CPU; } /// Check if device is a GPU device /// /// \return true if SYCL device is a GPU device - bool is_gpu() const { return (!is_host() && (MType == PI_DEVICE_TYPE_GPU)); } + bool is_gpu() const { return MType == PI_DEVICE_TYPE_GPU; } /// Check if device is an accelerator device /// /// \return true if SYCL device is an accelerator device - bool is_accelerator() const { - return (!is_host() && (MType == PI_DEVICE_TYPE_ACC)); - } + bool is_accelerator() const { return MType == PI_DEVICE_TYPE_ACC; } /// Return device type /// @@ -230,11 +211,6 @@ class device_impl { /// \return true if the SYCL device has the given feature. bool has(aspect Aspect) const; - /// Gets the single instance of the Host Device - /// - /// \return the host device_impl singleton - static std::shared_ptr getHostDeviceImpl(); - bool isAssertFailSupported() const; bool isRootDevice() const { return MRootDevice == nullptr; } @@ -327,7 +303,6 @@ class device_impl { sycl::detail::pi::PiDevice MDevice = 0; sycl::detail::pi::PiDeviceType MType; sycl::detail::pi::PiDevice MRootDevice = nullptr; - bool MIsHostDevice; PlatformImplPtr MPlatform; bool MIsAssertFailSupported = false; mutable std::string MDeviceName; diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp index 40477128a0533..4a1a410c6845d 100644 --- a/sycl/source/detail/device_info.hpp +++ b/sycl/source/detail/device_info.hpp @@ -1267,1042 +1267,6 @@ typename Param::return_type get_device_info(const DeviceImplPtr &Dev) { return get_device_info_impl::get(Dev); } -// SYCL host device information - -// Default template is disabled, all possible instantiations are -// specified explicitly. -template -inline typename Param::return_type get_device_info_host() = delete; - -template <> -inline std::vector get_device_info_host() { - return std::vector(); -} - -template <> -inline ext::oneapi::experimental::architecture -get_device_info_host() { - return ext::oneapi::experimental::architecture::x86_64; -} - -template <> -inline info::device_type get_device_info_host() { - return info::device_type::host; -} - -template <> inline uint32_t get_device_info_host() { - return 0x8086; -} - -template <> -inline uint32_t get_device_info_host() { - return std::thread::hardware_concurrency(); -} - -template <> -inline uint32_t get_device_info_host() { - return 3; -} - -template <> -inline range<1> get_device_info_host>() { - // current value is the required minimum - return {1}; -} - -template <> -inline range<2> get_device_info_host>() { - // current value is the required minimum - return {1, 1}; -} - -template <> -inline range<3> get_device_info_host>() { - // current value is the required minimum - return {1, 1, 1}; -} - -template <> -inline constexpr size_t get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>() { - // See handler.hpp for the maximum value : - return static_cast((std::numeric_limits::max)()); -} - -template <> -inline id<1> get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<1>>() { - // See handler.hpp for the maximum value : - static constexpr size_t Limit = get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); - return {Limit}; -} - -template <> -inline id<2> get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<2>>() { - // See handler.hpp for the maximum value : - static constexpr size_t Limit = get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); - return {Limit, Limit}; -} - -template <> -inline id<3> get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<3>>() { - // See handler.hpp for the maximum value : - static constexpr size_t Limit = get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); - return {Limit, Limit, Limit}; -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline constexpr size_t -get_device_info_host() { - return get_device_info_host< - ext::oneapi::experimental::info::device::max_global_work_groups>(); -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline id<1> -get_device_info_host() { - - return get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<1>>(); -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline id<2> -get_device_info_host() { - return get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<2>>(); -} - -// TODO:remove with deprecated feature -// device::get_info -template <> -inline id<3> -get_device_info_host() { - return get_device_info_host< - ext::oneapi::experimental::info::device::max_work_groups<3>>(); -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 1; -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update when appropriate - return 0; -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Char); -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Short); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Int); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Long); -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Float); -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Double); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Half); -} - -template <> -inline uint32_t get_device_info_host() { - return PlatformUtil::getMaxClockFrequency(); -} - -template <> inline uint32_t get_device_info_host() { - return sizeof(void *) * 8; -} - -template <> -inline uint64_t get_device_info_host() { - return static_cast(OSUtil::getOSMemSize()); -} - -template <> -inline uint64_t get_device_info_host() { - // current value is the required minimum - const uint64_t a = get_device_info_host() / 4; - const uint64_t b = 128ul * 1024 * 1024; - return (a > b) ? a : b; -} - -template <> inline bool get_device_info_host() { - return true; -} - -template <> inline bool get_device_info_host() { - return false; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_order::relaxed, memory_order::acquire, memory_order::release, - memory_order::acq_rel, memory_order::seq_cst}; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_order::relaxed, memory_order::acquire, memory_order::release, - memory_order::acq_rel}; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_scope::work_item, memory_scope::sub_group, - memory_scope::work_group, memory_scope::device, memory_scope::system}; -} - -template <> -inline std::vector -get_device_info_host() { - return {memory_scope::work_item, memory_scope::sub_group, - memory_scope::work_group, memory_scope::device, memory_scope::system}; -} - -template <> -inline bool -get_device_info_host() { - return false; -} - -template <> -inline uint32_t get_device_info_host() { - // current value is the required minimum - return 128; -} - -template <> -inline uint32_t get_device_info_host() { - // current value is the required minimum - return 8; -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image height. Both are not known in this - // query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image width. Both are not known in this - // query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image height/depth. Both are not known - // in this query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image width/depth. Both are not known - // in this query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // SYCL guarantees at least 8192. Some devices already known to provide more - // than that (i.e. it is 16384 for opencl:gpu), which may create issues during - // image object allocation on host. - // Using any fixed number (i.e. 16384) brings the risk of having similar - // issues on newer devices in future. Thus it does not make sense limiting - // the returned value on host. Practially speaking the returned value on host - // depends only on memory required for the image, which also depends on - // the image channel_type and the image height/width, which are not known - // in this query, thus it becomes user's responsibility to choose proper image - // parameters depending on similar query to (non-host device) and amount - // of available/allocatable memory. - return std::numeric_limits::max(); -} - -template <> -inline size_t get_device_info_host() { - // Not supported in SYCL - return 0; -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 2048; -} - -template <> inline uint32_t get_device_info_host() { - // current value is the required minimum - return 16; -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 1024; -} - -template <> -inline uint32_t get_device_info_host() { - return 1024; -} - -template <> -inline std::vector -get_device_info_host() { - // current value is the required minimum - return {}; -} - -template <> -inline std::vector -get_device_info_host() { - // current value is the required minimum - return {info::fp_config::round_to_nearest, info::fp_config::inf_nan}; -} - -template <> -inline std::vector -get_device_info_host() { - // current value is the required minimum - return {info::fp_config::fma, info::fp_config::round_to_nearest, - info::fp_config::round_to_zero, info::fp_config::round_to_inf, - info::fp_config::inf_nan, info::fp_config::denorm}; -} - -template <> -inline info::global_mem_cache_type -get_device_info_host() { - return info::global_mem_cache_type::read_write; -} - -template <> -inline uint32_t -get_device_info_host() { - return PlatformUtil::getMemCacheLineSize(); -} - -template <> -inline uint64_t get_device_info_host() { - return PlatformUtil::getMemCacheSize(); -} - -template <> -inline uint64_t get_device_info_host() { - // current value is the required minimum - return 64 * 1024; -} - -template <> -inline uint32_t get_device_info_host() { - // current value is the required minimum - return 8; -} - -template <> -inline info::local_mem_type -get_device_info_host() { - return info::local_mem_type::global; -} - -template <> -inline uint64_t get_device_info_host() { - // current value is the required minimum - return 32 * 1024; -} - -template <> -inline bool get_device_info_host() { - return false; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline size_t get_device_info_host() { - typedef std::ratio_divide - ns_period; - return ns_period::num / ns_period::den; -} - -template <> inline bool get_device_info_host() { - union { - uint16_t a; - uint8_t b[2]; - } u = {0x0100}; - - return u.b[1]; -} - -template <> inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline std::vector -get_device_info_host() { - return {info::execution_capability::exec_kernel}; -} - -template <> inline bool get_device_info_host() { - return true; -} - -template <> -inline std::vector -get_device_info_host() { - return {}; -} - -template <> -inline std::vector -get_device_info_host() { - return {}; -} - -template <> inline platform get_device_info_host() { - return createSyclObjFromImpl(platform_impl::getHostPlatformImpl()); -} - -template <> inline std::string get_device_info_host() { - return "SYCL host device"; -} - -template <> inline std::string get_device_info_host() { - return ""; -} - -template <> -inline std::string get_device_info_host() { - return "1.2"; -} - -template <> inline std::string get_device_info_host() { - return "FULL PROFILE"; -} - -template <> inline std::string get_device_info_host() { - return "1.2"; -} - -template <> -inline std::string get_device_info_host() { - return "not applicable"; -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update when appropriate - return {}; -} - -template <> -inline size_t get_device_info_host() { - // current value is the required minimum - return 1024 * 1024; -} - -template <> -inline bool get_device_info_host() { - return false; -} - -template <> inline device get_device_info_host() { - throw invalid_object_error( - "Partitioning to subdevices of the host device is not implemented", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t -get_device_info_host() { - // TODO update once subdevice creation is enabled - return 1; -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update once subdevice creation is enabled - return {}; -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update once subdevice creation is enabled - return {}; -} - -template <> -inline info::partition_property -get_device_info_host() { - return info::partition_property::no_partition; -} - -template <> -inline info::partition_affinity_domain -get_device_info_host() { - // TODO update once subdevice creation is enabled - return info::partition_affinity_domain::not_applicable; -} - -template <> -inline uint32_t get_device_info_host() { - // TODO update once subdevice creation is enabled - return 1; -} - -template <> -inline uint32_t get_device_info_host() { - // TODO update once subgroups are enabled - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline std::vector -get_device_info_host() { - // TODO update once subgroups are enabled - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool -get_device_info_host() { - // TODO update once subgroups are enabled - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool get_device_info_host() { - return false; -} - -template <> -inline std::string get_device_info_host() { - throw runtime_error( - "Backend version feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool -get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return true; -} - -template <> -inline bool get_device_info_host() { - return false; -} - -// Specializations for intel extensions for Level Zero low-level -// detail device descriptors (not support on host). -template <> -inline uint32_t get_device_info_host() { - throw runtime_error("Obtaining the device ID is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline std::string -get_device_info_host() { - throw runtime_error( - "Obtaining the PCI address is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t get_device_info_host() { - throw runtime_error("Obtaining the EU count is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU SIMD width is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t get_device_info_host() { - throw runtime_error( - "Obtaining the number of slices is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error("Obtaining the number of subslices per slice is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU count per subslice is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the HW threads count per EU is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline uint64_t -get_device_info_host() { - throw runtime_error( - "Obtaining the maximum memory bandwidth is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -template <> -inline detail::uuid_type -get_device_info_host() { - throw runtime_error( - "Obtaining the device uuid is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline std::string get_device_info_host() { - throw runtime_error( - "Obtaining the PCI address is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t get_device_info_host() { - throw runtime_error("Obtaining the EU count is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU SIMD width is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t get_device_info_host() { - throw runtime_error( - "Obtaining the number of slices is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error("Obtaining the number of subslices per slice is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the EU count per subslice is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the HW threads count per EU is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline uint64_t -get_device_info_host() { - throw runtime_error( - "Obtaining the maximum memory bandwidth is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} -// TODO:Move to namespace ext::intel::info::device -template <> inline bool get_device_info_host() { - return false; -} - -// TODO: Remove with deprecated feature -// device::get_info() -template <> -inline detail::uuid_type -get_device_info_host() { - throw runtime_error( - "Obtaining the device uuid is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint64_t get_device_info_host() { - throw runtime_error( - "Obtaining the device free memory is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the device memory clock rate is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t -get_device_info_host() { - throw runtime_error( - "Obtaining the device memory bus width is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline int32_t -get_device_info_host() { - throw runtime_error( - "Obtaining max compute queue indices is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline bool get_device_info_host< - ext::codeplay::experimental::info::device::supports_fusion>() { - // No support for fusion on the host device. - return false; -} - -template <> -inline uint32_t get_device_info_host< - ext::codeplay::experimental::info::device::max_registers_per_work_group>() { - throw runtime_error("Obtaining the maximum number of available registers per " - "work-group is not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::image_row_pitch_align>() { - throw runtime_error("Obtaining image pitch alignment is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::max_image_linear_row_pitch>() { - throw runtime_error("Obtaining max image linear pitch is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::matrix_combinations>() { - throw runtime_error("Obtaining matrix combinations is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::max_image_linear_width>() { - throw runtime_error("Obtaining max image linear width is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline uint32_t get_device_info_host< - ext::oneapi::experimental::info::device::max_image_linear_height>() { - throw runtime_error("Obtaining max image linear height is not " - "supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline float get_device_info_host< - ext::oneapi::experimental::info::device::mipmap_max_anisotropy>() { - throw runtime_error("Bindless image mipmaps are not supported on HOST device", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline std::vector get_device_info_host< - ext::oneapi::experimental::info::device::component_devices>() { - throw runtime_error("Host devices cannot be component devices.", - PI_ERROR_INVALID_DEVICE); -} - -template <> -inline sycl::device get_device_info_host< - ext::oneapi::experimental::info::device::composite_device>() { - throw runtime_error("Host devices cannot be composite devices.", - PI_ERROR_INVALID_DEVICE); -} - -// Returns the list of all progress guarantees that can be requested for -// work_groups from the coordination level of root_group when using host device. -// First it calls getHostProgressGuarantee to get the strongest guarantee -// available and then calls getProgressGuaranteesUpTo to get a list of all -// guarantees that are either equal to the strongest guarantee or weaker than -// it. The next 5 definitions follow the same model but for different scopes. -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_group_progress_capabilities< - ext::oneapi::experimental::execution_scope::root_group>>() { - - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_group, - execution_scope::root_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::sub_group_progress_capabilities< - ext::oneapi::experimental::execution_scope::root_group>>() { - - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::sub_group, - execution_scope::root_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::sub_group_progress_capabilities< - ext::oneapi::experimental::execution_scope::work_group>>() { - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::sub_group, - execution_scope::work_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_item_progress_capabilities< - ext::oneapi::experimental::execution_scope::root_group>>() { - - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_item, - execution_scope::root_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_item_progress_capabilities< - ext::oneapi::experimental::execution_scope::work_group>>() { - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_item, - execution_scope::work_group)); -} - -template <> -inline std::vector -get_device_info_host< - ext::oneapi::experimental::info::device::work_item_progress_capabilities< - ext::oneapi::experimental::execution_scope::sub_group>>() { - using execution_scope = ext::oneapi::experimental::execution_scope; - using ReturnT = - std::vector; - return device_impl::getProgressGuaranteesUpTo( - device_impl::getHostProgressGuarantee(execution_scope::work_item, - execution_scope::sub_group)); -} - // Returns the list of all progress guarantees that can be requested for // work_groups from the coordination level of root_group when using the device // given by Dev. First it calls getProgressGuarantee to get the strongest diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 63d60e41ac7e8..097cef03b4d66 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -33,24 +33,13 @@ extern xpti::trace_event_data_t *GSYCLGraphEvent; #endif // If we do not yet have a context, use the default one. -void event_impl::ensureContextInitialized() { - if (MIsContextInitialized) +void event_impl::initContextIfNeeded() { + if (MContext || !MIsDefaultConstructed) return; - if (MHostEvent) { - QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue(); - this->setContextImpl(detail::getSyclObjImpl(HostQueue->get_context())); - } else { - const device SyclDevice; - this->setContextImpl(detail::queue_impl::getDefaultOrNew( - detail::getSyclObjImpl(SyclDevice))); - } -} - -bool event_impl::is_host() { - // Treat all devices that don't support interoperability as host devices to - // avoid attempts to call method get on such events. - return MHostEvent; + const device SyclDevice; + this->setContextImpl( + detail::queue_impl::getDefaultOrNew(detail::getSyclObjImpl(SyclDevice))); } event_impl::~event_impl() { @@ -63,7 +52,7 @@ event_impl::~event_impl() { } void event_impl::waitInternal(bool *Success) { - if (!MHostEvent && MEvent) { + if (!MIsHostEvent && MEvent) { // Wait for the native event sycl::detail::pi::PiResult Err = getPlugin()->call_nocheck(1, &MEvent); @@ -96,7 +85,7 @@ void event_impl::waitInternal(bool *Success) { } void event_impl::setComplete() { - if (MHostEvent || !MEvent) { + if (MIsHostEvent || !MEvent) { { std::unique_lock lock(MMutex); #ifndef NDEBUG @@ -129,36 +118,27 @@ const sycl::detail::pi::PiEvent &event_impl::getHandleRef() const { sycl::detail::pi::PiEvent &event_impl::getHandleRef() { return MEvent; } const ContextImplPtr &event_impl::getContextImpl() { - ensureContextInitialized(); + initContextIfNeeded(); return MContext; } const PluginPtr &event_impl::getPlugin() { - ensureContextInitialized(); + initContextIfNeeded(); return MContext->getPlugin(); } void event_impl::setStateIncomplete() { MState = HES_NotComplete; } void event_impl::setContextImpl(const ContextImplPtr &Context) { - MHostEvent = Context->is_host(); + MIsHostEvent = Context == nullptr; MContext = Context; - MIsContextInitialized = true; } event_impl::event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext) - : MIsContextInitialized(true), MEvent(Event), - MContext(detail::getSyclObjImpl(SyclContext)), MHostEvent(false), + : MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true), MState(HES_Complete) { - if (MContext->is_host()) { - throw sycl::exception(sycl::make_error_code(sycl::errc::invalid), - "The syclContext must match the OpenCL context " - "associated with the clEvent. " + - codeToString(PI_ERROR_INVALID_CONTEXT)); - } - sycl::detail::pi::PiContext TempContext; getPlugin()->call( MEvent, PI_EVENT_INFO_CONTEXT, sizeof(sycl::detail::pi::PiContext), @@ -172,19 +152,18 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event, } event_impl::event_impl(const QueueImplPtr &Queue) - : MQueue{Queue}, - MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled}, - MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} { - this->setContextImpl(Queue->getContextImplPtr()); - if (Queue->is_host()) { + : MQueue{Queue}, MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled}, + MFallbackProfiling{MIsProfilingEnabled && Queue && + Queue->isProfilingFallback()} { + if (Queue) + this->setContextImpl(Queue->getContextImplPtr()); + else { MState.store(HES_NotComplete); - if (Queue->has_property()) { - MHostProfilingInfo.reset(new HostProfilingInfo()); - if (!MHostProfilingInfo) - throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), - "Out of host memory " + - codeToString(PI_ERROR_OUT_OF_HOST_MEMORY)); - } + MHostProfilingInfo.reset(new HostProfilingInfo()); + if (!MHostProfilingInfo) + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "Out of host memory " + + codeToString(PI_ERROR_OUT_OF_HOST_MEMORY)); return; } MState.store(HES_Complete); @@ -285,7 +264,8 @@ void event_impl::wait_and_throw( void event_impl::checkProfilingPreconditions() const { std::weak_ptr EmptyPtr; - if (!EmptyPtr.owner_before(MQueue) && !MQueue.owner_before(EmptyPtr)) { + if (!MIsHostEvent && !EmptyPtr.owner_before(MQueue) && + !MQueue.owner_before(EmptyPtr)) { throw sycl::exception(make_error_code(sycl::errc::invalid), "Profiling information is unavailable as the event " "has no associated queue."); @@ -323,7 +303,7 @@ event_impl::get_profiling_info() { // made by forcing the re-sync of submit time to start time is less than // 0.5ms. These timing values were obtained empirically using an integrated // Intel GPU). - if (MEventFromSubmittedExecCommandBuffer && !MHostEvent && MEvent) { + if (MEventFromSubmittedExecCommandBuffer && !MIsHostEvent && MEvent) { uint64_t StartTime = get_event_profiling_info( this->getHandleRef(), this->getPlugin()); @@ -337,7 +317,7 @@ template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); - if (!MHostEvent) { + if (!MIsHostEvent) { if (MEvent) { auto StartTime = get_event_profiling_info( @@ -364,7 +344,7 @@ event_impl::get_profiling_info() { template <> uint64_t event_impl::get_profiling_info() { checkProfilingPreconditions(); - if (!MHostEvent) { + if (!MIsHostEvent) { if (MEvent) { auto EndTime = get_event_profiling_info( @@ -389,7 +369,7 @@ uint64_t event_impl::get_profiling_info() { } template <> uint32_t event_impl::get_info() { - if (!MHostEvent && MEvent) { + if (!MIsHostEvent && MEvent) { return get_event_info(this->getHandleRef(), this->getPlugin()); } @@ -402,7 +382,7 @@ event_impl::get_info() { if (MState == HES_Discarded) return info::event_command_status::ext_oneapi_unknown; - if (!MHostEvent) { + if (!MIsHostEvent) { // Command is enqueued and PiEvent is ready if (MEvent) return get_event_info( @@ -412,7 +392,7 @@ event_impl::get_info() { return sycl::info::event_command_status::submitted; } - return MHostEvent && MState.load() != HES_Complete + return MIsHostEvent && MState.load() != HES_Complete ? sycl::info::event_command_status::submitted : info::event_command_status::complete; } @@ -420,7 +400,7 @@ event_impl::get_info() { template <> typename info::platform::version::return_type event_impl::get_backend_info() const { - if (!MIsContextInitialized) { + if (!MContext) { return "Context not initialized, no backend info available"; } if (MContext->getBackend() != backend::opencl) { @@ -433,14 +413,15 @@ event_impl::get_backend_info() const { ->get_platform() .get_info(); } - return ""; // If the queue has been released, no platform will be associated - // so return empty string + // If the queue has been released, no platform will be associated + // so return empty string. + return ""; } template <> typename info::device::version::return_type event_impl::get_backend_info() const { - if (!MIsContextInitialized) { + if (!MContext) { return "Context not initialized, no backend info available"; } if (MContext->getBackend() != backend::opencl) { @@ -458,7 +439,7 @@ event_impl::get_backend_info() const { template <> typename info::device::backend_version::return_type event_impl::get_backend_info() const { - if (!MIsContextInitialized) { + if (!MContext) { return "Context not initialized, no backend info available"; } if (MContext->getBackend() != backend::ext_oneapi_level_zero) { @@ -477,11 +458,12 @@ void HostProfilingInfo::start() { StartTime = getTimestamp(); } void HostProfilingInfo::end() { EndTime = getTimestamp(); } pi_native_handle event_impl::getNative() { - ensureContextInitialized(); + if (isHost()) + return {}; + initContextIfNeeded(); auto Plugin = getPlugin(); - if (!MIsInitialized) { - MIsInitialized = true; + if (MIsDefaultConstructed && !MEvent) { auto TempContext = MContext.get()->getHandleRef(); Plugin->call(TempContext, &MEvent); } @@ -568,6 +550,12 @@ void event_impl::setSubmissionTime() { e.what()); std::rethrow_exception(std::current_exception()); } + } else { + // Returning host time + using namespace std::chrono; + MSubmitTime = + duration_cast(steady_clock::now().time_since_epoch()) + .count(); } } else { // Capture the host timestamp for a return value of function call @@ -591,6 +579,13 @@ bool event_impl::isCompleted() { info::event_command_status::complete; } +void event_impl::setCommand(void *Cmd) { + MCommand = Cmd; + auto TypedCommand = static_cast(Cmd); + if (TypedCommand) + MIsHostEvent = TypedCommand->getWorkerContext() == nullptr; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 91bef738450d3..e52ac40ad78d7 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -49,8 +49,8 @@ class event_impl { /// Normally constructs a host event, use std::nullopt to instead instantiate /// a device event. event_impl(std::optional State = HES_Complete) - : MIsInitialized(false), MHostEvent(State), MIsFlushed(true), - MState(State.value_or(HES_Complete)) { + : MIsFlushed(true), MState(State.value_or(HES_Complete)), + MIsDefaultConstructed(!State), MIsHostEvent(State) { // Need to fail in event() constructor if there are problems with the // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept // event methods. This ::get() call uses static vars to read and parse the @@ -68,14 +68,6 @@ class event_impl { event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext); event_impl(const QueueImplPtr &Queue); - /// Checks if this event is a SYCL host event. - /// - /// All devices that do not support OpenCL interoperability are treated as - /// host device to avoid attempts to call method get on such events. - // - /// \return true if this event is a SYCL host event. - bool is_host(); - /// Waits for the event. /// /// Self is needed in order to pass shared_ptr to Scheduler. @@ -177,7 +169,7 @@ class event_impl { /// Scheduler mutex must be locked in write mode when this is called. /// /// @param Command is a generic pointer to Command object instance. - void setCommand(void *Command) { MCommand = Command; } + void setCommand(void *Command); /// Returns host profiling information. /// @@ -263,15 +255,6 @@ class event_impl { QueueImplPtr getSubmittedQueue() const { return MSubmittedQueue.lock(); }; - /// Checks if an event is in a fully intialized state. Default-constructed - /// events will return true only after having initialized its native event, - /// while other events will assume that they are fully initialized at - /// construction, relying on external sources to supply member data. - /// - /// \return true if the event is considered to be in a fully initialized - /// state. - bool isInitialized() const noexcept { return MIsInitialized; } - /// Checks if this event is complete. /// /// \return true if this event is complete. @@ -287,10 +270,11 @@ class event_impl { MPostCompleteEvents.push_back(Event); } - bool isContextInitialized() const noexcept { return MIsContextInitialized; } + bool isDefaultConstructed() const noexcept { return MIsDefaultConstructed; } ContextImplPtr getContextImplPtr() { - ensureContextInitialized(); + if (MIsDefaultConstructed) + initContextIfNeeded(); return MContext; } @@ -340,6 +324,8 @@ class event_impl { void setEnqueued() { MIsEnqueued = true; } + bool isHost() { return MIsHostEvent; } + void markAsProfilingTagEvent() { MProfilingTagEvent = true; } bool isProfilingTagEvent() const noexcept { return MProfilingTagEvent; } @@ -353,17 +339,12 @@ class event_impl { void instrumentationEpilog(void *TelementryEvent, const std::string &Name, int32_t StreamID, uint64_t IId) const; void checkProfilingPreconditions() const; - // Events constructed without a context will lazily use the default context - // when needed. - void ensureContextInitialized(); - bool MIsInitialized = true; - bool MIsContextInitialized = false; + sycl::detail::pi::PiEvent MEvent = nullptr; // Stores submission time of command associated with event uint64_t MSubmitTime = 0; uint64_t MHostBaseTime = 0; ContextImplPtr MContext; - bool MHostEvent = true; std::unique_ptr MHostProfilingInfo; void *MCommand = nullptr; std::weak_ptr MQueue; @@ -416,6 +397,20 @@ class event_impl { std::shared_ptr Context); std::atomic_bool MIsEnqueued{false}; + + // Events constructed without a context will lazily use the default context + // when needed. + void initContextIfNeeded(); + // Event class represents 3 different kinds of operations: + // | type | has PI event | MContext | MIsHostTask | MIsDefaultConstructed | + // | dev | true | !nullptr | false | false | + // | host | false | nullptr | true | false | + // |default| * | * | false | true | + // Default constructed event is created with empty ctor in host code, MContext + // is lazily initialized with default device context on first context query. + // MEvent is lazily created in first pi handle query. + bool MIsDefaultConstructed = false; + bool MIsHostEvent = false; }; } // namespace detail diff --git a/sycl/source/detail/filter_selector_impl.cpp b/sycl/source/detail/filter_selector_impl.cpp index 4b5f8e836ee6d..0043622d62483 100644 --- a/sycl/source/detail/filter_selector_impl.cpp +++ b/sycl/source/detail/filter_selector_impl.cpp @@ -99,9 +99,6 @@ filter_selector_impl::filter_selector_impl(const std::string &Input) } int filter_selector_impl::operator()(const device &Dev) const { - assert(!sycl::detail::getSyclObjImpl(Dev)->is_host() && - "filter_selector_impl should not be used with host."); - int Score = REJECT_DEVICE_SCORE; for (auto &Filter : mFilters) { diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 9ef8ce262932f..de4a6fd3bd22e 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -357,9 +357,6 @@ graph_impl::add(const std::shared_ptr &Impl, const std::shared_ptr &NodeImpl = std::make_shared(); - // Add any deps from the vector of extra dependencies - Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end()); - MNodeStorage.push_back(NodeImpl); addDepsToNode(NodeImpl, Deps); @@ -492,20 +489,12 @@ graph_impl::add(node_type NodeType, // list Deps.insert(Deps.end(), UniqueDeps.begin(), UniqueDeps.end()); - // Add any deps from the extra dependencies vector - Deps.insert(Deps.end(), MExtraDependencies.begin(), MExtraDependencies.end()); - const std::shared_ptr &NodeImpl = std::make_shared(NodeType, std::move(CommandGroup)); MNodeStorage.push_back(NodeImpl); addDepsToNode(NodeImpl, Deps); - // Set barrier nodes as prerequisites (new start points) for subsequent nodes - if (NodeImpl->MCGType == sycl::detail::CG::Barrier) { - MExtraDependencies.push_back(NodeImpl); - } - return NodeImpl; } @@ -614,12 +603,17 @@ void graph_impl::makeEdge(std::shared_ptr Src, removeRoot(Dest); // remove receiver from root node list } -std::vector graph_impl::getExitNodesEvents() { +std::vector graph_impl::getExitNodesEvents( + std::weak_ptr RecordedQueue) { std::vector Events; + auto RecordedQueueSP = RecordedQueue.lock(); for (auto &Node : MNodeStorage) { if (Node->MSuccessors.empty()) { - Events.push_back(getEventForNode(Node)); + auto EventForNode = getEventForNode(Node); + if (EventForNode->getSubmittedQueue() == RecordedQueueSP) { + Events.push_back(getEventForNode(Node)); + } } } diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 80837181ec056..9d13a87ed13e9 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -1184,26 +1184,26 @@ class graph_impl { size_t getNumberOfNodes() const { return MNodeStorage.size(); } /// Traverse the graph recursively to get the events associated with the - /// output nodes of this graph. + /// output nodes of this graph associated with a specific queue. + /// @param[in] Queue The queue exit nodes must have been recorded from. /// @return vector of events associated to exit nodes. - std::vector getExitNodesEvents(); - - /// Removes all Barrier nodes from the list of extra dependencies - /// MExtraDependencies. - /// @return vector of events associated to previous barrier nodes. std::vector - removeBarriersFromExtraDependencies() { - std::vector Events; - for (auto It = MExtraDependencies.begin(); - It != MExtraDependencies.end();) { - if ((*It)->MCGType == sycl::detail::CG::Barrier) { - Events.push_back(getEventForNode(*It)); - It = MExtraDependencies.erase(It); - } else { - ++It; - } - } - return Events; + getExitNodesEvents(std::weak_ptr Queue); + + /// Store the last barrier node that was submitted to the queue. + /// @param[in] Queue The queue the barrier was recorded from. + /// @param[in] BarrierNodeImpl The created barrier node. + void setBarrierDep(std::weak_ptr Queue, + std::shared_ptr BarrierNodeImpl) { + MBarrierDependencyMap[Queue] = BarrierNodeImpl; + } + + /// Get the last barrier node that was submitted to the queue. + /// @param[in] Queue The queue to find the last barrier node of. An empty + /// shared_ptr is returned if no barrier node has been recorded to the queue. + std::shared_ptr + getBarrierDep(std::weak_ptr Queue) { + return MBarrierDependencyMap[Queue]; } private: @@ -1281,11 +1281,11 @@ class graph_impl { /// presence of the assume_buffer_outlives_graph property. bool MAllowBuffers = false; - /// List of nodes that must be added as extra dependencies to new nodes when - /// added to this graph. - /// This list is mainly used by barrier nodes which must be considered - /// as predecessors for all nodes subsequently added to the graph. - std::list> MExtraDependencies; + /// Mapping from queues to barrier nodes. For each queue the last barrier + /// node recorded to the graph from the queue is stored. + std::map, std::shared_ptr, + std::owner_less>> + MBarrierDependencyMap; }; /// Class representing the implementation of command_graph. diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp index 1bdb2ddbd4697..901fd34b4cce8 100644 --- a/sycl/source/detail/helpers.cpp +++ b/sycl/source/detail/helpers.cpp @@ -31,9 +31,7 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { // throwaway events created with empty constructor will not have a context // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. - if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) || - SyclEventImplPtr->isNOP()) { + if (SyclEventImplPtr->isDefaultConstructed() || SyclEventImplPtr->isNOP()) { continue; } // The fusion command and its event are associated with a non-host context, @@ -41,7 +39,7 @@ getOrWaitEvents(std::vector DepEvents, ContextImplPtr Context) { bool NoPiEvent = SyclEventImplPtr->MCommand && !static_cast(SyclEventImplPtr->MCommand)->producesPiEvent(); - if (SyclEventImplPtr->is_host() || + if (SyclEventImplPtr->isHost() || SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) { // Call wait, because the command for the event might not have been // enqueued when kernel fusion is happening. diff --git a/sycl/source/detail/image_impl.cpp b/sycl/source/detail/image_impl.cpp index 0b512ae1aedbe..e5bacd33fc70d 100644 --- a/sycl/source/detail/image_impl.cpp +++ b/sycl/source/detail/image_impl.cpp @@ -471,6 +471,8 @@ bool image_impl::checkImageFormat( } std::vector image_impl::getDevices(const ContextImplPtr Context) { + if (!Context) + return {}; return Context->get_info(); } diff --git a/sycl/source/detail/kernel_bundle_impl.hpp b/sycl/source/detail/kernel_bundle_impl.hpp index 55586b6d2b5ac..35a5b690ccb8a 100644 --- a/sycl/source/detail/kernel_bundle_impl.hpp +++ b/sycl/source/detail/kernel_bundle_impl.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -329,12 +330,15 @@ class kernel_bundle_impl { } } + using include_pairs_t = + std::vector>; // oneapi_ext_kernel_compiler // construct from source string kernel_bundle_impl(const context &Context, syclex::source_language Lang, - const std::string &Src) + const std::string &Src, include_pairs_t IncludePairsVec) : MContext(Context), MDevices(Context.get_devices()), - MState(bundle_state::ext_oneapi_source), Language(Lang), Source(Src) {} + MState(bundle_state::ext_oneapi_source), Language(Lang), Source(Src), + IncludePairs(IncludePairsVec) {} // oneapi_ext_kernel_compiler // construct from source bytes @@ -348,16 +352,19 @@ class kernel_bundle_impl { // interop constructor kernel_bundle_impl(context Ctx, std::vector Devs, device_image_plain &DevImage, - std::vector KNames) + std::vector KNames, + syclex::source_language Lang) : kernel_bundle_impl(Ctx, Devs, DevImage) { MState = bundle_state::executable; KernelNames = KNames; + Language = Lang; } std::shared_ptr build_from_source(const std::vector Devices, const std::vector &BuildOptions, - std::string *LogPtr) { + std::string *LogPtr, + const std::vector &RegisteredKernelNames) { assert(MState == bundle_state::ext_oneapi_source && "bundle_state::ext_oneapi_source required"); @@ -397,6 +404,12 @@ class kernel_bundle_impl { [](std::byte B) { return static_cast(B); }); return Result; } + if (Language == syclex::source_language::sycl) { + const auto &SourceStr = std::get(this->Source); + return syclex::detail::SYCL_to_SPIRV(SourceStr, IncludePairs, + BuildOptions, LogPtr, + RegisteredKernelNames); + } throw sycl::exception( make_error_code(errc::invalid), "OpenCL C and SPIR-V are the only supported languages at this time"); @@ -437,11 +450,22 @@ class kernel_bundle_impl { PiProgram); device_image_plain DevImg{DevImgImpl}; return std::make_shared(MContext, MDevices, DevImg, - KernelNames); + KernelNames, Language); + } + + std::string adjust_kernel_name(const std::string &Name, + syclex::source_language Lang) { + // Once name demangling support is in, we won't need this. + if (Lang != syclex::source_language::sycl) + return Name; + + bool isMangled = Name.find("__sycl_kernel_") != std::string::npos; + return isMangled ? Name : "__sycl_kernel_" + Name; } bool ext_oneapi_has_kernel(const std::string &Name) { - auto it = std::find(KernelNames.begin(), KernelNames.end(), Name); + auto it = std::find(KernelNames.begin(), KernelNames.end(), + adjust_kernel_name(Name, Language)); return it != KernelNames.end(); } @@ -454,9 +478,11 @@ class kernel_bundle_impl { "kernel_bundles successfully built from " "kernel_bundle."); + std::string AdjustedName = adjust_kernel_name(Name, Language); if (!ext_oneapi_has_kernel(Name)) throw sycl::exception(make_error_code(errc::invalid), - "kernel '" + Name + "' not found in kernel_bundle"); + "kernel '" + AdjustedName + + "' not found in kernel_bundle"); assert(MDeviceImages.size() > 0); const std::shared_ptr &DeviceImageImpl = @@ -465,7 +491,8 @@ class kernel_bundle_impl { ContextImplPtr ContextImpl = getSyclObjImpl(MContext); const PluginPtr &Plugin = ContextImpl->getPlugin(); sycl::detail::pi::PiKernel PiKernel = nullptr; - Plugin->call(PiProgram, Name.c_str(), &PiKernel); + Plugin->call(PiProgram, AdjustedName.c_str(), + &PiKernel); // Kernel created by piKernelCreate is implicitly retained. std::shared_ptr KernelImpl = std::make_shared( @@ -710,11 +737,14 @@ class kernel_bundle_impl { SpecConstMapT MSpecConstValues; bool MIsInterop = false; bundle_state MState; - // ext_oneapi_kernel_compiler : Source, Languauge, KernelNames - const syclex::source_language Language = syclex::source_language::opencl; + + // ext_oneapi_kernel_compiler : Source, Languauge, KernelNames, IncludePairs + // Language is for both state::source and state::executable. + syclex::source_language Language = syclex::source_language::opencl; const std::variant> Source; // only kernel_bundles created from source have KernelNames member. std::vector KernelNames; + include_pairs_t IncludePairs; }; } // namespace detail diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp new file mode 100644 index 0000000000000..2967786673a50 --- /dev/null +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.cpp @@ -0,0 +1,270 @@ +//==-- kernel_compiler_opencl.cpp OpenCL kernel compilation support -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "kernel_compiler_sycl.hpp" +#include // make_error_code + +#if __GNUC__ && __GNUC__ < 8 + +// std::filesystem is not availalbe for GCC < 8 +// and much of the cross-platform file handling code depends upon it. +// Given that this extension is experimental and that the file +// handling aspects are most likely temporary, it makes sense to +// simply not support GCC<8. + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { +namespace detail { + +bool SYCL_Compilation_Available() { return false; } + +spirv_vec_t +SYCL_to_SPIRV(const std::string &SYCLSource, include_pairs_t IncludePairs, + const std::vector &UserArgs, std::string *LogPtr, + const std::vector &RegisteredKernelNames) { + (void)SYCLSource; + (void)IncludePairs; + (void)UserArgs; + (void)LogPtr; + (void)RegisteredKernelNames; + throw sycl::exception(sycl::errc::build, + "kernel_compiler does not support GCC<8"); +} +} // namespace detail +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl + +#else + +#include +#include +#include +#include +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { +namespace detail { + +std::string generateSemiUniqueId() { + // Get the current time as a time_t object. + std::time_t CurrentTime = std::time(nullptr); + + // Convert time_t to a string with format YYYYMMDD_HHMMSS. + std::tm *LocalTime = std::localtime(&CurrentTime); + std::stringstream Ss; + Ss << std::put_time(LocalTime, "%Y%m%d_%H%M%S"); + + // Amend with random number. + std::random_device Rd; + int RandomNumber = Rd() % 900 + 100; + Ss << "_" << std::setfill('0') << std::setw(3) << RandomNumber; + + return Ss.str(); +} + +std::filesystem::path prepareWS(const std::string &Id) { + const std::filesystem::path TmpDirectoryPath = + std::filesystem::temp_directory_path(); + std::filesystem::path NewDirectoryPath = TmpDirectoryPath / Id; + + try { + std::filesystem::create_directories(NewDirectoryPath); + } catch (const std::filesystem::filesystem_error &E) { + throw sycl::exception(sycl::errc::build, E.what()); + } + + return NewDirectoryPath; +} + +std::string userArgsAsString(const std::vector &UserArguments) { + return std::accumulate(UserArguments.begin(), UserArguments.end(), + std::string(""), + [](const std::string &A, const std::string &B) { + return A.empty() ? B : A + " " + B; + }); +} + +void outputPreamble(std::ofstream &Os, const std::filesystem::path &FilePath, + const std::string &Id, + const std::vector &UserArgs) { + + Os << "/*\n"; + Os << " clang++ -fsycl -o " << Id << ".bin "; + Os << userArgsAsString(UserArgs); + Os << " -fno-sycl-dead-args-optimization -fsycl-dump-device-code=./ " << Id; + Os << ".cpp \n */" << std::endl; +} + +std::filesystem::path +outputCpp(const std::filesystem::path &ParentDir, const std::string &Id, + std::string RawCodeString, const std::vector &UserArgs, + const std::vector &RegisteredKernelNames) { + std::filesystem::path FilePath = ParentDir / (Id + ".cpp"); + std::ofstream Outfile(FilePath, std::ios::out | std::ios::trunc); + + if (Outfile.is_open()) { + outputPreamble(Outfile, FilePath, Id, UserArgs); + Outfile << RawCodeString << std::endl; + + // Temporarily needed until -c works with -fsycl-dump-spirv. + Outfile << "int main() {\n"; + for (const std::string &KernelName : RegisteredKernelNames) { + Outfile << " " << KernelName << ";\n"; + } + Outfile << " return 0;\n}\n" << std::endl; + + Outfile.close(); + } else { + throw sycl::exception(sycl::errc::build, + "Failed to open .cpp file for write: " + + FilePath.string()); + } + + return FilePath; +} + +void outputIncludeFiles(const std::filesystem::path &Dirpath, + include_pairs_t IncludePairs) { + using pairStrings = std::pair; + for (pairStrings p : IncludePairs) { + std::filesystem::path FilePath = Dirpath / p.first; + std::ofstream outfile(FilePath, std::ios::out | std::ios::trunc); + if (outfile.is_open()) { + outfile << p.second << std::endl; + + outfile.close(); + } else { + throw sycl::exception(sycl::errc::build, + "Failed to open include file for write: " + + FilePath.string()); + } + } +} + +std::string getCompilerName() { +#ifdef __WIN32 + std::string Compiler = "clang++.exe"; +#else + std::string Compiler = "clang++"; +#endif + return Compiler; +} + +void invokeCompiler(const std::filesystem::path &FPath, + const std::filesystem::path &DPath, const std::string &Id, + const std::vector &UserArgs, + std::string *LogPtr) { + + std::filesystem::path FilePath(FPath); + std::filesystem::path ParentDir(DPath); + std::filesystem::path TargetPath = ParentDir / (Id + ".bin"); + std::filesystem::path LogPath = ParentDir / "compilation_log.txt"; + std::string Compiler = getCompilerName(); + + std::string Command = + Compiler + " -fsycl -o " + TargetPath.make_preferred().string() + " " + + userArgsAsString(UserArgs) + + " -fno-sycl-dead-args-optimization -fsycl-dump-device-code=" + + ParentDir.make_preferred().string() + " " + + FilePath.make_preferred().string() + " 2> " + + LogPath.make_preferred().string(); + + int Result = std::system(Command.c_str()); + + // Read the log file contents into the log variable. + std::string CompileLog; + std::ifstream LogStream; + LogStream.open(LogPath); + if (LogStream.is_open()) { + std::stringstream LogBuffer; + LogBuffer << LogStream.rdbuf(); + CompileLog.append(LogBuffer.str()); + if (LogPtr != nullptr) + LogPtr->append(LogBuffer.str()); + + } else if (Result == 0 && LogPtr != nullptr) { + // If there was a compilation problem, we want to report that (below), + // not a mere "missing log" error. + throw sycl::exception(sycl::errc::build, + "failure retrieving compilation log"); + } + + if (Result != 0) { + throw sycl::exception(sycl::errc::build, + "Compile failure: " + std::to_string(Result) + " " + + CompileLog); + } +} + +std::filesystem::path findSpv(const std::filesystem::path &ParentDir, + const std::string &Id) { + std::regex PatternRegex(Id + R"(.*\.spv)"); + + // Iterate through all files in the directory matching the pattern. + for (const auto &Entry : std::filesystem::directory_iterator(ParentDir)) { + if (Entry.is_regular_file() && + std::regex_match(Entry.path().filename().string(), PatternRegex)) { + return Entry.path(); // Return the path if it matches the SPV pattern. + } + } + + // File not found, throw. + throw sycl::exception(sycl::errc::build, "SPIRV output matching " + Id + + " missing from " + + ParentDir.filename().string()); +} + +spirv_vec_t loadSpvFromFile(const std::filesystem::path &FileName) { + std::ifstream SpvStream(FileName, std::ios::binary); + SpvStream.seekg(0, std::ios::end); + size_t Size = SpvStream.tellg(); + SpvStream.seekg(0); + spirv_vec_t Spv(Size); + SpvStream.read(reinterpret_cast(Spv.data()), Size); + + return Spv; +} + +spirv_vec_t +SYCL_to_SPIRV(const std::string &SYCLSource, include_pairs_t IncludePairs, + const std::vector &UserArgs, std::string *LogPtr, + const std::vector &RegisteredKernelNames) { + // clang-format off + const std::string id = generateSemiUniqueId(); + const std::filesystem::path ParentDir = prepareWS(id); + std::filesystem::path FilePath = outputCpp(ParentDir, id, SYCLSource, UserArgs, RegisteredKernelNames); + outputIncludeFiles(ParentDir, IncludePairs); + invokeCompiler(FilePath, ParentDir, id, UserArgs, LogPtr); + std::filesystem::path SpvPath = findSpv(ParentDir, id); + return loadSpvFromFile(SpvPath); + // clang-format on +} + +bool SYCL_Compilation_Available() { + // Is compiler on $PATH ? We try to invoke it. + std::string id = generateSemiUniqueId(); + const std::filesystem::path tmp = std::filesystem::temp_directory_path(); + std::filesystem::path DumpPath = tmp / (id + "_version.txt"); + std::string Compiler = getCompilerName(); + std::string TestCommand = + Compiler + " --version &> " + DumpPath.make_preferred().string(); + int result = std::system(TestCommand.c_str()); + + return (result == 0); +} + +} // namespace detail +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl +#endif diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp new file mode 100644 index 0000000000000..dfff9ac839e84 --- /dev/null +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_sycl.hpp @@ -0,0 +1,38 @@ +//==-- kernel_compiler_sycl.hpp SYCL kernel compilation support -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include // __SYCL_EXPORT +#include + +#include // std::accumulate +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { +namespace detail { + +using spirv_vec_t = std::vector; +using include_pairs_t = std::vector>; + +spirv_vec_t +SYCL_to_SPIRV(const std::string &Source, include_pairs_t IncludePairs, + const std::vector &UserArgs, std::string *LogPtr, + const std::vector &RegisteredKernelNames); + +bool SYCL_Compilation_Available(); + +} // namespace detail +} // namespace ext::oneapi::experimental + +} // namespace _V1 +} // namespace sycl diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp index 0696123e94450..8502f3489b9c7 100644 --- a/sycl/source/detail/kernel_impl.cpp +++ b/sycl/source/detail/kernel_impl.cpp @@ -77,9 +77,7 @@ kernel_impl::kernel_impl(ContextImplPtr Context, ProgramImplPtr ProgramImpl) kernel_impl::~kernel_impl() { try { // TODO catch an exception and put it to list of asynchronous exceptions - if (!is_host()) { - getPlugin()->call(MKernel); - } + getPlugin()->call(MKernel); } catch (std::exception &e) { __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~kernel_impl", e); } diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 1e56e6da4dc53..1a1542d0d409b 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -103,20 +103,10 @@ class kernel_impl { /// /// \return a valid cl_kernel instance cl_kernel get() const { - if (is_host()) { - throw invalid_object_error( - "This instance of kernel doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_KERNEL); - } getPlugin()->call(MKernel); return pi::cast(MKernel); } - /// Check if the associated SYCL context is a SYCL host context. - /// - /// \return true if this SYCL kernel is a host kernel. - bool is_host() const { return MContext->is_host(); } - const PluginPtr &getPlugin() const { return MContext->getPlugin(); } /// Query information from the kernel object using the info::kernel_info @@ -217,11 +207,6 @@ template inline typename Param::return_type kernel_impl::get_info() const { static_assert(is_kernel_info_desc::value, "Invalid kernel information descriptor"); - if (is_host()) { - // TODO implement - assert(0 && "Not implemented"); - } - if constexpr (std::is_same_v) checkIfValidForNumArgsInfoQuery(); @@ -248,9 +233,6 @@ kernel_impl::get_info(const device &Device) const { "is a built-in kernel."); } - if (is_host()) { - return get_kernel_device_specific_info_host(Device); - } return get_kernel_device_specific_info( this->getHandleRef(), getSyclObjImpl(Device)->getHandleRef(), getPlugin()); @@ -260,10 +242,6 @@ template inline typename Param::return_type kernel_impl::get_info(const device &Device, const sycl::range<3> &WGSize) const { - if (is_host()) { - throw runtime_error("Sub-group feature is not supported on HOST device.", - PI_ERROR_INVALID_DEVICE); - } return get_kernel_device_specific_info_with_input( this->getHandleRef(), getSyclObjImpl(Device)->getHandleRef(), WGSize, getPlugin()); diff --git a/sycl/source/detail/kernel_info.hpp b/sycl/source/detail/kernel_info.hpp index 12256158eed49..79c0f73c952de 100644 --- a/sycl/source/detail/kernel_info.hpp +++ b/sycl/source/detail/kernel_info.hpp @@ -137,79 +137,6 @@ uint32_t get_kernel_device_specific_info_with_input( return Result; } -template -inline typename Param::return_type -get_kernel_device_specific_info_host(const sycl::device &Device) = delete; - -template <> -inline sycl::range<3> get_kernel_device_specific_info_host< - info::kernel_device_specific::global_work_size>(const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline size_t get_kernel_device_specific_info_host< - info::kernel_device_specific::work_group_size>(const sycl::device &Dev) { - return Dev.get_info(); -} - -template <> -inline sycl::range<3> get_kernel_device_specific_info_host< - info::kernel_device_specific::compile_work_group_size>( - const sycl::device &) { - return {0, 0, 0}; -} - -template <> -inline size_t get_kernel_device_specific_info_host< - info::kernel_device_specific::preferred_work_group_size_multiple>( - const sycl::device &Dev) { - return get_kernel_device_specific_info_host< - info::kernel_device_specific::work_group_size>(Dev); -} - -template <> -inline size_t get_kernel_device_specific_info_host< - info::kernel_device_specific::private_mem_size>(const sycl::device &) { - return 0; -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::ext_codeplay_num_regs>(const sycl::device &) { - return 0; -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::max_num_sub_groups>(const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::max_sub_group_size>(const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::compile_num_sub_groups>( - const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} - -template <> -inline uint32_t get_kernel_device_specific_info_host< - info::kernel_device_specific::compile_sub_group_size>( - const sycl::device &) { - throw invalid_object_error("This instance of kernel is a host instance", - PI_ERROR_INVALID_KERNEL); -} } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 37ede74578ed7..6cfa71d156062 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -266,7 +266,7 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext, return; } - if (TargetContext->is_host()) { + if (!TargetContext) { MemObj->releaseHostMem(MemAllocation); return; } @@ -299,7 +299,6 @@ void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, return UserPtr; return MemObj->allocateHostMem(); - ; } void *MemoryManager::allocateInteropMemObject( @@ -398,7 +397,7 @@ void *MemoryManager::allocateMemBuffer( const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, sycl::detail::pi::PiEvent &OutEventToWait) { void *MemPtr; - if (TargetContext->is_host()) + if (!TargetContext) MemPtr = allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); else if (UserPtr && InteropContext) @@ -420,7 +419,7 @@ void *MemoryManager::allocateMemImage( const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext, const sycl::property_list &PropsList, sycl::detail::pi::PiEvent &OutEventToWait) { - if (TargetContext->is_host()) + if (!TargetContext) return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList); if (UserPtr && InteropContext) @@ -438,7 +437,7 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext, waitForEvents(DepEvents); OutEvent = nullptr; - if (TargetContext->is_host()) + if (!TargetContext) return static_cast(static_cast(ParentMemObj) + Offset); size_t SizeInBytes = ElemSize; @@ -507,6 +506,7 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr, const detail::EventImplPtr &OutEventImpl) { (void)SrcAccessRange; assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(TgtQueue && "Destination mem object queue must be not nullptr"); const sycl::detail::pi::PiQueue Queue = TgtQueue->getHandleRef(); const PluginPtr &Plugin = TgtQueue->getPlugin(); @@ -585,6 +585,7 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem, const detail::EventImplPtr &OutEventImpl) { (void)DstAccessRange; assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(SrcQueue && "Source mem object queue is expected to be not nullptr"); const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef(); const PluginPtr &Plugin = SrcQueue->getPlugin(); @@ -666,6 +667,8 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(SrcQueue && "Source mem object and target mem object queues are " + "expected to be not nullptr"); const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef(); const PluginPtr &Plugin = SrcQueue->getPlugin(); @@ -775,23 +778,23 @@ void MemoryManager::copy(SYCLMemObjI *SYCLMemObj, void *SrcMem, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { - if (SrcQueue->is_host()) { - if (TgtQueue->is_host()) - copyH2H(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize, - SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, - std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset, - DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); + if (!SrcQueue) { + if (!TgtQueue) + copyH2H(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize, + SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, nullptr, + DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, + std::move(DepEvents), OutEvent, OutEventImpl); else - copyH2D(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize, + copyH2D(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize, SrcAccessRange, SrcOffset, SrcElemSize, pi::cast(DstMem), std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); } else { - if (TgtQueue->is_host()) + if (!TgtQueue) copyD2H(SYCLMemObj, pi::cast(SrcMem), std::move(SrcQueue), DimSrc, SrcSize, SrcAccessRange, SrcOffset, - SrcElemSize, (char *)DstMem, std::move(TgtQueue), DimDst, DstSize, + SrcElemSize, (char *)DstMem, nullptr, DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl); else @@ -812,6 +815,7 @@ void MemoryManager::fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue, sycl::detail::pi::PiEvent &OutEvent, const detail::EventImplPtr &OutEventImpl) { assert(SYCLMemObj && "The SYCLMemObj is nullptr"); + assert(Queue && "Fill should be called only with a valid device queue"); const PluginPtr &Plugin = Queue->getPlugin(); @@ -857,7 +861,7 @@ void *MemoryManager::map(SYCLMemObjI *, void *Mem, QueueImplPtr Queue, unsigned int ElementSize, std::vector DepEvents, sycl::detail::pi::PiEvent &OutEvent) { - if (Queue->is_host()) { + if (!Queue) { throw runtime_error("Not supported configuration of map requested", PI_ERROR_INVALID_OPERATION); } @@ -902,7 +906,11 @@ void MemoryManager::unmap(SYCLMemObjI *, void *Mem, QueueImplPtr Queue, std::vector DepEvents, sycl::detail::pi::PiEvent &OutEvent) { - // Host queue is not supported here. + // Execution on host is not supported here. + if (!Queue) { + throw runtime_error("Not supported configuration of unmap requested", + PI_ERROR_INVALID_OPERATION); + } // All DepEvents are to the same Context. // Using the plugin of the Queue. @@ -917,9 +925,7 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!SrcQueue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_usm."); - + assert(SrcQueue && "USM copy must be called with a valid device queue"); if (!Len) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { if (OutEventImpl != nullptr) @@ -949,9 +955,7 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_usm."); - + assert(Queue && "USM fill must be called with a valid device queue"); if (!Length) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { if (OutEventImpl != nullptr) @@ -978,9 +982,7 @@ void MemoryManager::prefetch_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in prefetch_usm."); - + assert(Queue && "USM prefetch must be called with a valid device queue"); const PluginPtr &Plugin = Queue->getPlugin(); if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); @@ -994,9 +996,7 @@ void MemoryManager::advise_usm( std::vector /*DepEvents*/, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in advise_usm."); - + assert(Queue && "USM advise must be called with a valid device queue"); const PluginPtr &Plugin = Queue->getPlugin(); if (OutEventImpl != nullptr) OutEventImpl->setHostEnqueueTime(); @@ -1010,9 +1010,7 @@ void MemoryManager::copy_2d_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in copy_2d_usm."); - + assert(Queue && "USM copy 2d must be called with a valid device queue"); if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1088,9 +1086,7 @@ void MemoryManager::fill_2d_usm( std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_2d_usm."); - + assert(Queue && "USM fill 2d must be called with a valid device queue"); if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1118,9 +1114,7 @@ void MemoryManager::memset_2d_usm( char Value, std::vector DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in fill_2d_usm."); - + assert(Queue && "USM memset 2d must be called with a valid device queue"); if (Width == 0 || Height == 0) { // no-op, but ensure DepEvents will still be waited on if (!DepEvents.empty()) { @@ -1151,6 +1145,8 @@ memcpyToDeviceGlobalUSM(QueueImplPtr Queue, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent, const detail::EventImplPtr &OutEventImpl) { + assert(Queue && + "Copy to device global USM must be called with a valid device queue"); // Get or allocate USM memory for the device_global. DeviceGlobalUSMMem &DeviceGlobalUSM = DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue); @@ -1252,6 +1248,9 @@ static void memcpyToDeviceGlobalDirect( size_t NumBytes, size_t Offset, const void *Src, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { + assert( + Queue && + "Direct copy to device global must be called with a valid device queue"); sycl::detail::pi::PiProgram Program = getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry); const PluginPtr &Plugin = Queue->getPlugin(); @@ -1266,6 +1265,8 @@ static void memcpyFromDeviceGlobalDirect( size_t NumBytes, size_t Offset, void *Dest, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { + assert(Queue && "Direct copy from device global must be called with a valid " + "device queue"); sycl::detail::pi::PiProgram Program = getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry); const PluginPtr &Plugin = Queue->getPlugin(); @@ -1626,8 +1627,6 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem, size_t Length, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { - assert(!Context->is_host() && "Host queue not supported in prefetch_usm."); - const PluginPtr &Plugin = Context->getPlugin(); Plugin->call( CommandBuffer, Mem, Length, _pi_usm_migration_flags(0), Deps.size(), @@ -1640,8 +1639,6 @@ void MemoryManager::ext_oneapi_advise_usm_cmd_buffer( size_t Length, pi_mem_advice Advice, std::vector Deps, sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) { - assert(!Context->is_host() && "Host queue not supported in advise_usm."); - const PluginPtr &Plugin = Context->getPlugin(); Plugin->call( CommandBuffer, Mem, Length, Advice, Deps.size(), Deps.data(), @@ -1659,9 +1656,8 @@ void MemoryManager::copy_image_bindless( sycl::detail::pi::PiImageRegion CopyExtent, const std::vector &DepEvents, sycl::detail::pi::PiEvent *OutEvent) { - - assert(!Queue->getContextImplPtr()->is_host() && - "Host queue not supported in copy_image_bindless."); + assert(Queue && + "Copy image bindless must be called with a valid device queue"); assert((Flags == (sycl::detail::pi::PiImageCopyFlags) ext::oneapi::experimental::image_copy_flags::HtoD || Flags == (sycl::detail::pi::PiImageCopyFlags) diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 06ba2e2a25313..a47fedaedfe02 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -88,7 +88,6 @@ class MemoryManager { static void *allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr, bool HostPtrReadOnly, size_t Size, const sycl::property_list &PropsList); - static void * allocateInteropMemObject(ContextImplPtr TargetContext, void *UserPtr, const EventImplPtr &InteropEvent, diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp index 00f66a28a5de8..2b4dbf8c92fd2 100644 --- a/sycl/source/detail/platform_impl.cpp +++ b/sycl/source/detail/platform_impl.cpp @@ -30,12 +30,6 @@ namespace detail { using PlatformImplPtr = std::shared_ptr; -PlatformImplPtr platform_impl::getHostPlatformImpl() { - static PlatformImplPtr HostImpl = std::make_shared(); - - return HostImpl; -} - PlatformImplPtr platform_impl::getOrMakePlatformImpl(sycl::detail::pi::PiPlatform PiPlatform, const PluginPtr &Plugin) { @@ -85,9 +79,6 @@ static bool IsBannedPlatform(platform Platform) { // is disabled as well. // auto IsMatchingOpenCL = [](platform Platform, const std::string_view name) { - if (getSyclObjImpl(Platform)->is_host()) - return false; - const bool HasNameMatch = Platform.get_info().find( name) != std::string::npos; const auto Backend = detail::getSyclObjImpl(Platform)->getBackend(); @@ -466,15 +457,9 @@ platform_impl::get_devices(info::device_type DeviceType) const { ods_target_list *OdsTargetList = SYCLConfig::get(); - if (is_host() && (DeviceType == info::device_type::host || - DeviceType == info::device_type::all)) { - Res.push_back( - createSyclObjFromImpl(device_impl::getHostDeviceImpl())); - } - // If any DeviceType other than host was requested for host platform, // an empty vector will be returned. - if (is_host() || DeviceType == info::device_type::host) + if (DeviceType == info::device_type::host) return Res; pi_uint32 NumDevices = 0; @@ -556,9 +541,6 @@ platform_impl::get_devices(info::device_type DeviceType) const { } bool platform_impl::has_extension(const std::string &ExtensionName) const { - if (is_host()) - return false; - std::string AllExtensionNames = get_platform_info_string_impl( MPlatform, getPlugin(), detail::PiInfoCode::value); @@ -580,9 +562,6 @@ pi_native_handle platform_impl::getNative() const { template typename Param::return_type platform_impl::get_info() const { - if (is_host()) - return get_platform_info_host(); - return get_platform_info(this->getHandleRef(), getPlugin()); } diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp index 34537c7191af6..dfb2597bf417b 100644 --- a/sycl/source/detail/platform_impl.hpp +++ b/sycl/source/detail/platform_impl.hpp @@ -32,9 +32,6 @@ class device_impl; // TODO: implement parameters treatment for host device class platform_impl { public: - /// Constructs platform_impl for a SYCL host platform. - platform_impl() : MHostPlatform(true) {} - /// Constructs platform_impl from a plug-in interoperability platform /// handle. /// @@ -89,9 +86,6 @@ class platform_impl { template typename Param::return_type get_backend_info() const; - /// \return true if this SYCL platform is a host platform. - bool is_host() const { return MHostPlatform; }; - /// Returns the backend of this platform. backend getBackend(void) const { return MBackend; } @@ -106,14 +100,7 @@ class platform_impl { } /// \return an instance of OpenCL cl_platform_id. - cl_platform_id get() const { - if (is_host()) { - throw invalid_object_error( - "This instance of platform doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_PLATFORM); - } - return pi::cast(MPlatform); - } + cl_platform_id get() const { return pi::cast(MPlatform); } /// Returns raw underlying plug-in platform handle. /// @@ -122,13 +109,7 @@ class platform_impl { /// is in use. /// /// \return a raw plug-in platform handle. - const sycl::detail::pi::PiPlatform &getHandleRef() const { - if (is_host()) - throw invalid_object_error("This instance of platform is a host instance", - PI_ERROR_INVALID_PLATFORM); - - return MPlatform; - } + const sycl::detail::pi::PiPlatform &getHandleRef() const { return MPlatform; } /// Returns all available SYCL platforms in the system. /// @@ -140,17 +121,13 @@ class platform_impl { static std::vector get_platforms(); // \return the Plugin associated with this platform. - const PluginPtr &getPlugin() const { - assert(!MHostPlatform && "Plugin is not available for Host."); - return MPlugin; - } + const PluginPtr &getPlugin() const { return MPlugin; } /// Sets the platform implementation to use another plugin. /// /// \param PluginPtr is a pointer to a plugin instance /// \param Backend is the backend that we want this platform to use void setPlugin(PluginPtr &PluginPtr, backend Backend) { - assert(!MHostPlatform && "Plugin is not available for Host"); MPlugin = PluginPtr; MBackend = Backend; } @@ -192,14 +169,6 @@ class platform_impl { getOrMakeDeviceImpl(sycl::detail::pi::PiDevice PiDevice, const std::shared_ptr &PlatformImpl); - /// Static functions that help maintain platform uniquess and - /// equality of comparison - - /// Returns the host platform impl - /// - /// \return the host platform impl - static std::shared_ptr getHostPlatformImpl(); - /// Queries the cache to see if the specified PiPlatform has been seen /// before. If so, return the cached platform_impl, otherwise create a new /// one and cache it. @@ -238,7 +207,6 @@ class platform_impl { filterDeviceFilter(std::vector &PiDevices, ListT *FilterList) const; - bool MHostPlatform = false; sycl::detail::pi::PiPlatform MPlatform = 0; backend MBackend; diff --git a/sycl/source/detail/platform_info.hpp b/sycl/source/detail/platform_info.hpp index 42c41b5063cf5..70bcd626024d9 100644 --- a/sycl/source/detail/platform_info.hpp +++ b/sycl/source/detail/platform_info.hpp @@ -59,36 +59,6 @@ get_platform_info(sycl::detail::pi::PiPlatform Plt, const PluginPtr &Plugin) { return split_string(Result, ' '); } -// Host platform information methods -template -inline typename Param::return_type get_platform_info_host() = delete; - -template <> -inline std::string get_platform_info_host() { - return "FULL PROFILE"; -} - -template <> -inline std::string get_platform_info_host() { - return "1.2"; -} - -template <> inline std::string get_platform_info_host() { - return "SYCL host platform"; -} - -template <> -inline std::string get_platform_info_host() { - return ""; -} - -template <> -inline std::vector -get_platform_info_host() { - // TODO update when appropriate - return {}; -} - } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp index 6952c6d45ebd2..ed78ecb440ef7 100644 --- a/sycl/source/detail/program_impl.cpp +++ b/sycl/source/detail/program_impl.cpp @@ -72,9 +72,8 @@ program_impl::program_impl( } MDevices = ProgramList[0]->MDevices; std::vector DevicesSorted; - if (!is_host()) { - DevicesSorted = sort_devices_by_cl_device_id(MDevices); - } + DevicesSorted = sort_devices_by_cl_device_id(MDevices); + check_device_feature_support(MDevices); std::list> Locks; for (const auto &Prg : ProgramList) { @@ -85,35 +84,32 @@ program_impl::program_impl( "Not all programs are associated with the same context", PI_ERROR_INVALID_PROGRAM); } - if (!is_host()) { - std::vector PrgDevicesSorted = - sort_devices_by_cl_device_id(Prg->MDevices); - if (PrgDevicesSorted != DevicesSorted) { - throw invalid_object_error( - "Not all programs are associated with the same devices", - PI_ERROR_INVALID_PROGRAM); - } + + std::vector PrgDevicesSorted = + sort_devices_by_cl_device_id(Prg->MDevices); + if (PrgDevicesSorted != DevicesSorted) { + throw invalid_object_error( + "Not all programs are associated with the same devices", + PI_ERROR_INVALID_PROGRAM); } } - if (!is_host()) { - std::vector Devices(get_pi_devices()); - std::vector Programs; - bool NonInterOpToLink = false; - for (const auto &Prg : ProgramList) { - if (!Prg->MLinkable && NonInterOpToLink) - continue; - NonInterOpToLink |= !Prg->MLinkable; - Programs.push_back(Prg->MProgram); - } - const PluginPtr &Plugin = getPlugin(); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck( - MContext->getHandleRef(), Devices.size(), Devices.data(), - LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr, - nullptr, &MProgram); - Plugin->checkPiResult(Err); + std::vector Devices(get_pi_devices()); + std::vector Programs; + bool NonInterOpToLink = false; + for (const auto &Prg : ProgramList) { + if (!Prg->MLinkable && NonInterOpToLink) + continue; + NonInterOpToLink |= !Prg->MLinkable; + Programs.push_back(Prg->MProgram); } + const PluginPtr &Plugin = getPlugin(); + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck( + MContext->getHandleRef(), Devices.size(), Devices.data(), + LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr, + nullptr, &MProgram); + Plugin->checkPiResult(Err); } program_impl::program_impl(ContextImplPtr Context, @@ -209,7 +205,7 @@ program_impl::program_impl(ContextImplPtr Context, program_impl::~program_impl() { try { // TODO catch an exception and put it to list of asynchronous exceptions - if (!is_host() && MProgram != nullptr) { + if (MProgram != nullptr) { const PluginPtr &Plugin = getPlugin(); Plugin->call(MProgram); } @@ -220,11 +216,6 @@ program_impl::~program_impl() { cl_program program_impl::get() const { throw_if_state_is(program_state::none); - if (is_host()) { - throw invalid_object_error( - "This instance of program doesn't support OpenCL interoperability.", - PI_ERROR_INVALID_PROGRAM); - } getPlugin()->call(MProgram); return pi::cast(MProgram); } @@ -233,50 +224,43 @@ void program_impl::compile_with_kernel_name(std::string KernelName, std::string CompileOptions) { std::lock_guard Lock(MMutex); throw_if_state_is_not(program_state::none); - if (!is_host()) { - create_pi_program_with_kernel_name( - KernelName, - /*JITCompilationIsRequired=*/(!CompileOptions.empty())); - compile(CompileOptions); - } + create_pi_program_with_kernel_name( + KernelName, + /*JITCompilationIsRequired=*/(!CompileOptions.empty())); + compile(CompileOptions); MState = program_state::compiled; } void program_impl::link(std::string LinkOptions) { std::lock_guard Lock(MMutex); throw_if_state_is_not(program_state::compiled); - if (!is_host()) { - check_device_feature_support(MDevices); - std::vector Devices(get_pi_devices()); - const PluginPtr &Plugin = getPlugin(); - const char *LinkOpts = SYCLConfig::get(); - if (!LinkOpts) { - LinkOpts = LinkOptions.c_str(); - } + check_device_feature_support(MDevices); + std::vector Devices(get_pi_devices()); + const PluginPtr &Plugin = getPlugin(); + const char *LinkOpts = SYCLConfig::get(); + if (!LinkOpts) { + LinkOpts = LinkOptions.c_str(); + } - // Plugin resets MProgram with a new pi_program as a result of the call to - // "piProgramLink". Thus, we need to release MProgram before the call to - // piProgramLink. - if (MProgram != nullptr) - Plugin->call(MProgram); + // Plugin resets MProgram with a new pi_program as a result of the call to + // "piProgramLink". Thus, we need to release MProgram before the call to + // piProgramLink. + if (MProgram != nullptr) + Plugin->call(MProgram); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck( - MContext->getHandleRef(), Devices.size(), Devices.data(), LinkOpts, - /*num_input_programs*/ 1, &MProgram, nullptr, nullptr, &MProgram); - Plugin->checkPiResult(Err); - MLinkOptions = LinkOptions; - MBuildOptions = LinkOptions; - } + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck( + MContext->getHandleRef(), Devices.size(), Devices.data(), LinkOpts, + /*num_input_programs*/ 1, &MProgram, nullptr, nullptr, &MProgram); + Plugin->checkPiResult(Err); + MLinkOptions = LinkOptions; + MBuildOptions = LinkOptions; MState = program_state::linked; } bool program_impl::has_kernel(std::string KernelName, - bool IsCreatedFromSource) const { + bool /*IsCreatedFromSource*/) const { throw_if_state_is(program_state::none); - if (is_host()) { - return !IsCreatedFromSource; - } std::vector Devices(get_pi_devices()); pi_uint64 function_ptr; @@ -303,14 +287,6 @@ kernel program_impl::get_kernel(std::string KernelName, std::shared_ptr PtrToSelf, bool IsCreatedFromSource) const { throw_if_state_is(program_state::none); - if (is_host()) { - if (IsCreatedFromSource) - throw invalid_object_error("This instance of program is a host instance", - PI_ERROR_INVALID_PROGRAM); - - return createSyclObjFromImpl( - std::make_shared(MContext, PtrToSelf)); - } auto [Kernel, ArgMask] = get_pi_kernel_arg_mask_pair(KernelName); return createSyclObjFromImpl(std::make_shared( Kernel, MContext, PtrToSelf, IsCreatedFromSource, nullptr, ArgMask)); @@ -318,8 +294,6 @@ kernel program_impl::get_kernel(std::string KernelName, std::vector> program_impl::get_binaries() const { throw_if_state_is(program_state::none); - if (is_host()) - return {}; std::vector> Result; const PluginPtr &Plugin = getPlugin(); @@ -393,24 +367,23 @@ std::pair program_impl::get_pi_kernel_arg_mask_pair(const std::string &KernelName) const { std::pair Result; - const PluginPtr &Plugin = getPlugin(); - sycl::detail::pi::PiResult Err = - Plugin->call_nocheck( - MProgram, KernelName.c_str(), &Result.first); - if (Err == PI_ERROR_INVALID_KERNEL_NAME) { - throw invalid_object_error( - "This instance of program does not contain the kernel requested", - Err); - } - Plugin->checkPiResult(Err); + const PluginPtr &Plugin = getPlugin(); + sycl::detail::pi::PiResult Err = + Plugin->call_nocheck( + MProgram, KernelName.c_str(), &Result.first); + if (Err == PI_ERROR_INVALID_KERNEL_NAME) { + throw invalid_object_error( + "This instance of program does not contain the kernel requested", Err); + } + Plugin->checkPiResult(Err); - // Some PI Plugins (like OpenCL) require this call to enable USM - // For others, PI will turn this into a NOP. - if (getContextImplPtr()->getPlatformImpl()->supports_usm()) - Plugin->call( - Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); + // Some PI Plugins (like OpenCL) require this call to enable USM + // For others, PI will turn this into a NOP. + if (getContextImplPtr()->getPlatformImpl()->supports_usm()) + Plugin->call( + Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE); - return Result; + return Result; } std::vector diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp index 32a0c7fd38bfe..67c02e95734ab 100644 --- a/sycl/source/detail/program_impl.hpp +++ b/sycl/source/detail/program_impl.hpp @@ -134,9 +134,6 @@ class program_impl { /// not retained before return. const sycl::detail::pi::PiProgram &getHandleRef() const { return MProgram; } - /// \return true if this SYCL program is a host program. - bool is_host() const { return MContext->is_host(); } - /// Compiles the SYCL kernel function into the encapsulated raw program. /// /// The kernel function is defined by its name. This member function @@ -215,16 +212,11 @@ class program_impl { /// \return the SYCL context that this program was constructed with. context get_context() const { - if (is_host()) - return context(); return createSyclObjFromImpl(MContext); } /// \return the Plugin associated with the context of this program. - const PluginPtr &getPlugin() const { - assert(!is_host() && "Plugin is not available for Host."); - return MContext->getPlugin(); - } + const PluginPtr &getPlugin() const { return MContext->getPlugin(); } ContextImplPtr getContextImplPtr() const { return MContext; } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 5b873039cd4a1..588254743701f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -26,7 +26,9 @@ namespace sycl { inline namespace _V1 { namespace detail { -std::atomic queue_impl::MNextAvailableQueueID = 0; +// Treat 0 as reserved for host task traces +std::atomic queue_impl::MNextAvailableQueueID = 1; + thread_local bool NestedCallsDetector = false; class NestedCallsTracker { public: @@ -56,10 +58,9 @@ getPIEvents(const std::vector &DepEvents) { template <> uint32_t queue_impl::get_info() const { sycl::detail::pi::PiResult result = PI_SUCCESS; - if (!is_host()) - getPlugin()->call( - MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result, - nullptr); + getPlugin()->call( + MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result, + nullptr); return result; } @@ -157,8 +158,7 @@ event queue_impl::memset(const std::shared_ptr &Self, SYCL_STREAM_NAME, "memory_transfer_node"); PrepareNotify.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + reinterpret_cast(MDevice->getHandleRef())); xpti::addMetadata(TEvent, "memory_ptr", reinterpret_cast(Ptr)); xpti::addMetadata(TEvent, "value_set", Value); xpti::addMetadata(TEvent, "memory_size", Count); @@ -206,8 +206,7 @@ event queue_impl::memcpy(const std::shared_ptr &Self, SYCL_STREAM_NAME, "memory_transfer_node"); PrepareNotify.addMetadata([&](auto TEvent) { xpti::addMetadata(TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + reinterpret_cast(MDevice->getHandleRef())); xpti::addMetadata(TEvent, "src_memory_ptr", reinterpret_cast(Src)); xpti::addMetadata(TEvent, "dest_memory_ptr", reinterpret_cast(Dest)); @@ -304,12 +303,12 @@ void queue_impl::addEvent(const event &Event) { // if there is no command on the event, we cannot track it with MEventsWeak // as that will leave it with no owner. Track in MEventsShared only if we're // unable to call piQueueFinish during wait. - if (is_host() || MEmulateOOO) + if (MEmulateOOO) addSharedEvent(Event); } // As long as the queue supports piQueueFinish we only need to store events // for unenqueued commands and host tasks. - else if (is_host() || MEmulateOOO || EImpl->getHandleRef() == nullptr) { + else if (MEmulateOOO || EImpl->getHandleRef() == nullptr) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); @@ -320,7 +319,7 @@ void queue_impl::addEvent(const event &Event) { /// but some events have no other owner. In this case, /// addSharedEvent will have the queue track the events via a shared pointer. void queue_impl::addSharedEvent(const event &Event) { - assert(is_host() || MEmulateOOO); + assert(MEmulateOOO); std::lock_guard Lock(MMutex); // Events stored in MEventsShared are not released anywhere else aside from // calls to queue::wait/wait_and_throw, which a user application might not @@ -355,8 +354,7 @@ event queue_impl::submit_impl(const std::function &CGF, bool CallerNeedsEvent, const detail::code_location &Loc, const SubmitPostProcessF *PostProcess) { - handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue, - CallerNeedsEvent); + handler Handler(Self, PrimaryQueue, SecondaryQueue, false, CallerNeedsEvent); Handler.saveCodeLoc(Loc); { @@ -370,6 +368,9 @@ event queue_impl::submit_impl(const std::function &CGF, const CG::CGTYPE Type = Handler.getType(); event Event = detail::createSyclObjFromImpl( std::make_shared()); + std::vector Streams; + if (Type == CG::Kernel) + Streams = std::move(Handler.MStreamStorage); if (PostProcess) { bool IsKernel = Type == CG::Kernel; @@ -387,6 +388,19 @@ event queue_impl::submit_impl(const std::function &CGF, finalizeHandler(Handler, Event); addEvent(Event); + + auto EventImpl = detail::getSyclObjImpl(Event); + for (auto &Stream : Streams) { + // We don't want stream flushing to be blocking operation that is why submit + // a host task to print stream buffer. It will fire up as soon as the kernel + // finishes execution. + event FlushEvent = submit_impl( + [&](handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); }, + Self, PrimaryQueue, SecondaryQueue, /*CallerNeedsEvent*/ true, Loc, {}); + EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent)); + registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent)); + } + return Event; } @@ -438,9 +452,6 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, &EventImpl->getHandleRef(), EventImpl); } - if (MContext->is_host()) - return MDiscardEvents ? createDiscardedEvent() : event(); - if (isInOrder()) { auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr : MExtGraphDeps.LastEventPtr; @@ -494,19 +505,7 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, xpti_at::active, &QWaitInstanceNo); IId = QWaitInstanceNo; if (WaitEvent) { - device D = get_device(); - std::string DevStr; - if (getSyclObjImpl(D)->is_host()) - DevStr = "HOST"; - else if (D.is_cpu()) - DevStr = "CPU"; - else if (D.is_gpu()) - DevStr = "GPU"; - else if (D.is_accelerator()) - DevStr = "ACCELERATOR"; - else - DevStr = "UNKNOWN"; - xpti::addMetadata(WaitEvent, "sycl_device_type", DevStr); + xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this)); if (HasSourceInfo) { xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName()); xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName()); @@ -598,7 +597,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) { // directly. Otherwise, only wait for unenqueued or host task events, starting // from the latest submitted task in order to minimize total amount of calls, // then handle the rest with piQueueFinish. - const bool SupportsPiFinish = !is_host() && !MEmulateOOO; + const bool SupportsPiFinish = !MEmulateOOO; for (auto EventImplWeakPtrIt = WeakEvents.rbegin(); EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) { if (std::shared_ptr EventImplSharedPtr = @@ -667,15 +666,13 @@ bool queue_impl::ext_oneapi_empty() const { info::event_command_status::complete; } - // Check the status of the backend queue if this is not a host queue. - if (!is_host()) { - pi_bool IsReady = false; - getPlugin()->call( - MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady, - nullptr); - if (!IsReady) - return false; - } + // Check the status of the backend queue. + pi_bool IsReady = false; + getPlugin()->call( + MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady, + nullptr); + if (!IsReady) + return false; // We may have events like host tasks which are not submitted to the backend // queue so we need to get their status separately. @@ -689,7 +686,7 @@ bool queue_impl::ext_oneapi_empty() const { EventImplWeakPtrIt != MEventsWeak.end(); ++EventImplWeakPtrIt) if (std::shared_ptr EventImplSharedPtr = EventImplWeakPtrIt->lock()) - if (EventImplSharedPtr->is_host() && + if (EventImplSharedPtr->isHost() && EventImplSharedPtr ->get_info() != info::event_command_status::complete) @@ -733,8 +730,8 @@ void queue_impl::doUnenqueuedCommandCleanup( std::remove_if( Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(), [](const EventImplPtr &CommandEvent) { - return (CommandEvent->is_host() ? CommandEvent->isCompleted() - : CommandEvent->isEnqueued()); + return (CommandEvent->isHost() ? CommandEvent->isCompleted() + : CommandEvent->isEnqueued()); }), Deps.UnenqueuedCmdEvents.end()); } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4878134ec1e92..ccaf52cccd408 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -14,9 +14,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -26,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -92,7 +93,7 @@ class queue_impl { /// \param PropList is a list of properties to use for queue construction. queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler, const property_list &PropList) - : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList){}; + : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList) {}; /// Constructs a SYCL queue with an async_handler and property_list provided /// form a device and a context. @@ -106,7 +107,7 @@ class queue_impl { queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList) : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler), - MPropList(PropList), MHostQueue(MDevice->is_host()), + MPropList(PropList), MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -122,8 +123,7 @@ class queue_impl { if (MDevice->has(aspect::queue_profiling)) { // When piGetDeviceAndHostTimer is not supported, compute the // profiling time OpenCL version < 2.1 case - if (!getDeviceImplPtr()->is_host() && - !getDeviceImplPtr()->isGetDeviceAndHostTimerSupported()) + if (!getDeviceImplPtr()->isGetDeviceAndHostTimerSupported()) MFallbackProfiling = true; } else { throw sycl::exception(make_error_code(errc::feature_not_supported), @@ -152,7 +152,7 @@ class queue_impl { "Cannot enable fusion if device does not support fusion"); } if (!Context->isDeviceValid(Device)) { - if (!Context->is_host() && Context->getBackend() == backend::opencl) + if (Context->getBackend() == backend::opencl) throw sycl::invalid_object_error( "Queue cannot be constructed with the given context and device " "since the device is not a member of the context (descendants of " @@ -164,13 +164,12 @@ class queue_impl { "descendant of its member.", PI_ERROR_INVALID_DEVICE); } - if (!MHostQueue) { - const QueueOrder QOrder = - MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO; - MQueues.push_back(createQueue(QOrder)); - // This section is the second part of the instrumentation that uses the - // tracepoint information and notifies - } + + const QueueOrder QOrder = + MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO; + MQueues.push_back(createQueue(QOrder)); + // This section is the second part of the instrumentation that uses the + // tracepoint information and notifies // We enable XPTI tracing events using the TLS mechanism; if the code // location data is available, then the tracing data will be rich. @@ -194,16 +193,13 @@ class queue_impl { if (MDevice) { xpti::addMetadata(TEvent, "sycl_device_name", MDevice->getDeviceName()); - xpti::addMetadata( - TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + xpti::addMetadata(TEvent, "sycl_device", + reinterpret_cast(MDevice->getHandleRef())); } xpti::addMetadata(TEvent, "is_inorder", MIsInorder); xpti::addMetadata(TEvent, "queue_id", MQueueID); - if (!MHostQueue) - xpti::addMetadata(TEvent, "queue_handle", - reinterpret_cast(getHandleRef())); + xpti::addMetadata(TEvent, "queue_handle", + reinterpret_cast(getHandleRef())); }); // Also publish to TLS xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); @@ -259,15 +255,12 @@ class queue_impl { if (MDevice) { xpti::addMetadata(TEvent, "sycl_device_name", MDevice->getDeviceName()); - xpti::addMetadata( - TEvent, "sycl_device", - reinterpret_cast( - MDevice->is_host() ? 0 : MDevice->getHandleRef())); + xpti::addMetadata(TEvent, "sycl_device", + reinterpret_cast(MDevice->getHandleRef())); } xpti::addMetadata(TEvent, "is_inorder", MIsInorder); xpti::addMetadata(TEvent, "queue_id", MQueueID); - if (!MHostQueue) - xpti::addMetadata(TEvent, "queue_handle", getHandleRef()); + xpti::addMetadata(TEvent, "queue_handle", getHandleRef()); }); // Also publish to TLS before notification xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID); @@ -285,7 +278,7 @@ class queue_impl { /// \param AsyncHandler is a SYCL asynchronous exception handler. queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler) - : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false), + : MContext(Context), MAsyncHandler(AsyncHandler), MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -305,7 +298,6 @@ class queue_impl { queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList) : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList), - MHostQueue(false), MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -331,10 +323,8 @@ class queue_impl { } #endif throw_asynchronous(); - if (!MHostQueue) { - cleanup_fusion_cmd(); - getPlugin()->call(MQueues[0]); - } + cleanup_fusion_cmd(); + getPlugin()->call(MQueues[0]); } catch (std::exception &e) { __SYCL_REPORT_EXCEPTION_TO_STREAM("exception in ~queue_impl", e); } @@ -342,11 +332,6 @@ class queue_impl { /// \return an OpenCL interoperability queue handle. cl_command_queue get() { - if (MHostQueue) { - throw invalid_object_error( - "This instance of queue doesn't support OpenCL interoperability", - PI_ERROR_INVALID_QUEUE); - } getPlugin()->call(MQueues[0]); return pi::cast(MQueues[0]); } @@ -365,16 +350,11 @@ class queue_impl { /// \return an associated SYCL device. device get_device() const { return createSyclObjFromImpl(MDevice); } - /// \return true if this queue is a SYCL host queue. - bool is_host() const { return MHostQueue; } - /// \return true if the discard event property was set at time of creation. bool hasDiscardEventsProperty() const { return MDiscardEvents; } /// \return true if this queue allows for discarded events. - bool supportsDiscardingPiEvents() const { - return MHostQueue ? true : MIsInorder; - } + bool supportsDiscardingPiEvents() const { return MIsInorder; } bool isInOrder() const { return MIsInorder; } @@ -706,7 +686,7 @@ class queue_impl { MExceptions.PushBack(ExceptionPtr); } - ThreadPool &getThreadPool() { + static ThreadPool &getThreadPool() { return GlobalHandler::instance().getHostTaskThreadPool(); } @@ -787,6 +767,11 @@ class queue_impl { // tasks and host tasks is applicable for out of order queues only. Not neede // for in order ones. void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask); + + static ContextImplPtr getContext(const QueueImplPtr &Queue) { + return Queue ? Queue->getContextImplPtr() : nullptr; + } + // Must be called under MMutex protection void doUnenqueuedCommandCleanup( const std::shared_ptr @@ -848,13 +833,12 @@ class queue_impl { EventToBuildDeps = getSyclObjImpl(EventRet); } else { const CG::CGTYPE Type = Handler.getType(); - + std::lock_guard Lock{MMutex}; // The following code supports barrier synchronization if host task is // involved in the scenario. Native barriers cannot handle host task // dependency so in the case where some commands were not enqueued // (blocked), we track them to prevent barrier from being enqueued // earlier. - std::lock_guard Lock{MMutex}; { std::lock_guard RequestLock(MMissedCleanupRequestsMtx); for (auto &UpdatedGraph : MMissedCleanupRequests) @@ -975,7 +959,6 @@ class queue_impl { /// Iterator through MQueues. size_t MNextQueueIdx = 0; - const bool MHostQueue = false; /// Indicates that a native out-of-order queue could not be created and we /// need to emulate it with multiple native in-order queues. bool MEmulateOOO = false; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index fd467a5b178db..b9ff259906f95 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -56,7 +56,7 @@ namespace detail { // Global graph for the application extern xpti::trace_event_data_t *GSYCLGraphEvent; -bool CurrentCodeLocationValid() { +static bool CurrentCodeLocationValid() { detail::tls_code_loc_t Tls; auto CodeLoc = Tls.query(); auto FileName = CodeLoc.fileName(); @@ -74,8 +74,33 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent, static_cast(TraceEvent), InstanceID, Addr); } + +static size_t deviceToID(const device &Device) { + return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); +} + +static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) { + xpti::addMetadata(TraceEvent, "sycl_device_type", + queueDeviceToString(Queue.get())); + if (Queue) { + xpti::addMetadata(TraceEvent, "sycl_device", + deviceToID(Queue->get_device())); + xpti::addMetadata(TraceEvent, "sycl_device_name", + getSyclObjImpl(Queue->get_device())->getDeviceName()); + } +} + +static unsigned long long getQueueID(const QueueImplPtr &Queue) { + return Queue ? Queue->getQueueID() : 0; +} #endif +static ContextImplPtr getContext(const QueueImplPtr &Queue) { + if (Queue) + return Queue->getContextImplPtr(); + return nullptr; +} + #ifdef __SYCL_ENABLE_GNU_DEMANGLING struct DemangleHandle { char *p; @@ -95,19 +120,6 @@ static std::string demangleKernelName(std::string Name) { static std::string demangleKernelName(std::string Name) { return Name; } #endif -static std::string deviceToString(device Device) { - if (getSyclObjImpl(Device)->is_host()) - return "HOST"; - else if (Device.is_cpu()) - return "CPU"; - else if (Device.is_gpu()) - return "GPU"; - else if (Device.is_accelerator()) - return "ACCELERATOR"; - else - return "UNKNOWN"; -} - void applyFuncOnFilteredArgs( const KernelArgMask *EliminatedArgMask, std::vector &Args, std::function Func) { @@ -142,15 +154,6 @@ void applyFuncOnFilteredArgs( } } -#ifdef XPTI_ENABLE_INSTRUMENTATION -static size_t deviceToID(const device &Device) { - if (getSyclObjImpl(Device)->is_host()) - return 0; - else - return reinterpret_cast(getSyclObjImpl(Device)->getHandleRef()); -} -#endif - static std::string accessModeToString(access::mode Mode) { switch (Mode) { case access::mode::read: @@ -240,11 +243,8 @@ Command::getPiEvents(const std::vector &EventImpls) const { // At this stage dependency is definitely pi task and need to check if // current one is a host task. In this case we should not skip pi event due // to different sync mechanisms for different task types on in-order queue. - const QueueImplPtr &WorkerQueue = getWorkerQueue(); - // MWorkerQueue in command is always not null. So check if - // EventImpl->getWorkerQueue != nullptr is implicit. - if (EventImpl->getWorkerQueue() == WorkerQueue && - WorkerQueue->isInOrder() && !isHostTask()) + if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue && + MWorkerQueue->isInOrder() && !isHostTask()) continue; RetPiEvents.push_back(EventImpl->getHandleRef()); @@ -265,7 +265,7 @@ std::vector Command::getPiEventsBlocking( // (which is set lazily) calling getContextImpl() would set that // context, which we wish to avoid as it is expensive. // Skip host task and NOP events also. - if (!EventImpl->isContextInitialized() || EventImpl->is_host() || + if (EventImpl->isDefaultConstructed() || EventImpl->isHost() || EventImpl->isNOP()) continue; // In this path nullptr native event means that the command has not been @@ -282,11 +282,8 @@ std::vector Command::getPiEventsBlocking( // At this stage dependency is definitely pi task and need to check if // current one is a host task. In this case we should not skip pi event due // to different sync mechanisms for different task types on in-order queue. - const QueueImplPtr &WorkerQueue = getWorkerQueue(); - // MWorkerQueue in command is always not null. So check if - // EventImpl->getWorkerQueue != nullptr is implicit. - if (EventImpl->getWorkerQueue() == WorkerQueue && - WorkerQueue->isInOrder() && !isHostTask()) + if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue && + MWorkerQueue->isInOrder() && !isHostTask()) continue; RetPiEvents.push_back(EventImpl->getHandleRef()); @@ -324,7 +321,7 @@ struct EnqueueNativeCommandData { std::function func; }; -void InteropFreeFunc(pi_queue InteropQueue, void *InteropData) { +void InteropFreeFunc(pi_queue, void *InteropData) { auto *Data = reinterpret_cast(InteropData); return Data->func(Data->ih); } @@ -358,12 +355,12 @@ class DispatchHostTask { PluginWithEvents.first->call(RawEvents.size(), RawEvents.data()); } catch (const sycl::exception &) { - CGHostTask &HostTask = static_cast(MThisCmd->getCG()); - HostTask.MQueue->reportAsyncException(std::current_exception()); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + std::current_exception()); return false; } catch (...) { - CGHostTask &HostTask = static_cast(MThisCmd->getCG()); - HostTask.MQueue->reportAsyncException(std::current_exception()); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + std::current_exception()); return false; } } @@ -405,7 +402,8 @@ class DispatchHostTask { std::exception_ptr EPtr = std::make_exception_ptr(sycl::exception( make_error_code(errc::runtime), std::string("Couldn't wait for host-task's dependencies"))); - HostTask.MQueue->reportAsyncException(EPtr); + + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(EPtr); // reset host-task's lambda and quit HostTask.MHostTask.reset(); Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); @@ -415,6 +413,8 @@ class DispatchHostTask { try { // we're ready to call the user-defined lambda now if (HostTask.MHostTask->isInteropTask()) { + assert(HostTask.MQueue && + "Host task submissions should have an associated queue"); interop_handle IH{MReqToMem, HostTask.MQueue, HostTask.MQueue->getDeviceImplPtr(), HostTask.MQueue->getContextImplPtr()}; @@ -465,7 +465,8 @@ class DispatchHostTask { } } #endif - HostTask.MQueue->reportAsyncException(CurrentException); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + CurrentException); } HostTask.MHostTask.reset(); @@ -482,7 +483,8 @@ class DispatchHostTask { Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd); } catch (...) { auto CurrentException = std::current_exception(); - HostTask.MQueue->reportAsyncException(CurrentException); + MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException( + CurrentException); } } }; @@ -495,8 +497,13 @@ void Command::waitForPreparedHostEvents() const { void Command::waitForEvents(QueueImplPtr Queue, std::vector &EventImpls, sycl::detail::pi::PiEvent &Event) { +#ifndef NDEBUG + for (const EventImplPtr &Event : EventImpls) + assert(!Event->isHost() && + "Only non-host events are expected to be waited for here"); +#endif if (!EventImpls.empty()) { - if (Queue->is_host()) { + if (!Queue) { // Host queue can wait for events from different contexts, i.e. it may // contain events with different contexts in its MPreparedDepsEvents. // OpenCL 2.1 spec says that clWaitForEvents will return @@ -527,15 +534,9 @@ void Command::waitForEvents(QueueImplPtr Queue, RawEvents.size(), RawEvents.data()); } } else { -#ifndef NDEBUG - for (const EventImplPtr &Event : EventImpls) - assert(Event->getContextImpl().get() && - "Only non-host events are expected to be waited for here"); -#endif - std::vector RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); const PluginPtr &Plugin = Queue->getPlugin(); if (MEvent != nullptr) @@ -562,7 +563,8 @@ Command::Command( MEvent->setWorkerQueue(MWorkerQueue); MEvent->setSubmittedQueue(MWorkerQueue); MEvent->setCommand(this); - MEvent->setContextImpl(MQueue->getContextImplPtr()); + if (MQueue) + MEvent->setContextImpl(MQueue->getContextImplPtr()); MEvent->setStateIncomplete(); MEnqueueStatus = EnqueueResultT::SyclEnqueueReady; @@ -746,16 +748,14 @@ void Command::makeTraceEventEpilog() { Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, std::vector &ToCleanUp) { - const QueueImplPtr &WorkerQueue = getWorkerQueue(); - const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr(); - - // 1. Async work is not supported for host device. - // 2. Non-host events can be ignored if they are not fully initialized. - // 3. Some types of commands do not produce PI events after they are - // enqueued - // (e.g. alloca). Note that we can't check the pi event to make that - // distinction since the command might still be unenqueued at this point. - bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized()); + const ContextImplPtr &WorkerContext = getWorkerContext(); + + // 1. Non-host events can be ignored if they are not fully initialized. + // 2. Some types of commands do not produce PI events after they are + // enqueued (e.g. alloca). Note that we can't check the pi event to make that + // distinction since the command might still be unenqueued at this point. + bool PiEventExpected = + (!DepEvent->isHost() && !DepEvent->isDefaultConstructed()); if (auto *DepCmd = static_cast(DepEvent->getCommand())) PiEventExpected &= DepCmd->producesPiEvent(); @@ -770,7 +770,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, ContextImplPtr DepEventContext = DepEvent->getContextImpl(); // If contexts don't match we'll connect them using host task - if (DepEventContext != WorkerContext && !WorkerContext->is_host()) { + if (DepEventContext != WorkerContext && WorkerContext) { Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder; ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp); } else @@ -779,15 +779,12 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep, return ConnectionCmd; } -const ContextImplPtr &Command::getWorkerContext() const { +ContextImplPtr Command::getWorkerContext() const { + if (!MQueue) + return nullptr; return MQueue->getContextImplPtr(); } -const QueueImplPtr &Command::getWorkerQueue() const { - assert(MWorkerQueue && "MWorkerQueue must not be nullptr"); - return MWorkerQueue; -} - bool Command::producesPiEvent() const { return true; } bool Command::supportsPostEnqueueCleanup() const { return true; } @@ -926,7 +923,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, else { MEvent->setEnqueued(); if (MShouldCompleteEventIfPossible && - (MEvent->is_host() || MEvent->getHandleRef() == nullptr)) + (MEvent->isHost() || MEvent->getHandleRef() == nullptr)) MEvent->setComplete(); // Consider the command is successfully enqueued if return code is @@ -1042,16 +1039,12 @@ void AllocaCommandBase::emitInstrumentationData() { // Set the relevant meta data properties for this command if (MTraceEvent && MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); } #endif } @@ -1101,7 +1094,7 @@ pi_int32 AllocaCommand::enqueueImp() { void *HostPtr = nullptr; if (!MIsLeaderAlloca) { - if (MQueue->is_host()) { + if (!MQueue) { // Do not need to make allocation if we have a linked device allocation Command::waitForEvents(MQueue, EventImpls, Event); @@ -1111,9 +1104,9 @@ pi_int32 AllocaCommand::enqueueImp() { } // TODO: Check if it is correct to use std::move on stack variable and // delete it RawEvents below. - MMemAllocation = MemoryManager::allocate( - MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr, - std::move(EventImpls), Event); + MMemAllocation = MemoryManager::allocate(getContext(MQueue), getSYCLMemObj(), + MInitFromUserData, HostPtr, + std::move(EventImpls), Event); return PI_SUCCESS; } @@ -1122,7 +1115,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "ALLOCA ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Link : " << this->MLinkedAllocaCmd << "\\n"; Stream << "\"];" << std::endl; @@ -1171,7 +1164,7 @@ void AllocaSubBufCommand::emitInstrumentationData() { xpti::addMetadata(TE, "access_range_end", this->MRequirement.MAccessRange[1]); xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1181,7 +1174,7 @@ void *AllocaSubBufCommand::getMemAllocation() const { // In some cases parent`s memory allocation might change (e.g., after // map/unmap operations). If parent`s memory allocation changes, sub-buffer // memory allocation should be changed as well. - if (MQueue->is_host()) { + if (!MQueue) { return static_cast( static_cast(MParentAlloca->getMemAllocation()) + MRequirement.MOffsetInBytes); @@ -1195,7 +1188,7 @@ pi_int32 AllocaSubBufCommand::enqueueImp() { sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); MMemAllocation = MemoryManager::allocateMemSubBuffer( - MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(), + getContext(MQueue), MParentAlloca->getMemAllocation(), MRequirement.MElemSize, MRequirement.MOffsetInBytes, MRequirement.MAccessRange, std::move(EventImpls), Event); @@ -1208,8 +1201,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device()) - << "\\n"; + Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n"; Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n"; Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n"; @@ -1242,17 +1234,13 @@ void ReleaseCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "allocation_type", commandToName(MAllocaCmd->getType())); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1266,9 +1254,9 @@ pi_int32 ReleaseCommand::enqueueImp() { // On host side we only allocate memory for full buffers. // Thus, deallocating sub buffers leads to double memory freeing. - SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF; + SkipRelease |= !MQueue && MAllocaCmd->getType() == ALLOCA_SUB_BUF; - const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host(); + const bool CurAllocaIsHost = !MAllocaCmd->getQueue(); bool NeedUnmap = false; if (MAllocaCmd->MLinkedAllocaCmd) { @@ -1292,7 +1280,7 @@ pi_int32 ReleaseCommand::enqueueImp() { : MAllocaCmd->getQueue(); EventImplPtr UnmapEventImpl(new event_impl(Queue)); - UnmapEventImpl->setContextImpl(Queue->getContextImplPtr()); + UnmapEventImpl->setContextImpl(getContext(Queue)); UnmapEventImpl->setStateIncomplete(); sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef(); @@ -1315,9 +1303,9 @@ pi_int32 ReleaseCommand::enqueueImp() { if (SkipRelease) Command::waitForEvents(MQueue, EventImpls, Event); else { - MemoryManager::release( - MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(), - MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event); + MemoryManager::release(getContext(MQueue), MAllocaCmd->getSYCLMemObj(), + MAllocaCmd->getMemAllocation(), + std::move(EventImpls), Event); } return PI_SUCCESS; } @@ -1326,7 +1314,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "RELEASE ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << " Alloca : " << MAllocaCmd << "\\n"; Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n"; Stream << "\"];" << std::endl; @@ -1366,16 +1354,12 @@ void MapMemObject::emitInstrumentationData() { if (MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1385,7 +1369,7 @@ pi_int32 MapMemObject::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); *MDstPtr = MemoryManager::map( @@ -1400,7 +1384,7 @@ void MapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "MAP ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "\"];" << std::endl; @@ -1431,16 +1415,12 @@ void UnMapMemObject::emitInstrumentationData() { if (MFirstInstance) { xpti_td *TE = static_cast(MTraceEvent); - xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(TE, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(TE, MQueue); xpti::addMetadata(TE, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1462,16 +1442,16 @@ bool UnMapMemObject::producesPiEvent() const { // an event waitlist and Level Zero plugin attempts to batch these commands, // so the execution of kernel B starts only on step 4. This workaround // restores the old behavior in this case until this is resolved. - return MQueue->getDeviceImplPtr()->getBackend() != - backend::ext_oneapi_level_zero || - MEvent->getHandleRef() != nullptr; + return MQueue && (MQueue->getDeviceImplPtr()->getBackend() != + backend::ext_oneapi_level_zero || + MEvent->getHandleRef() != nullptr); } pi_int32 UnMapMemObject::enqueueImp() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(), @@ -1485,7 +1465,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\""; Stream << "ID = " << this << " ; "; - Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "UNMAP ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "\"];" << std::endl; @@ -1507,11 +1487,11 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstAllocaCmd(DstAllocaCmd) { - if (!MSrcQueue->is_host()) { + if (MSrcQueue) { MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); } - MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue; + MWorkerQueue = !MQueue ? MSrcQueue : MQueue; MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); @@ -1528,31 +1508,26 @@ void MemCpyCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - reinterpret_cast( - getSyclObjImpl(MSrcQueue->get_device()).get())); - xpti::addMetadata( - CmdTraceEvent, "copy_to", - reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get())); + MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); + xpti::addMetadata(CmdTraceEvent, "copy_to", + MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif } -const ContextImplPtr &MemCpyCommand::getWorkerContext() const { - return getWorkerQueue()->getContextImplPtr(); +ContextImplPtr MemCpyCommand::getWorkerContext() const { + if (!MWorkerQueue) + return nullptr; + return MWorkerQueue->getContextImplPtr(); } bool MemCpyCommand::producesPiEvent() const { @@ -1571,7 +1546,7 @@ bool MemCpyCommand::producesPiEvent() const { // an event waitlist and Level Zero plugin attempts to batch these commands, // so the execution of kernel B starts only on step 4. This workaround // restores the old behavior in this case until this is resolved. - return MQueue->is_host() || + return !MQueue || MQueue->getDeviceImplPtr()->getBackend() != backend::ext_oneapi_level_zero || MEvent->getHandleRef() != nullptr; @@ -1584,7 +1559,7 @@ pi_int32 MemCpyCommand::enqueueImp() { sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef(); auto RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); MemoryManager::copy( MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), @@ -1600,11 +1575,9 @@ void MemCpyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\""; Stream << "ID = " << this << " ; "; - Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n"; - Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host() - << "\\n"; - Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host() - << "\\n"; + Stream << "MEMCPY ON " << queueDeviceToString(MQueue.get()) << "\\n"; + Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n"; + Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n"; Stream << "\"];" << std::endl; @@ -1658,7 +1631,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue.get()) << "\\n"; bool IsReqOnBuffer = MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer; Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n"; @@ -1685,11 +1658,11 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, : Command(CommandType::COPY_MEMORY, std::move(DstQueue)), MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) { - if (!MSrcQueue->is_host()) { + if (MSrcQueue) { MEvent->setContextImpl(MSrcQueue->getContextImplPtr()); } - MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue; + MWorkerQueue = !MQueue ? MSrcQueue : MQueue; MEvent->setWorkerQueue(MWorkerQueue); emitInstrumentationDataProxy(); @@ -1706,35 +1679,30 @@ void MemCpyCommandHost::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); xpti::addMetadata(CmdTraceEvent, "copy_from", - reinterpret_cast( - getSyclObjImpl(MSrcQueue->get_device()).get())); - xpti::addMetadata( - CmdTraceEvent, "copy_to", - reinterpret_cast(getSyclObjImpl(MQueue->get_device()).get())); + MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0); + xpti::addMetadata(CmdTraceEvent, "copy_to", + MQueue ? deviceToID(MQueue->get_device()) : 0); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif } -const ContextImplPtr &MemCpyCommandHost::getWorkerContext() const { - return getWorkerQueue()->getContextImplPtr(); +ContextImplPtr MemCpyCommandHost::getWorkerContext() const { + if (!MWorkerQueue) + return nullptr; + return MWorkerQueue->getContextImplPtr(); } pi_int32 MemCpyCommandHost::enqueueImp() { - const QueueImplPtr &Queue = getWorkerQueue(); + const QueueImplPtr &Queue = MWorkerQueue; waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; std::vector RawEvents = getPiEvents(EventImpls); @@ -1750,7 +1718,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() { return PI_SUCCESS; } - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); MemoryManager::copy( MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(), MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange, @@ -1761,8 +1729,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() { return PI_SUCCESS; } -EmptyCommand::EmptyCommand(QueueImplPtr Queue) - : Command(CommandType::EMPTY_TASK, std::move(Queue)) { +EmptyCommand::EmptyCommand() : Command(CommandType::EMPTY_TASK, nullptr) { emitInstrumentationDataProxy(); } @@ -1805,18 +1772,13 @@ void EmptyCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -1845,7 +1807,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue.get()) << "\\n"; Stream << "\"];" << std::endl; @@ -1878,18 +1840,13 @@ void UpdateHostRequirementCommand::emitInstrumentationData() { if (MFirstInstance) { xpti_td *CmdTraceEvent = static_cast(MTraceEvent); - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, MQueue); xpti::addMetadata(CmdTraceEvent, "memory_object", reinterpret_cast(MAddress)); // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); makeTraceEventEpilog(); } #endif @@ -2041,6 +1998,7 @@ void instrumentationAddExtraKernelMetadata( if (!SyclKernel->isCreatedFromSource()) EliminatedArgMask = SyclKernel->getKernelArgMask(); } else { + assert(Queue && "Kernel submissions should have an associated queue"); std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) = detail::ProgramManager::getInstance().getOrCreateKernel( Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName); @@ -2105,12 +2063,7 @@ void instrumentationFillCommonData(const std::string &KernelName, if (CGKernelInstanceNo > 1) return; - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(Queue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(Queue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(Queue->get_device())->getDeviceName()); + addDeviceMetadata(CmdTraceEvent, Queue); if (!KernelName.empty()) { xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName); } @@ -2160,9 +2113,7 @@ std::pair emitKernelInstrumentationData( if (CmdTraceEvent) { // Stash the queue_id mutable metadata in TLS - xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - Queue->getQueueID()); - + xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue)); instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc, KernelBundleImplPtr, SyclKernelName, SyclKernel, Queue, CGArgs); @@ -2207,7 +2158,7 @@ void ExecCGCommand::emitInstrumentationData() { if (CmdTraceEvent) { xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); MTraceEvent = static_cast(CmdTraceEvent); if (MCommandGroup->getType() == detail::CG::Kernel) { auto KernelCG = @@ -2230,7 +2181,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n"; + Stream << "EXEC CG ON " << queueDeviceToString(MQueue.get()) << "\\n"; switch (MCommandGroup->getType()) { case detail::CG::Kernel: { @@ -2326,8 +2277,7 @@ void SetArgBasedOnType( const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr &DeviceImageImpl, const std::function &getMemAllocationFunc, - const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, - size_t NextTrueIndex) { + const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) { switch (Arg.MType) { case kernel_param_kind_t::kind_stream: break; @@ -2391,13 +2341,6 @@ void SetArgBasedOnType( break; } case kernel_param_kind_t::kind_specialization_constants_buffer: { - if (IsHost) { - throw sycl::exception( - sycl::make_error_code(sycl::errc::feature_not_supported), - "SYCL2020 specialization constants are not yet supported on host " - "device " + - codeToString(PI_ERROR_INVALID_OPERATION)); - } assert(DeviceImageImpl != nullptr); sycl::detail::pi::PiMem SpecConstsBuffer = DeviceImageImpl->get_spec_const_buffer_ref(); @@ -2429,13 +2372,13 @@ static pi_result SetKernelParamsAndLaunch( const KernelArgMask *EliminatedArgMask, const std::function &getMemAllocationFunc, bool IsCooperative) { + assert(Queue && "Kernel submissions should have an associated queue"); const PluginPtr &Plugin = Queue->getPlugin(); auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc, &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) { SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc, - Queue->get_context(), Queue->is_host(), Arg, - NextTrueIndex); + Queue->get_context(), Arg, NextTrueIndex); }; applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); @@ -2555,7 +2498,7 @@ pi_int32 enqueueImpCommandBufferKernel( &getMemAllocationFunc](sycl::detail::ArgDesc &Arg, size_t NextTrueIndex) { sycl::detail::SetArgBasedOnType(Plugin, PiKernel, DeviceImageImpl, - getMemAllocationFunc, Ctx, false, Arg, + getMemAllocationFunc, Ctx, Arg, NextTrueIndex); }; // Copy args for modification @@ -2620,7 +2563,7 @@ pi_int32 enqueueImpKernel( const std::function &getMemAllocationFunc, sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig, const bool KernelIsCooperative) { - + assert(Queue && "Kernel submissions should have an associated queue"); // Run OpenCL kernel auto ContextImpl = Queue->getContextImplPtr(); auto DeviceImpl = Queue->getDeviceImplPtr(); @@ -2735,6 +2678,8 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, bool blocking, void *ptr, size_t size, std::vector &RawEvents, const detail::EventImplPtr &OutEventImpl, bool read) { + assert(Queue && + "ReadWrite host pipe submissions should have an associated queue"); detail::HostPipeMapEntry *hostPipeEntry = ProgramManager::getInstance().getHostPipeEntry(PipeName); @@ -2784,6 +2729,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName, } pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { + assert(MQueue && "Command buffer enqueue should have an associated queue"); // Wait on host command dependencies waitForPreparedHostEvents(); @@ -2791,7 +2737,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { // submissions of the command buffer itself will not receive dependencies on // them, e.g. initial copies from host to device std::vector EventImpls = MPreparedDepsEvents; - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); std::vector RawEvents = getPiEvents(EventImpls); if (!RawEvents.empty()) { const PluginPtr &Plugin = MQueue->getPlugin(); @@ -2951,13 +2897,13 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { waitForPreparedHostEvents(); std::vector EventImpls = MPreparedDepsEvents; auto RawEvents = getPiEvents(EventImpls); - flushCrossQueueDeps(EventImpls, getWorkerQueue()); + flushCrossQueueDeps(EventImpls, MWorkerQueue); // We can omit creating a PI event and create a "discarded" event if either // the queue has the discard property or the command has been explicitly // marked as not needing an event, e.g. if the user did not ask for one, and // if the queue supports discarded PI event and there are no requirements. - bool DiscardPiEvent = (MQueue->MDiscardEvents || !MEventNeeded) && + bool DiscardPiEvent = MQueue && (MQueue->MDiscardEvents || !MEventNeeded) && MQueue->supportsDiscardingPiEvents() && MCommandGroup->getRequirements().size() == 0; sycl::detail::pi::PiEvent *Event = @@ -2979,10 +2925,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { MemoryManager::copy( AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, - Req->MElemSize, Copy->getDst(), - Scheduler::getInstance().getDefaultHostQueue(), Req->MDims, - Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, - Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent); + Req->MElemSize, Copy->getDst(), nullptr, Req->MDims, Req->MAccessRange, + Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize, + std::move(RawEvents), MEvent->getHandleRef(), MEvent); return PI_SUCCESS; } @@ -2991,11 +2936,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { Requirement *Req = (Requirement *)(Copy->getDst()); AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - Scheduler::getInstance().getDefaultHostQueue(); - MemoryManager::copy( - AllocaCmd->getSYCLMemObj(), Copy->getSrc(), - Scheduler::getInstance().getDefaultHostQueue(), Req->MDims, + AllocaCmd->getSYCLMemObj(), Copy->getSrc(), nullptr, Req->MDims, Req->MAccessRange, Req->MAccessRange, /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset, @@ -3035,29 +2977,12 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Kernel: { + assert(MQueue && "Kernel submissions should have an associated queue"); CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get(); NDRDescT &NDRDesc = ExecKernel->MNDRDesc; std::vector &Args = ExecKernel->MArgs; - if (MQueue->is_host()) { - for (ArgDesc &Arg : Args) - if (kernel_param_kind_t::kind_accessor == Arg.MType) { - Requirement *Req = (Requirement *)(Arg.MPtr); - AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); - Req->MData = AllocaCmd->getMemAllocation(); - } - if (!RawEvents.empty()) { - // Assuming that the events are for devices to the same Plugin. - const PluginPtr &Plugin = EventImpls[0]->getPlugin(); - Plugin->call(RawEvents.size(), &RawEvents[0]); - } - - ExecKernel->MHostKernel->call(NDRDesc, - getEvent()->getHostProfilingInfo()); - return PI_SUCCESS; - } - auto getMemAllocationFunc = [this](Requirement *Req) { AllocaCommandBase *AllocaCmd = getAllocaForReq(Req); // getAllocaForReq may return nullptr if Req is a default constructed @@ -3173,8 +3098,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { Req->MSYCLMemObj->MRecord->MAllocaCommands; for (AllocaCommandBase *AllocaCmd : AllocaCmds) - if (HostTask->MQueue->getContextImplPtr() == - AllocaCmd->getQueue()->getContextImplPtr()) { + if (getContext(HostTask->MQueue) == + getContext(AllocaCmd->getQueue())) { auto MemArg = reinterpret_cast(AllocaCmd->getMemAllocation()); ReqToMem.emplace_back(std::make_pair(Req, MemArg)); @@ -3199,7 +3124,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { // submitted to report exception origin properly. copySubmissionCodeLocation(); - MQueue->getThreadPool().submit( + queue_impl::getThreadPool().submit( DispatchHostTask(this, std::move(ReqToMem), std::move(ReqPiMem))); MShouldCompleteEventIfPossible = false; @@ -3207,10 +3132,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::Barrier: { - if (MQueue->getDeviceImplPtr()->is_host()) { - // NOP for host device. - return PI_SUCCESS; - } + assert(MQueue && "Barrier submission should have an associated queue"); const PluginPtr &Plugin = MQueue->getPlugin(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); @@ -3220,12 +3142,12 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::BarrierWaitlist: { + assert(MQueue && "Barrier submission should have an associated queue"); CGBarrier *Barrier = static_cast(MCommandGroup.get()); std::vector Events = Barrier->MEventsWaitWithBarrier; std::vector PiEvents = getPiEventsBlocking(Events); - if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) { - // NOP for host device. + if (PiEvents.empty()) { // If Events is empty, then the barrier has no effect. return PI_SUCCESS; } @@ -3288,6 +3210,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { typeSize, RawEvents, EventImpl, read); } case CG::CGTYPE::ExecCommandBuffer: { + assert(MQueue && + "Command buffer submissions should have an associated queue"); CGExecCommandBuffer *CmdBufferCG = static_cast(MCommandGroup.get()); if (MEvent != nullptr) @@ -3311,11 +3235,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreWait: { + assert(MQueue && + "Semaphore wait submissions should have an associated queue"); CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get(); - if (MQueue->getDeviceImplPtr()->is_host()) { - // NOP for host device. - return PI_SUCCESS; - } const detail::PluginPtr &Plugin = MQueue->getPlugin(); auto OptWaitValue = SemWait->getWaitValue(); @@ -3327,11 +3249,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { return PI_SUCCESS; } case CG::CGTYPE::SemaphoreSignal: { + assert(MQueue && + "Semaphore signal submissions should have an associated queue"); CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get(); - if (MQueue->getDeviceImplPtr()->is_host()) { - // NOP for host device. - return PI_SUCCESS; - } const detail::PluginPtr &Plugin = MQueue->getPlugin(); auto OptSignalValue = SemSignal->getSignalValue(); @@ -3447,19 +3367,14 @@ void KernelFusionCommand::emitInstrumentationData() { // This function is called in the constructor of the command. At this point // the kernel fusion list is still empty, so we don't have a terrible lot of // information we could attach to this node here. - if (MFirstInstance && CmdTraceEvent) { - xpti::addMetadata(CmdTraceEvent, "sycl_device", - deviceToID(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_type", - deviceToString(MQueue->get_device())); - xpti::addMetadata(CmdTraceEvent, "sycl_device_name", - getSyclObjImpl(MQueue->get_device())->getDeviceName()); - } + if (MFirstInstance && CmdTraceEvent) + addDeviceMetadata(CmdTraceEvent, MQueue); + if (MFirstInstance) { // Since we do NOT add queue_id value to metadata, we are stashing it to TLS // as this data is mutable and the metadata is supposed to be invariant xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, - MQueue->getQueueID()); + getQueueID(MQueue)); xptiNotifySubscribers(MStreamID, NotificationTraceType, detail::GSYCLGraphEvent, static_cast(MTraceEvent), MInstanceID, @@ -3473,7 +3388,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; Stream << "ID = " << this << "\\n"; - Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n" + Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue.get()) << "\\n" << "FUSION LIST: {"; bool Initial = true; for (auto *Cmd : MFusionList) { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index ea1a5b5111149..471bf66264c54 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -33,7 +33,6 @@ class node_impl; namespace detail { #ifdef XPTI_ENABLE_INSTRUMENTATION -bool CurrentCodeLocationValid(); void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID, xpti_td *TraceEvent, uint16_t Type, const void *Addr); @@ -223,11 +222,7 @@ class Command { /// Get the context of the queue this command will be submitted to. Could /// differ from the context of MQueue for memory copy commands. - virtual const ContextImplPtr &getWorkerContext() const; - - /// Get the queue this command will be submitted to. Could differ from MQueue - /// for memory copy commands. - const QueueImplPtr &getWorkerQueue() const; + virtual ContextImplPtr getWorkerContext() const; /// Returns true iff the command produces a PI event on non-host devices. virtual bool producesPiEvent() const; @@ -377,12 +372,12 @@ class Command { std::string MSubmissionFileName; std::string MSubmissionFunctionName; - // This flag allows to control whether host event should be set complete - // after successfull enqueue of command. Event is considered as host event if - // either it's is_host() return true or there is no backend representation - // of event (i.e. getHandleRef() return reference to nullptr value). - // By default the flag is set to true due to most of host operations are - // synchronous. The only asynchronous operation currently is host-task. + // This flag allows to control whether event should be set complete + // after successfull enqueue of command. Event is considered as "host" event + // if there is no backend representation of event (i.e. getHandleRef() return + // reference to nullptr value). By default the flag is set to true due to most + // of host operations are synchronous. The only asynchronous operation + // currently is host-task. bool MShouldCompleteEventIfPossible = true; /// Indicates that the node will be freed by graph cleanup. Such nodes should @@ -415,7 +410,7 @@ class Command { /// implement lock in the graph, or to merge several nodes into one. class EmptyCommand : public Command { public: - EmptyCommand(QueueImplPtr Queue); + EmptyCommand(); void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MRequirements[0]; } @@ -587,7 +582,7 @@ class MemCpyCommand : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; - const ContextImplPtr &getWorkerContext() const final; + ContextImplPtr getWorkerContext() const final; bool producesPiEvent() const final; private: @@ -611,7 +606,7 @@ class MemCpyCommandHost : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } void emitInstrumentationData() final; - const ContextImplPtr &getWorkerContext() const final; + ContextImplPtr getWorkerContext() const final; private: pi_int32 enqueueImp() final; @@ -804,8 +799,7 @@ void SetArgBasedOnType( const detail::plugin &Plugin, sycl::detail::pi::PiKernel Kernel, const std::shared_ptr &DeviceImageImpl, const std::function &getMemAllocationFunc, - const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg, - size_t NextTrueIndex); + const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex); void applyFuncOnFilteredArgs( const KernelArgMask *EliminatedArgMask, std::vector &Args, diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index a17c45720733c..f8397016fce41 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -49,17 +49,18 @@ static bool doOverlap(const Requirement *LHS, const Requirement *RHS) { LHS->MOffsetInBytes); } -static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { - // Consider two different host contexts to be the same to avoid additional - // allocation on the host - return LHS == RHS || (LHS->is_host() && RHS->is_host()); -} - /// Checks if current requirement is requirement for sub buffer. static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } +static bool isOnSameContext(const ContextImplPtr Context, + const QueueImplPtr &Queue) { + // Covers case for host usage (nullptr == nullptr) and existing device + // contexts comparison. + return Context == queue_impl::getContext(Queue); +} + /// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { @@ -249,7 +250,7 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, "shouldn't lead to any enqueuing (no linked " "alloca or exceeding the leaf limit)."); } else - MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(), + MemObject->MRecord.reset(new MemObjRecord{queue_impl::getContext(Queue), LeafLimit, AllocateDependency}); MMemObjs.push_back(MemObject); @@ -288,8 +289,8 @@ void Scheduler::GraphBuilder::addNodeToLeaves( UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - AllocaCommandBase *AllocaCmd = - findAllocaForReq(Record, Req, Queue->getContextImplPtr()); + auto Context = queue_impl::getContext(Queue); + AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Context); assert(AllocaCmd && "There must be alloca for requirement!"); UpdateHostRequirementCommand *UpdateCommand = new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData); @@ -297,8 +298,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( // dependencies become invalid if requirement is stored by pointer. const Requirement *StoredReq = UpdateCommand->getRequirement(); - std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::set Deps = findDepsForReq(Record, Req, Context); std::vector ToCleanUp; for (Command *Dep : Deps) { Command *ConnCmd = @@ -323,7 +323,7 @@ static Command *insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc, assert(AllocaCmdSrc->MIsActive && "Expected source alloca command to be active"); - if (AllocaCmdSrc->getQueue()->is_host()) { + if (!AllocaCmdSrc->getQueue()) { UnMapMemObject *UnMapCmd = new UnMapMemObject( AllocaCmdDst, *AllocaCmdDst->getRequirement(), &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue()); @@ -351,8 +351,8 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if (!AllocaCmdDst) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); - std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + auto Context = queue_impl::getContext(Queue); + std::set Deps = findDepsForReq(Record, Req, Context); Deps.insert(AllocaCmdDst); // Get parent allocation of sub buffer to perform full copy of whole buffer if (IsSuitableSubReq(Req)) { @@ -368,8 +368,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( // current context, need to find a parent alloca command for it (it must be // there) auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext) && + bool Res = isOnSameContext(Record->MCurContext, AllocaCmd->getQueue()) && // Looking for a parent buffer alloca command AllocaCmd->getType() == Command::CommandType::ALLOCA; return Res; @@ -404,7 +403,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( if ((Req->MAccessMode == access::mode::discard_write) || (Req->MAccessMode == access::mode::discard_read_write)) { - Record->MCurContext = Queue->getContextImplPtr(); + Record->MCurContext = Context; return nullptr; } else { // Full copy of buffer is needed to avoid loss of data that may be caused @@ -426,15 +425,14 @@ Command *Scheduler::GraphBuilder::insertMemoryMove( addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue); for (Command *Cmd : ToCleanUp) cleanupCommand(Cmd); - Record->MCurContext = Queue->getContextImplPtr(); + Record->MCurContext = Context; return NewCmd; } Command *Scheduler::GraphBuilder::remapMemoryObject( MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd, std::vector &ToEnqueue) { - assert(HostAllocaCmd->getQueue()->is_host() && - "Host alloca command expected"); + assert(!HostAllocaCmd->getQueue() && "Host alloca command expected"); assert(HostAllocaCmd->MIsActive && "Active alloca command expected"); AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd; @@ -480,7 +478,6 @@ Command *Scheduler::GraphBuilder::remapMemoryObject( Command * Scheduler::GraphBuilder::addCopyBack(Requirement *Req, std::vector &ToEnqueue) { - QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue(); SYCLMemObjI *MemObj = Req->MSYCLMemObj; MemObjRecord *Record = getMemObjRecord(MemObj); if (Record && MPrintOptionsArray[BeforeAddCopyBack]) @@ -490,14 +487,13 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req, if (nullptr == Record || !Record->MMemModified) return nullptr; - std::set Deps = - findDepsForReq(Record, Req, HostQueue->getContextImplPtr()); + std::set Deps = findDepsForReq(Record, Req, nullptr); AllocaCommandBase *SrcAllocaCmd = findAllocaForReq(Record, Req, Record->MCurContext); auto MemCpyCmdUniquePtr = std::make_unique( *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData, - SrcAllocaCmd->getQueue(), std::move(HostQueue)); + SrcAllocaCmd->getQueue(), nullptr); if (!MemCpyCmdUniquePtr) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); @@ -531,19 +527,17 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, auto SYCLMemObj = static_cast(Req->MSYCLMemObj); SYCLMemObj->handleWriteAccessorCreation(); } - - const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue(); - - MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req); + // Host accessor is not attached to any queue so no QueueImplPtr object to be + // sent to getOrInsertMemObjRecord. + MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req); if (MPrintOptionsArray[BeforeAddHostAcc]) printGraphAsDot("before_addHostAccessor"); markModifiedIfWrite(Record, Req); AllocaCommandBase *HostAllocaCmd = - getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue); + getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); - if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(), - Record->MCurContext)) { + if (isOnSameContext(Record->MCurContext, HostAllocaCmd->getQueue())) { if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer ? (static_cast( @@ -553,15 +547,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, ToEnqueue); } } else - insertMemoryMove(Record, Req, HostQueue, ToEnqueue); + insertMemoryMove(Record, Req, nullptr, ToEnqueue); Command *UpdateHostAccCmd = - insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue); + insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue); // Need empty command to be blocked until host accessor is destructed - EmptyCommand *EmptyCmd = - addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue, - Command::BlockReason::HostAccessor, ToEnqueue); + EmptyCommand *EmptyCmd = addEmptyCmd( + UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue); Req->MBlockedCmd = EmptyCmd; @@ -572,14 +565,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, } Command *Scheduler::GraphBuilder::addCGUpdateHost( - std::unique_ptr CommandGroup, const QueueImplPtr &HostQueue, + std::unique_ptr CommandGroup, std::vector &ToEnqueue) { auto UpdateHost = static_cast(CommandGroup.get()); Requirement *Req = UpdateHost->getReqToUpdate(); - MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req); - return insertMemoryMove(Record, Req, HostQueue, ToEnqueue); + MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req); + return insertMemoryMove(Record, Req, nullptr, ToEnqueue); } /// Start the search for the record from list of "leaf" commands and check if @@ -625,9 +618,10 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req); // Going through copying memory between contexts is not supported. - if (Dep.MDepCommand) - CanBypassDep &= - sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr()); + if (Dep.MDepCommand) { + auto DepQueue = Dep.MDepCommand->getQueue(); + CanBypassDep &= isOnSameContext(Context, DepQueue); + } if (!CanBypassDep) { RetDeps.insert(DepCmd); @@ -666,7 +660,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( bool AllowConst) { auto IsSuitableAlloca = [&Context, Req, AllowConst](AllocaCommandBase *AllocaCmd) { - bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context); + bool Res = isOnSameContext(Context, AllocaCmd->getQueue()); if (IsSuitableSubReq(Req)) { const Requirement *TmpReq = AllocaCmd->getRequirement(); Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF; @@ -684,10 +678,15 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq( static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { if (const char *HUMConfig = SYCLConfig::get()) { if (std::strcmp(HUMConfig, "0") == 0) - return Ctx->is_host(); + return Ctx == nullptr; if (std::strcmp(HUMConfig, "1") == 0) return true; } + // host task & host accessor is covered with no device context but provide + // required support. + if (Ctx == nullptr) + return true; + for (const device &Device : Ctx->getDevices()) { if (!Device.get_info()) return false; @@ -702,9 +701,9 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) { AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue, std::vector &ToEnqueue) { - - AllocaCommandBase *AllocaCmd = findAllocaForReq( - Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false); + auto Context = queue_impl::getContext(Queue); + AllocaCommandBase *AllocaCmd = + findAllocaForReq(Record, Req, Context, /*AllowConst=*/false); if (!AllocaCmd) { std::vector ToCleanUp; @@ -734,8 +733,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // TODO the case where the first alloca is made with a discard mode and // the user pointer is read-only is still not handled: it leads to // unnecessary copy on devices with unified host memory support. - const bool HostUnifiedMemory = - checkHostUnifiedMemory(Queue->getContextImplPtr()); + const bool HostUnifiedMemory = checkHostUnifiedMemory(Context); SYCLMemObjI *MemObj = Req->MSYCLMemObj; const bool InitFromUserData = Record->MAllocaCommands.empty() && (HostUnifiedMemory || MemObj->isInterop()); @@ -751,16 +749,14 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // There's no need to make a host allocation if the buffer is not // initialized with user data. if (MemObj->hasUserDataPtr()) { - QueueImplPtr DefaultHostQueue = - Scheduler::getInstance().getDefaultHostQueue(); AllocaCommand *HostAllocaCmd = new AllocaCommand( - DefaultHostQueue, FullReq, true /* InitFromUserData */, + nullptr, FullReq, true /* InitFromUserData */, nullptr /* LinkedAllocaCmd */, MemObj->isHostPointerReadOnly() /* IsConst */); Record->MAllocaCommands.push_back(HostAllocaCmd); Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue); ++(HostAllocaCmd->MLeafCounter); - Record->MCurContext = DefaultHostQueue->getContextImplPtr(); + Record->MCurContext = nullptr; } } } else { @@ -772,7 +768,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // new one. There could be situations when we could setup link with // "not" current allocation, but it will require memory copy. // Can setup link between cl and host allocations only - if (Queue->is_host() != Record->MCurContext->is_host()) { + if ((Context == nullptr) != (Record->MCurContext == nullptr)) { // Linked commands assume that the host allocation is reused by the // plugin runtime and that can lead to unnecessary copy overhead on // devices that do not support host unified memory. Do not link the @@ -784,7 +780,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( bool PinnedHostMemory = MemObj->usesPinnedHostMemory(); bool HostUnifiedMemoryOnNonHostDevice = - Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext) + Queue == nullptr ? checkHostUnifiedMemory(Record->MCurContext) : HostUnifiedMemory; if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) { AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq( @@ -824,14 +820,13 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // construction, host allocation doesn't. So, device allocation should // always be active here. Also if the "follower" command is a device one // we have to change current context to the device one. - if (Queue->is_host()) { + if (Queue == nullptr) { AllocaCmd->MIsActive = false; } else { LinkedAllocaCmd->MIsActive = false; - Record->MCurContext = Queue->getContextImplPtr(); + Record->MCurContext = Context; - std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + std::set Deps = findDepsForReq(Record, Req, Context); for (Command *Dep : Deps) { Command *ConnCmd = AllocaCmd->addDep( DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp); @@ -871,10 +866,8 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record, EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( Command *Cmd, const std::vector &Reqs, - const QueueImplPtr &Queue, Command::BlockReason Reason, - std::vector &ToEnqueue, const bool AddDepsToLeaves) { - EmptyCommand *EmptyCmd = - new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue()); + Command::BlockReason Reason, std::vector &ToEnqueue) { + EmptyCommand *EmptyCmd = new EmptyCommand(); if (!EmptyCmd) throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); @@ -884,9 +877,9 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( EmptyCmd->MBlockReason = Reason; for (Requirement *Req : Reqs) { - MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req); + MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req); AllocaCommandBase *AllocaCmd = - getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); + getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue); EmptyCmd->addRequirement(Cmd, AllocaCmd, Req); } // addRequirement above call addDep that already will add EmptyCmd as user for @@ -894,19 +887,17 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd( if (!Reqs.size()) Cmd->addUser(EmptyCmd); - if (AddDepsToLeaves) { - const std::vector &Deps = Cmd->MDeps; - std::vector ToCleanUp; - for (const DepDesc &Dep : Deps) { - const Requirement *Req = Dep.MDepRequirement; - MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); + const std::vector &Deps = Cmd->MDeps; + std::vector ToCleanUp; + for (const DepDesc &Dep : Deps) { + const Requirement *Req = Dep.MDepRequirement; + MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj); - updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp); - addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); - } - for (Command *Cmd : ToCleanUp) - cleanupCommand(Cmd); + updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp); + addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue); } + for (Command *Cmd : ToCleanUp) + cleanupCommand(Cmd); return EmptyCmd; } @@ -1069,8 +1060,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( AllocaCmd = getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue); - isSameCtx = - sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext); + isSameCtx = isOnSameContext(Record->MCurContext, QueueForAlloca); } // If there is alloca command we need to check if the latest memory is in @@ -1078,7 +1068,7 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isSameCtx) { // If the memory is already in the required host context, check if the // required access mode is valid, remap if not. - if (Record->MCurContext->is_host() && + if (!Record->MCurContext && !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) { remapMemoryObject(Record, Req, Req->MIsSubBuffer @@ -1096,21 +1086,20 @@ void Scheduler::GraphBuilder::createGraphForCommand( if (isInteropTask) { const detail::CGHostTask &HT = static_cast(CG); - if (HT.MQueue->getContextImplPtr() != Record->MCurContext) { + if (!isOnSameContext(Record->MCurContext, HT.MQueue)) { NeedMemMoveToHost = true; MemMoveTargetQueue = HT.MQueue; } - } else if (!Queue->is_host() && !Record->MCurContext->is_host()) + } else if (Queue && Record->MCurContext) NeedMemMoveToHost = true; if (NeedMemMoveToHost) - insertMemoryMove(Record, Req, - Scheduler::getInstance().getDefaultHostQueue(), - ToEnqueue); + insertMemoryMove(Record, Req, nullptr, ToEnqueue); insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } + std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, queue_impl::getContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd) { @@ -1350,9 +1339,8 @@ Command *Scheduler::GraphBuilder::connectDepEvent( /* DepEvents = */ {DepEvent}), CG::CodeplayHostTask, /* Payload */ {})); - ConnectCmd = new ExecCGCommand( - std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue(), - /*EventNeeded=*/true); + ConnectCmd = new ExecCGCommand(std::move(ConnectCG), nullptr, + /*EventNeeded=*/true); } catch (const std::bad_alloc &) { throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY); } @@ -1714,7 +1702,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue); - isSameCtx = sameCtx(Queue->getContextImplPtr(), Record->MCurContext); + isSameCtx = isOnSameContext(Record->MCurContext, Queue); } if (!isSameCtx) { @@ -1723,17 +1711,15 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate( bool NeedMemMoveToHost = false; auto MemMoveTargetQueue = Queue; - if (!Queue->is_host() && !Record->MCurContext->is_host()) + if (Queue && Record->MCurContext) NeedMemMoveToHost = true; if (NeedMemMoveToHost) - insertMemoryMove(Record, Req, - Scheduler::getInstance().getDefaultHostQueue(), - ToEnqueue); + insertMemoryMove(Record, Req, nullptr, ToEnqueue); insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue); } std::set Deps = - findDepsForReq(Record, Req, Queue->getContextImplPtr()); + findDepsForReq(Record, Req, queue_impl::getContext(Queue)); for (Command *Dep : Deps) { if (Dep != NewCmd.get()) { diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 78fd300460526..fbea6f14dea3d 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -99,21 +99,6 @@ EventImplPtr Scheduler::addCG( EventImplPtr NewEvent = nullptr; const CG::CGTYPE Type = CommandGroup->getType(); std::vector AuxiliaryCmds; - std::vector Streams; - - if (Type == CG::Kernel) { - auto *CGExecKernelPtr = static_cast(CommandGroup.get()); - Streams = CGExecKernelPtr->getStreams(); - CGExecKernelPtr->clearStreams(); - // Stream's flush buffer memory is mainly initialized in stream's __init - // method. However, this method is not available on host device. - // Initializing stream's flush buffer on the host side in a separate task. - if (Queue->is_host()) { - for (const StreamImplPtr &Stream : Streams) { - Stream->initStreamHost(Queue); - } - } - } std::vector> AuxiliaryResources; AuxiliaryResources = CommandGroup->getAuxiliaryResources(); CommandGroup->clearAuxiliaryResources(); @@ -125,14 +110,13 @@ EventImplPtr Scheduler::addCG( Command *NewCmd = nullptr; switch (Type) { case CG::UpdateHost: - NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), - DefaultHostQueue, AuxiliaryCmds); + NewCmd = + MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), AuxiliaryCmds); NewEvent = NewCmd->getEvent(); break; case CG::CodeplayHostTask: { - auto Result = - MGraphBuilder.addCG(std::move(CommandGroup), DefaultHostQueue, - AuxiliaryCmds, EventNeeded); + auto Result = MGraphBuilder.addCG(std::move(CommandGroup), nullptr, + AuxiliaryCmds, EventNeeded); NewCmd = Result.NewCmd; NewEvent = Result.NewEvent; ShouldEnqueue = Result.ShouldEnqueue; @@ -152,10 +136,6 @@ EventImplPtr Scheduler::addCG( if (ShouldEnqueue) { enqueueCommandForCG(NewEvent, AuxiliaryCmds); - - for (const auto &StreamImplPtr : Streams) { - StreamImplPtr->flush(NewEvent); - } } if (!AuxiliaryResources.empty()) @@ -227,7 +207,7 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { { WriteLockT Lock = acquireWriteLock(); NewCmd = MGraphBuilder.addCopyBack(Req, AuxiliaryCmds); - // Command was not creted because there were no operations with + // Command was not created because there were no operations with // buffer. if (!NewCmd) return nullptr; @@ -252,7 +232,9 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) { throw runtime_error("Enqueue process failed.", PI_ERROR_INVALID_OPERATION); } catch (...) { - NewCmd->getQueue()->reportAsyncException(std::current_exception()); + auto WorkerQueue = NewCmd->getEvent()->getWorkerQueue(); + assert(WorkerQueue && "WorkerQueue for CopyBack command must be not null"); + WorkerQueue->reportAsyncException(std::current_exception()); } EventImplPtr NewEvent = NewCmd->getEvent(); cleanupCommands(ToCleanUp); @@ -395,18 +377,6 @@ void Scheduler::enqueueUnblockedCommands( } } -Scheduler::Scheduler() { - sycl::device HostDevice = - createSyclObjFromImpl(device_impl::getHostDeviceImpl()); - sycl::context HostContext{HostDevice}; - DefaultHostQueue = QueueImplPtr( - new queue_impl(detail::getSyclObjImpl(HostDevice), - detail::getSyclObjImpl(HostContext), /*AsyncHandler=*/{}, - /*PropList=*/{sycl::property::queue::enable_profiling()})); -} - -Scheduler::~Scheduler() { DefaultHostQueue.reset(); } - void Scheduler::releaseResources(BlockingT Blocking) { // There might be some commands scheduled for post enqueue cleanup that // haven't been freed because of the graph mutex being locked at the time, @@ -727,12 +697,10 @@ bool CheckEventReadiness(const ContextImplPtr &Context, // don't represent actual dependencies. Calling getContextImpl() would set // their context, which we wish to avoid as it is expensive. // NOP events also don't represent actual dependencies. - if ((!SyclEventImplPtr->isContextInitialized() && - !SyclEventImplPtr->is_host()) || - SyclEventImplPtr->isNOP()) { + if (SyclEventImplPtr->isDefaultConstructed() || SyclEventImplPtr->isNOP()) { return true; } - if (SyclEventImplPtr->is_host()) { + if (SyclEventImplPtr->isHost()) { return SyclEventImplPtr->isCompleted(); } // Cross-context dependencies can't be passed to the backend directly. diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 124fc1181116c..e9a8f4d566282 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -204,7 +204,6 @@ struct MemObjRecord { LeavesCollection::AllocateDependencyF AllocateDependency) : MReadLeaves{this, LeafLimit, AllocateDependency}, MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {} - // Contains all allocation commands for the memory object. std::vector MAllocaCommands; @@ -217,8 +216,8 @@ struct MemObjRecord { // The context which has the latest state of the memory object. ContextImplPtr MCurContext; - // The mode this object can be accessed with from the host context. - // Valid only if the current context is host. + // The mode this object can be accessed from the host (host_accessor). + // Valid only if the current usage is on host. access::mode MHostAccess = access::mode::read_write; // The flag indicates that the content of the memory object was/will be @@ -452,10 +451,6 @@ class Scheduler { /// \return true if an instance of the scheduler object exists. static bool isInstanceAlive(); - QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; } - - const QueueImplPtr &getDefaultHostQueue() const { return DefaultHostQueue; } - static MemObjRecord *getMemObjRecord(const Requirement *const Req); void deferMemObjRelease(const std::shared_ptr &MemObj); @@ -470,8 +465,6 @@ class Scheduler { bool isInFusionMode(QueueIdT Queue); - Scheduler(); - ~Scheduler(); void releaseResources(BlockingT Blocking = BlockingT::BLOCKING); bool isDeferredMemObjectsEmpty(); @@ -613,7 +606,6 @@ class Scheduler { /// /// \return a command that represents command group execution. Command *addCGUpdateHost(std::unique_ptr CommandGroup, - const QueueImplPtr &HostQueue, std::vector &ToEnqueue); /// Enqueues a command to update memory to the latest state. @@ -750,10 +742,8 @@ class Scheduler { EmptyCommand *addEmptyCmd(Command *Cmd, const std::vector &Req, - const QueueImplPtr &Queue, Command::BlockReason Reason, - std::vector &ToEnqueue, - const bool AddDepsToLeaves = true); + std::vector &ToEnqueue); void createGraphForCommand(Command *NewCmd, CG &CG, bool isInteropTask, std::vector &Reqs, @@ -967,8 +957,6 @@ class Scheduler { MAuxiliaryResources; std::mutex MAuxiliaryResourcesMutex; - QueueImplPtr DefaultHostQueue; - friend class Command; friend class DispatchHostTask; friend class queue_impl; diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp index 4ff380d7295c5..f74599a4353b9 100644 --- a/sycl/source/detail/stream_impl.cpp +++ b/sycl/source/detail/stream_impl.cpp @@ -66,67 +66,42 @@ size_t stream_impl::get_work_item_buffer_size() const { return MaxStatementSize_; } -void stream_impl::initStreamHost(QueueImplPtr Queue) { - // Real size of full flush buffer is saved only in buffer_impl field of - // FlushBuf object. - size_t FlushBufSize = getSyclObjImpl(FlushBuf_)->size(); +void stream_impl::generateFlushCommand(handler &cgh) { + // Create accessor to the flush buffer even if not using it yet. Otherwise + // kernel will be a leaf for the flush buffer and scheduler will not be able + // to cleanup the kernel. TODO: get rid of finalize method by using host + // accessor to the flush buffer. + host_accessor FlushBuffHostAcc(FlushBuf_, + cgh); + host_accessor BufHostAcc( + Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize)); - auto Q = createSyclObjFromImpl(Queue); - Q.submit([&](handler &cgh) { - auto FlushBufAcc = FlushBuf_.get_access( - cgh, range<1>(1), id<1>(0)); - cgh.host_task([=] { - char *FlushBufPtr = FlushBufAcc.get_pointer(); - std::memset(FlushBufPtr, 0, FlushBufSize); - }); + cgh.host_task([=] { + if (!BufHostAcc.empty()) { + // SYCL 2020, 4.16: + // > If the totalBufferSize or workItemBufferSize limits are exceeded, + // > it is implementation-defined whether the streamed characters + // > exceeding the limit are output, or silently ignored/discarded, and + // > if output it is implementation-defined whether those extra + // > characters exceeding the workItemBufferSize limit count toward the + // > totalBufferSize limit. Regardless of this implementation defined + // > behavior of output exceeding the limits, no undefined or erroneous + // > behavior is permitted of an implementation when the limits are + // > exceeded. + // + // Defend against zero-sized buffers (although they'd have no practical + // use). + printf("%s", &(BufHostAcc[0])); + } + fflush(stdout); }); } -void stream_impl::flush(const EventImplPtr &LeadEvent) { - // We don't want stream flushing to be blocking operation that is why submit a - // host task to print stream buffer. It will fire up as soon as the kernel - // finishes execution. - auto Q = detail::createSyclObjFromImpl( - sycl::detail::Scheduler::getInstance().getDefaultHostQueue()); - event Event = Q.submit([&](handler &cgh) { - auto BufHostAcc = - Buf_.get_access( - cgh, range<1>(BufferSize_), id<1>(OffsetSize)); - // Create accessor to the flush buffer even if not using it yet. Otherwise - // kernel will be a leaf for the flush buffer and scheduler will not be able - // to cleanup the kernel. TODO: get rid of finalize method by using host - // accessor to the flush buffer. - auto FlushBufHostAcc = - FlushBuf_ - .get_access( - cgh); - cgh.host_task([=] { - if (!BufHostAcc.empty()) { - // SYCL 2020, 4.16: - // > If the totalBufferSize or workItemBufferSize limits are exceeded, - // > it is implementation-defined whether the streamed characters - // > exceeding the limit are output, or silently ignored/discarded, and - // > if output it is implementation-defined whether those extra - // > characters exceeding the workItemBufferSize limit count toward the - // > totalBufferSize limit. Regardless of this implementation defined - // > behavior of output exceeding the limits, no undefined or erroneous - // > behavior is permitted of an implementation when the limits are - // > exceeded. - // - // Defend against zero-sized buffers (although they'd have no practical - // use). - printf("%s", &(BufHostAcc[0])); - } - fflush(stdout); - }); - }); - if (LeadEvent) { - LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event)); - LeadEvent->getSubmittedQueue()->registerStreamServiceEvent( - detail::getSyclObjImpl(Event)); - } -} +// ABI break: remove +void stream_impl::initStreamHost(QueueImplPtr) {} + +// ABI break: remove +void stream_impl::flush(const EventImplPtr &) {} } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp index 1578a8d7cb508..59b4c77e057c3 100644 --- a/sycl/source/detail/stream_impl.hpp +++ b/sycl/source/detail/stream_impl.hpp @@ -37,13 +37,11 @@ class stream_impl { // buffer and offset in the flush buffer GlobalOffsetAccessorT accessGlobalOffset(handler &CGH); - // Initialize flush buffers on host. - void initStreamHost(QueueImplPtr Queue); + // ABI break: remove + void initStreamHost(QueueImplPtr); - // Enqueue task to copy stream buffer to the host and print the contents - // The host task event is then registered for post processing in the - // LeadEvent as well as in queue LeadEvent associated with. - void flush(const EventImplPtr &LeadEvent); + // ABI break: remove + void flush(const EventImplPtr &); size_t size() const noexcept; @@ -57,6 +55,8 @@ class stream_impl { return PropList_.get_property(); } + void generateFlushCommand(handler &cgh); + private: // Size of the stream buffer size_t BufferSize_; diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 792d321b6334e..009f910be440c 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -33,12 +33,6 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext, MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr), MSharedPtrStorage(nullptr), MHostPtrProvided(true), MOwnNativeHandle(OwnNativeHandle) { - if (MInteropContext->is_host()) - throw sycl::invalid_parameter_error( - "Creation of interoperability memory object using host context is " - "not allowed", - PI_ERROR_INVALID_CONTEXT); - sycl::detail::pi::PiContext Context = nullptr; const PluginPtr &Plugin = getPlugin(); @@ -84,12 +78,6 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext, MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr), MSharedPtrStorage(nullptr), MHostPtrProvided(true), MOwnNativeHandle(OwnNativeHandle) { - if (MInteropContext->is_host()) - throw sycl::invalid_parameter_error( - "Creation of interoperability memory object using host context is " - "not allowed", - PI_ERROR_INVALID_CONTEXT); - sycl::detail::pi::PiContext Context = nullptr; const PluginPtr &Plugin = getPlugin(); @@ -185,25 +173,18 @@ size_t SYCLMemObjT::getBufSizeForContext(const ContextImplPtr &Context, bool SYCLMemObjT::isInterop() const { return MOpenCLInterop; } -void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context, +void SYCLMemObjT::determineHostPtr(const ContextImplPtr & /*Context*/, bool InitFromUserData, void *&HostPtr, bool &HostPtrReadOnly) { // The data for the allocation can be provided via either the user pointer // (InitFromUserData, can be read-only) or a runtime-allocated read-write // HostPtr. We can have one of these scenarios: - // 1. The allocation is the first one and on host. InitFromUserData == true. - // 2. The allocation is the first one and isn't on host. InitFromUserData + // 1. The allocation is the first one and isn't on host. InitFromUserData // varies based on unified host memory support and whether or not the data can // be discarded. - // 3. The allocation is not the first one and is on host. InitFromUserData == - // false, HostPtr == nullptr. This can only happen if the allocation command - // is not linked since it would be a no-op otherwise. Attempt to reuse the - // user pointer if it's read-write, but do not copy its contents if it's not. - // 4. The allocation is not the first one and not on host. InitFromUserData == + // 2. The allocation is not the first one and not on host. InitFromUserData == // false, HostPtr is provided if the command is linked. The host pointer is // guaranteed to be reused in this case. - if (Context->is_host() && !MOpenCLInterop && !MHostPtrReadOnly) - InitFromUserData = true; if (InitFromUserData) { assert(!HostPtr && "Cannot init from user data and reuse host ptr provided " @@ -228,8 +209,9 @@ void SYCLMemObjT::detachMemoryObject( !MOwnNativeHandle || (MInteropContext && !MInteropContext->isOwnedByRuntime()); - if (MRecord && MRecord->MCurContext->isOwnedByRuntime() && - !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal)) { + if (MRecord && MRecord->MCurContext && + MRecord->MCurContext->isOwnedByRuntime() && !InteropObjectsUsed && + (!MHostPtrProvided || MIsInternal)) { bool okToDefer = GlobalHandler::instance().isOkToDefer(); if (okToDefer) Scheduler::getInstance().deferMemObjRelease(Self); diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp old mode 100755 new mode 100644 index ecf63bc63e427..7237e88be440f --- a/sycl/source/detail/usm/usm_impl.cpp +++ b/sycl/source/detail/usm/usm_impl.cpp @@ -73,62 +73,47 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt, return nullptr; std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - if (CtxImpl->is_host()) { - if (!Alignment) { - // worst case default - Alignment = 128; + pi_context C = CtxImpl->getHandleRef(); + const PluginPtr &Plugin = CtxImpl->getPlugin(); + pi_result Error = PI_ERROR_INVALID_VALUE; + + switch (Kind) { + case alloc::host: { + std::array Props; + auto PropsIter = Props.begin(); + + if (PropList.has_property< + sycl::ext::intel::experimental::property::usm::buffer_location>() && + Ctxt.get_platform().has_extension( + "cl_intel_mem_alloc_buffer_location")) { + *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; + *PropsIter++ = PropList + .get_property() + .get_buffer_location(); } - aligned_allocator Alloc(Alignment); - try { - RetVal = Alloc.allocate(Size); - } catch (const std::bad_alloc &) { - // Conform with Specification behavior - RetVal = nullptr; - } - } else { - pi_context C = CtxImpl->getHandleRef(); - const PluginPtr &Plugin = CtxImpl->getPlugin(); - pi_result Error = PI_ERROR_INVALID_VALUE; - - switch (Kind) { - case alloc::host: { - std::array Props; - auto PropsIter = Props.begin(); - - if (PropList.has_property() && - Ctxt.get_platform().has_extension( - "cl_intel_mem_alloc_buffer_location")) { - *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; - *PropsIter++ = PropList - .get_property() - .get_buffer_location(); - } - - assert(PropsIter >= Props.begin() && PropsIter < Props.end()); - *PropsIter++ = 0; // null-terminate property list - - Error = Plugin->call_nocheck( - &RetVal, C, Props.data(), Size, Alignment); - - break; - } - case alloc::device: - case alloc::shared: - case alloc::unknown: { - RetVal = nullptr; - Error = PI_ERROR_INVALID_VALUE; - break; - } - } + assert(PropsIter >= Props.begin() && PropsIter < Props.end()); + *PropsIter++ = 0; // null-terminate property list - // Error is for debugging purposes. - // The spec wants a nullptr returned, not an exception. - if (Error != PI_SUCCESS) - return nullptr; + Error = Plugin->call_nocheck( + &RetVal, C, Props.data(), Size, Alignment); + + break; + } + case alloc::device: + case alloc::shared: + case alloc::unknown: { + RetVal = nullptr; + Error = PI_ERROR_INVALID_VALUE; + break; } + } + + // Error is for debugging purposes. + // The spec wants a nullptr returned, not an exception. + if (Error != PI_SUCCESS) + return nullptr; #ifdef XPTI_ENABLE_INSTRUMENTATION xpti::addMetadata(PrepareNotify.traceEvent(), "memory_ptr", reinterpret_cast(RetVal)); @@ -154,98 +139,79 @@ void *alignedAllocInternal(size_t Alignment, size_t Size, if (Size == 0) return nullptr; - if (CtxImpl->is_host()) { - if (Kind == alloc::unknown) { - RetVal = nullptr; - } else { - if (!Alignment) { - // worst case default - Alignment = 128; - } - - aligned_allocator Alloc(Alignment); - try { - RetVal = Alloc.allocate(Size); - } catch (const std::bad_alloc &) { - // Conform with Specification behavior - RetVal = nullptr; - } - } - } else { - pi_context C = CtxImpl->getHandleRef(); - const PluginPtr &Plugin = CtxImpl->getPlugin(); - pi_result Error = PI_ERROR_INVALID_VALUE; - pi_device Id; - - switch (Kind) { - case alloc::device: { - Id = DevImpl->getHandleRef(); - - std::array Props; - auto PropsIter = Props.begin(); - - // Buffer location is only supported on FPGA devices - if (PropList.has_property() && - DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { - *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; - *PropsIter++ = PropList - .get_property() - .get_buffer_location(); - } - - assert(PropsIter >= Props.begin() && PropsIter < Props.end()); - *PropsIter++ = 0; // null-terminate property list - - Error = Plugin->call_nocheck( - &RetVal, C, Id, Props.data(), Size, Alignment); - - break; + pi_context C = CtxImpl->getHandleRef(); + const PluginPtr &Plugin = CtxImpl->getPlugin(); + pi_result Error = PI_ERROR_INVALID_VALUE; + pi_device Id; + + switch (Kind) { + case alloc::device: { + Id = DevImpl->getHandleRef(); + + std::array Props; + auto PropsIter = Props.begin(); + + // Buffer location is only supported on FPGA devices + if (PropList.has_property< + sycl::ext::intel::experimental::property::usm::buffer_location>() && + DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { + *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; + *PropsIter++ = PropList + .get_property() + .get_buffer_location(); } - case alloc::shared: { - Id = DevImpl->getHandleRef(); - - std::array Props; - auto PropsIter = Props.begin(); - - if (PropList.has_property< - sycl::ext::oneapi::property::usm::device_read_only>()) { - *PropsIter++ = PI_MEM_ALLOC_FLAGS; - *PropsIter++ = PI_MEM_ALLOC_DEVICE_READ_ONLY; - } - - if (PropList.has_property() && - DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { - *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; - *PropsIter++ = PropList - .get_property() - .get_buffer_location(); - } - - assert(PropsIter >= Props.begin() && PropsIter < Props.end()); - *PropsIter++ = 0; // null-terminate property list - - Error = Plugin->call_nocheck( - &RetVal, C, Id, Props.data(), Size, Alignment); - - break; - } - case alloc::host: - case alloc::unknown: { - RetVal = nullptr; - Error = PI_ERROR_INVALID_VALUE; - break; + + assert(PropsIter >= Props.begin() && PropsIter < Props.end()); + *PropsIter++ = 0; // null-terminate property list + + Error = Plugin->call_nocheck( + &RetVal, C, Id, Props.data(), Size, Alignment); + + break; + } + case alloc::shared: { + Id = DevImpl->getHandleRef(); + + std::array Props; + auto PropsIter = Props.begin(); + + if (PropList.has_property< + sycl::ext::oneapi::property::usm::device_read_only>()) { + *PropsIter++ = PI_MEM_ALLOC_FLAGS; + *PropsIter++ = PI_MEM_ALLOC_DEVICE_READ_ONLY; } + + if (PropList.has_property< + sycl::ext::intel::experimental::property::usm::buffer_location>() && + DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) { + *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION; + *PropsIter++ = PropList + .get_property() + .get_buffer_location(); } - // Error is for debugging purposes. - // The spec wants a nullptr returned, not an exception. - if (Error != PI_SUCCESS) - return nullptr; + assert(PropsIter >= Props.begin() && PropsIter < Props.end()); + *PropsIter++ = 0; // null-terminate property list + + Error = Plugin->call_nocheck( + &RetVal, C, Id, Props.data(), Size, Alignment); + + break; } + case alloc::host: + case alloc::unknown: { + RetVal = nullptr; + Error = PI_ERROR_INVALID_VALUE; + break; + } + } + + // Error is for debugging purposes. + // The spec wants a nullptr returned, not an exception. + if (Error != PI_SUCCESS) + return nullptr; return RetVal; } @@ -284,14 +250,9 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt, void freeInternal(void *Ptr, const context_impl *CtxImpl) { if (Ptr == nullptr) return; - if (CtxImpl->is_host()) { - // need to use alignedFree here for Windows - detail::OSUtil::alignedFree(Ptr); - } else { - pi_context C = CtxImpl->getHandleRef(); - const PluginPtr &Plugin = CtxImpl->getPlugin(); - Plugin->call(C, Ptr); - } + pi_context C = CtxImpl->getHandleRef(); + const PluginPtr &Plugin = CtxImpl->getPlugin(); + Plugin->call(C, Ptr); } void free(void *Ptr, const context &Ctxt, @@ -578,10 +539,6 @@ alloc get_pointer_type(const void *Ptr, const context &Ctxt) { std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - // Everything on a host device is just system malloc so call it host - if (CtxImpl->is_host()) - return alloc::host; - pi_context PICtx = CtxImpl->getHandleRef(); pi_usm_type AllocTy; @@ -631,10 +588,6 @@ device get_pointer_device(const void *Ptr, const context &Ctxt) { std::shared_ptr CtxImpl = detail::getSyclObjImpl(Ctxt); - // Just return the host device in the host context - if (CtxImpl->is_host()) - return Ctxt.get_devices()[0]; - // Check if ptr is a host allocation if (get_pointer_type(Ptr, Ctxt) == alloc::host) { auto Devs = CtxImpl->getDevices(); diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp index c08e620b0583d..1884f5cd34265 100644 --- a/sycl/source/detail/xpti_registry.cpp +++ b/sycl/source/detail/xpti_registry.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include +#include #include #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -362,6 +363,20 @@ void XPTIRegistry::sampledImageHostAccessorNotification( #endif } +std::string queueDeviceToString(const queue_impl *const &Queue) { + if (!Queue) + return "HOST"; + auto Device = Queue->get_device(); + if (Device.is_cpu()) + return "CPU"; + else if (Device.is_gpu()) + return "GPU"; + else if (Device.is_accelerator()) + return "ACCELERATOR"; + else + return "UNKNOWN"; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp index 681e2841c027b..356679a75c2fb 100644 --- a/sycl/source/detail/xpti_registry.hpp +++ b/sycl/source/detail/xpti_registry.hpp @@ -319,6 +319,9 @@ class XPTIScope { }; // class XPTIScope #endif +class queue_impl; +std::string queueDeviceToString(const detail::queue_impl *const &Queue); + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 20df5cf47256a..423ff7be44121 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -71,9 +71,8 @@ std::vector device::get_devices(info::device_type deviceType) { cl_device_id device::get() const { return impl->get(); } bool device::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "device::is_host should not be called in implementation."); - return IsHost; + assert(false && "device::is_host should not be called in implementation."); + return false; } bool device::is_cpu() const { return impl->is_cpu(); } diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp index a7bae8055c10b..69d62f354ea4c 100644 --- a/sycl/source/event.cpp +++ b/sycl/source/event.cpp @@ -38,9 +38,8 @@ bool event::operator==(const event &rhs) const { return rhs.impl == impl; } bool event::operator!=(const event &rhs) const { return !(*this == rhs); } bool event::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "event::is_host should not be called in implementation."); - return IsHost; + assert(false && "event::is_host should not be called in implementation."); + return false; } void event::wait() { impl->wait(impl); } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 10ce364310912..aee154fa19dbd 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -81,29 +81,28 @@ void *getValueFromDynamicParameter( } // namespace detail /// TODO: Unused. Remove with ABI break. -handler::handler(std::shared_ptr Queue, bool IsHost) - : handler(Queue, IsHost, /*CallerNeedsEvent=*/true) {} +handler::handler(std::shared_ptr Queue, bool) + : handler(Queue, false, /*CallerNeedsEvent=*/true) {} /// TODO: Unused. Remove with ABI break. handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, - bool IsHost) - : handler(Queue, PrimaryQueue, SecondaryQueue, IsHost, + std::shared_ptr SecondaryQueue, bool) + : handler(Queue, PrimaryQueue, SecondaryQueue, false, /*CallerNeedsEvent=*/true) {} -handler::handler(std::shared_ptr Queue, bool IsHost, +handler::handler(std::shared_ptr Queue, bool, bool CallerNeedsEvent) - : handler(Queue, Queue, nullptr, IsHost, CallerNeedsEvent) {} + : handler(Queue, Queue, nullptr, false, CallerNeedsEvent) {} handler::handler(std::shared_ptr Queue, std::shared_ptr PrimaryQueue, - std::shared_ptr SecondaryQueue, - bool IsHost, bool CallerNeedsEvent) + std::shared_ptr SecondaryQueue, bool, + bool CallerNeedsEvent) : MImpl(std::make_shared(std::move(PrimaryQueue), std::move(SecondaryQueue), CallerNeedsEvent)), - MQueue(std::move(Queue)), MIsHost(IsHost) {} + MQueue(std::move(Queue)), MIsHost(false) {} handler::handler( std::shared_ptr Graph) @@ -287,17 +286,10 @@ event handler::finalize() { detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); #endif - if (MQueue->is_host()) { - MHostKernel->call(MNDRDesc, (NewEvent) - ? NewEvent->getHostProfilingInfo() - : nullptr); - Result = PI_SUCCESS; - } else { - Result = enqueueImpKernel( - MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel, - MKernelName.c_str(), RawEvents, NewEvent, nullptr, - MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative); - } + Result = enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, + MKernel, MKernelName.c_str(), RawEvents, + NewEvent, nullptr, MImpl->MKernelCacheConfig, + MImpl->MKernelIsCooperative); #ifdef XPTI_ENABLE_INSTRUMENTATION // Emit signal only when event is created if (NewEvent != nullptr) { @@ -339,7 +331,7 @@ event handler::finalize() { if (PI_SUCCESS != EnqueueKernel()) throw runtime_error("Enqueue process failed.", PI_ERROR_INVALID_OPERATION); - else if (NewEvent->is_host() || NewEvent->getHandleRef() == nullptr) + else if (NewEvent->isHost() || NewEvent->getHandleRef() == nullptr) NewEvent->setComplete(); NewEvent->setEnqueued(); @@ -421,19 +413,6 @@ event handler::finalize() { case detail::CG::Barrier: case detail::CG::BarrierWaitlist: { if (auto GraphImpl = getCommandGraph(); GraphImpl != nullptr) { - // if no event to wait for was specified, we add all exit - // nodes/events of the graph - if (MEventsWaitWithBarrier.size() == 0) { - MEventsWaitWithBarrier = GraphImpl->getExitNodesEvents(); - // Graph-wide barriers take precedence over previous one. - // We therefore remove the previous ones from ExtraDependencies list. - // The current barrier is then added to this list in the graph_impl. - std::vector EventsBarriers = - GraphImpl->removeBarriersFromExtraDependencies(); - MEventsWaitWithBarrier.insert(std::end(MEventsWaitWithBarrier), - std::begin(EventsBarriers), - std::end(EventsBarriers)); - } CGData.MEvents.insert(std::end(CGData.MEvents), std::begin(MEventsWaitWithBarrier), std::end(MEventsWaitWithBarrier)); @@ -551,6 +530,7 @@ event handler::finalize() { // it to the graph to create a node, rather than submit it to the scheduler. if (auto GraphImpl = MQueue->getCommandGraph(); GraphImpl) { auto EventImpl = std::make_shared(); + EventImpl->setSubmittedQueue(MQueue); std::shared_ptr NodeImpl = nullptr; @@ -582,7 +562,17 @@ event handler::finalize() { // queue. GraphImpl->setLastInorderNode(MQueue, NodeImpl); } else { - NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); + auto LastBarrierRecordedFromQueue = GraphImpl->getBarrierDep(MQueue); + if (LastBarrierRecordedFromQueue) { + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), + {LastBarrierRecordedFromQueue}); + } else { + NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup)); + } + + if (NodeImpl->MCGType == sycl::detail::CG::Barrier) { + GraphImpl->setBarrierDep(MQueue, NodeImpl); + } } // Associate an event with this new node and return the event. @@ -923,7 +913,7 @@ void handler::ext_oneapi_barrier(const std::vector &WaitList) { auto EventImpl = detail::getSyclObjImpl(Event); // We could not wait for host task events in backend. // Adding them as dependency to enable proper scheduling. - if (EventImpl->is_host()) { + if (EventImpl->isHost()) { depends_on(EventImpl); } MEventsWaitWithBarrier.push_back(EventImpl); diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp index ff14c0a879078..625eb995c47d3 100644 --- a/sycl/source/kernel.cpp +++ b/sycl/source/kernel.cpp @@ -31,9 +31,8 @@ kernel::kernel(cl_kernel ClKernel, const context &SyclContext) cl_kernel kernel::get() const { return impl->get(); } bool kernel::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "kernel::is_host should not be called in implementation."); - return IsHost; + assert(false && "kernel::is_host should not be called in implementation."); + return false; } context kernel::get_context() const { diff --git a/sycl/source/kernel_bundle.cpp b/sycl/source/kernel_bundle.cpp index aace54af59ac2..85a6d0a561bd2 100644 --- a/sycl/source/kernel_bundle.cpp +++ b/sycl/source/kernel_bundle.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -362,19 +363,22 @@ using source_kb = kernel_bundle; using exe_kb = kernel_bundle; using kernel_bundle_impl = sycl::detail::kernel_bundle_impl; +namespace detail { + ///////////////////////// -// syclex::is_source_kernel_bundle_supported +// syclex::detail::is_source_kernel_bundle_supported ///////////////////////// bool is_source_kernel_bundle_supported(backend BE, source_language Language) { // Support is limited to the opencl and level_zero backends. bool BE_Acceptable = (BE == sycl::backend::ext_oneapi_level_zero) || (BE == sycl::backend::opencl); if (BE_Acceptable) { - // At the moment, OpenCL and SPIR-V are the only supported languages. if (Language == source_language::opencl) { return detail::OpenCLC_Compilation_Available(); } else if (Language == source_language::spirv) { return true; + } else if (Language == source_language::sycl) { + return detail::SYCL_Compilation_Available(); } } @@ -383,12 +387,15 @@ bool is_source_kernel_bundle_supported(backend BE, source_language Language) { } ///////////////////////// -// syclex::create_kernel_bundle_from_source +// syclex::detail::create_kernel_bundle_from_source ///////////////////////// -source_kb create_kernel_bundle_from_source(const context &SyclContext, - source_language Language, - const std::string &Source) { +using include_pairs_t = std::vector>; + +source_kb make_kernel_bundle_from_source(const context &SyclContext, + source_language Language, + const std::string &Source, + include_pairs_t IncludePairs) { // TODO: if we later support a "reason" why support isn't present // (like a missing shared library etc.) it'd be nice to include it in // the exception message here. @@ -397,15 +404,21 @@ source_kb create_kernel_bundle_from_source(const context &SyclContext, throw sycl::exception(make_error_code(errc::invalid), "kernel_bundle creation from source not supported"); + // throw if include not supported? awaiting guidance + // if(!IncludePairs.empty() && is_include_supported(Languuage)){ throw invalid + // } + std::shared_ptr KBImpl = - std::make_shared(SyclContext, Language, Source); + std::make_shared(SyclContext, Language, Source, + IncludePairs); return sycl::detail::createSyclObjFromImpl(KBImpl); } -source_kb -create_kernel_bundle_from_source(const context &SyclContext, - source_language Language, - const std::vector &Bytes) { +source_kb make_kernel_bundle_from_source(const context &SyclContext, + source_language Language, + const std::vector &Bytes, + include_pairs_t IncludePairs) { + (void)IncludePairs; backend BE = SyclContext.get_backend(); if (!is_source_kernel_bundle_supported(BE, Language)) throw sycl::exception(make_error_code(errc::invalid), @@ -419,17 +432,17 @@ create_kernel_bundle_from_source(const context &SyclContext, ///////////////////////// // syclex::detail::build_from_source(source_kb) => exe_kb ///////////////////////// -namespace detail { -exe_kb build_from_source(source_kb &SourceKB, - const std::vector &Devices, - const std::vector &BuildOptions, - std::string *LogPtr) { +exe_kb +build_from_source(source_kb &SourceKB, const std::vector &Devices, + const std::vector &BuildOptions, + std::string *LogPtr, + const std::vector &RegisteredKernelNames) { std::vector UniqueDevices = sycl::detail::removeDuplicateDevices(Devices); std::shared_ptr sourceImpl = getSyclObjImpl(SourceKB); - std::shared_ptr KBImpl = - sourceImpl->build_from_source(UniqueDevices, BuildOptions, LogPtr); + std::shared_ptr KBImpl = sourceImpl->build_from_source( + UniqueDevices, BuildOptions, LogPtr, RegisteredKernelNames); return sycl::detail::createSyclObjFromImpl(KBImpl); } diff --git a/sycl/source/platform.cpp b/sycl/source/platform.cpp index a2ee714952be9..179c8c09d0825 100644 --- a/sycl/source/platform.cpp +++ b/sycl/source/platform.cpp @@ -41,10 +41,8 @@ bool platform::has_extension(const std::string &ExtensionName) const { } bool platform::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && - "platform::is_host should not be called in implementation."); - return IsHost; + assert(false && "platform::is_host should not be called in implementation."); + return false; } std::vector platform::get_devices(info::device_type DeviceType) const { diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 9648431a5a429..9977b526349b3 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -96,9 +96,8 @@ queue::ext_oneapi_get_graph() const { } bool queue::is_host() const { - bool IsHost = impl->is_host(); - assert(!IsHost && "queue::is_host should not be called in implementation."); - return IsHost; + assert(false && "queue::is_host should not be called in implementation."); + return false; } void queue::throw_asynchronous() { impl->throw_asynchronous(); } @@ -263,7 +262,7 @@ event queue::ext_oneapi_submit_barrier(const std::vector &WaitList, bool AllEventsEmptyOrNop = std::all_of( begin(WaitList), end(WaitList), [&](const event &Event) -> bool { auto EventImpl = detail::getSyclObjImpl(Event); - return !EventImpl->isContextInitialized() || EventImpl->isNOP(); + return EventImpl->isDefaultConstructed() || EventImpl->isNOP(); }); if (is_in_order() && !impl->getCommandGraph() && !impl->MDiscardEvents && !impl->MIsProfilingEnabled && AllEventsEmptyOrNop) { diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp index 549dc13ed76c6..bca68c2e6d290 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp @@ -10,6 +10,9 @@ // TODO enable opaque pointers support on CPU. // UNSUPPORTED: cpu || accelerator +// https://github.com/intel/llvm/issues/14397 +// UNSUPPORTED: windows && gpu-intel-gen12 + // RUN: %{build} -o %t.out // RUN: %{run} %t.out // RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t2.out %} diff --git a/sycl/test-e2e/Basic/stream/blocking_pipes_and_stream.cpp b/sycl/test-e2e/Basic/stream/blocking_pipes_and_stream.cpp index d61f1f69234a3..12e2c062999d3 100644 --- a/sycl/test-e2e/Basic/stream/blocking_pipes_and_stream.cpp +++ b/sycl/test-e2e/Basic/stream/blocking_pipes_and_stream.cpp @@ -1,4 +1,4 @@ -// REQUIRES: accelerator +// REQUIRES: accelerator, TEMPORARY_DISABLED // RUN: %{build} -o %t.out // RUN: %{run} %t.out | FileCheck %s diff --git a/sycl/test-e2e/Basic/stream/stream.cpp b/sycl/test-e2e/Basic/stream/stream.cpp index fe3429cffad2b..04680d969841e 100644 --- a/sycl/test-e2e/Basic/stream/stream.cpp +++ b/sycl/test-e2e/Basic/stream/stream.cpp @@ -9,6 +9,9 @@ // //===----------------------------------------------------------------------===// +// https://github.com/intel/llvm/issues/14397 +// UNSUPPORTED: windows && gpu-intel-gen12 + #include #include diff --git a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp index d8d645f6ac7dc..1140ef603d336 100644 --- a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp @@ -1,5 +1,8 @@ // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-finite-math-only%} %else %{-fno-finite-math-only%} +// https://github.com/intel/llvm/issues/14397 +// UNSUPPORTED: windows && gpu-intel-gen12 + // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp index 121e911c0474c..56dfbc081fb06 100644 --- a/sycl/test-e2e/Config/allowlist.cpp +++ b/sycl/test-e2e/Config/allowlist.cpp @@ -34,62 +34,57 @@ int main() { // Expected that the allowlist filter is not set if (getenv("PRINT_PLATFORM_INFO")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) { - - std::string Name = Platform.get_info(); - std::string Ver = Platform.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); - - std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name - << "}},PlatformVersion:{{" << Ver << "}}"; - - return 0; - } - throw std::runtime_error("Non host device is not found"); + for (const sycl::platform &Platform : sycl::platform::get_platforms()) { + std::string Name = Platform.get_info(); + std::string Ver = Platform.get_info(); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); + + std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name + << "}},PlatformVersion:{{" << Ver << "}}"; + + return 0; + } + throw std::runtime_error("No device is found"); } // Expected that the allowlist filter is not set if (getenv("PRINT_DEVICE_INFO")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) { - const sycl::device Dev = Platform.get_devices().at(0); - std::string Name = Dev.get_info(); - std::string Ver = Dev.get_info(); + for (const sycl::platform &Platform : sycl::platform::get_platforms()) { + const sycl::device Dev = Platform.get_devices().at(0); + std::string Name = Dev.get_info(); + std::string Ver = Dev.get_info(); - // As a string will be used as regexp pattern, we need to get rid of - // symbols that can be treated in a special way. - replaceSpecialCharacters(Name); - replaceSpecialCharacters(Ver); + // As a string will be used as regexp pattern, we need to get rid of + // symbols that can be treated in a special way. + replaceSpecialCharacters(Name); + replaceSpecialCharacters(Ver); - std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name - << "}},DriverVersion:{{" << Ver << "}}"; + std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name + << "}},DriverVersion:{{" << Ver << "}}"; - return 0; - } - throw std::runtime_error("Non host device is not found"); + return 0; + } + throw std::runtime_error("No device is found"); } // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result if (getenv("TEST_DEVICE_AVAILABLE")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) { - if (Platform.get_devices().size() != 1) - throw std::runtime_error("Expected only one non host device."); + for (const sycl::platform &Platform : sycl::platform::get_platforms()) { + if (Platform.get_devices().size() != 1) + throw std::runtime_error("Expected only one device."); - return 0; - } - throw std::runtime_error("Non host device is not found"); + return 0; + } + throw std::runtime_error("No device is found"); } // Expected the allowlist to be set but empty if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) { - for (const sycl::platform &Platform : sycl::platform::get_platforms()) - if (!Platform.is_host()) - throw std::runtime_error("Expected no non host device is available"); + if (!sycl::platform::get_platforms().empty()) + throw std::runtime_error("Expected no device is available"); return 0; } diff --git a/sycl/test-e2e/ESIMD/api/functional/operators/operator_logical_not.cpp b/sycl/test-e2e/ESIMD/api/functional/operators/operator_logical_not.cpp index 54688b894f661..a118b55d36977 100644 --- a/sycl/test-e2e/ESIMD/api/functional/operators/operator_logical_not.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/operators/operator_logical_not.cpp @@ -5,9 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// TODO This test freezed on Linux OS, enable test on linux once this issue will -// be fixed. -// REQUIRES: windows // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out // RUN: %{run} %t.out // diff --git a/sycl/test-e2e/ESIMD/api/replicate_smoke.cpp b/sycl/test-e2e/ESIMD/api/replicate_smoke.cpp index b491624c8cbdc..fc3e2b0daa857 100644 --- a/sycl/test-e2e/ESIMD/api/replicate_smoke.cpp +++ b/sycl/test-e2e/ESIMD/api/replicate_smoke.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// https://github.com/intel/llvm/issues/10369 -// UNSUPPORTED: gpu // // UNSUPPORTED: gpu-intel-pvc // TODO: remove fno-fast-math option once the issue is investigated and the test @@ -17,16 +15,13 @@ // // The test checks main functionality of the esimd::replicate_vs_w_hs function. -// Temporarily disable while the failure is being investigated. -// UNSUPPORTED: windows - #include "../esimd_test_utils.hpp" using namespace sycl; using namespace sycl::ext::intel::esimd; using bfloat16 = sycl::ext::oneapi::bfloat16; using tfloat32 = sycl::ext::intel::experimental::esimd::tfloat32; - +namespace syclex = sycl::ext::oneapi::experimental; template struct char_to_int { using type = typename std::conditional< sizeof(T) == 1, @@ -182,6 +177,8 @@ int main(int argc, char **argv) { queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); auto dev = q.get_device(); const bool doublesSupported = dev.has(sycl::aspect::fp64); + syclex::architecture arch = + dev.get_info(); std::cout << "Running on " << dev.get_info() << "\n"; bool passed = true; @@ -192,7 +189,8 @@ int main(int argc, char **argv) { passed &= test(q); passed &= test(q); passed &= test(q); - passed &= test(q); + if (arch > syclex::architecture::intel_gpu_dg1) + passed &= test(q); passed &= test(q); #ifdef USE_TF32 passed &= test(q); diff --git a/sycl/test-e2e/ESIMD/api/replicate_smoke_pvc.cpp b/sycl/test-e2e/ESIMD/api/replicate_smoke_pvc.cpp index 44d2cec95848a..8ecae61fa38b0 100644 --- a/sycl/test-e2e/ESIMD/api/replicate_smoke_pvc.cpp +++ b/sycl/test-e2e/ESIMD/api/replicate_smoke_pvc.cpp @@ -12,9 +12,6 @@ // The test checks main functionality of the esimd::replicate_vs_w_hs function. // PVC variant of the test - adds tfloat32. -// Temporarily disable while the failure is being investigated. -// UNSUPPORTED: windows - #define USE_TF32 #include "replicate_smoke.cpp" diff --git a/sycl/test-e2e/ESIMD/fp_in_phi.cpp b/sycl/test-e2e/ESIMD/fp_in_phi.cpp index 4ca19beac646e..9450c46c833de 100644 --- a/sycl/test-e2e/ESIMD/fp_in_phi.cpp +++ b/sycl/test-e2e/ESIMD/fp_in_phi.cpp @@ -5,10 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// On Windows vector compute backend (as a part of IGC) uses llvm-7 and llvm-7 -// based spirv translator. This test should start working on Windows when the -// llvm version is switched to 9. -// UNSUPPORTED: windows // RUN: %{build} -Xclang -fsycl-allow-func-ptr -o %t.out // RUN: %{run} %t.out // UNSUPPORTED: ze_debug diff --git a/sycl/test-e2e/ESIMD/fp_in_select.cpp b/sycl/test-e2e/ESIMD/fp_in_select.cpp index 78f50bdc1d820..12b8ea3c0e25e 100644 --- a/sycl/test-e2e/ESIMD/fp_in_select.cpp +++ b/sycl/test-e2e/ESIMD/fp_in_select.cpp @@ -5,14 +5,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// On Windows vector compute backend (as a part of IGC) uses llvm-7 and llvm-7 -// based spirv translator. This test should start working on Windows when the -// llvm version is switched to 9. -// UNSUPPORTED: windows // RUN: %{build} -Xclang -fsycl-allow-func-ptr -o %t.out // RUN: %{run} %t.out -// The test fails on JITing due to use of too many registers -// REQUIRES: TEMPORARY_DISBLED // // The test checks that ESIMD kernels correctly handle function pointers as // arguments of LLVM's select function. diff --git a/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp b/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp index 226a9898e9d7c..22848b9b288f5 100644 --- a/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp +++ b/sycl/test-e2e/ESIMD/noinline_call_from_func.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// Test currently timeouts on Windows Level Zero and OpenCL -// UNSUPPORTED: windows // RUN: %{build} -o %t.out // RUN: env IGC_FunctionControl=3 IGC_ForceInlineStackCallWithImplArg=1 %{run} %t.out // diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_shortcut_with_kb.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_shortcut_with_kb.cpp index 7176def8ec7de..485266c8a3d06 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_shortcut_with_kb.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_shortcut_with_kb.cpp @@ -21,8 +21,8 @@ constexpr size_t N = 1024; int main() { sycl::queue Q; - if (!oneapiext::is_source_kernel_bundle_supported( - Q.get_backend(), oneapiext::source_language::opencl)) { + if (!Q.get_device().ext_oneapi_can_compile( + oneapiext::source_language::opencl)) { std::cout << "Backend does not support OpenCL C source kernel bundle extension: " << Q.get_backend() << std::endl; diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event_and_kb.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event_and_kb.cpp index 0e8988574fd5d..b2731740a60e3 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event_and_kb.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_event_and_kb.cpp @@ -21,8 +21,8 @@ int main() { sycl::queue Q; int Memory[N] = {0}; - if (!oneapiext::is_source_kernel_bundle_supported( - Q.get_backend(), oneapiext::source_language::opencl)) { + if (!Q.get_device().ext_oneapi_can_compile( + oneapiext::source_language::opencl)) { std::cout << "Backend does not support OpenCL C source kernel bundle extension: " << Q.get_backend() << std::endl; diff --git a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_kb.cpp b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_kb.cpp index be29a73b87ee0..2651a803b509c 100644 --- a/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_kb.cpp +++ b/sycl/test-e2e/EnqueueFunctions/kernel_submit_with_kb.cpp @@ -21,8 +21,8 @@ int main() { sycl::queue Q; int Memory[N] = {0}; - if (!oneapiext::is_source_kernel_bundle_supported( - Q.get_backend(), oneapiext::source_language::opencl)) { + if (!Q.get_device().ext_oneapi_can_compile( + oneapiext::source_language::opencl)) { std::cout << "Backend does not support OpenCL C source kernel bundle extension: " << Q.get_backend() << std::endl; diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp similarity index 92% rename from sycl/test-e2e/KernelCompiler/kernel_compiler.cpp rename to sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp index 356a26f7be2e8..79b72ee19b0ae 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_opencl.cpp @@ -1,4 +1,4 @@ -//==- kernel_compiler.cpp --- kernel_compiler extension tests -------------==// +//==- kernel_compiler_opencl.cpp --- kernel_compiler extension tests -------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -85,12 +85,13 @@ void test_build_and_run() { sycl::context ctx{d}; sycl::queue q{ctx, d}; - bool ok = syclex::is_source_kernel_bundle_supported( - ctx.get_backend(), syclex::source_language::opencl); + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::opencl); if (!ok) { - std::cout << "Apparently this backend does not support OpenCL C source " + std::cout << "Apparently this device does not support OpenCL C source " "kernel bundle extension: " - << ctx.get_backend() << std::endl; + << q.get_device().get_info() + << std::endl; return; } @@ -141,8 +142,8 @@ void test_error() { sycl::context ctx{d}; sycl::queue q{ctx, d}; - bool ok = syclex::is_source_kernel_bundle_supported( - ctx.get_backend(), syclex::source_language::opencl); + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::opencl); if (!ok) { return; } diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp index c0e8a7dda85ae..38567fe6ee0b3 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp @@ -175,9 +175,7 @@ void testKernelsFromSpvFile(std::string kernels_file, return bundle.ext_oneapi_get_kernel(name); }; - sycl::device d; - sycl::context ctx{d}; - sycl::queue q{ctx, d}; + sycl::queue q; auto bundle = loadKernelsFromFile(q, kernels_file); // Test simple kernel. diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp new file mode 100644 index 0000000000000..fc294f49fad45 --- /dev/null +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl.cpp @@ -0,0 +1,276 @@ +//==- kernel_compiler_sycl.cpp --- kernel_compiler extension tests -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: (opencl || level_zero) +// UNSUPPORTED: accelerator + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +auto constexpr AddEmH = R"===( + int AddEm(int a, int b){ + return a + b + 5; + } +)==="; + +// TODO: remove SYCL_EXTERNAL once it is no longer needed. +auto constexpr SYCLSource = R"===( +#include +#include "AddEm.h" + +// use extern "C" to avoid name mangling +extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void ff_cp(int *ptr) { + + // intentionally using deprecated routine, as opposed to this_work_item::get_nd_item<1>() + sycl::nd_item<1> Item = sycl::ext::oneapi::experimental::this_nd_item<1>(); + + sycl::id<1> GId = Item.get_global_id(); + ptr[GId.get(0)] = AddEm(GId.get(0), 37); +} + +// this name will be mangled +template +SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void ff_templated(T *ptr) { + + sycl::nd_item<1> Item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + + sycl::id<1> GId = Item.get_global_id(); + ptr[GId.get(0)] = GId.get(0) + 39; +} +)==="; + +auto constexpr BadSource = R"===( +#include + +extern "C" SYCL_EXTERNAL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void ff_cp(int *ptr) { + + sycl::nd_item<1> Item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + + sycl::id<1> GId = Item.get_global_id() + no semi colon !! + ptr[GId.get(0)] = GId.get(0) + 41; +} +)==="; + +auto constexpr ESIMDSource = R"===( +#include +#include + +using namespace sycl::ext::intel::esimd; + +constexpr int VL = 16; + +extern "C" SYCL_EXTERNAL SYCL_ESIMD_KERNEL SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((sycl::ext::oneapi::experimental::nd_range_kernel<1>)) +void vector_add_esimd(float *A, float *B, float *C) { + sycl::nd_item<1> item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + unsigned int i = item.get_global_id(0); + unsigned int offset = i * VL ; + + simd va(A + offset); + simd vb(B + offset); + simd vc = va + vb; + vc.copy_to(C + offset); +} +)==="; + +void test_1(sycl::queue &Queue, sycl::kernel &Kernel, int seed) { + constexpr int Range = 10; + int *usmPtr = sycl::malloc_shared(Range, Queue); + int start = 3; + + sycl::nd_range<1> R1{{Range}, {1}}; + + bool Passa = true; + + memset(usmPtr, 0, Range * sizeof(int)); + Queue.submit([&](sycl::handler &Handler) { + Handler.set_arg(0, usmPtr); + Handler.parallel_for(R1, Kernel); + }); + Queue.wait(); + + for (int i = 0; i < Range; i++) { + std::cout << usmPtr[i] << " "; + assert(usmPtr[i] = i + seed); + } + std::cout << std::endl; + + sycl::free(usmPtr, Queue); +} + +void test_build_and_run() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl); + if (!ok) { + std::cout << "Apparently this device does not support SYCL source " + "kernel bundle extension: " + << q.get_device().get_info() + << std::endl; + return; + } + + // Create from source. + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, SYCLSource, + syclex::properties{syclex::include_files{"AddEm.h", AddEmH}}); + + // Double check kernel_bundle.get_source() / get_backend(). + sycl::context ctxRes = kbSrc.get_context(); + assert(ctxRes == ctx); + sycl::backend beRes = kbSrc.get_backend(); + assert(beRes == ctx.get_backend()); + + // Compilation of empty prop list, no devices. + exe_kb kbExe1 = syclex::build(kbSrc); + + // Compilation with props and devices + std::string log; + std::vector flags{"-g", "-fno-fast-math"}; + std::vector devs = kbSrc.get_devices(); + exe_kb kbExe2 = syclex::build( + kbSrc, devs, + syclex::properties{// syclex::build_options{flags}, + syclex::save_log{&log}, + syclex::registered_kernel_names{"ff_templated"}}); + assert(log.find("warning: 'this_nd_item<1>' is deprecated") != + std::string::npos); + + // clang-format off + + // extern "C" was used, so the name "ff_cp" is not mangled and can be used directly. + sycl::kernel k = kbExe2.ext_oneapi_get_kernel("ff_cp"); + + // The templated function name will have been mangled. Mapping from original + // name to mangled is not yet supported. So we cannot yet do this: + // sycl::kernel k2 = kbExe2.ext_oneapi_get_kernel("ff_templated"); + + // Instead, we can TEMPORARILY use the mangled name. Once demangling is supported + // this might no longer work. + sycl::kernel k2 = kbExe2.ext_oneapi_get_kernel("_Z26__sycl_kernel_ff_templatedIiEvPT_"); + + // clang-format on + + // Test the kernels. + test_1(q, k, 37 + 5); // AddEm will add 5 more. + test_1(q, k2, 39); +} + +void test_error() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl); + if (!ok) { + return; + } + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, BadSource); + try { + exe_kb kbExe = syclex::build(kbSrc); + assert(false && "we should not be here"); + } catch (sycl::exception &e) { + // yas! + assert(e.code() == sycl::errc::build); + assert(std::string(e.what()).find( + "error: expected ';' at end of declaration") != + std::string::npos); + } +} + +void test_esimd() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + + if (!q.get_device().has(sycl::aspect::ext_intel_esimd)) { + std::cout << "Device does not support ESIMD" << std::endl; + return; + } + + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl); + if (!ok) { + return; + } + + std::string log; + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl, ESIMDSource); + exe_kb kbExe = + syclex::build(kbSrc, syclex::properties{syclex::save_log{&log}}); + + // extern "C" was used, so the name "vector_add_esimd" is not mangled and can + // be used directly. + sycl::kernel k = kbExe.ext_oneapi_get_kernel("vector_add_esimd"); + + // Now test it. + constexpr int VL = 16; // this constant also in ESIMDSource string. + constexpr int size = VL * 16; + + float *A = sycl::malloc_shared(size, q); + float *B = sycl::malloc_shared(size, q); + float *C = sycl::malloc_shared(size, q); + for (size_t i = 0; i < size; i++) { + A[i] = float(1); + B[i] = float(2); + C[i] = 0.0f; + } + sycl::range<1> GlobalRange{size / VL}; + sycl::range<1> LocalRange{1}; + sycl::nd_range<1> NDRange{GlobalRange, LocalRange}; + + q.submit([&](sycl::handler &h) { + h.set_arg(0, A); + h.set_arg(1, B); + h.set_arg(2, C); + h.parallel_for(NDRange, k); + }).wait(); + + // Check. + for (size_t i = 0; i < size; i++) { + assert(C[i] == 3.0f); + } + + sycl::free(A, q); + sycl::free(B, q); + sycl::free(C, q); +} + +int main() { + +#ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER + test_build_and_run(); + test_error(); + test_esimd(); +#else + static_assert(false, "Kernel Compiler feature test macro undefined"); +#endif + return 0; +} diff --git a/sycl/test-e2e/Regression/acos.cpp b/sycl/test-e2e/Regression/acos.cpp new file mode 100644 index 0000000000000..59cb130be0756 --- /dev/null +++ b/sycl/test-e2e/Regression/acos.cpp @@ -0,0 +1,39 @@ +// REQUIRES: aspect-fp64 +// UNSUPPORTED: cuda || hip + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +#include +#include +#include + +using namespace sycl; +using T = std::complex; + +int main() { + queue q; + const double pi = std::atan2(+0., -0.); + T data[][2] = {{{-2, -INFINITY}, {pi / 2, INFINITY}}, + {{-0., INFINITY}, {pi / 2, -INFINITY}}}; + int N = std::size(data); + auto *p = malloc_shared(N, q); + + q.single_task([=] { + for (int i = 0; i < N; ++i) { + p[i] = std::acos(data[i][0]); + } + }).wait(); + + int fails = 0; + for (int i = 0; i < N; ++i) { + auto actual = p[i]; + auto expected = data[i][1]; + if (expected != actual) { + std::cout << i << " fail:" + << "expected = " << expected << ", actual = " << actual << "\n"; + ++fails; + } + } + + return fails; +} diff --git a/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp b/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp index 5b71a60a54051..a7d4c6493b8b5 100644 --- a/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp +++ b/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp @@ -1,9 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// -// Assertion `!MHostPlatform && "Plugin is not available for Host."' failed on -// Nvidia. -// XFAIL: hip_nvidia #include diff --git a/sycl/test-e2e/Tracing/code_location_queue_submit.cpp b/sycl/test-e2e/Tracing/code_location_queue_submit.cpp index 6ebfe43e936e5..ce780f5e81725 100644 --- a/sycl/test-e2e/Tracing/code_location_queue_submit.cpp +++ b/sycl/test-e2e/Tracing/code_location_queue_submit.cpp @@ -5,8 +5,7 @@ // Test tracing of the code location data for queue.submit in case of failure // (exception generation) -// First queue creation (id = 0) is queue created on line 15. -// The second queue is a host queue created on first scheduler usage. +// First queue creation (id = 0) is queue created on line 17. #include #include @@ -19,16 +18,10 @@ int main() { unsigned char *HostAllocDst = NULL; // CHECK: [SYCL] Queue create: // CHECK-DAG: queue_handle : {{.*}} - // CHECK-DAG: queue_id : 0 - // CHECK-DAG: is_inorder : false - // CHECK-DAG: sycl_device : {{.*}} - // CHECK-DAG: sycl_device_name : {{.*}} - // CHECK-DAG: sycl_context : {{.*}} - // CHECK-NEXT: [SYCL] Queue create: // CHECK-DAG: queue_id : 1 // CHECK-DAG: is_inorder : false // CHECK-DAG: sycl_device : {{.*}} - // CHECK-DAG: sycl_device_name : SYCL host device + // CHECK-DAG: sycl_device_name : {{.*}} // CHECK-DAG: sycl_context : {{.*}} // CHECK: [SYCL] Runtime reports: // CHECK-NEXT: what: NULL pointer argument in memory copy operation. -30 (PI_ERROR_INVALID_VALUE) @@ -44,6 +37,6 @@ int main() { sycl::free(HostAllocSrc, Q); } // CHECK-NEXT: [SYCL] Queue destroy: - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 return !ExceptionCaught; } diff --git a/sycl/test-e2e/Tracing/task_execution.cpp b/sycl/test-e2e/Tracing/task_execution.cpp index d591c20b8f6c0..b4932df0eda55 100644 --- a/sycl/test-e2e/Tracing/task_execution.cpp +++ b/sycl/test-e2e/Tracing/task_execution.cpp @@ -15,38 +15,32 @@ int main() { Q.copy(AllocDst, AllocSrc, 1).wait(); // CHECK: [SYCL] Queue create: // CHECK-DAG: queue_handle : {{.*}} - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: is_inorder : false // CHECK-DAG: sycl_device : {{.*}} // CHECK-DAG: sycl_device_name : {{.*}} // CHECK-DAG: sycl_context : {{.*}} // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: memory_size : 1 // CHECK-DAG: value_set : 0 // CHECK-DAG: memory_ptr : {{.*}} // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: memory_size : 1 // CHECK-DAG: dest_memory_ptr : {{.*}} // CHECK-DAG: src_memory_ptr : {{.*}} // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) - // CHECK-NEXT: [SYCL] Queue create: - // CHECK-DAG: queue_id : 1 - // CHECK-DAG: is_inorder : false - // CHECK-DAG: sycl_device : {{.*}} - // CHECK-DAG: sycl_device_name : SYCL host device - // CHECK-DAG: sycl_context : {{.*}} Q.single_task([]() {}).wait(); // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) // CHECK-DAG: enqueue_kernel_data : {{.*}} // CHECK-DAG: sym_column_no : {{.*}} - // CHECK-DAG: sym_line_no : 43 + // CHECK-DAG: sym_line_no : 37 // CHECK-DAG: sym_source_file_name : {{.*}}task_execution.cpp - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: sym_function_name : typeinfo name for main::E2ETestKernel // CHECK-DAG: from_source : {{.*}} // CHECK-DAG: sycl_device_name : {{.*}} @@ -55,7 +49,7 @@ int main() { // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) // CHECK-NEXT: [SYCL] Queue destroy: - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 sycl::free(AllocSrc, Q); sycl::free(AllocDst, Q); } diff --git a/sycl/test-e2e/Tracing/task_execution_handler.cpp b/sycl/test-e2e/Tracing/task_execution_handler.cpp index 0563275f81312..a208fe6655bda 100644 --- a/sycl/test-e2e/Tracing/task_execution_handler.cpp +++ b/sycl/test-e2e/Tracing/task_execution_handler.cpp @@ -16,7 +16,7 @@ int main() { { cgh.memset(AllocSrc, 0, 1); }) .wait(); // CHECK: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: sym_column_no : {{.*}} // CHECK-DAG: sym_function_name : {{.*}} // CHECK-DAG: kernel_name : {{.*}} @@ -27,7 +27,7 @@ int main() { // CHECK-DAG: sycl_device : {{.*}} // CHECK-NEXT: [SYCL] Task end (event={{.*}},instanceID={{.*}}) // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}}) - // CHECK-DAG: queue_id : 0 + // CHECK-DAG: queue_id : 1 // CHECK-DAG: sym_column_no : {{.*}} // CHECK-DAG: sym_function_name : {{.*}} // CHECK-DAG: kernel_name : {{.*}} diff --git a/sycl/test-e2e/no_sycl_hpp_in_e2e_tests.cpp b/sycl/test-e2e/no_sycl_hpp_in_e2e_tests.cpp index 7b2aa6b3b5883..74f8e1aed8aff 100644 --- a/sycl/test-e2e/no_sycl_hpp_in_e2e_tests.cpp +++ b/sycl/test-e2e/no_sycl_hpp_in_e2e_tests.cpp @@ -7,7 +7,7 @@ // CHECK-DAG: no_sycl_hpp_in_e2e_tests.cpp // CHECK-DAG: lit.cfg.py // -// CHECK-NUM-MATCHES: 3 +// CHECK-NUM-MATCHES: 4 // // This test verifies that `` isn't used in E2E tests. Instead, // fine-grained includes should used, see diff --git a/sycl/test-e2e/syclcompat/dim.cpp b/sycl/test-e2e/syclcompat/dim.cpp index b4f1a1595bde5..7eea85960933b 100644 --- a/sycl/test-e2e/syclcompat/dim.cpp +++ b/sycl/test-e2e/syclcompat/dim.cpp @@ -35,6 +35,33 @@ int main() { assert(d3.y == 1); assert(d3.z == 1); } + std::cout << "Testing Empty Construct" << std::endl; + { + syclcompat::dim3 d3; + assert(d3.x == 1); + assert(d3.y == 1); + assert(d3.z == 1); + } + std::cout << "Testing Empty Construct & Update" << std::endl; + { + syclcompat::dim3 d3; + d3.x = 1; + d3.y = 2; + d3.z = 3; + + assert(d3.x == 1); + assert(d3.y == 2); + assert(d3.z == 3); + } + std::cout << "Testing Empty Construct & Update 2" << std::endl; + { + syclcompat::dim3 d3; + d3.x = 32; + + assert(d3.x == 32); + assert(d3.y == 1); + assert(d3.z == 1); + } std::cout << "Testing Convert" << std::endl; { syclcompat::dim3 d3(512); diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 99fb95d92fa72..4a7d0dcdcb0eb 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2988,17 +2988,8 @@ _ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE0EE _ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE1EE7compileIJSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EEEEES8_IhSaIhEERKSE_DpRKT_ _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE -_ZN4sycl3_V13ext6oneapi12experimental10mem_adviseENS0_5queueEPvmiRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi10level_zero6detail11make_deviceERKNS0_8platformEm -_ZN4sycl3_V13ext6oneapi12experimental12physical_memC1ERKNS0_6deviceERKNS0_7contextEm -_ZN4sycl3_V13ext6oneapi12experimental12physical_memC2ERKNS0_6deviceERKNS0_7contextEm -_ZN4sycl3_V13ext6oneapi12experimental15get_access_modeEPKvmRKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental15set_access_modeEPKvmNS3_19address_access_modeERKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental16free_virtual_memEmmRKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental19get_mem_granularityERKNS0_6deviceERKNS0_7contextENS3_16granularity_modeE -_ZN4sycl3_V13ext6oneapi12experimental19get_mem_granularityERKNS0_7contextENS3_16granularity_modeE -_ZN4sycl3_V13ext6oneapi12experimental19reserve_virtual_memEmmRKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental5unmapEPKvmRKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental10mem_adviseENS0_5queueEPvmiRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_16image_descriptorERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental12create_imageENS3_16image_mem_handleERKNS3_22bindless_image_samplerERKNS3_16image_descriptorERKNS0_5queueE @@ -3009,6 +3000,8 @@ _ZN4sycl3_V13ext6oneapi12experimental12create_imageERNS3_9image_memERKNS3_16imag _ZN4sycl3_V13ext6oneapi12experimental12create_imageERNS3_9image_memERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental12create_imageERNS3_9image_memERKNS3_22bindless_image_samplerERKNS3_16image_descriptorERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental12create_imageERNS3_9image_memERKNS3_22bindless_image_samplerERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental12physical_memC1ERKNS0_6deviceERKNS0_7contextEm +_ZN4sycl3_V13ext6oneapi12experimental12physical_memC2ERKNS0_6deviceERKNS0_7contextEm _ZN4sycl3_V13ext6oneapi12experimental14free_image_memENS3_16image_mem_handleENS3_10image_typeERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental14free_image_memENS3_16image_mem_handleENS3_10image_typeERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental14free_image_memENS3_16image_mem_handleERKNS0_5queueE @@ -3017,10 +3010,16 @@ _ZN4sycl3_V13ext6oneapi12experimental15alloc_image_memERKNS3_16image_descriptorE _ZN4sycl3_V13ext6oneapi12experimental15alloc_image_memERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental15free_mipmap_memENS3_16image_mem_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental15free_mipmap_memENS3_16image_mem_handleERKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental15get_access_modeEPKvmRKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental15get_image_rangeENS3_16image_mem_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental15get_image_rangeENS3_16image_mem_handleERKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental15set_access_modeEPKvmNS3_19address_access_modeERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental16alloc_mipmap_memERKNS3_16image_descriptorERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental16alloc_mipmap_memERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental16free_virtual_memEmmRKNS0_7contextE +_ZN4sycl3_V13ext6oneapi12experimental19get_mem_granularityERKNS0_6deviceERKNS0_7contextENS3_16granularity_modeE +_ZN4sycl3_V13ext6oneapi12experimental19get_mem_granularityERKNS0_7contextENS3_16granularity_modeE +_ZN4sycl3_V13ext6oneapi12experimental19reserve_virtual_memEmmRKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental20destroy_image_handleERNS3_20sampled_image_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental20destroy_image_handleERNS3_20sampled_image_handleERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental20destroy_image_handleERNS3_22unsampled_image_handleERKNS0_5queueE @@ -3060,9 +3059,6 @@ _ZN4sycl3_V13ext6oneapi12experimental25map_external_memory_arrayENS3_18interop_m _ZN4sycl3_V13ext6oneapi12experimental25map_external_memory_arrayENS3_18interop_mem_handleERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_semaphore_handleERKNS0_5queueE _ZN4sycl3_V13ext6oneapi12experimental26destroy_external_semaphoreENS3_24interop_semaphore_handleERKNS0_6deviceERKNS0_7contextE -_ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE -_ZN4sycl3_V13ext6oneapi12experimental32create_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKSt6vectorISt4byteSaIS9_EE -_ZN4sycl3_V13ext6oneapi12experimental33is_source_kernel_bundle_supportedENS0_7backendENS3_15source_languageE _ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi1EEEvNS0_5rangeIXT_EEE _ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi2EEEvNS0_5rangeIXT_EEE _ZN4sycl3_V13ext6oneapi12experimental4node12update_rangeILi3EEEvNS0_5rangeIXT_EEE @@ -3070,11 +3066,12 @@ _ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi1EEEvNS0_8nd_range _ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi2EEEvNS0_8nd_rangeIXT_EEE _ZN4sycl3_V13ext6oneapi12experimental4node15update_nd_rangeILi3EEEvNS0_8nd_rangeIXT_EEE _ZN4sycl3_V13ext6oneapi12experimental4node19get_node_from_eventENS0_5eventE +_ZN4sycl3_V13ext6oneapi12experimental5unmapEPKvmRKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC1ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implC2ERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD1Ev _ZN4sycl3_V13ext6oneapi12experimental6detail14image_mem_implD2Ev -_ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISK_EEPSK_ +_ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bundleILNS0_12bundle_stateE3EEERKSt6vectorINS0_6deviceESaISA_EERKS9_INSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISK_EEPSK_SO_ _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKvm _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPKNS0_6detail16AccessorBaseHostE _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv @@ -3098,6 +3095,9 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_5 _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC1ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC2ERKNS0_5queueERKNS0_13property_listE _ZN4sycl3_V13ext6oneapi12experimental6detail24modifiable_command_graphC2ERKNS0_7contextERKNS0_6deviceERKNS0_13property_listE +_ZN4sycl3_V13ext6oneapi12experimental6detail30make_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt6vectorISt4pairISE_SE_ESaISJ_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail30make_kernel_bundle_from_sourceERKNS0_7contextENS3_15source_languageERKSt6vectorISt4byteSaISA_EES9_ISt4pairINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESL_ESaISM_EE +_ZN4sycl3_V13ext6oneapi12experimental6detail33is_source_kernel_bundle_supportedENS0_7backendENS3_15source_languageE _ZN4sycl3_V13ext6oneapi12experimental6memcpyENS0_5queueEPvPKvmRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi12experimental6memsetENS0_5queueEPvimRKNS0_6detail13code_locationE _ZN4sycl3_V13ext6oneapi12experimental9image_memC1ERKNS3_16image_descriptorERKNS0_5queueE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 9b80d2eb69c8b..00d45e8913778 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -569,10 +569,10 @@ ??0half@host_half_impl@detail@_V1@sycl@@QEAA@AEBM@Z ??0half@host_half_impl@detail@_V1@sycl@@QEAA@G@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z -??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N1@Z -??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z +??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N1@Z +??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z ??0host_selector@_V1@sycl@@QEAA@$$QEAV012@@Z ??0host_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0host_selector@_V1@sycl@@QEAA@XZ @@ -609,10 +609,6 @@ ??0kernel_id@_V1@sycl@@AEAA@PEBD@Z ??0kernel_id@_V1@sycl@@QEAA@$$QEAV012@@Z ??0kernel_id@_V1@sycl@@QEAA@AEBV012@@Z -??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z -??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z -??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVqueue@45@_K@Z -??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVdevice@45@AEBVcontext@45@_K@Z ??0modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA@AEBV?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z ??0modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z @@ -621,6 +617,10 @@ ??0node@experimental@oneapi@ext@_V1@sycl@@AEAA@AEBV?$shared_ptr@Vnode_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z ??0node@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z ??0node@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVdevice@45@AEBVcontext@45@_K@Z +??0physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBVqueue@45@_K@Z ??0platform@_V1@sycl@@AEAA@AEBVdevice@12@@Z ??0platform@_V1@sycl@@AEAA@V?$shared_ptr@Vplatform_impl@detail@_V1@sycl@@@std@@@Z ??0platform@_V1@sycl@@QEAA@$$QEAV012@@Z @@ -683,9 +683,9 @@ ??1kernel@_V1@sycl@@QEAA@XZ ??1kernel_bundle_plain@detail@_V1@sycl@@QEAA@XZ ??1kernel_id@_V1@sycl@@QEAA@XZ -??1physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1node@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ +??1physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1platform@_V1@sycl@@QEAA@XZ ??1queue@_V1@sycl@@QEAA@XZ ??1sampler@_V1@sycl@@QEAA@XZ @@ -768,12 +768,12 @@ ??4kernel_bundle_plain@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4kernel_id@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4kernel_id@_V1@sycl@@QEAAAEAV012@AEBV012@@Z -??4physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z -??4physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??4modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@$$QEAV0123456@@Z ??4modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@AEBV0123456@@Z ??4node@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z ??4node@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z +??4physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z +??4physical_mem@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??4platform@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z ??4platform@_V1@sycl@@QEAAAEAV012@AEBV012@@Z ??4queue@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z @@ -3933,7 +3933,7 @@ ?begin@kernel_bundle_plain@detail@_V1@sycl@@IEBAPEBVdevice_image_plain@234@XZ ?begin_recording@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEAVqueue@67@AEBVproperty_list@67@@Z ?begin_recording@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAAXAEBV?$vector@Vqueue@_V1@sycl@@V?$allocator@Vqueue@_V1@sycl@@@std@@@std@@AEBVproperty_list@67@@Z -?build_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$01@56@AEAV?$kernel_bundle@$02@56@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@PEAV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z +?build_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$01@56@AEAV?$kernel_bundle@$02@56@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@PEAV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@2@Z ?build_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$kernel_bundle@$0A@@23@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@5@AEBVproperty_list@23@@Z ?cancel_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAAXXZ ?category@exception@_V1@sycl@@QEBAAEBVerror_category@std@@XZ @@ -3956,8 +3956,6 @@ ?create_image@experimental@oneapi@ext@_V1@sycl@@YA?AUunsampled_image_handle@12345@AEAVimage_mem@12345@AEBUimage_descriptor@12345@AEBVqueue@45@@Z ?create_image@experimental@oneapi@ext@_V1@sycl@@YA?AUunsampled_image_handle@12345@Uimage_mem_handle@12345@AEBUimage_descriptor@12345@AEBVdevice@45@AEBVcontext@45@@Z ?create_image@experimental@oneapi@ext@_V1@sycl@@YA?AUunsampled_image_handle@12345@Uimage_mem_handle@12345@AEBUimage_descriptor@12345@AEBVqueue@45@@Z -?create_kernel_bundle_from_source@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$02@45@AEBVcontext@45@W4source_language@12345@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z -?create_kernel_bundle_from_source@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$02@45@AEBVcontext@45@W4source_language@12345@AEBV?$vector@W4byte@std@@V?$allocator@W4byte@std@@@2@@std@@@Z ?default_selector_v@_V1@sycl@@YAHAEBVdevice@12@@Z ?deleteAccProps@buffer_plain@detail@_V1@sycl@@IEAAXAEBW4PropWithDataKind@234@@Z ?depends_on@handler@_V1@sycl@@IEAAXAEBV?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z @@ -4088,14 +4086,13 @@ ?find_device_intersection@detail@_V1@sycl@@YA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@V?$kernel_bundle@$00@_V1@sycl@@V?$allocator@V?$kernel_bundle@$00@_V1@sycl@@@std@@@5@@Z ?free@_V1@sycl@@YAXPEAXAEBVcontext@12@AEBUcode_location@detail@12@@Z ?free@_V1@sycl@@YAXPEAXAEBVqueue@12@AEBUcode_location@detail@12@@Z -?free_virtual_mem@experimental@oneapi@ext@_V1@sycl@@YAX_K0AEBVcontext@45@@Z +?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVqueue@45@@Z ?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@W4image_type@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@W4image_type@12345@AEBVqueue@45@@Z -?free_image_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_mipmap_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z ?free_mipmap_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVqueue@45@@Z -?free_mipmap_mem@experimental@oneapi@ext@_V1@sycl@@YAXUimage_mem_handle@12345@AEBVdevice@45@AEBVcontext@45@@Z +?free_virtual_mem@experimental@oneapi@ext@_V1@sycl@@YAX_K0AEBVcontext@45@@Z ?frexp_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z ?frexp_impl@detail@_V1@sycl@@YAMMPEAH@Z ?frexp_impl@detail@_V1@sycl@@YANNPEAH@Z @@ -4210,8 +4207,8 @@ ?get_coordinate_normalization_mode@sampler@_V1@sycl@@QEBA?AW4coordinate_normalization_mode@23@XZ ?get_count@image_plain@detail@_V1@sycl@@IEBA_KXZ ?get_descriptor@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBAAEBUimage_descriptor@23456@XZ -?get_device@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ ?get_device@image_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ +?get_device@physical_mem@experimental@oneapi@ext@_V1@sycl@@QEBA?AVdevice@56@XZ ?get_device@queue@_V1@sycl@@QEBA?AVdevice@23@XZ ?get_devices@context@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ ?get_devices@device@_V1@sycl@@SA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4device_type@info@23@@Z @@ -4310,7 +4307,7 @@ ?is_host@queue@_V1@sycl@@QEBA_NXZ ?is_in_fusion_mode@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEBA_NXZ ?is_in_order@queue@_V1@sycl@@QEBA_NXZ -?is_source_kernel_bundle_supported@experimental@oneapi@ext@_V1@sycl@@YA_NW4backend@45@W4source_language@12345@@Z +?is_source_kernel_bundle_supported@detail@experimental@oneapi@ext@_V1@sycl@@YA_NW4backend@56@W4source_language@23456@@Z ?is_specialization_constant_set@kernel_bundle_plain@detail@_V1@sycl@@IEBA_NPEBD@Z ?join_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$vector@V?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@V?$allocator@V?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@@2@@5@W4bundle_state@23@@Z ?lgamma_r_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z @@ -4329,6 +4326,8 @@ ?make_kernel@detail@_V1@sycl@@YA?AVkernel@23@_KAEBVcontext@23@W4backend@23@@Z ?make_kernel_bundle@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@_KAEBVcontext@23@W4bundle_state@23@W4backend@23@@Z ?make_kernel_bundle@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@_KAEBVcontext@23@_NW4bundle_state@23@W4backend@23@@Z +?make_kernel_bundle_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$02@56@AEBVcontext@56@W4source_language@23456@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$vector@U?$pair@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V12@@std@@V?$allocator@U?$pair@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V12@@std@@@2@@std@@@Z +?make_kernel_bundle_from_source@detail@experimental@oneapi@ext@_V1@sycl@@YA?AV?$kernel_bundle@$02@56@AEBVcontext@56@W4source_language@23456@AEBV?$vector@W4byte@std@@V?$allocator@W4byte@std@@@2@@std@@V?$vector@U?$pair@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V12@@std@@V?$allocator@U?$pair@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V12@@std@@@2@@std@@@Z ?make_platform@detail@_V1@sycl@@YA?AVplatform@23@_KW4backend@23@@Z ?make_queue@detail@_V1@sycl@@YA?AVqueue@23@_KHAEBVcontext@23@PEBVdevice@23@_NAEBVproperty_list@23@AEBV?$function@$$A6AXVexception_list@_V1@sycl@@@Z@std@@W4backend@23@@Z ?malloc@_V1@sycl@@YAPEAX_KAEBVdevice@12@AEBVcontext@12@W4alloc@usm@12@AEBUcode_location@detail@12@@Z diff --git a/sycl/unittests/Extensions/CommandGraph/Barrier.cpp b/sycl/unittests/Extensions/CommandGraph/Barrier.cpp index 79f55dc226b62..790956d5aeff4 100644 --- a/sycl/unittests/Extensions/CommandGraph/Barrier.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Barrier.cpp @@ -61,8 +61,8 @@ TEST_F(CommandGraphTest, EnqueueBarrierMultipleQueues) { auto Node3Graph = Queue.submit( [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - auto Barrier = - Queue2.submit([&](sycl::handler &cgh) { cgh.ext_oneapi_barrier(); }); + auto Barrier = Queue2.submit( + [&](sycl::handler &cgh) { cgh.ext_oneapi_barrier({Node2Graph}); }); auto Node4Graph = Queue2.submit( [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); @@ -74,21 +74,44 @@ TEST_F(CommandGraphTest, EnqueueBarrierMultipleQueues) { // Check the graph structure // (1) (2) (3) - // \ | / - // \ | / + // | // (B) // / \ // (4) (5) ASSERT_EQ(GraphImpl->MRoots.size(), 3lu); for (auto Root : GraphImpl->MRoots) { - auto Node = Root.lock(); - ASSERT_EQ(Node->MSuccessors.size(), 1lu); - auto BarrierNode = Node->MSuccessors.front().lock(); - ASSERT_EQ(BarrierNode->MCGType, sycl::detail::CG::Barrier); - ASSERT_EQ(GraphImpl->getEventForNode(BarrierNode), - sycl::detail::getSyclObjImpl(Barrier)); - ASSERT_EQ(BarrierNode->MPredecessors.size(), 3lu); - ASSERT_EQ(BarrierNode->MSuccessors.size(), 2lu); + auto RootNode = Root.lock(); + + if (GraphImpl->getEventForNode(RootNode) == + sycl::detail::getSyclObjImpl(Node2Graph)) { + + ASSERT_EQ(RootNode->MSuccessors.size(), 1lu); + auto SuccNode = RootNode->MSuccessors.front().lock(); + + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Barrier)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 2lu); + + for (auto SuccSucc : SuccNode->MSuccessors) { + auto SuccSuccNode = SuccSucc.lock(); + + if (GraphImpl->getEventForNode(SuccSuccNode) == + sycl::detail::getSyclObjImpl(Node4Graph)) { + ASSERT_EQ(SuccSuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccSuccNode->MSuccessors.size(), 0lu); + } else if (GraphImpl->getEventForNode(SuccSuccNode) == + sycl::detail::getSyclObjImpl(Node5Graph)) { + ASSERT_EQ(SuccSuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccSuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false && "Unexpected node"); + } + } + } else { + ASSERT_EQ(RootNode->MSuccessors.size(), 0lu); + } } } @@ -434,10 +457,8 @@ TEST_F(CommandGraphTest, InOrderQueuesWithEmptyBarrierWaitList) { // Check the graph structure // (1) (2) - // \ / | - // (B) | - // | / - // (3) + // | | + // (B) (3) auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); @@ -447,27 +468,28 @@ TEST_F(CommandGraphTest, InOrderQueuesWithEmptyBarrierWaitList) { if (GraphImpl->getEventForNode(RootNode) == sycl::detail::getSyclObjImpl(Node1)) { ASSERT_EQ(RootNode->MSuccessors.size(), 1lu); - } else if (GraphImpl->getEventForNode(RootNode) == - sycl::detail::getSyclObjImpl(Node2)) { - ASSERT_EQ(RootNode->MSuccessors.size(), 2lu); - } else { - ASSERT_TRUE(false && "Unexpected root node"); - } - auto SuccNode = RootNode->MSuccessors.front().lock(); + auto SuccNode = RootNode->MSuccessors.front().lock(); - ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), - sycl::detail::getSyclObjImpl(BarrierNode)); - ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); - ASSERT_EQ(SuccNode->MSuccessors.size(), 1lu); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(BarrierNode)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else if (GraphImpl->getEventForNode(RootNode) == + sycl::detail::getSyclObjImpl(Node2)) { + ASSERT_EQ(RootNode->MSuccessors.size(), 1lu); - auto SuccSuccNode = SuccNode->MSuccessors.front().lock(); + auto SuccNode = RootNode->MSuccessors.front().lock(); - ASSERT_EQ(SuccSuccNode->MPredecessors.size(), 2lu); - ASSERT_EQ(SuccSuccNode->MSuccessors.size(), 0lu); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Node3)); - ASSERT_EQ(GraphImpl->getEventForNode(SuccSuccNode), - sycl::detail::getSyclObjImpl(Node3)); + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false && "Unexpected root node"); + } } } @@ -487,7 +509,7 @@ TEST_F(CommandGraphTest, BarrierMixedQueueTypes) { auto Node2 = OutOfOrderQueue.submit( [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); - auto BarrierNode = InOrderQueue.ext_oneapi_submit_barrier(); + auto BarrierNode = InOrderQueue.ext_oneapi_submit_barrier({Node1, Node2}); auto Node3 = OutOfOrderQueue.submit([&](sycl::handler &cgh) { cgh.depends_on(Node2); @@ -500,8 +522,8 @@ TEST_F(CommandGraphTest, BarrierMixedQueueTypes) { // (1) (2) // \ /| // (B) | - // | / - // (3) + // | + // (3) auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); @@ -518,20 +540,21 @@ TEST_F(CommandGraphTest, BarrierMixedQueueTypes) { ASSERT_TRUE(false && "Unexpected root node"); } - auto SuccNode = RootNode->MSuccessors.front().lock(); - - ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), - sycl::detail::getSyclObjImpl(BarrierNode)); - ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); - ASSERT_EQ(SuccNode->MSuccessors.size(), 1lu); - - auto SuccSuccNode = SuccNode->MSuccessors.front().lock(); - - ASSERT_EQ(SuccSuccNode->MPredecessors.size(), 2lu); - ASSERT_EQ(SuccSuccNode->MSuccessors.size(), 0lu); - - ASSERT_EQ(GraphImpl->getEventForNode(SuccSuccNode), - sycl::detail::getSyclObjImpl(Node3)); + for (auto Succ : RootNode->MSuccessors) { + auto SuccNode = Succ.lock(); + + if (GraphImpl->getEventForNode(SuccNode) == + sycl::detail::getSyclObjImpl(BarrierNode)) { + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else if (GraphImpl->getEventForNode(SuccNode) == + sycl::detail::getSyclObjImpl(Node3)) { + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false && "Unexpected root node"); + } + } } } @@ -550,35 +573,370 @@ TEST_F(CommandGraphTest, BarrierBetweenExplicitNodes) { Graph.end_recording(); auto Node2 = Graph.add( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }, + {experimental::property::node::depends_on(Node1)}); + + // Check the graph structure + // (B) (1) + // | + // (2) + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); + + for (auto Root : GraphImpl->MRoots) { + auto RootNode = Root.lock(); + + if (GraphImpl->getEventForNode(RootNode) == + sycl::detail::getSyclObjImpl(BarrierNode)) { + ASSERT_EQ(RootNode->MSuccessors.size(), 0lu); + } else if (RootNode == sycl::detail::getSyclObjImpl(Node1)) { + ASSERT_EQ(RootNode->MSuccessors.size(), 1lu); + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(SuccNode, sycl::detail::getSyclObjImpl(Node2)); + } else { + ASSERT_TRUE(false); + } + } +} + +TEST_F(CommandGraphTest, BarrierMultipleOOOQueue) { + sycl::queue Queue2{Queue.get_context(), Dev}; + experimental::command_graph Graph{ + Queue}; + + Graph.begin_recording({Queue, Queue2}); + + auto Node1 = Queue.submit( [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node3 = Queue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node4 = Queue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto BarrierNode = Queue.ext_oneapi_submit_barrier(); + + auto Node5 = Queue2.submit([&](sycl::handler &cgh) { + cgh.depends_on({Node3, Node4}); + cgh.single_task>([]() {}); + }); + + auto Node6 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + Graph.end_recording(); + // Check the graph structure - // (1) - // | - // (B) - // | - // (2) + // (1) (2) (3) (4) + // \ / \ / + // (B) (5) + // | + // (6) auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); - ASSERT_EQ(GraphImpl->MRoots.size(), 1lu); + ASSERT_EQ(GraphImpl->MRoots.size(), 4u); for (auto Root : GraphImpl->MRoots) { auto RootNode = Root.lock(); - auto Node1Impl = sycl::detail::getSyclObjImpl(Node1); - ASSERT_EQ(RootNode, Node1Impl); + auto RootNodeEvent = GraphImpl->getEventForNode(RootNode); + if ((RootNodeEvent == sycl::detail::getSyclObjImpl(Node1)) || + (RootNodeEvent == sycl::detail::getSyclObjImpl(Node2))) { - auto SuccNode = RootNode->MSuccessors.front().lock(); + auto SuccNode = RootNode->MSuccessors.front().lock(); - ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), - sycl::detail::getSyclObjImpl(BarrierNode)); - ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); - ASSERT_EQ(SuccNode->MSuccessors.size(), 1lu); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(BarrierNode)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 1lu); - auto SuccSuccNode = SuccNode->MSuccessors.front().lock(); + auto SuccSuccNode = SuccNode->MSuccessors.front().lock(); + + ASSERT_EQ(SuccSuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccSuccNode->MSuccessors.size(), 0lu); + + auto Node6Impl = sycl::detail::getSyclObjImpl(Node6); + ASSERT_EQ(GraphImpl->getEventForNode(SuccSuccNode), Node6Impl); + } else if ((RootNodeEvent == sycl::detail::getSyclObjImpl(Node3)) || + (RootNodeEvent == sycl::detail::getSyclObjImpl(Node4))) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Node5)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 2lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false); + } + } +} + +TEST_F(CommandGraphTest, BarrierMultipleInOrderQueue) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue1{Queue.get_context(), Dev, Properties}; + sycl::queue InOrderQueue2{Queue.get_context(), Dev, Properties}; + experimental::command_graph Graph{ + InOrderQueue1}; + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + + auto Node1 = InOrderQueue1.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2 = InOrderQueue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto BarrierNode = InOrderQueue1.ext_oneapi_submit_barrier(); - ASSERT_EQ(SuccSuccNode->MPredecessors.size(), 1lu); - ASSERT_EQ(SuccSuccNode->MSuccessors.size(), 0lu); + auto Node3 = InOrderQueue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + Graph.end_recording(); + + // Check the graph structure + // (1) (2) + // | | + // (B) (3) + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + ASSERT_EQ(GraphImpl->MRoots.size(), 2u); + + for (auto Root : GraphImpl->MRoots) { + auto RootNode = Root.lock(); + auto RootNodeEvent = GraphImpl->getEventForNode(RootNode); + if (RootNodeEvent == sycl::detail::getSyclObjImpl(Node1)) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(BarrierNode)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else if (RootNodeEvent == sycl::detail::getSyclObjImpl(Node2)) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Node3)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false); + } + } +} + +TEST_F(CommandGraphTest, BarrierMultipleMixedOrderQueues) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Queue.get_context(), Dev, Properties}; + experimental::command_graph Graph{ + Queue}; + + Graph.begin_recording({Queue, InOrderQueue}); + + auto Node1 = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto Node2 = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + auto BarrierNode = Queue.ext_oneapi_submit_barrier(); + + auto Node3 = InOrderQueue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + Graph.end_recording(); + + // Check the graph structure + // (1) (2) + // | | + // (B) (3) + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + ASSERT_EQ(GraphImpl->MRoots.size(), 2u); + + for (auto Root : GraphImpl->MRoots) { + auto RootNode = Root.lock(); + auto RootNodeEvent = GraphImpl->getEventForNode(RootNode); + if (RootNodeEvent == sycl::detail::getSyclObjImpl(Node1)) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(BarrierNode)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else if (RootNodeEvent == sycl::detail::getSyclObjImpl(Node2)) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Node3)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false); + } + } +} + +TEST_F(CommandGraphTest, BarrierMultipleQueuesMultipleBarriers) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue{Queue.get_context(), Dev, Properties}; + experimental::command_graph Graph{ + Queue}; + + Graph.begin_recording({Queue, InOrderQueue}); + + auto Barrier1 = Queue.ext_oneapi_submit_barrier(); + auto Barrier2 = InOrderQueue.ext_oneapi_submit_barrier(); + auto Barrier3 = InOrderQueue.ext_oneapi_submit_barrier(); + auto Barrier4 = Queue.ext_oneapi_submit_barrier(); + + Graph.end_recording(); + + // Check the graph structure + // (1) (2) + // | | + // (4) (3) + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + ASSERT_EQ(GraphImpl->MRoots.size(), 2u); + + for (auto Root : GraphImpl->MRoots) { + auto RootNode = Root.lock(); + auto RootNodeEvent = GraphImpl->getEventForNode(RootNode); + if (RootNodeEvent == sycl::detail::getSyclObjImpl(Barrier1)) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Barrier4)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else if (RootNodeEvent == sycl::detail::getSyclObjImpl(Barrier2)) { + auto SuccNode = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(GraphImpl->getEventForNode(SuccNode), + sycl::detail::getSyclObjImpl(Barrier3)); + + ASSERT_EQ(SuccNode->MPredecessors.size(), 1lu); + ASSERT_EQ(SuccNode->MSuccessors.size(), 0lu); + } else { + ASSERT_TRUE(false); + } + } +} + +TEST_F(CommandGraphTest, BarrierWithInOrderCommands) { + sycl::property_list Properties{sycl::property::queue::in_order()}; + sycl::queue InOrderQueue1{Dev, Properties}; + sycl::queue InOrderQueue2{Dev, Properties}; + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + auto Node1 = InOrderQueue1.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node2 = InOrderQueue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + auto Barrier1 = InOrderQueue1.ext_oneapi_submit_barrier(); + auto Barrier2 = InOrderQueue2.ext_oneapi_submit_barrier(); + Graph.end_recording(); + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + auto Node3 = InOrderQueue1.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node4 = InOrderQueue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + auto Barrier3 = InOrderQueue1.ext_oneapi_submit_barrier(); + auto Barrier4 = InOrderQueue2.ext_oneapi_submit_barrier(); + Graph.end_recording(); + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + auto Node5 = InOrderQueue1.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + auto Node6 = InOrderQueue2.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph.end_recording(); + + Graph.begin_recording({InOrderQueue1, InOrderQueue2}); + auto Barrier5 = InOrderQueue1.ext_oneapi_submit_barrier({Node5, Node6}); + Graph.end_recording(); + + // Check the graph structure + // (1) (2) + // | | + // (B1) (B2) + // | | + // (3) (4) + // | | + // (B3) (B4) + // | | + // (5) (6) + // \ / + // (B5) + auto GraphImpl = sycl::detail::getSyclObjImpl(Graph); + ASSERT_EQ(GraphImpl->MRoots.size(), 2lu); + + for (auto Root : GraphImpl->MRoots) { + auto RootNode = Root.lock(); + bool EvenPath; + + ASSERT_EQ(RootNode->MSuccessors.size(), 1lu); + if (GraphImpl->getEventForNode(RootNode) == + sycl::detail::getSyclObjImpl(Node2)) { + EvenPath = true; + } else if (GraphImpl->getEventForNode(RootNode), + sycl::detail::getSyclObjImpl(Node1)) { + EvenPath = false; + } else { + ASSERT_TRUE(false); + } + + auto Succ1Node = RootNode->MSuccessors.front().lock(); + ASSERT_EQ(Succ1Node->MSuccessors.size(), 1lu); + if (EvenPath) { + ASSERT_EQ(GraphImpl->getEventForNode(Succ1Node), + sycl::detail::getSyclObjImpl(Barrier2)); + } else { + ASSERT_EQ(GraphImpl->getEventForNode(Succ1Node), + sycl::detail::getSyclObjImpl(Barrier1)); + } + + auto Succ2Node = Succ1Node->MSuccessors.front().lock(); + ASSERT_EQ(Succ2Node->MSuccessors.size(), 1lu); + if (EvenPath) { + ASSERT_EQ(GraphImpl->getEventForNode(Succ2Node), + sycl::detail::getSyclObjImpl(Node4)); + } else { + ASSERT_EQ(GraphImpl->getEventForNode(Succ2Node), + sycl::detail::getSyclObjImpl(Node3)); + } + + auto Succ3Node = Succ2Node->MSuccessors.front().lock(); + ASSERT_EQ(Succ3Node->MSuccessors.size(), 1lu); + if (EvenPath) { + ASSERT_EQ(GraphImpl->getEventForNode(Succ3Node), + sycl::detail::getSyclObjImpl(Barrier4)); + } else { + ASSERT_EQ(GraphImpl->getEventForNode(Succ3Node), + sycl::detail::getSyclObjImpl(Barrier3)); + } + + auto Succ4Node = Succ3Node->MSuccessors.front().lock(); + ASSERT_EQ(Succ4Node->MSuccessors.size(), 1lu); + if (EvenPath) { + ASSERT_EQ(GraphImpl->getEventForNode(Succ4Node), + sycl::detail::getSyclObjImpl(Node6)); + } else { + ASSERT_EQ(GraphImpl->getEventForNode(Succ4Node), + sycl::detail::getSyclObjImpl(Node5)); + } - auto Node2Impl = sycl::detail::getSyclObjImpl(Node2); - ASSERT_EQ(SuccSuccNode, Node2Impl); + auto Succ5Node = Succ4Node->MSuccessors.front().lock(); + ASSERT_EQ(Succ5Node->MSuccessors.size(), 0lu); + ASSERT_EQ(Succ5Node->MPredecessors.size(), 2lu); + ASSERT_EQ(GraphImpl->getEventForNode(Succ5Node), + sycl::detail::getSyclObjImpl(Barrier5)); } } diff --git a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp index 28de1cf587504..04f306dc0cfb8 100644 --- a/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp +++ b/sycl/unittests/Extensions/CommandGraph/Exceptions.cpp @@ -237,6 +237,41 @@ TEST_F(CommandGraphTest, ExplicitBarrierException) { ASSERT_EQ(Success, false); } +TEST_F(CommandGraphTest, ExplicitBarrierDependencyException) { + + experimental::command_graph Graph2{ + Queue}; + + Graph2.begin_recording({Queue}); + + auto Node = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + Graph2.end_recording(); + + auto Event = Queue.submit( + [&](sycl::handler &cgh) { cgh.single_task>([]() {}); }); + + Graph.begin_recording(Queue); + + std::error_code ExceptionCode = make_error_code(sycl::errc::success); + try { + auto BarrierNode = Queue.ext_oneapi_submit_barrier({Node}); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + ExceptionCode = make_error_code(sycl::errc::success); + try { + auto BarrierNode = Queue.ext_oneapi_submit_barrier({Event}); + } catch (exception &Exception) { + ExceptionCode = Exception.code(); + } + ASSERT_EQ(ExceptionCode, sycl::errc::invalid); + + Graph2.end_recording(); +} + TEST_F(CommandGraphTest, FusionExtensionExceptionCheck) { device D; if (!D.get_info< diff --git a/sycl/unittests/buffer/BufferReleaseBase.hpp b/sycl/unittests/buffer/BufferReleaseBase.hpp index b35d73cb3909c..bfcc4fb8369ed 100644 --- a/sycl/unittests/buffer/BufferReleaseBase.hpp +++ b/sycl/unittests/buffer/BufferReleaseBase.hpp @@ -43,10 +43,6 @@ class BufferDestructionCheckCommon : public ::testing::Test { protected: void SetUp() override { - if (Plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - GTEST_SKIP(); - } MockSchedulerPtr = new MockScheduler(); sycl::detail::GlobalHandler::instance().attachScheduler( dynamic_cast(MockSchedulerPtr)); diff --git a/sycl/unittests/pi/PiMock.cpp b/sycl/unittests/pi/PiMock.cpp index c7014162f9cf8..02044d9631376 100644 --- a/sycl/unittests/pi/PiMock.cpp +++ b/sycl/unittests/pi/PiMock.cpp @@ -56,10 +56,6 @@ TEST(PiMockTest, ConstructFromQueue) { sycl::unittest::PiMock Mock; queue MockQ{Mock.getPlatform().get_devices()[0]}; queue NormalQ; - if (NormalQ.is_host()) { - std::cerr << "Not run due to host-only environment\n"; - return; - } const auto &NormalPiPlugin = detail::getSyclObjImpl(NormalQ)->getPlugin()->getPiPlugin(); diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp index dfb51edcaf13e..9f7da55f7d8b7 100644 --- a/sycl/unittests/scheduler/AllocaLinking.cpp +++ b/sycl/unittests/scheduler/AllocaLinking.cpp @@ -47,13 +47,6 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device, TEST_F(SchedulerTest, AllocaLinking) { HostUnifiedMemory = false; - // This host device constructor should be placed before Mock.redefine - // because it overrides the real implementation of get_device_info - // which is needed when creating a host device. - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; sycl::unittest::PiMock Mock; sycl::queue Q{Mock.getPlatform().get_devices()[0]}; @@ -72,7 +65,7 @@ TEST_F(SchedulerTest, AllocaLinking) { detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_FALSE(HostAllocaCmd->MLinkedAllocaCmd); EXPECT_FALSE(NonHostAllocaCmd->MLinkedAllocaCmd); @@ -88,7 +81,7 @@ TEST_F(SchedulerTest, AllocaLinking) { detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd); EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd); @@ -104,7 +97,7 @@ TEST_F(SchedulerTest, AllocaLinking) { detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd); EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd); diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp index d893c33f5cc26..daf8599947ad2 100644 --- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp +++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp @@ -163,7 +163,7 @@ TEST_F(SchedulerTest, StreamAUXCmdsWait) { auto EventImplProxy = std::static_pointer_cast(EventImpl); - ASSERT_TRUE(EventImplProxy->MPostCompleteEvents.size() == 1) + ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1u) << "Expected 1 post complete event"; Q.wait(); @@ -219,13 +219,7 @@ TEST_F(SchedulerTest, CommandsWaitForEvents) { std::shared_ptr E2( new detail::event_impl(TestContext->EventCtx2, Q2.get_context())); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue(new detail::queue_impl( - detail::getSyclObjImpl(HostDevice), /*AsyncHandler=*/{}, - /*PropList=*/{})); - - MockCommand Cmd(DefaultHostQueue); + MockCommand Cmd(nullptr); std::vector> Events; Events.push_back(E1); @@ -233,7 +227,7 @@ TEST_F(SchedulerTest, CommandsWaitForEvents) { pi_event EventResult = nullptr; - Cmd.waitForEventsCall(DefaultHostQueue, Events, EventResult); + Cmd.waitForEventsCall(nullptr, Events, EventResult); ASSERT_TRUE(TestContext->EventCtx1WasWaited && TestContext->EventCtx2WasWaited) diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp index 2e54057e434d6..31d4e92bf89a8 100644 --- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp +++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp @@ -26,10 +26,6 @@ constexpr auto DisableCleanupName = "SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP"; std::vector> PassedNumEvents; bool CheckTestExecutionRequirements(const platform &plt) { - if (plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - return false; - } // This test only contains device image for SPIR-V capable devices. if (plt.get_backend() != sycl::backend::opencl && plt.get_backend() != sycl::backend::ext_oneapi_level_zero) { @@ -82,10 +78,10 @@ class DependsOnTests : public ::testing::Test { std::unique_ptr CmdGroup = MockCGH.finalize(); - detail::Command *NewCmd = MS.addCG( - std::move(CmdGroup), - Type == TestCGType::HOST_TASK ? MS.getDefaultHostQueue() : QueueDevImpl, - ToEnqueue, /*EventNeeded=*/true); + detail::Command *NewCmd = + MS.addCG(std::move(CmdGroup), + Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, + ToEnqueue, /*EventNeeded=*/true); EXPECT_EQ(ToEnqueue.size(), 0u); return NewCmd; } @@ -174,7 +170,6 @@ TEST_F(DependsOnTests, DISABLED_EnqueueNoMemObjTwoHostTasks) { TEST_F(DependsOnTests, EnqueueNoMemObjTwoHostTasks) { #endif // Checks enqueue of two dependent host tasks - detail::QueueImplPtr QueueHostImpl = MS.getDefaultHostQueue(); std::vector Events; detail::Command *Cmd1 = diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp index 8206728b2b221..94be957e03051 100644 --- a/sycl/unittests/scheduler/GraphCleanup.cpp +++ b/sycl/unittests/scheduler/GraphCleanup.cpp @@ -172,7 +172,7 @@ static void checkCleanupOnEnqueue(MockScheduler &MS, } static void checkCleanupOnLeafUpdate( - MockScheduler &MS, detail::QueueImplPtr &QueueImpl, buffer &Buf, + MockScheduler &MS, detail::QueueImplPtr QueueImpl, buffer &Buf, detail::Requirement &MockReq, std::function SchedulerCall) { bool CommandDeleted = false; @@ -246,15 +246,11 @@ TEST_F(SchedulerTest, PostEnqueueCleanup) { checkCleanupOnLeafUpdate( MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) { detail::Command *Leaf = *Record->MWriteLeaves.begin(); - MS.addEmptyCmd(Leaf, {&MockReq}, QueueImpl, - detail::Command::BlockReason::HostTask, ToEnqueue); + MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask, + ToEnqueue); }); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - detail::QueueImplPtr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; checkCleanupOnLeafUpdate( - MS, DefaultHostQueue, Buf, MockReq, [&](detail::MemObjRecord *Record) { + MS, nullptr, Buf, MockReq, [&](detail::MemObjRecord *Record) { MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue); }); // Check cleanup on exceeding leaf limit. diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp index 049131d661779..9ce9a1f944349 100644 --- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp @@ -77,11 +77,6 @@ TEST_F(SchedulerTest, InOrderQueueDeps) { sycl::detail::QueueImplPtr InOrderQueueImpl = detail::getSyclObjImpl(InOrderQueue); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; - MockScheduler MS; int val; @@ -92,18 +87,17 @@ TEST_F(SchedulerTest, InOrderQueueDeps) { MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req); std::vector AuxCmds; MS.getOrCreateAllocaForReq(Record, &Req, InOrderQueueImpl, AuxCmds); - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); // Check that sequential memory movements submitted to the same in-order // queue do not depend on each other. - detail::Command *Cmd = - MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds); + detail::Command *Cmd = MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds); detail::EnqueueResultT Res; auto ReadLock = MS.acquireGraphReadLock(); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); Cmd = MS.insertMemoryMove(Record, &Req, InOrderQueueImpl, AuxCmds); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); - Cmd = MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds); + Cmd = MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds); MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING); } diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index 8693ff5e4c52b..929f8735bc85f 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -130,10 +130,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { customextUSMEnqueueMemset); sycl::platform Plt = Mock.getPlatform(); - if (Plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - GTEST_SKIP(); - } context Ctx{Plt}; queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; diff --git a/sycl/unittests/scheduler/KernelFusion.cpp b/sycl/unittests/scheduler/KernelFusion.cpp index 1db16cbda1493..5503749ea71d7 100644 --- a/sycl/unittests/scheduler/KernelFusion.cpp +++ b/sycl/unittests/scheduler/KernelFusion.cpp @@ -44,10 +44,6 @@ detail::Command *CreateTaskCommand(MockScheduler &MS, } bool CheckTestExecRequirements(const platform &plt) { - if (plt.is_host()) { - std::cout << "Not run due to host-only environment\n"; - return false; - } // This test only contains device image for SPIR-V capable devices. if (plt.get_backend() != sycl::backend::opencl && plt.get_backend() != sycl::backend::ext_oneapi_level_zero) { diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index a2533cceda138..b8aadcfb258f0 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -36,8 +36,6 @@ TEST_F(SchedulerTest, LeafLimit) { unittest::ScopedEnvVar DisabledCleanup{ DisableCleanupName, "1", detail::SYCLConfig::reset}; - sycl::queue HQueue(detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl())); MockScheduler MS; std::vector> LeavesToAdd; std::unique_ptr MockDepCmd; diff --git a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp index 61e3de6671fb1..565c3b2a2314c 100644 --- a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp +++ b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp @@ -60,8 +60,8 @@ TEST_F(SchedulerTest, LeafLimitDiffContexts) { std::vector ToEnqueue; AllocaCmd = MS.getOrCreateAllocaForReq( Rec, &MockReq, detail::getSyclObjImpl(Queue), ToEnqueue); - std::ignore = MS.getOrCreateAllocaForReq( - Rec, &MockReq, MS.getDefaultHostQueue(), ToEnqueue); + std::ignore = + MS.getOrCreateAllocaForReq(Rec, &MockReq, nullptr, ToEnqueue); DepCmd = std::make_unique(detail::getSyclObjImpl(Queue), MockReq); } diff --git a/sycl/unittests/scheduler/LeavesCollection.cpp b/sycl/unittests/scheduler/LeavesCollection.cpp index ea883041add66..e0732926537b0 100644 --- a/sycl/unittests/scheduler/LeavesCollection.cpp +++ b/sycl/unittests/scheduler/LeavesCollection.cpp @@ -36,10 +36,8 @@ createGenericCommand(const std::shared_ptr &Q) { return std::shared_ptr{new MockCommand(Q, Command::RUN_CG)}; } -std::shared_ptr -createEmptyCommand(const std::shared_ptr &Q, - const Requirement &Req) { - EmptyCommand *Cmd = new EmptyCommand(Q); +std::shared_ptr createEmptyCommand(const Requirement &Req) { + EmptyCommand *Cmd = new EmptyCommand(); Cmd->addRequirement(/* DepCmd = */ nullptr, /* AllocaCmd = */ nullptr, &Req); Cmd->MBlockReason = Command::BlockReason::HostAccessor; return std::shared_ptr{Cmd}; @@ -97,7 +95,7 @@ TEST_F(LeavesCollectionTest, PushBack) { for (size_t Idx = 0; Idx < GenericCmdsCapacity * 4; ++Idx) { auto Cmd = Idx % 2 ? createGenericCommand(getSyclObjImpl(Q)) - : createEmptyCommand(getSyclObjImpl(Q), MockReq); + : createEmptyCommand(MockReq); Cmds.push_back(Cmd); LE.push_back(Cmds.back().get(), ToEnqueue); @@ -137,7 +135,7 @@ TEST_F(LeavesCollectionTest, Remove) { for (size_t Idx = 0; Idx < GenericCmdsCapacity * 4; ++Idx) { auto Cmd = Idx % 2 ? createGenericCommand(getSyclObjImpl(Q)) - : createEmptyCommand(getSyclObjImpl(Q), MockReq); + : createEmptyCommand(MockReq); Cmds.push_back(Cmd); if (LE.push_back(Cmds.back().get(), ToEnqueue)) diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp index 5ab9cfbb43f5a..b08b211d1e2dc 100644 --- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp +++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp @@ -64,28 +64,21 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) { sycl::queue Queue1{Dev}; sycl::detail::QueueImplPtr Q1 = sycl::detail::getSyclObjImpl(Queue1); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue(new detail::queue_impl( - detail::getSyclObjImpl(HostDevice), /*AsyncHandler=*/{}, - /*PropList=*/{})); - auto AllocaDep = [](sycl::detail::Command *, sycl::detail::Command *, sycl::detail::MemObjRecord *, std::vector &) {}; std::shared_ptr Record{ - new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), 10, - AllocaDep)}; + new sycl::detail::MemObjRecord(nullptr, 10, AllocaDep)}; MemObjMock MemObj(Record); Req.MSYCLMemObj = &MemObj; - sycl::detail::AllocaCommand AllocaCmd1(DefaultHostQueue, Req, false); + sycl::detail::AllocaCommand AllocaCmd1(nullptr, Req, false); Record->MAllocaCommands.push_back(&AllocaCmd1); - MockCommand DepCmd(DefaultHostQueue, Req); - MockCommand DepDepCmd(DefaultHostQueue, Req); + MockCommand DepCmd(nullptr, Req); + MockCommand DepDepCmd(nullptr, Req); DepCmd.MDeps.push_back({&DepDepCmd, DepDepCmd.getRequirement(), &AllocaCmd1}); DepDepCmd.MUsers.insert(&DepCmd); std::vector ToEnqueue; diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp index d52a257f3603b..24a19977844fb 100644 --- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp +++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp @@ -91,11 +91,6 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { redefinedMemCreateWithNativeHandle); sycl::detail::QueueImplPtr QImpl = detail::getSyclObjImpl(Q); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - std::shared_ptr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; - MockScheduler MS; // Check non-host alloca with non-discard access mode { @@ -112,10 +107,10 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // order to perform a memory move. EXPECT_EQ(Record->MAllocaCommands.size(), 2U); detail::AllocaCommandBase *HostAllocaCmd = Record->MAllocaCommands[0]; - EXPECT_TRUE(HostAllocaCmd->getQueue()->is_host()); + EXPECT_TRUE(HostAllocaCmd->getQueue() == nullptr); EXPECT_TRUE(!HostAllocaCmd->MLinkedAllocaCmd); EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd); - EXPECT_TRUE(Record->MCurContext->is_host()); + EXPECT_TRUE(Record->MCurContext == nullptr); detail::Command *MemoryMove = MS.insertMemoryMove(Record, &Req, QImpl, AuxCmds); @@ -157,11 +152,10 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { // No special handling required: alloca commands are created one after // another and the transfer is done via a write operation. - detail::MemObjRecord *Record = - MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req); + detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(nullptr, &Req); std::vector AuxCmds; detail::AllocaCommandBase *HostAllocaCmd = - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); EXPECT_EQ(Record->MAllocaCommands.size(), 1U); detail::AllocaCommandBase *NonHostAllocaCmd = MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); @@ -186,14 +180,14 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) { detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req); std::vector AuxCmds; MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds); - MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds); + MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds); // Memory movement operations should be omitted for discard access modes. detail::Command *MemoryMove = - MS.insertMemoryMove(Record, &DiscardReq, DefaultHostQueue, AuxCmds); + MS.insertMemoryMove(Record, &DiscardReq, nullptr, AuxCmds); EXPECT_TRUE(MemoryMove == nullptr); // The current context for the record should still be modified. - EXPECT_EQ(Record->MCurContext, DefaultHostQueue->getContextImplPtr()); + EXPECT_EQ(Record->MCurContext, nullptr); } // Check that interoperability memory objects are initialized. { diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp index c27e4d672e0fa..4f1413fcd75a0 100644 --- a/sycl/unittests/scheduler/QueueFlushing.cpp +++ b/sycl/unittests/scheduler/QueueFlushing.cpp @@ -122,21 +122,15 @@ TEST_F(SchedulerTest, QueueFlushing) { QueueImplA}; testCommandEnqueue(&UnmapCmd, QueueImplB, MockReq); - device HostDevice = detail::createSyclObjFromImpl( - detail::device_impl::getHostDeviceImpl()); - detail::QueueImplPtr DefaultHostQueue{ - new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})}; detail::AllocaCommand HostAllocaCmd = - detail::AllocaCommand(DefaultHostQueue, MockReq); + detail::AllocaCommand(nullptr, MockReq); - detail::MemCpyCommand MemCpyCmd{MockReq, &AllocaCmd, - MockReq, &HostAllocaCmd, - QueueImplA, DefaultHostQueue}; + detail::MemCpyCommand MemCpyCmd{MockReq, &AllocaCmd, MockReq, + &HostAllocaCmd, QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmd, QueueImplB, MockReq); - detail::MemCpyCommandHost MemCpyCmdHost{MockReq, &AllocaCmd, - MockReq, &MockHostPtr, - QueueImplA, DefaultHostQueue}; + detail::MemCpyCommandHost MemCpyCmdHost{MockReq, &AllocaCmd, MockReq, + &MockHostPtr, QueueImplA, nullptr}; testCommandEnqueue(&MemCpyCmdHost, QueueImplB, MockReq); std::unique_ptr CG{ diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index b1c667c5c40ca..4974501a66581 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -188,10 +188,9 @@ class MockScheduler : public sycl::detail::Scheduler { sycl::detail::EmptyCommand * addEmptyCmd(sycl::detail::Command *Cmd, const std::vector &Reqs, - const sycl::detail::QueueImplPtr &Queue, sycl::detail::Command::BlockReason Reason, std::vector &ToEnqueue) { - return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue); + return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Reason, ToEnqueue); } sycl::detail::Command *addCG(std::unique_ptr CommandGroup, diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp index 7e76027c05431..af11192c31851 100644 --- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp +++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp @@ -11,6 +11,7 @@ #include #include +#include #include using namespace sycl; @@ -46,92 +47,3 @@ class MockHandlerStreamInit : public MockHandler { return CommandGroup; } }; - -using CmdTypeTy = sycl::detail::Command::CommandType; - -// Function recursively checks that initial command has dependency on chain of -// other commands that should have type DepCmdsTypes[Depth] (Depth is a distance -// - 1 in a command dependencies tree from initial command to a currently -// checked one) and requirement on memory object of stream's flush buffer. -static bool ValidateDepCommandsTree(const detail::Command *Cmd, - const std::vector &DepCmdsTypes, - const detail::SYCLMemObjI *MemObj, - size_t Depth = 0) { - if (!Cmd || Depth >= DepCmdsTypes.size()) - throw sycl::runtime_error("Command parameters are invalid", - PI_ERROR_INVALID_VALUE); - - for (const detail::DepDesc &Dep : Cmd->MDeps) { - if (Dep.MDepCommand && - (Dep.MDepCommand->getType() == DepCmdsTypes[Depth]) && - Dep.MDepRequirement && (Dep.MDepRequirement->MSYCLMemObj == MemObj) && - ((Depth == DepCmdsTypes.size() - 1) || - ValidateDepCommandsTree(Dep.MDepCommand, DepCmdsTypes, MemObj, - Depth + 1))) { - return true; - } - } - - return false; -} - -TEST_F(SchedulerTest, StreamInitDependencyOnHost) { - // Disable post enqueue cleanup so that it doesn't interfere with dependency - // checks. - unittest::ScopedEnvVar DisabledCleanup{ - DisableCleanupName, "1", - detail::SYCLConfig::reset}; - std::shared_ptr HQueueImpl(new detail::queue_impl( - detail::device_impl::getHostDeviceImpl(), /*AsyncHandler=*/{}, - /*PropList=*/{})); - - // Emulating processing of command group function - MockHandlerStreamInit MockCGH(HQueueImpl, true, /*CallerNeedsEvent=*/true); - MockCGH.setType(detail::CG::Kernel); - - auto EmptyKernel = [](sycl::nd_item<1>) {}; - MockCGH - .setHostKernel, 1, class Empty>( - EmptyKernel); - MockCGH.setNDRangeDesc( - sycl::nd_range<1>{sycl::range<1>{1}, sycl::range<1>{1}}); - - // Emulating construction of stream object inside command group - detail::StreamImplPtr StreamImpl = - std::make_shared(1024, 200, property_list{}); - detail::GlobalBufAccessorT FlushBufAcc = - StreamImpl->accessGlobalFlushBuf(MockCGH); - MockCGH.addStream(StreamImpl); - - detail::SYCLMemObjI *FlushBufMemObjPtr = - detail::getSyclObjImpl(FlushBufAcc)->MSYCLMemObj; - ASSERT_TRUE(!!FlushBufMemObjPtr) - << "Memory object for stream flush buffer not initialized"; - - std::unique_ptr MainCG = MockCGH.finalize(); - - // Emulate call of Scheduler::addCG - std::vector Streams = - static_cast(MainCG.get())->getStreams(); - ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects"; - - Streams[0]->initStreamHost(HQueueImpl); - - MockScheduler MS; - std::vector AuxCmds; - detail::Command *NewCmd = - MS.addCG(std::move(MainCG), HQueueImpl, AuxCmds, /*EventNeeded=*/true); - ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler"; - ASSERT_GT(NewCmd->MDeps.size(), 0u) - << "No deps appeared in the new exec kernel command"; - - // Searching in dependencies for CG execution command that initializes flush - // buffer of a stream that is supposed to be used inside NewCmd's CG. - // Tree of dependencies should look like: - // [MAIN_CG] -> [EMPTY_NODE {FlushBufMemObj}] -> [FILL_CG {FlushBufMemObj}] -> - // [[ALLOC_TASK {FlushBufMemObj}] - std::vector DepCmdsTypes({CmdTypeTy::RUN_CG, // FILL_CG - CmdTypeTy::ALLOCA}); - ASSERT_TRUE(ValidateDepCommandsTree(NewCmd, DepCmdsTypes, FlushBufMemObjPtr)) - << "Dependency on stream flush buffer initialization not found"; -}