diff --git a/.github/workflows/sycl_aws.yml b/.github/workflows/sycl-aws.yml similarity index 100% rename from .github/workflows/sycl_aws.yml rename to .github/workflows/sycl-aws.yml diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl-containers.yaml similarity index 98% rename from .github/workflows/sycl_containers.yaml rename to .github/workflows/sycl-containers.yaml index 011d45ee9fee5..bb2e086929847 100644 --- a/.github/workflows/sycl_containers.yaml +++ b/.github/workflows/sycl-containers.yaml @@ -12,14 +12,14 @@ on: - 'devops/dependencies.json' - 'devops/scripts/install_drivers.sh' - 'devops/scripts/install_build_tools.sh' - - '.github/workflows/sycl_containers.yaml' + - '.github/workflows/sycl-containers.yaml' pull_request: paths: - 'devops/containers/**' - 'devops/dependencies.json' - 'devops/scripts/install_drivers.sh' - 'devops/scripts/install_build_tools.sh' - - '.github/workflows/sycl_containers.yaml' + - '.github/workflows/sycl-containers.yaml' jobs: base_image_ubuntu2204: diff --git a/.github/workflows/sycl_detect_changes.yml b/.github/workflows/sycl-detect-changes.yml similarity index 100% rename from .github/workflows/sycl_detect_changes.yml rename to .github/workflows/sycl-detect-changes.yml diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/sycl-docs.yml similarity index 97% rename from .github/workflows/gh_pages.yml rename to .github/workflows/sycl-docs.yml index d0d4a130d6f78..73062642535cf 100644 --- a/.github/workflows/gh_pages.yml +++ b/.github/workflows/sycl-docs.yml @@ -7,7 +7,7 @@ on: branches: - sycl paths: - - '.github/workflows/gh_pages.yml' + - '.github/workflows/sycl-docs.yml' - 'clang/docs/**' - 'sycl/doc/**' diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl-linux-build.yml similarity index 100% rename from .github/workflows/sycl_linux_build.yml rename to .github/workflows/sycl-linux-build.yml diff --git a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml similarity index 94% rename from .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml rename to .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml index 50dc1ca700035..06a5f13674768 100644 --- a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml +++ b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml @@ -48,7 +48,7 @@ jobs: image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: ext_oneapi_cuda:gpu - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} @@ -62,7 +62,7 @@ jobs: aws_start: name: AWS Start - uses: ./.github/workflows/sycl_aws.yml + uses: ./.github/workflows/sycl-aws.yml secrets: inherit with: mode: start @@ -71,7 +71,7 @@ jobs: linux_e2e_on_nightly_aws: name: '[AWS][CUDA] E2E on Nightly' needs: [aws_start] - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: CUDA runner: '["aws-cuda_${{ github.run_id }}-${{ github.run_attempt }}"]' @@ -86,7 +86,7 @@ jobs: name: AWS Stop needs: [aws_start, linux_e2e_on_nightly_aws] if: always() - uses: ./.github/workflows/sycl_aws.yml + uses: ./.github/workflows/sycl-aws.yml secrets: inherit with: mode: stop diff --git a/.github/workflows/sycl_linux_precommit_aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml similarity index 98% rename from .github/workflows/sycl_linux_precommit_aws.yml rename to .github/workflows/sycl-linux-precommit-aws.yml index 3fce40107d90e..8ff68e725e447 100644 --- a/.github/workflows/sycl_linux_precommit_aws.yml +++ b/.github/workflows/sycl-linux-precommit-aws.yml @@ -54,7 +54,7 @@ jobs: e2e-cuda: needs: [aws-start] - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: CUDA E2E runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]' diff --git a/.github/workflows/sycl_linux_precommit.yml b/.github/workflows/sycl-linux-precommit.yml similarity index 94% rename from .github/workflows/sycl_linux_precommit.yml rename to .github/workflows/sycl-linux-precommit.yml index 8df5237d63d04..eb3f30aebcfe6 100644 --- a/.github/workflows/sycl_linux_precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -18,8 +18,8 @@ on: - 'clang/docs/**' - '**.md' - '**.rst' - - '.github/workflows/sycl_windows_*.yml' - - '.github/workflows/sycl_macos_*.yml' + - '.github/workflows/sycl-windows-*.yml' + - '.github/workflows/sycl-macos-*.yml' - 'devops/containers/**' - 'devops/actions/build_container/**' @@ -30,12 +30,12 @@ concurrency: jobs: detect_changes: - uses: ./.github/workflows/sycl_detect_changes.yml + uses: ./.github/workflows/sycl-detect-changes.yml build: needs: [detect_changes] if: always() && success() - uses: ./.github/workflows/sycl_linux_build.yml + uses: ./.github/workflows/sycl-linux-build.yml with: build_ref: ${{ github.sha }} merge_ref: '' @@ -75,7 +75,7 @@ jobs: install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} extra_lit_opts: --param matrix-xmx8=True --param gpu-intel-dg2=True env: '{"LIT_FILTER":"Matrix/"}' - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} @@ -111,7 +111,7 @@ jobs: runner: '["Linux", "gen12"]' - name: Perf tests on Intel Arc A-Series Graphics system runner: '["Linux", "arc"]' - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} diff --git a/.github/workflows/sycl_linux_run_tests.yml b/.github/workflows/sycl-linux-run-tests.yml similarity index 100% rename from .github/workflows/sycl_linux_run_tests.yml rename to .github/workflows/sycl-linux-run-tests.yml diff --git a/.github/workflows/sycl_macos_build_and_test.yml b/.github/workflows/sycl-macos-build-and-test.yml similarity index 100% rename from .github/workflows/sycl_macos_build_and_test.yml rename to .github/workflows/sycl-macos-build-and-test.yml diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl-nightly.yml similarity index 96% rename from .github/workflows/sycl_nightly.yml rename to .github/workflows/sycl-nightly.yml index e802af89a29b4..28df4a8b8de13 100644 --- a/.github/workflows/sycl_nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -8,7 +8,7 @@ on: jobs: ubuntu2204_build: if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build.yml + uses: ./.github/workflows/sycl-linux-build.yml secrets: inherit with: build_cache_root: "/__w/" @@ -59,7 +59,7 @@ jobs: image: ghcr.io/intel/llvm/ubuntu2204_build:latest image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: ext_oneapi_cuda:gpu - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} @@ -74,7 +74,7 @@ jobs: sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} build-win: - uses: ./.github/workflows/sycl_windows_build.yml + uses: ./.github/workflows/sycl-windows-build.yml if: github.repository == 'intel/llvm' with: retention-days: 90 @@ -89,7 +89,7 @@ jobs: always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' - uses: ./.github/workflows/sycl_windows_run_tests.yml + uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl-post-commit.yml similarity index 88% rename from .github/workflows/sycl_post_commit.yml rename to .github/workflows/sycl-post-commit.yml index 76dd3ac88e3d2..00205a52b03e1 100644 --- a/.github/workflows/sycl_post_commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -12,10 +12,10 @@ on: - sycl - sycl-devops-pr/** paths: - - .github/workflows/sycl_post_commit.yml - - .github/workflows/sycl_linux_build.yml - - .github/workflows/sycl_linux_run_tests.yml - - .github/workflows/sycl_macos_build_and_test.yml + - .github/workflows/sycl-post-commit.yml + - .github/workflows/sycl-linux-build.yml + - .github/workflows/sycl-linux-run-tests.yml + - .github/workflows/sycl-macos-build-and-test.yml - ./devops/actions/cleanup - ./devops/actions/cached_checkout @@ -23,7 +23,7 @@ jobs: build-lin: name: Linux (Self build + shared libraries + no-assertions) if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build.yml + uses: ./.github/workflows/sycl-linux-build.yml with: build_cache_root: "/__w/llvm" build_cache_suffix: sprod_shared @@ -63,7 +63,7 @@ jobs: env: '{"LIT_FILTER":"PerformanceTests/"}' extra_lit_opts: -a -j 1 --param enable-perf-tests=True target_devices: all - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} @@ -87,7 +87,7 @@ jobs: always() && success() && github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_windows_build.yml + uses: ./.github/workflows/sycl-windows-build.yml e2e-win: needs: build-win @@ -96,7 +96,7 @@ jobs: always() && !cancelled() && needs.build-win.outputs.build_conclusion == 'success' - uses: ./.github/workflows/sycl_windows_run_tests.yml + uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' @@ -105,4 +105,4 @@ jobs: macos_default: name: macOS if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_macos_build_and_test.yml + uses: ./.github/workflows/sycl-macos-build-and-test.yml diff --git a/.github/workflows/sycl_stale_issues.yml b/.github/workflows/sycl-stale-issues.yml similarity index 100% rename from .github/workflows/sycl_stale_issues.yml rename to .github/workflows/sycl-stale-issues.yml diff --git a/.github/workflows/sync-main.yml b/.github/workflows/sycl-sync-main.yml similarity index 100% rename from .github/workflows/sync-main.yml rename to .github/workflows/sycl-sync-main.yml diff --git a/.github/workflows/sycl_update_gpu_driver.yml b/.github/workflows/sycl-update-gpu-driver.yml similarity index 100% rename from .github/workflows/sycl_update_gpu_driver.yml rename to .github/workflows/sycl-update-gpu-driver.yml diff --git a/.github/workflows/sycl_windows_build.yml b/.github/workflows/sycl-windows-build.yml similarity index 100% rename from .github/workflows/sycl_windows_build.yml rename to .github/workflows/sycl-windows-build.yml diff --git a/.github/workflows/sycl_windows_precommit.yml b/.github/workflows/sycl-windows-precommit.yml similarity index 80% rename from .github/workflows/sycl_windows_precommit.yml rename to .github/workflows/sycl-windows-precommit.yml index 2b7883895d97c..3b96b463560a9 100644 --- a/.github/workflows/sycl_windows_precommit.yml +++ b/.github/workflows/sycl-windows-precommit.yml @@ -16,9 +16,9 @@ on: - 'clang/docs/**' - '**.md' - '**.rst' - - '.github/workflows/sycl_linux_*.yml' - - '.github/workflows/sycl_precommit_aws.yml' - - '.github/workflows/sycl_macos_*.yml' + - '.github/workflows/sycl-linux-*.yml' + - '.github/workflows/sycl-precommit-aws.yml' + - '.github/workflows/sycl-macos-*.yml' - 'devops/containers/**' - 'devops/actions/build_container/**' @@ -32,14 +32,14 @@ concurrency: jobs: detect_changes: - uses: ./.github/workflows/sycl_detect_changes.yml + uses: ./.github/workflows/sycl-detect-changes.yml build: needs: [detect_changes] if: | always() && success() && github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_windows_build.yml + uses: ./.github/workflows/sycl-windows-build.yml with: changes: ${{ needs.detect_changes.outputs.filters }} @@ -50,7 +50,7 @@ jobs: always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' - uses: ./.github/workflows/sycl_windows_run_tests.yml + uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' diff --git a/.github/workflows/sycl_windows_run_tests.yml b/.github/workflows/sycl-windows-run-tests.yml similarity index 100% rename from .github/workflows/sycl_windows_run_tests.yml rename to .github/workflows/sycl-windows-run-tests.yml diff --git a/README.md b/README.md index bf72abab3e686..8ab5cda1621bb 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ For general contribution process see [CONTRIBUTING.md](./CONTRIBUTING.md) [![oneAPI logo](https://spec.oneapi.io/oneapi-logo-white-scaled.jpg)](https://www.oneapi.io/) -[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml) -[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/gh_pages.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/gh_pages.yml) +[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml) +[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml) The DPC++ is a LLVM-based compiler project that implements compiler and runtime support for the SYCL\* language. The project is hosted in the diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2d0f05f6ca35b..e32047292ab8a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8404,6 +8404,9 @@ def : CLFlag<"O1">, Alias<_SLASH_O>, AliasArgs<["1"]>, HelpText<"Optimize for size (like /Og /Os /Oy /Ob2 /GF /Gy)">; def : CLFlag<"O2">, Alias<_SLASH_O>, AliasArgs<["2"]>, HelpText<"Optimize for speed (like /Og /Oi /Ot /Oy /Ob2 /GF /Gy)">; +def : CLFlag<"O3">, Alias<_SLASH_O>, AliasArgs<["3"]>, + HelpText<"Optimize for maximum speed and enable more aggressive optimizations" + " that may not improve performance on some programs">; def : CLFlag<"Ob0">, Alias<_SLASH_O>, AliasArgs<["b0"]>, HelpText<"Disable function inlining">; def : CLFlag<"Ob1">, Alias<_SLASH_O>, AliasArgs<["b1"]>, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ca1752b24b062..0a90073cf50d0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -23271,21 +23271,11 @@ static bool hasFuncNameRequestedFPAccuracy(StringRef Name, return (FuncMapIt != LangOpts.FPAccuracyFuncMap.end()); } -llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( +llvm::CallInst *CodeGenFunction::MaybeEmitFPBuiltinofFD( llvm::FunctionType *IRFuncTy, const SmallVectorImpl &IRArgs, - llvm::Value *FnPtr, const FunctionDecl *FD) { - llvm::Function *Func; + llvm::Value *FnPtr, StringRef Name, unsigned FDBuiltinID) { unsigned FPAccuracyIntrinsicID = 0; - StringRef Name; - if (CurrentBuiltinID == 0) { - // Even if the current function doesn't have a clang builtin, create - // an 'fpbuiltin-max-error' attribute for it; unless it's marked with - // an NoBuiltin attribute. - if (FD->hasAttr() || - !FD->getNameInfo().getName().isIdentifier()) - return nullptr; - - Name = FD->getName(); + if (FDBuiltinID == 0) { FPAccuracyIntrinsicID = llvm::StringSwitch(Name) .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd) @@ -23300,9 +23290,7 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( } else { // The function has a clang builtin. Create an attribute for it // only if it has an fpbuiltin intrinsic. - unsigned BuiltinID = getCurrentBuiltinID(); - Name = CGM.getContext().BuiltinInfo.getName(BuiltinID); - switch (BuiltinID) { + switch (FDBuiltinID) { default: // If the function has a clang builtin but doesn't have an // fpbuiltin, it will be generated with no 'fpbuiltin-max-error' @@ -23384,7 +23372,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( const LangOptions &LangOpts = getLangOpts(); if (hasFuncNameRequestedFPAccuracy(Name, LangOpts) || !LangOpts.FPAccuracyVal.empty()) { - Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType()); + llvm::Function *Func = + CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType()); return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs), FPAccuracyIntrinsicID); } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index d55023875f2fc..442059fb03789 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5707,8 +5707,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, if (!getLangOpts().FPAccuracyFuncMap.empty() || !getLangOpts().FPAccuracyVal.empty()) { const auto *FD = dyn_cast_if_present(TargetDecl); - if (FD) { - CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD); + if (FD && FD->getNameInfo().getName().isIdentifier()) { + CI = MaybeEmitFPBuiltinofFD(IRFuncTy, IRCallArgs, CalleePtr, + FD->getName(), FD->getBuiltinID()); if (CI) return RValue::get(CI); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 3ddd05cb53d8e..04c3a715da205 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4434,9 +4434,10 @@ class CodeGenFunction : public CodeGenTypeCache { RValue EmitIntelSYCLPtrAnnotationBuiltin(const CallExpr *E); llvm::CallInst * - EmitFPBuiltinIndirectCall(llvm::FunctionType *IRFuncTy, - const SmallVectorImpl &IRArgs, - llvm::Value *FnPtr, const FunctionDecl *FD); + MaybeEmitFPBuiltinofFD(llvm::FunctionType *IRFuncTy, + const SmallVectorImpl &IRArgs, + llvm::Value *FnPtr, StringRef Name, + unsigned FDBuiltinID); enum class MSVCIntrin; llvm::Value *EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, const CallExpr *E); diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 05fc203797f25..7642d3d79a666 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -342,8 +342,8 @@ llvm::Type *CodeGenTypes::ConvertSYCLJointMatrixINTELType(RecordDecl *RD) { if (CompTy->isStructTy()) { StringRef LlvmTyName = CompTy->getStructName(); // Emit half/int16/float for sycl[::*]::{half,bfloat16,tf32} - if (LlvmTyName.startswith("class.sycl::") || - LlvmTyName.startswith("class.__sycl_internal::")) + if (LlvmTyName.starts_with("class.sycl::") || + LlvmTyName.starts_with("class.__sycl_internal::")) LlvmTyName = LlvmTyName.rsplit("::").second; if (LlvmTyName == "half") { CompTy = llvm::Type::getHalfTy(getLLVMContext()); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 21a563ce99c45..6265280877e9f 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3518,7 +3518,7 @@ getLinkerArgs(Compilation &C, DerivedArgList &Args, bool IncludeObj = false) { // manner than the OpenMP processing. We should try and refactor this // to use the OpenMP flow (adding -l to the llvm-link step) auto resolveStaticLib = [&](StringRef LibName, bool IsStatic) -> bool { - if (!LibName.startswith("-l")) + if (!LibName.starts_with("-l")) return false; for (auto &LPath : LibPaths) { if (!IsStatic) { @@ -3663,7 +3663,7 @@ static bool IsSYCLDeviceLibObj(std::string ObjFilePath, bool isMSVCEnv) { StringRef ObjFileName = llvm::sys::path::filename(ObjFilePath); StringRef ObjSuffix = isMSVCEnv ? ".obj" : ".o"; bool Ret = - (ObjFileName.startswith("libsycl-") && ObjFileName.endswith(ObjSuffix)) + (ObjFileName.starts_with("libsycl-") && ObjFileName.ends_with(ObjSuffix)) ? true : false; return Ret; diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index ae33f79198692..01887706d8757 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -344,7 +344,7 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) // The deprecated -DLLVM_ENABLE_PROJECTS=libcxx configuration installs // libc++.so in D.Dir+"/../lib/". Detect this path. // TODO Remove once LLVM_ENABLE_PROJECTS=libcxx is unsupported. - if (StringRef(D.Dir).startswith(SysRoot) && + if (StringRef(D.Dir).starts_with(SysRoot) && (Args.hasArg(options::OPT_fsycl) || D.getVFS().exists(D.Dir + "/../lib/libsycl.so"))) addPathIfExists(D, D.Dir + "/../lib", Paths); diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index c1b912db596e8..52f811621ba82 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -930,6 +930,7 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL, break; case '1': case '2': + case '3': case 'x': case 'd': // Ignore /O[12xd] flags that aren't the last one on the command line. @@ -946,11 +947,14 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL, } else if (OptChar == '2' || OptChar == 'x') { DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin)); DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "2"); + } else if (OptChar == '3') { + DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin)); + DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "3"); } if (SupportsForcingFramePointer && !DAL.hasArgNoClaim(options::OPT_fno_omit_frame_pointer)) DAL.AddFlagArg(A, Opts.getOption(options::OPT_fomit_frame_pointer)); - if (OptChar == '1' || OptChar == '2') + if (OptChar == '1' || OptChar == '2' || OptChar == '3') DAL.AddFlagArg(A, Opts.getOption(options::OPT_ffunction_sections)); } break; @@ -1070,7 +1074,8 @@ MSVCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, // OptChar does not expand; it's an argument to the previous char. continue; } - if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || OptChar == 'd') + if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || + OptChar == 'd' || OptChar == '3') ExpandChar = OptStr.data() + I; } } diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 32ad99ba384b0..ca68b97858027 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -401,8 +401,9 @@ const char *SYCL::Linker::constructLLVMLinkCommand( LibPostfix = ".cubin"; } StringRef LibSyclPrefix("libsycl-"); - if (!InputFilename.startswith(LibSyclPrefix) || - !InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2)) + if (!InputFilename.starts_with(LibSyclPrefix) || + !InputFilename.ends_with(LibPostfix) || + (InputFilename.count('-') < 2)) return false; // Skip the prefix "libsycl-" std::string PureLibName = @@ -419,7 +420,7 @@ const char *SYCL::Linker::constructLLVMLinkCommand( PureLibName.substr(0, FinalDashPos) + PureLibName.substr(DotPos); } for (const auto &L : SYCLDeviceLibList) { - if (StringRef(PureLibName).startswith(L)) + if (StringRef(PureLibName).starts_with(L)) return true; } return false; @@ -1354,7 +1355,7 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple, for (auto *A : Args) { if (!A->getOption().matches(options::OPT_Xsycl_backend_EQ)) continue; - if (StringRef(A->getValue()).startswith("intel_gpu")) + if (StringRef(A->getValue()).starts_with("intel_gpu")) TargArgs.push_back(A->getValue(1)); } if (llvm::find_if(TargArgs, [&](auto Cur) { diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h index 8e1c97dbbb546..2e1a30be76eda 100644 --- a/clang/lib/Driver/ToolChains/SYCL.h +++ b/clang/lib/Driver/ToolChains/SYCL.h @@ -125,7 +125,7 @@ constexpr char AmdGPU[] = "amd_gpu_"; template std::optional isGPUTarget(StringRef Target) { // Handle target specifications that resemble '(intel, nvidia, amd)_gpu_*' // here. - if (Target.startswith(GPUArh)) { + if (Target.starts_with(GPUArh)) { return resolveGenDevice(Target); } return std::nullopt; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index dcd567d32b698..2c5f907b4797e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -282,11 +282,11 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs, .Default(false); }; if ((getEmissionReason(FDecl) == Sema::DeviceDiagnosticReason::Sycl) && - Id && !Id->getName().startswith("__spirv_") && - !Id->getName().startswith("__sycl_") && - !Id->getName().startswith("__devicelib_ConvertBF16ToFINTEL") && - !Id->getName().startswith("__devicelib_ConvertFToBF16INTEL") && - !Id->getName().startswith("__assert_fail") && + Id && !Id->getName().starts_with("__spirv_") && + !Id->getName().starts_with("__sycl_") && + !Id->getName().starts_with("__devicelib_ConvertBF16ToFINTEL") && + !Id->getName().starts_with("__devicelib_ConvertFToBF16INTEL") && + !Id->getName().starts_with("__assert_fail") && !isMsvcMathFn(Id->getName())) { SYCLDiagIfDeviceCode( *Locs.begin(), diag::err_sycl_device_function_is_called_from_esimd, diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index e2a8d10ded1ab..d0f8ddbe7193c 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -4295,7 +4295,7 @@ static void CheckSYCL2020SubGroupSizes(Sema &S, FunctionDecl *SYCLKernel, // No need to validate __spirv routines here since they // are mapped to the equivalent SPIRV operations. const IdentifierInfo *II = FD->getIdentifier(); - if (II && II->getName().startswith("__spirv_")) + if (II && II->getName().starts_with("__spirv_")) return; // Else we need to figure out why they don't match. diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c index 74de6a1f72c80..3fdde4443b8c8 100644 --- a/clang/test/CodeGen/fp-accuracy.c +++ b/clang/test/CodeGen/fp-accuracy.c @@ -177,7 +177,7 @@ double rsqrt(double); // CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] -// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F3_HIGH]] @@ -203,6 +203,19 @@ double rsqrt(double); // CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW:[0-9]+]] // CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3-LABEL: define dso_local void @f2 +// CHECK-F3: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW]] +// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F3_MEDIUM]] +// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F3_MEDIUM]] + +// CHECK-F3-LABEL: define dso_local float @fake_exp10 + +// CHECK-F3-LABEL: define dso_local void @f4 +// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]] + // CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0" @@ -414,7 +427,7 @@ void f1(float a, float b) { // CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]] // CHECK-F2: call float @tanf(float {{.*}}) // -// CHECK-LABEL-F4: define dso_local void @f2 +// CHECK-F4-LABEL: define dso_local void @f2 // CHECK-F4: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] @@ -422,6 +435,17 @@ void f1(float a, float b) { // CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call float @tanf(float {{.*}}) // +// CHECK-F4-LABEL: define dso_local float @fake_exp10 + +// CHECK-F4-LABEL: define dso_local void @f3 +// CHECK-F4: call float @fake_exp10(float {{.*}}) + +// CHECK-F4-LABEL: define dso_local void @f4 +// CHECK-F4: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] +// CHECK-F4: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] + +// CHECK-F4: attributes #[[ATTR_F4_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" + // CHECK-F5-LABEL: define dso_local void @f2 // CHECK-F5: call float @llvm.cos.f32(float {{.*}}) // CHECK-F5: call float @llvm.sin.f32(float {{.*}}) @@ -430,6 +454,15 @@ void f1(float a, float b) { // CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) // CHECK-F5: call float @tanf(float {{.*}}) // +// CHECK-F5-LABEL: define dso_local float @fake_exp10 + +// CHECK-F5-LABEL: define dso_local void @f3 +// CHECK-F5: call float @fake_exp10(float {{.*}}) + +// CHECK-F5-LABEL: define dso_local void @f4 +// CHECK-F5: call double @llvm.exp.f64(double {{.*}}) +// CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM]] + // CHECK-F5: attributes #[[ATTR_F5_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F5: attributes #[[ATTR_F5_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // @@ -441,6 +474,15 @@ void f1(float a, float b) { // CHECK-F6: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call float @tanf(float {{.*}}) #[[ATTR8:[0-9]+]] // +// CHECK-F6-LABEL: define dso_local float @fake_exp10 +// +// CHECK-F6-LABEL: define dso_local void @f3 +// CHECK-F6: call float @fake_exp10(float {{.*}}) + +// CHECK-F6-LABEL: define dso_local void @f4 +// CHECK-F6: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] +// CHECK-F6: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] + // CHECK-F6: attributes #[[ATTR_F6_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F6: attributes #[[ATTR_F6_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // @@ -454,12 +496,36 @@ void f1(float a, float b) { // CHECK-LABEL: define dso_local void @f3 // CHECK: call float @fake_exp10(float {{.*}}) + +// CHECK-LABEL: define dso_local void @f4 +// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]] +// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]] + +// CHECK-F1-LABEL: define dso_local void @f3 // CHECK-F1: call float @fake_exp10(float {{.*}}) + +// CHECK-F1-LABEL: define dso_local void @f4 +// CHECK-F1: call double @llvm.exp.f64(double {{.*}}) +// CHECK-F1: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]] + +// CHECK-F2-LABEL: define dso_local float @fake_exp10 + +// CHECK-F2-LABEL: define dso_local void @f3 // CHECK-F2: call float @fake_exp10(float {{.*}}) +// CHECK-F2-LABEL: define dso_local void @f4 +// CHECK-F2: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]] +// CHECK-F2: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F2_CUDA]] + +// CHECK-SPIR-LABEL: define dso_local spir_func float @fake_exp10 +// // CHECK-SPIR-LABEL: define dso_local spir_func void @f3 // CHECK-SPIR: call spir_func float @fake_exp10(float {{.*}}) +// CHECK-SPIR-LABEL: define dso_local spir_func void @f4 +// CHECK-SPIR: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_SYCL5]] +// CHECK-SPIR: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_SYCL1]] + // CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" @@ -526,6 +592,10 @@ void f1(float a, float b) { // CHECK-DEFAULT-LABEL: define dso_local void @f3 // CHECK-DEFAULT: call float @fake_exp10(float {{.*}}) +// CHECK-DEFAULT-LABEL: define dso_local void @f4 +// CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}}) +// CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}}) + void f2(float a, float b) { float sin = 0.f, cos = 0.f; @@ -541,3 +611,16 @@ float fake_exp10(float a) __attribute__((no_builtin)){} void f3(float a, float b) { a = fake_exp10(b); } + +#define sz 2 +double in[sz]; +double out[sz]; + +double getInput(int i) { + return in[i]; +} + +void f4() { + for (int i = 0; i < sz; i++) + out[i] = cos(exp(getInput(i))); +} diff --git a/clang/test/Driver/Xarch.c b/clang/test/Driver/Xarch.c index f7693fb689d58..ec58211d6fba0 100644 --- a/clang/test/Driver/Xarch.c +++ b/clang/test/Driver/Xarch.c @@ -1,10 +1,10 @@ -// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s -// O3ONCE: "-O3" -// O3ONCE-NOT: "-O3" +// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5ONCE %s +// O5ONCE: "-O5" +// O5ONCE-NOT: "-O5" -// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3NONE %s -// O3NONE-NOT: "-O3" -// O3NONE: argument unused during compilation: '-Xarch_i386 -O3' +// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5NONE %s +// O5NONE-NOT: "-O5" +// O5NONE: argument unused during compilation: '-Xarch_i386 -O5' // RUN: not %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -o -Xarch_i386 -S %s -S -Xarch_i386 -o 2>&1 | FileCheck -check-prefix=INVALID %s // INVALID: error: invalid Xarch argument: '-Xarch_i386 -o' diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 5b6dfe308a76e..2f60637f7c010 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -207,6 +207,10 @@ // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O2 /O2 -### -- %s 2>&1 | FileCheck -check-prefix=O2O2 %s // O2O2: "-O2" +// RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O3 -### -- %s 2>&1 | FileCheck -check-prefix=O3 %s +// O3: -mframe-pointer=none +// O3: -O3 + // RUN: %clang_cl /Zs -Werror /Oy -- %s 2>&1 // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s diff --git a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp index c9fa2f87a072f..5dbf044675106 100644 --- a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp +++ b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp @@ -173,7 +173,7 @@ int main(int argc, const char **argv) { // possibly reusing ClangOffloadBundler's 'OffloadTargetInfo'. for (const std::string &Target : Targets) { std::string Prefix = Target + "."; - if (Symbol.startswith(Prefix)) + if (Symbol.starts_with(Prefix)) Target2Symbols[Target].insert( Symbol.substr(Prefix.size(), Len - Prefix.size())); } diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp index e104ab115e970..480ba1944ebba 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp @@ -34,13 +34,13 @@ constexpr char SLM_ALLOCATOR_DTOR_SUFFIX[] = "EED2Ev"; bool isSlmAllocatorConstructor(const Function &F) { auto Name = F.getName(); return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) && - Name.endswith(SLM_ALLOCATOR_CTOR_SUFFIX); + Name.ends_with(SLM_ALLOCATOR_CTOR_SUFFIX); } bool isSlmAllocatorDestructor(const Function &F) { auto Name = F.getName(); return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) && - Name.endswith(SLM_ALLOCATOR_DTOR_SUFFIX); + Name.ends_with(SLM_ALLOCATOR_DTOR_SUFFIX); } bool isSlmInit(const Function &F) { diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 9a3aa8437a590..6ec1102f402ba 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1596,15 +1596,15 @@ SmallPtrSet collectGenXVolatileTypes(Module &M) { // TODO FIXME relying on type name in LLVM IR is fragile, needs rework if (!GTy || !GTy->getName() .rtrim(".0123456789") - .endswith("sycl::_V1::ext::intel::esimd::simd")) + .ends_with("sycl::_V1::ext::intel::esimd::simd")) continue; assert(GTy->getNumContainedTypes() == 1); auto VTy = GTy->getContainedType(0); if ((GTy = dyn_cast(VTy))) { - assert( - GTy->getName() - .rtrim(".0123456789") - .endswith("sycl::_V1::ext::intel::esimd::detail::simd_obj_impl")); + assert(GTy->getName() + .rtrim(".0123456789") + .ends_with( + "sycl::_V1::ext::intel::esimd::detail::simd_obj_impl")); VTy = GTy->getContainedType(0); } assert(VTy->isVectorTy()); diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp index 264a8b4bc817a..2624e26116823 100644 --- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp @@ -283,7 +283,7 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) { static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) { Function *Res; const char GetPrefix[] = "__dpcpp_nativecpu_get"; - if (Name.startswith(GetPrefix)) { + if (Name.starts_with(GetPrefix)) { Res = addGetFunc(M, Name, StateType); } else if (Name == NativeCPUSetLocalId) { Res = addSetLocalIdFunc(M, Name, StateType); diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp index 2a3586ba49339..ffb6394913fc9 100644 --- a/llvm/lib/Support/PropertySetIO.cpp +++ b/llvm/lib/Support/PropertySetIO.cpp @@ -37,7 +37,7 @@ PropertySetRegistry::read(const MemoryBuffer *Buf) { for (line_iterator LI(*Buf); !LI.is_at_end(); LI++) { // see if this line starts a new property set - if (LI->startswith("[")) { + if (LI->starts_with("[")) { // yes - parse the category (property name) auto EndPos = LI->rfind(']'); if (EndPos == StringRef::npos) diff --git a/llvm/lib/Support/SimpleTable.cpp b/llvm/lib/Support/SimpleTable.cpp index 434e7d7cb7670..ec47391450100 100644 --- a/llvm/lib/Support/SimpleTable.cpp +++ b/llvm/lib/Support/SimpleTable.cpp @@ -214,8 +214,8 @@ Expected SimpleTable::read(MemoryBuffer *Buf, return std::make_unique(); UPtrTy Res; - if (LI->startswith(COL_TITLE_LINE_OPEN)) { - if (!LI->endswith(COL_TITLE_LINE_CLOSE)) + if (LI->starts_with(COL_TITLE_LINE_OPEN)) { + if (!LI->ends_with(COL_TITLE_LINE_CLOSE)) return createStringError(errc::invalid_argument, "malformed title line"); // column titles present StringRef L = LI->substr(1, LI->size() - 2); // trim '[' and ']' diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index f0c4cd875f6fe..1e923ebae21f9 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -726,7 +726,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { (SubArchName.ends_with("r6el") || SubArchName.ends_with("r6"))) return Triple::MipsSubArch_r6; - if (SubArchName.startswith("spir")) { + if (SubArchName.starts_with("spir")) { StringRef SA(SubArchName); if (SA.consume_front("spir64_") || SA.consume_front("spir_")) { if (SA == "fpga") diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 1891e10679a0f..f37f1fdcfc2e9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -838,7 +838,7 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { // // extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0 // ``` // can't be lowered by SPIR-V translator to "standard" format. - if (StringRef(Trunc.getModule()->getTargetTriple()).startswith("spir")) + if (StringRef(Trunc.getModule()->getTargetTriple()).starts_with("spir")) return nullptr; // Whenever an element is extracted from a vector, and then truncated, diff --git a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp index b8dd7cda8883c..105643062aa90 100644 --- a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp +++ b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp @@ -243,7 +243,7 @@ bool insertAtomicInstrumentationCall(Module &M, StringRef Name, PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M, ModuleAnalysisManager &MAM) { - assert(StringRef(M.getTargetTriple()).startswith("spir")); + assert(StringRef(M.getTargetTriple()).starts_with("spir")); bool IRModified = false; std::vector SPIRVCrossWGInstuctions = { SPIRV_CONTROL_BARRIER, SPIRV_GROUP_ALL, SPIRV_GROUP_ANY, @@ -299,7 +299,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M, if (std::any_of(SPIRVCrossWGInstuctions.begin(), SPIRVCrossWGInstuctions.end(), [&CalleeName](StringRef Name) { - return CalleeName.startswith(Name); + return CalleeName.starts_with(Name); })) { Instruction *InstAfterBarrier = CI->getNextNode(); const DebugLoc &DL = CI->getDebugLoc(); @@ -307,7 +307,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M, insertSimpleInstrumentationCall(M, ITT_ANNOTATION_WI_RESUME, InstAfterBarrier, DL); IRModified = true; - } else if (CalleeName.startswith(SPIRV_ATOMIC_INST)) { + } else if (CalleeName.starts_with(SPIRV_ATOMIC_INST)) { Instruction *InstAfterAtomic = CI->getNextNode(); IRModified |= insertAtomicInstrumentationCall( M, ITT_ANNOTATION_ATOMIC_START, CI, CI, CalleeName); diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp index a31ad613f9918..87cbf42da2df2 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp @@ -79,7 +79,7 @@ bool isSpirvSyclBuiltin(StringRef FName) { // now skip the digits FName = FName.drop_while([](char C) { return std::isdigit(C); }); - return FName.startswith("__spirv_") || FName.startswith("__sycl_"); + return FName.starts_with("__spirv_") || FName.starts_with("__sycl_"); } // Return true if the function is a ESIMD builtin @@ -91,12 +91,12 @@ bool isESIMDBuiltin(StringRef FName) { // now skip the digits FName = FName.drop_while([](char C) { return std::isdigit(C); }); - return FName.startswith("__esimd_"); + return FName.starts_with("__esimd_"); } // Return true if the function name starts with "__builtin_" bool isGenericBuiltin(StringRef FName) { - return FName.startswith("__builtin_"); + return FName.starts_with("__builtin_"); } bool isKernel(const Function &F) { diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 6ca581d8caf14..1b6cdefb9a541 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -696,7 +696,7 @@ uint32_t llvm::getSYCLDeviceLibReqMask(const Module &M) { return 0; uint32_t ReqMask = 0; for (const Function &SF : M) { - if (SF.getName().startswith(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) { + if (SF.getName().starts_with(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) { assert(SF.getCallingConv() == CallingConv::SPIR_FUNC); uint32_t DeviceLibBits = getDeviceLibBits(SF.getName().str()); ReqMask |= DeviceLibBits; diff --git a/llvm/tools/sycl-post-link/SpecConstants.cpp b/llvm/tools/sycl-post-link/SpecConstants.cpp index dc5f1b2fd3f2c..66c2cc5ef1818 100644 --- a/llvm/tools/sycl-post-link/SpecConstants.cpp +++ b/llvm/tools/sycl-post-link/SpecConstants.cpp @@ -801,8 +801,8 @@ PreservedAnalyses SpecConstantsPass::run(Module &M, if (!F.isDeclaration()) continue; - if (!F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) && - !F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) + if (!F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) && + !F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) continue; SmallVector SCIntrCalls; @@ -1014,8 +1014,8 @@ bool SpecConstantsPass::collectSpecConstantDefaultValuesMetadata( bool llvm::checkModuleContainsSpecConsts(const Module &M) { for (const Function &F : M.functions()) { - if (F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) || - F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) + if (F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) || + F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) return true; } diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 69f43b8b486ea..50cb225f6cb18 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -340,7 +340,7 @@ std::string makeResultFileName(Twine Ext, int I, StringRef Suffix) { : sys::path::parent_path(OutputFilename); const StringRef Sep = sys::path::get_separator(); std::string Dir = Dir0.str(); - if (!Dir0.empty() && !Dir0.endswith(Sep)) + if (!Dir0.empty() && !Dir0.ends_with(Sep)) Dir += Sep.str(); return Dir + sys::path::stem(OutputFilename).str() + Suffix.str() + "_" + std::to_string(I) + Ext.str(); diff --git a/sycl-fusion/common/lib/NDRangesHelper.cpp b/sycl-fusion/common/lib/NDRangesHelper.cpp index 96f26d96a4ea5..7c418022ef4ee 100644 --- a/sycl-fusion/common/lib/NDRangesHelper.cpp +++ b/sycl-fusion/common/lib/NDRangesHelper.cpp @@ -170,6 +170,18 @@ jit_compiler::FusedNDRange::get(ArrayRef NDRanges) { "Cannot fuse kernels whose fusion would " "yield non-uniform work-group sizes"); } + + // Work-items in the same work-group in the original ND-ranges must be in + // the same work-group in the fused one. + if (LocalSize && any_of(NDRanges, [&Fused](const NDRange &NDR) { + return NDR.hasSpecificLocalSize() && requireIDRemapping(Fused, NDR); + })) { + return createStringError( + inconvertibleErrorCode(), + "Cannot fuse kernels when any of the fused kernels with a specific " + "local size has different global sizes in dimensions [2, N) or " + "different number of dimensions"); + } } return FusedNDRange{Fused, IsHeterogeneousList, NDRanges}; diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index b83a8a26eeae0..224a1984d2902 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -212,7 +212,8 @@ These restrictions can be simplified to: - No two local sizes specified by the nd-ranges will be different; - No global id remapping is needed ([see](#work-item-remapping)) or all input offsets are 0; -- All the fused nd-ranges must have the same offset. +- All the fused nd-ranges must have the same offset; +- No global id remapping is needed for kernels specifying a local size. As we can see, there is no restrictions in the number of dimensions or global sizes of the input nd-ranges. diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 4d0da9995908a..31c6a86f3300d 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -149,9 +149,11 @@ // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones. // 14.41 Added piextCommandBufferMemBufferFill & piextCommandBufferFillUSM // 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM +// 15.43 Changed the signature of piextMemGetNativeHandle to also take a +// pi_device -#define _PI_H_VERSION_MAJOR 14 -#define _PI_H_VERSION_MINOR 42 +#define _PI_H_VERSION_MAJOR 15 +#define _PI_H_VERSION_MINOR 43 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -1424,8 +1426,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition( /// Gets the native handle of a PI mem object. /// /// \param mem is the PI mem to get the native handle of. +/// \param dev is the PI device that the native allocation will be resident on /// \param nativeHandle is the native handle of mem. -__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem, +__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem, pi_device dev, pi_native_handle *nativeHandle); /// Creates PI mem object from a native handle. diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7af684ee99cfb..5d8f13ce82619 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -179,6 +179,45 @@ __ESIMD_API simd gather_impl(const T *p, simd offsets, return lsc_format_ret(Result); } +/// USM pointer scatter. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_store.ugm +/// +/// Scatters elements to specific address. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to store per address. +/// @tparam DS is the data size. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam N is the number of channels (platform dependent). +/// @param p is the base pointer. +/// @param offsets is the zero-based offsets in bytes. +/// @param vals is values to store. +/// @param pred is predicates. +/// +template +__ESIMD_API void scatter_impl(T *p, simd offsets, + simd vals, simd_mask pred) { + static_assert(std::is_integral_v, "Unsupported offset type"); + check_lsc_vector_size(); + check_lsc_data_size(); + check_cache_hint(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd addrs = reinterpret_cast(p); + addrs += convert(offsets); + simd Tmp = lsc_format_input(vals); + __esimd_lsc_store_stateless(pred.data(), addrs.data(), + Tmp.data()); +} + // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter. // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is // not used because of an issue in GPU driver, which does not recognize @@ -616,44 +655,203 @@ gather(const Tx *p, Toffset offset, simd_mask mask = 1) { return gather(p, simd(offset), mask); } +/// template +/// void scatter(T *p, simd byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (usm-sc-1) + +/// template +/// void scatter(T *p, simd byte_offsets, simd vals, +/// PropertyListT props = {}); // (usm-sc-2) + +/// The next two functions are similar to usm-sc-{1,2} with the 'byte_offsets' +/// parameter represerented as 'simd_view'. + +/// template +/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (usm-sc-3) + +/// template +/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (usm-sc-4) + +/// template +/// void scatter(T *p, simd byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (usm-sc-1) +/// /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a /// value of the corresponding element in the input offset vector. Access to /// any element's memory location can be disabled via the input mask. -/// @tparam Tx Element type, must be of size 4 or less. -/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16 -/// or \c 32. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. /// @param p The base address. -/// @param offsets A vector of 32-bit or 64-bit offsets in bytes. For each lane -/// \c i, ((byte*)p + offsets[i]) must be element size aligned. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. /// @param vals The vector to scatter. -/// @param mask The access mask, defaults to all 1s. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + static_assert(std::is_integral_v, "Unsupported offset type"); + static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); + + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); + static_assert(Alignment >= sizeof(T), + "scatter() requires at least element-size alignment"); + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + + // Use LSC lowering if L1/L2 or VS > 1. + if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) { + static_assert(VS == 1 || sizeof(T) >= 4, + "VS > 1 is supprted only for 4- and 8-byte elements"); + return detail::scatter_impl(p, byte_offsets, vals, mask); + } else { + using Tx = detail::__raw_t; + simd byte_offsets_i = convert(byte_offsets); + simd addrs(reinterpret_cast(p)); + addrs = addrs + byte_offsets_i; + if constexpr (sizeof(T) == 1) { + simd D = __esimd_wrregion( + D.data(), vals.data(), 0); + __esimd_svm_scatter(), + detail::ElemsPerAddrEncoding<1>()>( + addrs.data(), D.data(), mask.data()); + } else if constexpr (sizeof(T) == 2) { + simd D = __esimd_wrregion( + D.data(), vals.data(), 0); + __esimd_svm_scatter(), + detail::ElemsPerAddrEncoding<2>()>( + addrs.data(), D.data(), mask.data()); + } else + __esimd_svm_scatter(), + detail::ElemsPerAddrEncoding<1>()>( + addrs.data(), vals.data(), mask.data()); + } +} + +// template +// void scatter(T *p, simd byte_offsets, simd vals, +// PropertyListT props = {}); // (usm-sc-2) /// -template -__ESIMD_API void scatter(Tx *p, simd offsets, simd vals, - simd_mask mask = 1) { - using T = detail::__raw_t; - static_assert(std::is_integral_v, "Unsupported offset type"); - static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N"); - simd offsets_i = convert(offsets); - simd addrs(reinterpret_cast(p)); - addrs = addrs + offsets_i; - if constexpr (sizeof(T) == 1) { - simd D; - D = __esimd_wrregion(D.data(), vals.data(), 0); - __esimd_svm_scatter(), - detail::ElemsPerAddrEncoding<1>()>( - addrs.data(), D.data(), mask.data()); - } else if constexpr (sizeof(T) == 2) { - simd D; - D = __esimd_wrregion(D.data(), vals.data(), 0); - __esimd_svm_scatter(), - detail::ElemsPerAddrEncoding<2>()>( - addrs.data(), D.data(), mask.data()); - } else - __esimd_svm_scatter(), - detail::ElemsPerAddrEncoding<1>()>( - addrs.data(), vals.data(), mask.data()); +/// Writes ("scatters") elements of the input vector to different memory +/// locations. Each memory location is base address plus an offset - a +/// value of the corresponding element in the input offset vector. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(p, byte_offsets, vals, Mask, props); +} + +// template +// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +// simd_mask mask, PropertyListT props = {}); // (usm-sc-3) +/// +/// Writes ("scatters") elements of the input vector to different memory +/// locations. Each memory location is base address plus an offset - a +/// value of the corresponding element in the input offset vector. Access to +/// any element's memory location can be disabled via the input mask. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + scatter(p, byte_offsets.read(), vals, mask, props); +} + +/// template +/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (usm-sc-4) +/// +/// Writes ("scatters") elements of the input vector to different memory +/// locations. Each memory location is base address plus an offset - a +/// value of the corresponding element in the input offset vector. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(p, byte_offsets.read(), vals, Mask, props); } /// A variation of \c scatter API with \c offsets represented as \c simd_view @@ -671,7 +869,7 @@ __ESIMD_API void scatter(Tx *p, simd offsets, simd vals, template __ESIMD_API void scatter(Tx *p, simd_view offsets, simd vals, simd_mask mask = 1) { - scatter(p, offsets.read(), vals, mask); + scatter(p, offsets.read(), vals, mask); } /// A variation of \c scatter API with \c offsets represented as scalar. @@ -688,7 +886,7 @@ __ESIMD_API void scatter(Tx *p, simd_view offsets, template __ESIMD_API std::enable_if_t && N == 1> scatter(Tx *p, Toffset offset, simd vals, simd_mask mask = 1) { - scatter(p, simd(offset), vals, mask); + scatter(p, simd(offset), vals, mask); } namespace detail { diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 774154dfc0f97..a71981de54bec 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -102,16 +102,6 @@ template struct lsc_expand_type { using type = __ESIMD_DNS::lsc_expand_type::type; }; -template struct lsc_bitcast_type { -public: - using type = std::conditional_t< - sizeof(T) == 1, uint8_t, - std::conditional_t< - sizeof(T) == 2, uint16_t, - std::conditional_t>>>; -}; - } // namespace detail /// L1 or L3 cache hint kinds. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 069898f14e747..7fb0ae833e793 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1434,7 +1434,7 @@ __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using MsgT = typename detail::lsc_expand_type::type; - using CstT = typename detail::lsc_bitcast_type::type; + using CstT = __ESIMD_DNS::uint_type_t; __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); __esimd_lsc_store_slm( @@ -1487,25 +1487,8 @@ template offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - static_assert(std::is_integral_v, "Unsupported offset type"); - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using _CstT = typename detail::lsc_bitcast_type::type; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view<_CstT>(); - __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); - __esimd_lsc_store_stateless(pred.data(), addrs.data(), - Tmp.data()); + __ESIMD_DNS::scatter_impl(p, offsets, + vals, pred); } template ::type; - using _CstT = typename detail::lsc_bitcast_type::type; + using _CstT = __ESIMD_DNS::uint_type_t; __ESIMD_NS::simd Tmp = vals.template bit_cast_view<_CstT>(); auto si = __ESIMD_NS::get_surface_index(acc); __esimd_lsc_store_bti { Rest...); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES buffer &getAssertHappenedBuffer(); +#endif event memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, @@ -3019,9 +3021,7 @@ class AssertInfoCopier; */ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue, const detail::code_location &CodeLoc) { - using AHBufT = buffer; - - AHBufT &Buffer = Self.getAssertHappenedBuffer(); + buffer Buffer{1}; event CopierEv, CheckerEv, PostCheckerEv; auto CopierCGF = [&](handler &CGH) { diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 4b8163b03efbd..de715de0835fd 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -17,7 +17,12 @@ #include // Forward declarations -void enableCUDATracing(); +struct cuda_tracing_context_t_; + +void enableCUDATracing(cuda_tracing_context_t_ *ctx); +void disableCUDATracing(cuda_tracing_context_t_ *ctx); +cuda_tracing_context_t_ *createCUDATracingContext(); +void freeCUDATracingContext(cuda_tracing_context_t_ *Ctx); //-- PI API implementation extern "C" { @@ -228,8 +233,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, @@ -1237,7 +1243,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { std::memset(&(PluginInit->PiFunctionTable), 0, sizeof(PluginInit->PiFunctionTable)); - enableCUDATracing(); + cuda_tracing_context_t_ *Ctx = createCUDATracingContext(); + enableCUDATracing(Ctx); // Forward calls to CUDA RT. #define _PI_API(api) \ diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 266c72a3b3587..126ada92348f6 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -236,8 +236,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 79e047850af88..0fc36a231be6c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -243,8 +243,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index c7e71f9791d35..48ce104a94e90 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -240,8 +240,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 3e7f3aea4dfed..c09be92f89406 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -222,8 +222,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index df841b786bfb1..c19c93a6af53a 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -3081,13 +3081,14 @@ inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, return PI_SUCCESS; } -inline pi_result piextMemGetNativeHandle(pi_mem Mem, +inline pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, pi_native_handle *NativeHandle) { PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); ur_mem_handle_t UrMem = reinterpret_cast(Mem); + ur_device_handle_t UrDev = reinterpret_cast(Dev); ur_native_handle_t NativeMem{}; - HANDLE_ERRORS(urMemGetNativeHandle(UrMem, &NativeMem)); + HANDLE_ERRORS(urMemGetNativeHandle(UrMem, UrDev, &NativeMem)); *NativeHandle = reinterpret_cast(NativeMem); diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index adbeb652bf613..b9742b8697fa8 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -235,9 +235,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition( BufferCreateInfo, RetMem); } -__SYCL_EXPORT pi_result -piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +__SYCL_EXPORT pi_result piextMemGetNativeHandle( + pi_mem Mem, pi_device Dev, pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } __SYCL_EXPORT pi_result diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 04c055465a9cf..835c732a40bf9 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -84,7 +84,11 @@ buffer_impl::getNativeVector(backend BackendName) const { } pi_native_handle Handle; - Plugin->call(NativeMem, &Handle); + // When doing buffer interop we don't know what device the memory should be + // resident on, so pass nullptr for Device param. Buffer interop may not be + // supported by all backends. + Plugin->call(NativeMem, /*Dev*/ nullptr, + &Handle); Handles.push_back(Handle); } diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 0daa53587ed4d..e59fb94a09f65 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -144,7 +144,11 @@ void memBufferCreateHelper(const PluginPtr &Plugin, pi_context Ctx, // Always use call_nocheck here, because call may throw an exception, // and this lambda will be called from destructor, which in combination // rewards us with UB. - Plugin->call_nocheck(*RetMem, &Ptr); + // When doing buffer interop we don't know what device the memory should + // be resident on, so pass nullptr for Device param. Buffer interop may + // not be supported by all backends. + Plugin->call_nocheck( + *RetMem, /*Dev*/ nullptr, &Ptr); emitMemAllocEndTrace(MemObjID, (uintptr_t)(Ptr), Size, 0 /* guard zone */, CorrID); }}; @@ -167,7 +171,11 @@ void memReleaseHelper(const PluginPtr &Plugin, pi_mem Mem) { // Do not make unnecessary PI calls without instrumentation enabled if (xptiTraceEnabled()) { pi_native_handle PtrHandle = 0; - Plugin->call(Mem, &PtrHandle); + // When doing buffer interop we don't know what device the memory should be + // resident on, so pass nullptr for Device param. Buffer interop may not be + // supported by all backends. + Plugin->call(Mem, /*Dev*/ nullptr, + &PtrHandle); Ptr = (uintptr_t)(PtrHandle); } #endif diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 810f991d6667f..ddd6a71d7db80 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -108,7 +108,9 @@ class queue_impl { const async_handler &AsyncHandler, const property_list &PropList) : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList), MHostQueue(MDevice->is_host()), +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES MAssertHappenedBuffer(range<1>{1}), +#endif MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -283,7 +285,9 @@ class queue_impl { queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler) : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false), +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES MAssertHappenedBuffer(range<1>{1}), +#endif MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -305,7 +309,10 @@ class queue_impl { queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList) : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList), - MHostQueue(false), MAssertHappenedBuffer(range<1>{1}), + MHostQueue(false), +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + MAssertHappenedBuffer(range<1>{1}), +#endif MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -673,9 +680,11 @@ class queue_impl { /// \return a native handle. pi_native_handle getNative(int32_t &NativeHandleDesc) const; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES buffer &getAssertHappenedBuffer() { return MAssertHappenedBuffer; } +#endif void registerStreamServiceEvent(const EventImplPtr &Event) { std::lock_guard Lock(MMutex); @@ -918,8 +927,10 @@ class queue_impl { /// need to emulate it with multiple native in-order queues. bool MEmulateOOO = false; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // Buffer to store assert failure descriptor buffer MAssertHappenedBuffer; +#endif // This event is employed for enhanced dependency tracking with in-order queue // Access to the event should be guarded with MMutex diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index 0b0ab39199370..cd479493bbae3 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -34,8 +34,8 @@ pi_native_handle interop_handle::getNativeMem(detail::Requirement *Req) const { auto Plugin = MQueue->getPlugin(); pi_native_handle Handle; - Plugin->call(Iter->second, - &Handle); + Plugin->call( + Iter->second, MDevice->getHandleRef(), &Handle); return Handle; } diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 47b6b29b89bab..1b877a31da4e0 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -298,9 +298,11 @@ pi_native_handle queue::getNative(int32_t &NativeHandleDesc) const { return impl->getNative(NativeHandleDesc); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES buffer &queue::getAssertHappenedBuffer() { return impl->getAssertHappenedBuffer(); } +#endif event queue::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, diff --git a/sycl/test-e2e/Assert/check_resource_leak.cpp b/sycl/test-e2e/Assert/check_resource_leak.cpp new file mode 100644 index 0000000000000..252d2ed9e0c49 --- /dev/null +++ b/sycl/test-e2e/Assert/check_resource_leak.cpp @@ -0,0 +1,32 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Device globals aren't supported on opencl:gpu yet. +// UNSUPPORTED: opencl && gpu + +// TODO: Fails at JIT compilation for some reason. +// UNSUPPORTED: hip +#define SYCL_FALLBACK_ASSERT 1 + +#include + +// DeviceGlobalUSMMem::~DeviceGlobalUSMMem() has asserts to ensure some +// resources have been cleaned up when it's executed. Those asserts used to fail +// when "AssertHappened" buffer used in fallback implementation of the device +// assert was a data member of the queue_impl. +sycl::ext::oneapi::experimental::device_global dg; + +int main() { + sycl::queue q; + q.submit([&](sycl::handler &cgh) { + sycl::range<1> R{16}; + cgh.parallel_for(sycl::nd_range<1>{R, R}, [=](sycl::nd_item<1> ndi) { + if (ndi.get_global_linear_id() == 0) + dg.get() = 42; + auto sg = sycl::ext::oneapi::experimental::this_sub_group(); + auto active = sycl::ext::oneapi::group_ballot(sg, 1); + }); + }).wait(); + + return 0; +} diff --git a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp index 77f6a6c4122d9..83bb92eca5440 100644 --- a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp +++ b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // This test checks local accessor cmpxchg atomic operations. //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// REQUIRES: gpu-intel-pvc // RUN: %{build} -o %t.out // RUN: %{run} %t.out // diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp index 25bce1b2b2244..4fde6446bdff1 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // This test checks LSC SLM atomic operations. //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// REQUIRES: gpu-intel-pvc // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp new file mode 100644 index 0000000000000..270ecc4e698af --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -0,0 +1,227 @@ +//==------- scatter.hpp - DPC++ ESIMD on-device test ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------===// + +#include "common.hpp" + +using namespace sycl; +using namespace sycl::ext::intel::esimd; + +template +bool verify(const T *Out, int N, int Size, int VS, uint32_t MaskStride, + bool UseMask) { + using Tuint = esimd_test::uint_type_t; + int NumErrors = 0; + int NOffsets = N / VS; + for (uint32_t I = 0; I < Size; I += N) { // Verify by 1 vector at once + for (int VSI = 0; VSI < VS; VSI++) { + for (int OffsetI = 0; OffsetI < NOffsets; OffsetI++) { + size_t OutIndex = I + VSI * NOffsets + OffsetI; + bool IsMaskSet = UseMask ? ((OutIndex / VS) % MaskStride == 0) : true; + Tuint Expected = sycl::bit_cast((T)OutIndex); + if (!UseMask || IsMaskSet) + Expected = sycl::bit_cast((T)(OutIndex * 2)); + Tuint Computed = sycl::bit_cast(Out[OutIndex]); + if (Computed != Expected && ++NumErrors < 16) { + std::cout << "Out[" << OutIndex << "] = " << std::to_string(Computed) + << " vs " << std::to_string(Expected) << std::endl; + } + } + } + } + return NumErrors == 0; +} + +template +bool testUSM(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + uint32_t Groups = 8; + uint32_t Threads = 16; + size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "USM case: T=" << esimd_test::type_name() << ",N=" << N + << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + simd ByteOffsets(GlobalID * N * sizeof(T), + VS * sizeof(T)); + auto ByteOffsetsView = ByteOffsets.template select(); + simd Vals = gather(Out, ByteOffsets); + Vals *= 2; + auto ValsView = Vals.template select(); + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView); + } + } + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testUSM(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // // Test scatter() that is available on Gen12 and PVC. + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 1, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + + Passed &= testUSM(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{cache_hint_L1, + cache_hint_L2, + alignment}; + Passed &= testUSM(Q, 2, LSCProps); + Passed &= testUSM(Q, 2, LSCProps); + Passed &= testUSM(Q, 2, LSCProps); + Passed &= testUSM(Q, 2, LSCProps); + + Passed &= testUSM(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testUSM(Q, 2, AlignElemProps) + Passed &= + testUSM(Q, 2, AlignElemProps); + Passed &= + testUSM(Q, 2, AlignElemProps); + Passed &= + testUSM(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp new file mode 100644 index 0000000000000..929d3c6fc04f7 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp @@ -0,0 +1,37 @@ +//==------- scatter_usm.cpp - DPC++ ESIMD on-device test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use cache-hint +// properties to not impose using DG2/PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testUSM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp new file mode 100644 index 0000000000000..aa466795e9b06 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp @@ -0,0 +1,38 @@ +//==------- scatter_usm_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test uses cache-hint +// properties and requires DG2 or PVC. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testUSM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp new file mode 100644 index 0000000000000..1f98b2da24c33 --- /dev/null +++ b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp @@ -0,0 +1,136 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -o %t.out -I/opt/rocm/include -L/opt/rocm/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: hip + +#include +#include + +#define __HIP_PLATFORM_AMD__ + +#include + +using namespace sycl; +using namespace sycl::access; + +static constexpr size_t BUFFER_SIZE = 1024; + +template class Modifier; + +template class Init; + +template +void checkBufferValues(BufferT Buffer, ValueT Value) { + auto Acc = Buffer.get_host_access(); + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) { + if (Acc[Idx] != Value) { + std::cerr << "buffer[" << Idx << "] = " << Acc[Idx] + << ", expected val = " << Value << '\n'; + exit(1); + } + } +} + +template +void copy(buffer &Src, buffer &Dst, queue &Q) { + Q.submit([&](handler &CGH) { + auto SrcA = Src.template get_access(CGH); + auto DstA = Dst.template get_access(CGH); + + auto Func = [=](interop_handle IH) { + auto HipStream = IH.get_native_queue(); + auto SrcMem = IH.get_native_mem(SrcA); + auto DstMem = IH.get_native_mem(DstA); + + if (hipMemcpyWithStream(DstMem, SrcMem, sizeof(DataT) * SrcA.get_count(), + hipMemcpyDefault, HipStream) != hipSuccess) { + throw; + } + + if (hipStreamSynchronize(HipStream) != hipSuccess) { + throw; + } + + if (Q.get_backend() != IH.get_backend()) + throw; + }; + CGH.host_task(Func); + }); +} + +template void modify(buffer &B, queue &Q) { + Q.submit([&](handler &CGH) { + auto Acc = B.template get_access(CGH); + + auto Kernel = [=](item<1> Id) { Acc[Id] += 1; }; + + CGH.parallel_for>(Acc.get_count(), Kernel); + }); +} + +template +void init(buffer &B1, buffer &B2, queue &Q) { + Q.submit([&](handler &CGH) { + auto Acc1 = B1.template get_access(CGH); + auto Acc2 = B2.template get_access(CGH); + + CGH.parallel_for>(BUFFER_SIZE, [=](item<1> Id) { + Acc1[Id] = B1Init; + Acc2[Id] = B2Init; + }); + }); +} + +// Check that a single host-interop-task with a buffer will work. +void test_ht_buffer(queue &Q) { + buffer Buffer{BUFFER_SIZE}; + + Q.submit([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + auto Func = [=](interop_handle IH) { /*A no-op */ }; + CGH.host_task(Func); + }); +} + +// A test that uses HIP interop to copy data from buffer A to buffer B, by +// getting HIP ptrs and calling the hipMemcpyWithStream. Then run a SYCL +// kernel that modifies the data in place for B, e.g. increment one, then copy +// back to buffer A. Run it on a loop, to ensure the dependencies and the +// reference counting of the objects is not leaked. +void test_ht_kernel_dependencies(queue &Q) { + static constexpr int COUNT = 4; + buffer Buffer1{BUFFER_SIZE}; + buffer Buffer2{BUFFER_SIZE}; + + // Init the buffer with a'priori invalid data. + init(Buffer1, Buffer2, Q); + + // Repeat a couple of times. + for (size_t Idx = 0; Idx < COUNT; ++Idx) { + copy(Buffer1, Buffer2, Q); + modify(Buffer2, Q); + copy(Buffer2, Buffer1, Q); + } + + checkBufferValues(Buffer1, COUNT - 1); + checkBufferValues(Buffer2, COUNT - 1); +} + +void tests(queue &Q) { + test_ht_buffer(Q); + test_ht_kernel_dependencies(Q); +} + +int main() { + queue Q([](sycl::exception_list ExceptionList) { + if (ExceptionList.size() != 1) { + std::cerr << "Should be one exception in exception list" << std::endl; + std::abort(); + } + std::rethrow_exception(*ExceptionList.begin()); + }); + tests(Q); + std::cout << "Test PASSED" << std::endl; + return 0; +} diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp index 709befa514915..930fcc12eff86 100644 --- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp +++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp @@ -15,14 +15,19 @@ enum class Internalization { None, Local, Private }; template size_t getSize(Range r); -template <> size_t getSize(range<1> r) { return r.size(); } -template <> size_t getSize(nd_range<1> r) { +template size_t getSize(range r) { + return r.size(); +} +template size_t getSize(nd_range r) { return r.get_global_range().size(); } template void performFusion(queue &q, Range1 R1, Range2 R2) { + using IndexTy1 = item; + using IndexTy2 = item; + int in[dataSize], tmp[dataSize], out[dataSize]; for (size_t i = 0; i < dataSize; ++i) { @@ -43,15 +48,19 @@ void performFusion(queue &q, Range1 R1, Range2 R2) { q.submit([&](handler &cgh) { auto accIn = bIn.get_access(cgh); auto accTmp = bTmp.get_access(cgh); - cgh.parallel_for( - R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; }); + cgh.parallel_for(R1, [=](IndexTy1 i) { + size_t j = i.get_linear_id(); + accTmp[j] = accIn[j] + 5; + }); }); q.submit([&](handler &cgh) { auto accTmp = bTmp.get_access(cgh); auto accOut = bOut.get_access(cgh); - cgh.parallel_for( - R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; }); + cgh.parallel_for(R2, [=](IndexTy2 i) { + size_t j = i.get_linear_id(); + accOut[j] = accTmp[j] * 2; + }); }); fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); @@ -117,5 +126,17 @@ int main() { // CHECK-NEXT: Cannot fuse kernels whose fusion would yield non-uniform work-group sizes // CHECK: COMPUTATION OK + // Scenario: Fusing two kernels that may lead to synchronization issues as two + // work-items in the same work-group may not be in the same work-group in the + // fused ND-range. + performFusion( + q, nd_range<2>{range<2>{2, 2}, range<2>{2, 2}}, + nd_range<2>{range<2>{4, 4}, range<2>{2, 2}}); + // CHECK: ERROR: JIT compilation for kernel fusion failed with message: + // CHECK-NEXT: Illegal ND-range combination + // CHECK-NEXT: Detailed information: + // CHECK-NEXT: Cannot fuse kernels when any of the fused kernels with a specific local size has different global sizes in dimensions [2, N) or different number of dimensions + // CHECK: COMPUTATION OK + return 0; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp index fe5b110864e6b..9fd4f184692be 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp @@ -11,7 +11,7 @@ using namespace sycl::ext::oneapi; using namespace sycl::ext::oneapi::experimental::matrix; constexpr float bf16_eps = 0.00390625; -// Example usage of Nvidia matrix multiply. +// Example usage of joint_matrix matrix multiply. // Optimizations such as memory paddings for avoiding bank conflicts are not // included in this test which aids clarity for what is going on. This example // forms a "Big matrix" corresponding to a single "TILE" using cuda example @@ -30,37 +30,47 @@ constexpr float bf16_eps = 0.00390625; constexpr int N_THREADS_PER_MATRIX_OP = 32; // number of submatrices per row of accumulator ("C", "D") matrices. -constexpr int SUB_TILES_M = 3; +constexpr int SUB_TILES_M = 2; // number of submatrices per col of accumulator matrices. constexpr int SUB_TILES_N = 2; // number of submatrices per col of "A"/per row of "B", matrices. -constexpr int SUB_TILES_K = 1; +constexpr int SUB_TILES_K = 2; -template +template class TypeHelper; -template -using KernelName = class TypeHelper; +template +using KernelName = + class TypeHelper; -template +template Tc matrix_ref_mn(const int &m, const int &n, Tm *A, Tm *B, Tc *C) { Tc res = C[m * Big_N + n]; - if constexpr (std::is_same::value) { - for (int k = 0; k < Big_K; k++) - res += A[m * Big_K + k] * B[k * Big_N + n]; - } else { - for (int k = 0; k < Big_K; k++) - res += - static_cast(A[m * Big_K + k]) * static_cast(B[k * Big_N + n]); + for (int k = 0; k < Big_K; k++) { + auto index_a = + layout_A == layout::row_major ? m * Big_K + k : m + k * Big_M; + auto index_b = + layout_B == layout::row_major ? k * Big_N + n : k + n * Big_K; + + if constexpr (std::is_same::value) { + res += A[index_a] * B[index_b]; + } else { + res += static_cast(A[index_a]) * static_cast(B[index_b]); + } } return res; } -template > +template < + typename Tm, typename Tc, typename Td, size_t Sub_Tiles_M, + size_t Sub_Tiles_K, size_t Sub_Tiles_N, size_t M, size_t K, size_t N, + layout layout_A = layout::row_major, layout layout_B = layout::row_major, + layout layout_C = layout::row_major, typename T3 = std::remove_const_t> void test(queue &q) { // total number of M dimension matrix elements for the "Big matrix". constexpr auto Big_M = Sub_Tiles_M * M; @@ -97,7 +107,8 @@ void test(queue &q) { accessor accA(bufA, cgh); - cgh.parallel_for>( + cgh.parallel_for>( range<1>(Big_M * Big_K), [=](item<1> item) { auto i = item.get_linear_id(); accA[i] = 0.1f * (i % 10); @@ -107,7 +118,8 @@ void test(queue &q) { accessor accB(bufB, cgh); - cgh.parallel_for>( + cgh.parallel_for>( range<1>(Big_K * Big_N), [=](item<1> item) { auto i = item.get_linear_id(); accB[i] = 0.1f * (i % 10); @@ -130,7 +142,8 @@ void test(queue &q) { range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; - cgh.parallel_for>( + cgh.parallel_for< + KernelName>( nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) { sycl::sub_group sg = item.get_sub_group(); // row id of current submatrix of BIG C matrix @@ -138,33 +151,46 @@ void test(queue &q) { // column id of current submatrix of BIG C matrix const auto n = item.get_group().get_group_id()[1]; - joint_matrix - sub_a; - joint_matrix - sub_b; + joint_matrix sub_a; + joint_matrix sub_b; joint_matrix, use::accumulator, M, N> sub_c; joint_matrix sub_d; + auto stride_C = layout_C == layout::row_major ? Big_N : Big_M; + auto load_stride_C = layout_C == layout::row_major + ? (m * M) * Big_N + n * N + : (m * M) + n * N * Big_M; joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (m * M) * Big_N + n * N, - Big_N, layout::row_major); + load_stride_C, + stride_C, layout_C); + + auto stride_A = layout_A == layout::row_major ? Big_K : Big_M; + auto stride_B = layout_B == layout::row_major ? Big_N : Big_K; + // k = row/col id of current submatrix of BIG A/B matrices for (int k = 0; k < Sub_Tiles_K; k++) { + auto load_stride_A = layout_A == layout::row_major + ? (k * K) + (m * M * Big_K) + : (k * K * Big_M) + (m * M); + auto load_stride_B = layout_B == layout::row_major + ? (k * K * Big_N) + (n * N) + : (k * K) + (n * N * Big_K); + joint_matrix_load( sg, sub_a, accA.template get_multi_ptr() + - (k * K) + (m * M * Big_K), - Big_K); + load_stride_A, + stride_A); joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * K * Big_N) + (n * N), - Big_N); + load_stride_B, + stride_B); // round values to correct precision if using tf32 if constexpr (std::is_same::value) { @@ -174,12 +200,13 @@ void test(queue &q) { } joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c); + joint_matrix_copy(sg, sub_d, sub_c); } joint_matrix_store( sg, sub_d, accD.template get_multi_ptr() + - (m * M) * Big_N + n * N, - Big_N, layout::row_major); + load_stride_C, + stride_C, layout_C); }); }); q.wait(); @@ -187,14 +214,18 @@ void test(queue &q) { for (int m = 0; m < Big_M; m++) { for (int n = 0; n < Big_N; n++) { + auto index_D = + layout_C == layout::row_major ? m * Big_N + n : m + n * Big_M; if constexpr (std::is_same, bfloat16>::value) { - auto res_device = matrix_ref_mn(m, n, A, B, C); - assert(fabs(2 * (D[m * Big_N + n] - res_device)) / - (D[m * Big_N + n] + res_device) < + auto res_device = + matrix_ref_mn(m, n, A, B, + C); + assert(fabs(2 * (D[index_D] - res_device)) / (D[index_D] + res_device) < bf16_eps * 2); } else { - assert( - (D[m * Big_N + n] == matrix_ref_mn(m, n, A, B, C))); + assert((D[index_D] == + matrix_ref_mn(m, n, A, + B, C))); } } } diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp index f28372b6277dc..a558600ad390c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp @@ -80,12 +80,23 @@ int main() { test(Q); + // test different layout combinations for one case + + test(Q); + test(Q); + test(Q); + test(Q); + + // joint_matrix_apply tests + auto apply_add = [](auto &x) { x = x + 2; }; float D[MATRIX_M][MATRIX_N]; big_matrix MD_f((float *)&D); - // joint_matrix_apply tests - matrix_verify_lambda(Q, MD_f, 0.0, apply_add); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp index cea15392408cc..1dea8c879b5eb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp @@ -50,13 +50,28 @@ int main() { test(Q); + // test different layout combinations for one case + + test(Q); + test(Q); + test(Q); + test(Q); + + // joint_matrix_apply tests + auto apply_add = [](auto &x) { x = x + 2; }; int32_t D_i[MATRIX_M][MATRIX_N]; big_matrix MD_i((int32_t *)&D_i); - // joint_matrix_apply tests - matrix_verify_lambda(Q, MD_i, 0, apply_add); matrix_verify_lambda(Q, MD_i, 0, apply_add); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp index 2a0731d9b988e..ca823161b6197 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp @@ -43,9 +43,28 @@ int main() { // A/B tf32 test(Q); test(Q); + 16, 8, 16, layout::row_major, layout::row_major, layout::row_major, + precision::tf32>(Q); + + // test different layout combinations for one case + + test(Q); + test(Q); + test(Q); + test(Q); + + // joint_matrix_apply tests float D[MATRIX_M][MATRIX_N]; big_matrix MD_f((float *)&D); @@ -54,7 +73,6 @@ int main() { big_matrix MD_d((double *)&D_d); auto apply_add = [](auto &x) { x = x + 2; }; - // joint_matrix_apply tests matrix_verify_lambda(Q, MD_f, 0.0, apply_add); matrix_verify_lambda(Q, MD_d, -60.0, apply_add); diff --git a/sycl/test-e2e/USM/usm_pooling.cpp b/sycl/test-e2e/USM/usm_pooling.cpp index 4a9d16ec5a34e..2f2d4009dce1e 100644 --- a/sycl/test-e2e/USM/usm_pooling.cpp +++ b/sycl/test-e2e/USM/usm_pooling.cpp @@ -1,6 +1,9 @@ // REQUIRES: level_zero // RUN: %{build} -o %t.out +// https://github.com/intel/llvm/issues/12397 +// UNSUPPORTED: gpu-intel-dg2 + // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB. // With no pooling: 1,2,3,4,5 allocs lead to ZE call. diff --git a/sycl/test/check_device_code/fp-accuracy.cpp b/sycl/test/check_device_code/fp-accuracy.cpp new file mode 100644 index 0000000000000..f5a42c2bbc436 --- /dev/null +++ b/sycl/test/check_device_code/fp-accuracy.cpp @@ -0,0 +1,29 @@ +// DEFINE: %{common_opts} = -fsycl -fsycl-device-only -fno-math-errno \ +// DEFINE: -ffp-accuracy=high -S -emit-llvm -o - %s + +// RUN: %clangxx %{common_opts} | FileCheck %s + +// RUN: %clangxx %{common_opts} -ffp-accuracy=low:exp \ +// RUN: | FileCheck %s --check-prefix=CHECK-LOW-EXP + +#include + +SYCL_EXTERNAL auto foo(double x) { + using namespace sycl; + return cos(exp(log(x))); +} + +// CHECK-LABEL: define {{.*}}food +// CHECK: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_HIGH:[0-9]+]] +// CHECK: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]] +// CHECK: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]] + +// CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" + +// CHECK-LOW-EXP-LABEL: define {{.*}}food +// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F1_HIGH:[0-9]+]] +// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F1_LOW:[0-9]+]] +// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]] + +// CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" +// CHECK-F1: attributes #[[ATTR_F1_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0" diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 737d7b4fabfad..31dbc3e889f4b 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -969,6 +969,8 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, simd pass_thru; auto pass_thru_view = pass_thru.select<32, 1>(); + auto usm_view = usm.select<32, 1>(); + // Test USM and ACC gather using this plan: // 1) gather(usm, offsets): offsets is simd or simd_view // 2) gather(usm, offsets, mask): offsets is simd or simd_view @@ -1151,6 +1153,64 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, props_align4); acc_res = gather(acc, ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}}) + scatter(ptrf, ioffset_n32, usm, mask_n32); + + scatter(ptrf, ioffset_n32, usm); + + scatter(ptrf, ioffset_n32, usm, mask_n32, props_align4); + + scatter(ptrf, ioffset_n32, usm, props_align4); + + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v32i1.v32i64.v32i32(<32 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0) + scatter(ptrf, ioffset_n32, usm, mask_n32, props_cache_load); + scatter(ptrf, ioffset_n32, usm, props_cache_load); + + scatter(ptrf, ioffset_n32_view, usm, mask_n32, props_cache_load); + scatter(ptrf, ioffset_n32_view, usm, props_cache_load); + + scatter(ptrf, ioffset_n32, usm_view, mask_n32, props_cache_load); + scatter(ptrf, ioffset_n32, usm_view, props_cache_load); + + scatter(ptrf, ioffset_n32_view, usm_view, mask_n32, + props_cache_load); + scatter(ptrf, ioffset_n32_view, usm_view, props_cache_load); + + // VS > 1 + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0) + scatter(ptrf, ioffset_n16, usm, mask_n16, props_cache_load); + + scatter(ptrf, ioffset_n16, usm, props_cache_load); + + scatter(ptrf, ioffset_n16_view, usm, mask_n16, + props_cache_load); + scatter(ptrf, ioffset_n16_view, usm, props_cache_load); + + scatter(ptrf, ioffset_n16, usm_view, mask_n16, + props_cache_load); + scatter(ptrf, ioffset_n16, usm_view, props_cache_load); + + scatter(ptrf, ioffset_n16_view, usm_view, mask_n16, + props_cache_load); + scatter(ptrf, ioffset_n16_view, usm_view, props_cache_load); + + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0) + scatter(ptrf, ioffset_n16, usm, mask_n16); + + scatter(ptrf, ioffset_n16, usm); + + scatter(ptrf, ioffset_n16_view, usm, mask_n16); + + scatter(ptrf, ioffset_n16_view, usm); + + scatter(ptrf, ioffset_n16, usm_view, mask_n16); + + scatter(ptrf, ioffset_n16, usm_view); + + scatter(ptrf, ioffset_n16_view, usm_view, mask_n16); + + scatter(ptrf, ioffset_n16_view, usm_view); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index c1f0a58f82274..31eac5598f588 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -613,7 +613,7 @@ mock_piMemBufferPartition(pi_mem buffer, pi_mem_flags flags, return PI_SUCCESS; } -inline pi_result mock_piextMemGetNativeHandle(pi_mem mem, +inline pi_result mock_piextMemGetNativeHandle(pi_mem mem, pi_device dev, pi_native_handle *nativeHandle) { *nativeHandle = reinterpret_cast(mem); return PI_SUCCESS;