diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index 526498d1675e3..7fe80879f7664 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -298,7 +298,9 @@ LANGOPT( "SYCL compiler assumes value fits within MAX_INT for member function of " "get/operator[], get_id/operator[] and get_global_id/get_global_linear_id " "in SYCL class id, iterm and nd_iterm") -LANGOPT(SYCLDisableRangeRounding, 1, 0, "Disable parallel for range rounding") +ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2, + SYCLRangeRoundingPreference::On, + "Preference for SYCL parallel_for range rounding") LANGOPT(SYCLEnableIntHeaderDiags, 1, 0, "Enable diagnostics that require the " "SYCL integration header") LANGOPT(SYCLAllowVirtualFunctions, 1, 0, diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h index 2c508c32674c3..c8081a77d65c9 100644 --- a/clang/include/clang/Basic/LangOptions.h +++ b/clang/include/clang/Basic/LangOptions.h @@ -151,6 +151,12 @@ class LangOptionsBase { undefined }; + enum class SYCLRangeRoundingPreference { + On, + Disable, + Force, + }; + enum HLSLLangStd { HLSL_Unset = 0, HLSL_2015 = 2015, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 0eb5701a3a455..f770a6edb66d1 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3997,6 +3997,21 @@ def fsycl_host_compiler_options_EQ : Joined<["-"], "fsycl-host-compiler-options= Visibility<[ClangOption, CLOption, DXCOption]>, HelpText<"When performing the host compilation with " "-fsycl-host-compiler specified, use the given options during that compile. " "Options are expected to be a quoted list of space separated options.">; +def fsycl_range_rounding_EQ : Joined<["-"], "fsycl-range-rounding=">, + Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>, + Values<"on,disable,force">, + NormalizedValuesScope<"LangOptions::SYCLRangeRoundingPreference">, + NormalizedValues<["On", "Disable", "Force"]>, + MarshallingInfoEnum, "On">, + HelpText<"Options for range rounding of SYCL range kernels: " + "disable (do not generate range rounded kernels) " + "force (only generate range rounded kernels) " + "on (generate range rounded kernels as well as unrounded kernels). Default is 'on'">; +def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">, + Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>, + Alias, AliasArgs<["disable"]>, + HelpText<"Deprecated: please use -fsycl-range-rounding=disable instead.">, + Flags<[Deprecated]>; def fno_sycl_use_footer : Flag<["-"], "fno-sycl-use-footer">, Visibility<[ClangOption, CLOption, DXCOption]>, HelpText<"Disable usage of the integration footer during SYCL enabled " "compilations.">; @@ -8256,9 +8271,6 @@ defm sycl_allow_func_ptr: BoolFOption<"sycl-allow-func-ptr", def fenable_sycl_dae : Flag<["-"], "fenable-sycl-dae">, HelpText<"Enable Dead Argument Elimination in SPIR kernels">, MarshallingInfoFlag>; -def fsycl_disable_range_rounding : Flag<["-"], "fsycl-disable-range-rounding">, - HelpText<"Disable parallel for range rounding.">, - MarshallingInfoFlag>; def fsycl_enable_int_header_diags: Flag<["-"], "fsycl-enable-int-header-diags">, HelpText<"Enable diagnostics that require the SYCL integration header.">, MarshallingInfoFlag>; diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index eb9c374741d81..13dbf17fad334 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1158,6 +1158,10 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C, checkSingleArgValidity(DeviceCodeSplit, {"per_kernel", "per_source", "auto", "off"}); + Arg *RangeRoundingPreference = + C.getInputArgs().getLastArg(options::OPT_fsycl_range_rounding_EQ); + checkSingleArgValidity(RangeRoundingPreference, {"disable", "force", "on"}); + Arg *SYCLForceTarget = getArgRequiringSYCLRuntime(options::OPT_fsycl_force_target_EQ); if (SYCLForceTarget) { diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 9b94b761a0897..ea652823950cc 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5427,6 +5427,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, options::OPT_fno_sycl_esimd_force_stateless_mem, true)) CmdArgs.push_back("-fno-sycl-esimd-force-stateless-mem"); + if (Arg *A = Args.getLastArg(options::OPT_fsycl_range_rounding_EQ)) + A->render(Args, CmdArgs); + // Add the Unique ID prefix StringRef UniqueID = D.getSYCLUniqueID(Input.getBaseInput()); if (!UniqueID.empty()) @@ -5451,10 +5454,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, bool DisableRangeRounding = false; if (Arg *A = Args.getLastArg(options::OPT_O_Group)) { if (A->getOption().matches(options::OPT_O0)) - DisableRangeRounding = true; + // If the user has set some range rounding preference then let that + // override not range rounding at -O0 + if (!Args.getLastArg(options::OPT_fsycl_range_rounding_EQ)) + DisableRangeRounding = true; } if (DisableRangeRounding || HasFPGA) - CmdArgs.push_back("-fsycl-disable-range-rounding"); + CmdArgs.push_back("-fsycl-range-rounding=disable"); if (HasFPGA) { // Pass -fintelfpga to both the host and device SYCL compilations if set. diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 6ea2be70b6d3d..0dde49bde21a0 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -579,8 +579,16 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, // Set __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ macro for // both host and device compilations if -fsycl-disable-range-rounding // flag is used. - if (LangOpts.SYCLDisableRangeRounding) + switch (LangOpts.getSYCLRangeRounding()) { + case LangOptions::SYCLRangeRoundingPreference::Disable: Builder.defineMacro("__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__"); + break; + case LangOptions::SYCLRangeRoundingPreference::Force: + Builder.defineMacro("__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__"); + break; + default: + break; + } } if (LangOpts.DeclareSPIRVBuiltins) { diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 89705ca50ab83..0d07d673f3880 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -5172,10 +5172,19 @@ void SYCLIntegrationHeader::emit(raw_ostream &O) { O << "#endif //" << Macro.first << "\n\n"; } - if (S.getLangOpts().SYCLDisableRangeRounding) { + switch (S.getLangOpts().getSYCLRangeRounding()) { + case LangOptions::SYCLRangeRoundingPreference::Disable: O << "#ifndef __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ \n"; O << "#define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1\n"; O << "#endif //__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__\n\n"; + break; + case LangOptions::SYCLRangeRoundingPreference::Force: + O << "#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ \n"; + O << "#define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1\n"; + O << "#endif //__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__\n\n"; + break; + default: + break; } if (SpecConsts.size() > 0) { diff --git a/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp b/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp index 0cd39fd53fee2..752189ca53847 100644 --- a/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp +++ b/clang/test/CodeGenSYCL/integration_header_ppmacros.cpp @@ -2,8 +2,10 @@ // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-SYCL2020 // RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -sycl-std=2017 -fsycl-int-header=%t.h %s // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-SYCL2017 -// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-disable-range-rounding -fsycl-int-header=%t.h %s +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-range-rounding=disable -fsycl-int-header=%t.h %s // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-RANGE +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-range-rounding=force -fsycl-int-header=%t.h %s +// RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-FORCE-RANGE // RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-int-header=%t.h %s // RUN: FileCheck -input-file=%t.h %s --check-prefix=CHECK-NO-RANGE @@ -33,4 +35,10 @@ int main() { // CHECK-RANGE: #ifndef __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ // CHECK-RANGE-NEXT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 // CHECK-RANGE-NEXT: #endif //__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ + +// CHECK-FORCE-RANGE: #ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ +// CHECK-FORCE-RANGE-NEXT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-FORCE-RANGE-NEXT: #endif //__SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // CHECK-NO-RANGE-NOT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-NO-RANGE-NOT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 diff --git a/clang/test/Driver/sycl-offload-intelfpga.cpp b/clang/test/Driver/sycl-offload-intelfpga.cpp index 92fddf11cc877..f86f2cc7e4f7e 100644 --- a/clang/test/Driver/sycl-offload-intelfpga.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga.cpp @@ -26,13 +26,13 @@ // CHK-HOST-DEVICE: clang{{.*}} "-cc1"{{.*}} "-fsycl-is-device"{{.*}} "-fintelfpga" // CHK-HOST-DEVICE: clang{{.*}} "-cc1"{{.*}} "-fintelfpga"{{.*}} "-fsycl-is-host" -/// FPGA target implies -fsycl-disable-range-rounding +/// FPGA target implies -fsycl-range-rounding=disable // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fintelfpga %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64_fpga-unknown-unknown %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s -// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-disable-range-rounding" -// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-disable-range-rounding"{{.*}} "-fsycl-is-host" +// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-range-rounding=disable" +// CHK-RANGE-ROUNDING: clang{{.*}} "-fsycl-range-rounding=disable"{{.*}} "-fsycl-is-host" /// FPGA target implies -emit-only-kernels-as-entry-points in sycl-post-link // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fintelfpga %s 2>&1 \ @@ -41,12 +41,12 @@ // RUN: | FileCheck -check-prefix=CHK-NON-KERNEL-ENTRY-POINTS %s // CHK-NON-KERNEL-ENTRY-POINTS: sycl-post-link{{.*}} "-emit-only-kernels-as-entry-points" -/// -fsycl-disable-range-rounding is applied to all compilations if fpga is used +/// -fsycl-range-rounding=disable is applied to all compilations if fpga is used // RUN: %clangxx -### -target x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64_fpga-unknown-unknown,spir64_gen-unknown-unknown %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING-MULTI %s -// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-disable-range-rounding" -// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-fsycl-disable-range-rounding"{{.*}} "-fsycl-is-host" -// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_fpga-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-disable-range-rounding" +// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_gen-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-range-rounding=disable" +// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-fsycl-range-rounding=disable"{{.*}} "-fsycl-is-host" +// CHK-RANGE-ROUNDING-MULTI: clang{{.*}} "-triple" "spir64_fpga-unknown-unknown"{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-range-rounding=disable" /// -fintelfpga with -reuse-exe= // RUN: touch %t.cpp diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c index 636d9e89b8092..f67ca70bbb717 100644 --- a/clang/test/Driver/sycl-offload.c +++ b/clang/test/Driver/sycl-offload.c @@ -508,13 +508,33 @@ // RUN: | FileCheck -check-prefix=CHK-TOOLS-OPTS2 %s // CHK-TOOLS-OPTS2: clang-offload-wrapper{{.*}} "-link-opts=-DFOO1 -DFOO2" -/// -fsycl-disable-range-rounding settings +/// -fsycl-range-rounding settings +/// +/// // Check that driver flag is passed to cc1 +// RUN: %clang -### -fsycl -fsycl-range-rounding=disable %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-DRIVER-RANGE-ROUNDING-DISABLE %s +// RUN: %clang -### -fsycl -fsycl-range-rounding=force %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-DRIVER-RANGE-ROUNDING-FORCE %s +// RUN: %clang -### -fsycl -fsycl-range-rounding=on %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-DRIVER-RANGE-ROUNDING-ON %s +// CHK-DRIVER-RANGE-ROUNDING-DISABLE: "-cc1{{.*}}-fsycl-range-rounding=disable" +// CHK-DRIVER-RANGE-ROUNDING-FORCE: "-cc1{{.*}}-fsycl-range-rounding=force" +// CHK-DRIVER-RANGE-ROUNDING-ON: "-cc1{{.*}}-fsycl-range-rounding=on" +/// +/// // RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ // RUN: -fsycl-targets=spir64 -O0 %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s // RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 -Od %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s -// CHK-DISABLE-RANGE-ROUNDING: "-fsycl-disable-range-rounding" +// RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ +// RUN: -O0 -fsycl-range-rounding=force %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-OVERRIDE-RANGE-ROUNDING %s +// RUN: %clang_cl -### -fsycl -Od %s 2>&1 -fsycl-range-rounding=force %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-OVERRIDE-RANGE-ROUNDING %s +// CHK-DISABLE-RANGE-ROUNDING: "-fsycl-range-rounding=disable" +// CHK-OVERRIDE-RANGE-ROUNDING: "-fsycl-range-rounding=force" +// CHK-OVERRIDE-RANGE-ROUNDING-NOT: "-fsycl-range-rounding=disable" // RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \ // RUN: -fsycl-targets=spir64 -O2 %s 2>&1 \ @@ -527,6 +547,8 @@ // RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s // CHK-RANGE-ROUNDING-NOT: "-fsycl-disable-range-rounding" +// CHK-RANGE-ROUNDING-NOT: "-fsycl-range-rounding=disable" +// CHK-RANGE-ROUNDING-NOT: "-fsycl-range-rounding=force" /// ########################################################################### diff --git a/clang/test/Preprocessor/predefined-macros.c b/clang/test/Preprocessor/predefined-macros.c index 119350ebf3fab..199fd03bee63b 100644 --- a/clang/test/Preprocessor/predefined-macros.c +++ b/clang/test/Preprocessor/predefined-macros.c @@ -284,32 +284,60 @@ // CHECK-RDC: #define __CLANG_RDC__ 1 // RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ -// RUN: -triple spir64-unknown-unknown -fsycl-disable-range-rounding -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RANGE +// RUN: -triple spir64-unknown-unknown -fsycl-range-rounding=disable -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ // RUN: -triple spir64_fpga-unknown-unknown -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE -// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-disable-range-rounding \ +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-range-rounding=disable \ // RUN: -triple spir64_fpga-unknown-unknown -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-device -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE // RUN: %clang_cc1 %s -E -dM -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-host \ -// RUN: -triple x86_64-unknown-linux-gnu -fsycl-disable-range-rounding -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-RANGE +// RUN: -triple x86_64-unknown-linux-gnu -fsycl-range-rounding=disable -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-RANGE // RUN: %clang_cc1 %s -E -dM -fsycl-is-host -o - \ -// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-NO-RANGE +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-DISABLE-NO-RANGE -// CHECK-RANGE: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 -// CHECK-NO-RANGE-NOT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-DISABLE-RANGE: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-DISABLE-NO-RANGE-NOT: #define __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__ 1 + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ +// RUN: -triple spir64-unknown-unknown -fsycl-range-rounding=force -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device \ +// RUN: -triple spir64_fpga-unknown-unknown -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -fsycl-range-rounding=force \ +// RUN: -triple spir64_fpga-unknown-unknown -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-device -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// RUN: %clang_cc1 %s -E -dM -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-host \ +// RUN: -triple x86_64-unknown-linux-gnu -fsycl-range-rounding=force -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-RANGE + +// RUN: %clang_cc1 %s -E -dM -fsycl-is-host -o - \ +// RUN: | FileCheck -match-full-lines %s --check-prefix=CHECK-FORCE-NO-RANGE + +// CHECK-FORCE-RANGE: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 +// CHECK-FORCE-NO-RANGE-NOT: #define __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ 1 // RUN: %clang_cc1 %s -E -dM -o - -x hip -triple x86_64-unknown-linux-gnu \ // RUN: -fgpu-default-stream=per-thread \ @@ -334,4 +362,4 @@ // RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device | FileCheck -match-full-lines \ // RUN: %s --check-prefix=CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG // CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG: #define __HIPSTDPAR__ 1 -// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1 \ No newline at end of file +// CHECK-HIPSTDPAR-INTERPOSE-DEV-NEG-NOT: #define __HIPSTDPAR_INTERPOSE_ALLOC__ 1 diff --git a/sycl/doc/design/ParallelForRangeRounding.md b/sycl/doc/design/ParallelForRangeRounding.md index a4199aed8e800..7f43cafe6e96e 100644 --- a/sycl/doc/design/ParallelForRangeRounding.md +++ b/sycl/doc/design/ParallelForRangeRounding.md @@ -42,5 +42,8 @@ rounding will only be used if the SYCL runtime X dimension exceeds some minimum value, which can be configured using the `SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS` environment variable. -Generation of range rounded kernels can be disabled by using the compiler flag -`-fsycl-disable-range-rounding`. +In order to reduce binary size, the user can tell the compiler only to generate +the range rounded kernel, `-fsycl-range-rounding=force`. The user can also tell +the SYCL implementation to only produce the unrounded kernel using the flag +`-fsycl-range-rounding=disable`. By default both kernels will be generated, +which is equivalent to `-fsycl-range-rounding=on`. diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 51e2f41de9d75..672f8da32c91c 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -1181,7 +1181,6 @@ class __SYCL_EXPORT handler { // non-32-bit global range, we wrap the old kernel in a new kernel // that has each work item peform multiple invocations the old // kernel in a 32-bit global range. - auto Dev = detail::getSyclObjImpl(detail::getDeviceFromHandler(*this)); id MaxNWGs = [&] { auto [MaxWGs, HasMaxWGs] = getMaxWorkGroups_v2(); if (!HasMaxWGs) { @@ -1224,6 +1223,11 @@ class __SYCL_EXPORT handler { // will yield a rounded-up value for the total range. Adjust(0, ((RoundedRange[0] + GoodFactor - 1) / GoodFactor) * GoodFactor); } +#ifdef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // If we are forcing range rounding kernels to be used, we always want the + // rounded range kernel to be generated, even if rounding isn't needed + DidAdjust = true; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ for (int i = 0; i < Dims; ++i) if (RoundedRange[i] > MaxRange[i]) @@ -1330,6 +1334,9 @@ class __SYCL_EXPORT handler { { (void)UserRange; (void)Props; +#ifndef __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ + // If parallel_for range rounding is forced then only range rounded + // kernel is generated kernel_parallel_for_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ @@ -1340,6 +1347,9 @@ class __SYCL_EXPORT handler { std::move(KernelFunc)); setType(detail::CG::Kernel); #endif +#else + (void)KernelFunc; +#endif // __SYCL_FORCE_PARALLEL_FOR_RANGE_ROUNDING__ } } diff --git a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp index 0a456ef02ea00..9e6d74dcb3f85 100644 --- a/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp +++ b/sycl/test-e2e/Basic/parallel_for_range_roundup.cpp @@ -1,152 +1,164 @@ // REQUIRES: gpu // RUN: %{build} -o %t.out -// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s - +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT + +// RUN: %{build} -fsycl-range-rounding=force -o %t.out +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %{run} %t.out | FileCheck %s --check-prefix=CHECK-DEFAULT + +// These tests test 3 things: +// +// 1. The user range is the same as the in kernel range (using BufRange) as +// reported by get_range(). +// 2. That the effective range is the same as the reported range (using +// BufCounter). i.e. check that the mapping of effective range to user range +// is "onto". +// 3. That every index in a 1, 2, or 3 dimension range is active the execution +// (using BufIndexes). i.e. check that the mapping of effective range to user +// range is "one-to-one". +// #include #include + using namespace sycl; +constexpr size_t MagicY = 33, MagicZ = 64; + range<1> Range1 = {0}; range<2> Range2 = {0, 0}; range<3> Range3 = {0, 0, 0}; +template class Kernel1; +template class Kernel2; +template class Kernel3; + void check(const char *msg, size_t v, size_t ref) { std::cout << msg << v << std::endl; assert(v == ref); } -int try_item1(size_t size) { - range<1> Size{size}; - int Counter = 0; - { - buffer, 1> BufRange(&Range1, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<1> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range1.get(0), size); - check("Counter = ", Counter, size); - return 0; +template void checkVec(vec a, vec b) { + static_assert(Dims == 1 || Dims == 2 || Dims == 3, + "Should only be use for 1, 2 or 3 dimensional vectors"); + assert(a[0] == b[0]); + if constexpr (Dims > 1) + assert(a[1] == b[1]); + if constexpr (Dims > 2) + assert(a[2] == b[2]); } -void try_item2(size_t size) { - range<2> Size{size, 10}; - int Counter = 0; - { - buffer, 1> BufRange(&Range2, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<2> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0][0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range2.get(0), size); - check("Counter = ", Counter, size * 10); -} - -void try_item3(size_t size) { - range<3> Size{size, 10, 10}; - int Counter = 0; - { - buffer, 1> BufRange(&Range3, 1); - buffer BufCounter(&Counter, 1); - queue myQueue; - - myQueue.submit([&](handler &cgh) { - auto AccRange = BufRange.get_access(cgh); - auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](item<3> ITEM) { - AccCounter[0].fetch_add(1); - AccRange[0][0] = ITEM.get_range(0); - }); - }); - myQueue.wait(); - } - check("Size seen by user = ", Range3.get(0), size); - check("Counter = ", Counter, size * 10 * 10); -} - -void try_id1(size_t size) { +template void try_1d_range(size_t size) { + using IndexCheckT = int; range<1> Size{size}; int Counter = 0; + std::vector ItemIndexes(Size[0]); { buffer, 1> BufRange(&Range1, 1); buffer BufCounter(&Counter, 1); + buffer BufIndexes(ItemIndexes); queue myQueue; myQueue.submit([&](handler &cgh) { auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<1> ID) { + auto AccIndexes = BufIndexes.get_access(cgh); + cgh.parallel_for>(Size, [=](KernelIdT I) { AccCounter[0].fetch_add(1); - AccRange[0] = ID[0]; + if constexpr (std::is_same_v>) + AccRange[0] = sycl::range<1>(I.get_range(0)); + int Idx = I[0]; + AccIndexes[Idx] = IndexCheckT(I[0]); }); }); myQueue.wait(); } + if constexpr (std::is_same_v>) { + check("Size seen by user at Dim 0 = ", Range1.get(0), size); + } check("Counter = ", Counter, size); + for (auto i = 0; i < Size[0]; ++i) { + checkVec<1>(vec(ItemIndexes[i]), vec(i)); + } + std::cout << "Correct kernel indexes used\n"; } -void try_id2(size_t size) { - range<2> Size{size, 10}; +template void try_2d_range(size_t size) { + using IndexCheckT = int2; + range<2> Size{size, MagicY}; int Counter = 0; + std::vector ItemIndexes(Size[0] * Size[1]); { buffer, 1> BufRange(&Range2, 1); buffer BufCounter(&Counter, 1); + buffer BufIndexes(ItemIndexes); queue myQueue; myQueue.submit([&](handler &cgh) { auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<2> ID) { + auto AccIndexes = BufIndexes.get_access(cgh); + cgh.parallel_for>(Size, [=](KernelIdT I) { AccCounter[0].fetch_add(1); - AccRange[0][0] = ID[0]; + if constexpr (std::is_same_v>) + AccRange[0] = sycl::range<2>(I.get_range(0), I.get_range(1)); + int Idx = I[0] * Size[1] + I[1]; + AccIndexes[Idx] = IndexCheckT(I[0], I[1]); }); }); myQueue.wait(); } - check("Counter = ", Counter, size * 10); + if constexpr (std::is_same_v>) { + check("Size seen by user at Dim 0 = ", Range2.get(0), Size[0]); + check("Size seen by user at Dim 1 = ", Range2.get(1), Size[1]); + } + check("Counter = ", Counter, size * MagicY); + for (auto i = 0; i < Size[0]; ++i) + for (auto j = 0; j < Size[1]; ++j) + checkVec<2>(ItemIndexes[i * Size[1] + j], IndexCheckT(i, j)); + std::cout << "Correct kernel indexes used\n"; } -void try_id3(size_t size) { - range<3> Size{size, 10, 10}; +template void try_3d_range(size_t size) { + using IndexCheckT = int3; + range<3> Size{size, MagicY, MagicZ}; int Counter = 0; + std::vector ItemIndexes(Size[0] * Size[1] * Size[2]); { buffer, 1> BufRange(&Range3, 1); buffer BufCounter(&Counter, 1); + buffer BufIndexes(ItemIndexes); queue myQueue; myQueue.submit([&](handler &cgh) { auto AccRange = BufRange.get_access(cgh); auto AccCounter = BufCounter.get_access(cgh); - cgh.parallel_for(Size, [=](id<3> ID) { + auto AccIndexes = BufIndexes.get_access(cgh); + cgh.parallel_for>(Size, [=](KernelIdT I) { AccCounter[0].fetch_add(1); - AccRange[0][0] = ID[0]; + if constexpr (std::is_same_v>) + AccRange[0] = + sycl::range<3>(I.get_range(0), I.get_range(1), I.get_range(2)); + int Idx = I[0] * Size[1] * Size[2] + I[1] * Size[2] + I[2]; + AccIndexes[Idx] = IndexCheckT(I[0], I[1], I[2]); }); }); myQueue.wait(); } - check("Counter = ", Counter, size * 10 * 10); + if constexpr (std::is_same_v>) { + check("Size seen by user at Dim 0 = ", Range3.get(0), Size[0]); + check("Size seen by user at Dim 1 = ", Range3.get(1), Size[1]); + check("Size seen by user at Dim 2 = ", Range3.get(2), Size[2]); + } + check("Counter = ", Counter, size * MagicY * MagicZ); + for (auto i = 0; i < Size[0]; ++i) + for (auto j = 0; j < Size[1]; ++j) + for (auto k = 0; k < Size[2]; ++k) + checkVec<3>(ItemIndexes[i * Size[1] * Size[2] + j * Size[2] + k], + IndexCheckT(i, j, k)); + std::cout << "Correct kernel indexes used\n"; } void try_unnamed_lambda(size_t size) { - range<3> Size{size, 10, 10}; + range<3> Size{size, MagicY, MagicZ}; int Counter = 0; { buffer, 1> BufRange(&Range3, 1); @@ -163,57 +175,71 @@ void try_unnamed_lambda(size_t size) { }); myQueue.wait(); } - check("Counter = ", Counter, size * 10 * 10); + check("Counter = ", Counter, size * MagicY * MagicZ); } int main() { - int x; - - x = 1500; - try_item1(x); - try_item2(x); - try_item3(x); - try_id1(x); - try_id2(x); - try_id3(x); + int x = 1500; + try_1d_range>(x); + try_1d_range>(x); + try_2d_range>(x); + try_2d_range>(x); + try_3d_range>(x); + try_3d_range>(x); try_unnamed_lambda(x); x = 256; - try_item1(x); - try_item2(x); - try_item3(x); - try_id1(x); - try_id2(x); - try_id3(x); + try_1d_range>(x); + try_1d_range>(x); + try_2d_range>(x); + try_2d_range>(x); + try_3d_range>(x); + try_3d_range>(x); try_unnamed_lambda(x); - - return 0; } -// CHECK: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Size seen by user = 1500 -// CHECK-NEXT: Counter = 1500 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Size seen by user = 1500 -// CHECK-NEXT: Counter = 15000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Size seen by user = 1500 -// CHECK-NEXT: Counter = 150000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 1500 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 15000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 150000 -// CHECK-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 -// CHECK-NEXT: Counter = 150000 -// CHECK-NEXT: Size seen by user = 256 -// CHECK-NEXT: Counter = 256 -// CHECK-NEXT: Size seen by user = 256 -// CHECK-NEXT: Counter = 2560 -// CHECK-NEXT: Size seen by user = 256 -// CHECK-NEXT: Counter = 25600 -// CHECK-NEXT: Counter = 256 -// CHECK-NEXT: Counter = 2560 -// CHECK-NEXT: Counter = 25600 -// CHECK-NEXT: Counter = 25600 +// CHECK-DEFAULT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Counter = 1500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 1500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Counter = 49500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 49500 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 1500 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: parallel_for range adjusted at dim 0 from 1500 to 1504 +// CHECK-DEFAULT-NEXT: Counter = 3168000 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Counter = 256 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 256 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Counter = 8448 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 8448 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 0 = 256 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 1 = 33 +// CHECK-DEFAULT-NEXT: Size seen by user at Dim 2 = 64 +// CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 540672 +// CHECK-DEFAULT-NEXT: Correct kernel indexes used +// CHECK-DEFAULT-NEXT: Counter = 540672