Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NVPTX][AMD][New offload model] Add support for -fsycl-embed-ir in the new offloading model #14526

Merged
3 changes: 3 additions & 0 deletions clang/lib/Driver/ToolChains/Clang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11238,6 +11238,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
if (IsUsingLTO && LTOMode == LTOK_Thin)
CmdArgs.push_back(Args.MakeArgString("-sycl-thin-lto"));

if (Args.hasArg(options::OPT_fsycl_embed_ir))
CmdArgs.push_back(Args.MakeArgString("-sycl-embed-ir"));

Naghasan marked this conversation as resolved.
Show resolved Hide resolved
// Formulate and add any offload-wrapper and AOT specific options. These
// are additional options passed in via -Xsycl-target-linker and
// -Xsycl-target-backend.
Expand Down
49 changes: 47 additions & 2 deletions clang/test/Driver/linker-wrapper-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@
// RUN: touch %t1.devicelib.cpp
// RUN: %clang %t1.devicelib.cpp -fsycl -fsycl-targets=intel_gpu_pvc -c --offload-new-driver -o %t1.devicelib.o
//
// Run clang-linker-wrapper test
// Run clang-linker-wrapper test (with and without -sycl-embed-ir). -sycl-embed-ir should have no effect for Intel targets.
//
// RUN: clang-linker-wrapper -sycl-device-libraries=%t1.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t1.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-GEN %s
// RUN: clang-linker-wrapper -sycl-embed-ir -sycl-device-libraries=%t1.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t1.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-GEN %s
// CHK-CMDS-AOT-GEN: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts --spirv-preserve-auxdata --spirv-target-env=SPV-IR --spirv-builtin-format=global
// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
// CHK-CMDS-AOT-GEN-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings
Expand Down Expand Up @@ -121,9 +122,53 @@
//
// RUN: clang-linker-wrapper -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t4.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-AMD %s
// CHK-CMDS-AOT-AMD: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts --spirv-preserve-auxdata --spirv-target-env=SPV-IR --spirv-builtin-format=global
// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[FIRSTLLVMLINKOUT]].bc
// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}clang"{{.*}} -o [[CLANGOUT:.*]] --target=amdgcn-amd-amdhsa -mcpu={{.*}}
// CHK-CMDS-AOT-AMD-NEXT: offload-wrapper: input: [[WRAPPERIN:.*]], output: [[WRAPPEROUT:.*]]
// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]] [[WRAPPEROUT]]
// CHK-CMDS-AOT-AMD-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o

/// Check for -sycl-embed-ir for standalone clang-linker-wrapper run for sycl (NVPTX)
// -------
// Generate .o file as linker wrapper input.
//
// RUN: %clang %s -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_50 -c -nocudalib -fno-sycl-libspirv --offload-new-driver -o %t3.o
//
// Generate .o file as SYCL device library file.
//
// RUN: touch %t3.devicelib.cpp
// RUN: %clang %t3.devicelib.cpp -fsycl -fsycl-targets=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_50 -nocudalib -fno-sycl-libspirv -c --offload-new-driver -o %t3.devicelib.o
//
// Run clang-linker-wrapper test
//
// RUN: clang-linker-wrapper -sycl-device-libraries=%t3.devicelib.o -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" -sycl-embed-ir "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t3.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-NV-EMBED-IR %s
// CHK-CMDS-AOT-NV-EMBED-IR: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts --spirv-preserve-auxdata --spirv-target-env=SPV-IR --spirv-builtin-format=global
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: offload-wrapper: input: {{.*}}.bc, output: [[WRAPPEROUT1:.*]]
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT1:.*]] [[WRAPPEROUT1]]
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}clang"{{.*}} -o [[CLANGOUT:.*]] --target=nvptx64-nvidia-cuda -march={{.*}}
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: offload-wrapper: input: [[WRAPPERIN:.*]], output: [[WRAPPEROUT:.*]]
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT2:.*]] [[WRAPPEROUT]]
// CHK-CMDS-AOT-NV-EMBED-IR-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT1]] [[LLCOUT2]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o

/// Check for -sycl-embed-ir for standalone clang-linker-wrapper run for sycl (AMD)
// -------
// Generate .o file as linker wrapper input.
//
// RUN: %clang %s -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx803 -fgpu-rdc -nogpulib -fno-sycl-libspirv -c --offload-new-driver -o %t4.o
//
// Run clang-linker-wrapper test
//
// RUN: clang-linker-wrapper -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" -sycl-embed-ir "--host-triple=x86_64-unknown-linux-gnu" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %t4.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS-AOT-AMD-EMBED-IR %s
// CHK-CMDS-AOT-AMD-EMBED-IR: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts --spirv-preserve-auxdata --spirv-target-env=SPV-IR --spirv-builtin-format=global
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[FIRSTLLVMLINKOUT]].bc
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: offload-wrapper: input: {{.*}}.bc, output: [[WRAPPEROUT1:.*]]
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT1:.*]] [[WRAPPEROUT1]]
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: "{{.*}}clang"{{.*}} -o [[CLANGOUT:.*]] --target=amdgcn-amd-amdhsa -mcpu={{.*}}
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: offload-wrapper: input: [[WRAPPERIN:.*]], output: [[WRAPPEROUT2:.*]]
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT2:.*]] [[WRAPPEROUT2]]
// CHK-CMDS-AOT-AMD-EMBED-IR-NEXT: "{{.*}}ld" -- HOST_LINKER_FLAGS -dynamic-linker HOST_DYN_LIB -o a.out [[LLCOUT1]] [[LLCOUT2]] HOST_LIB_PATH HOST_STAT_LIB {{.*}}.o
6 changes: 6 additions & 0 deletions clang/test/Driver/sycl-offload-new-driver.c
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,9 @@
// RUN: | FileCheck -check-prefix NVPTX_DEF_ARCH %s
// NVPTX_DEF_ARCH: clang-offload-packager{{.*}} "--image=file={{.*}},triple=nvptx64-nvidia-cuda,arch=sm_50,kind=sycl"

/// check for -sycl-embed-ir transmission to clang-linker-wrapper tool
// RUN: %clangxx -fsycl -### -fsycl-targets=nvptx64-nvidia-cuda \
// RUN: -fno-sycl-libspirv -nocudalib --offload-new-driver \
// RUN: -fsycl-embed-ir %s 2>&1 \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if -fsycl-embed-ir is passed for devices other than NVidia or AMD?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to our implementation, it will be ignored. I can add a test for it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that acceptable? Would a warning or error be appropriate here?

// RUN: | FileCheck -check-prefix CHECK_EMBED_IR %s
// CHECK_EMBED_IR: clang-linker-wrapper{{.*}} "-sycl-embed-ir"
33 changes: 26 additions & 7 deletions clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -938,7 +938,7 @@ static Expected<StringRef> runAOTCompile(StringRef InputFile,
/// \returns A path to the LLVM Module that contains wrapped images.
Expected<StringRef>
wrapSYCLBinariesFromFile(std::vector<module_split::SplitModule> &SplitModules,
const ArgList &Args) {
const ArgList &Args, bool IsEmbeddedIR) {
auto OutputFileOrErr = createOutputFile(
sys::path::filename(ExecutableName) + ".sycl.image.wrapper", "bc");
if (!OutputFileOrErr)
Expand Down Expand Up @@ -970,15 +970,19 @@ wrapSYCLBinariesFromFile(std::vector<module_split::SplitModule> &SplitModules,
// spir64-unknown-unknown/spirv64-unknown-unknown/spirv64.
// TODO: Fix SYCL runtime to accept other triples
llvm::Triple T(Target);
StringRef A(T.getArchName());
if(A == "spirv64")
A = "spir64";
std::string EmbeddedIRTarget("llvm_");
EmbeddedIRTarget.append(T.getArchName());
StringRef RegularTarget(T.getArchName());
if (RegularTarget == "spirv64")
RegularTarget = "spir64";

for (auto &SI : SplitModules) {
auto MBOrDesc = MemoryBuffer::getFile(SI.ModuleFilePath);
if (!MBOrDesc)
return createFileError(SI.ModuleFilePath, MBOrDesc.getError());

Images.emplace_back(std::move(*MBOrDesc), SI.Properties, SI.Symbols, A);
StringRef ImageTarget = IsEmbeddedIR ? StringRef(EmbeddedIRTarget) : StringRef(RegularTarget);
Images.emplace_back(std::move(*MBOrDesc), SI.Properties, SI.Symbols, ImageTarget);
}

LLVMContext C;
Expand Down Expand Up @@ -1059,8 +1063,8 @@ static Expected<StringRef> runCompile(StringRef &InputFile,
// Run wrapping library and llc
static Expected<StringRef>
runWrapperAndCompile(std::vector<module_split::SplitModule> &SplitModules,
const ArgList &Args) {
auto OutputFile = sycl::wrapSYCLBinariesFromFile(SplitModules, Args);
const ArgList &Args, bool IsEmbeddedIR = false) {
auto OutputFile = sycl::wrapSYCLBinariesFromFile(SplitModules, Args, IsEmbeddedIR);
if (!OutputFile)
return OutputFile.takeError();
// call to llc
Expand Down Expand Up @@ -2060,6 +2064,21 @@ Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
return SplitModulesOrErr.takeError();

auto &SplitModules = *SplitModulesOrErr;
const llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ));
if ((Triple.isNVPTX() || Triple.isAMDGCN()) &&
LinkerArgs.hasArg(OPT_sycl_embed_ir)) {
// When compiling for Nvidia/AMD devices and the user requested the
// IR to be embedded in the application (via option), run the output
// of sycl-post-link (filetable referencing LLVM Bitcode + symbols)
// through the offload wrapper and link the resulting object to the
// application.
auto OutputFile =
sycl::runWrapperAndCompile(SplitModules, LinkerArgs, /* IsEmbeddedIR */ true);
if (!OutputFile)
return OutputFile.takeError();
WrappedOutput.push_back(*OutputFile);
}

for (size_t I = 0, E = SplitModules.size(); I != E; ++I) {
SmallVector<StringRef> Files = {SplitModules[I].ModuleFilePath};
auto LinkedFileFinalOrErr =
Expand Down
3 changes: 3 additions & 0 deletions clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,6 @@ def sycl_backend_link_options_from_image_EQ : Joined<["--", "-"], "sycl-backend-

def sycl_thin_lto : Flag<["--", "-"], "sycl-thin-lto">,
Flags<[WrapperOnlyOption]>, HelpText<"Link SYCL device code using thinLTO">;

def sycl_embed_ir : Flag<["--", "-"], "sycl-embed-ir">,
Flags<[WrapperOnlyOption]>, HelpText<"Embed LLVM IR for runtime kernel fusion">;
109 changes: 109 additions & 0 deletions sycl/test-e2e/NewOffloadDriver/diamond_shape.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// REQUIRES: fusion

// RUN: %{build} %{embed-ir} -O2 --offload-new-driver -o %t.out
// RUN: %{run} %t.out

// Test complete fusion with private internalization specified on the
// accessors for a combination of four kernels, forming a diamond-like shape and
// repeating one of the kernels. This test uses the new offloading model for
// linking device objects.

#include <sycl/detail/core.hpp>
#include <sycl/ext/codeplay/experimental/fusion_wrapper.hpp>
#include <sycl/properties/all_properties.hpp>

using namespace sycl;

struct AddKernel {
accessor<int, 1> accIn1;
accessor<int, 1> accIn2;
accessor<int, 1> accOut;

void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
};

int main() {
constexpr size_t dataSize = 512;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize],
tmp2[dataSize], tmp3[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp1[i] = -1;
tmp2[i] = -1;
tmp3[i] = -1;
out[i] = -1;
}

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
maksimsab marked this conversation as resolved.
Show resolved Hide resolved

{
buffer<int> bIn1{in1, range{dataSize}};
buffer<int> bIn2{in2, range{dataSize}};
buffer<int> bIn3{in3, range{dataSize}};
buffer<int> bTmp1{tmp1, range{dataSize}};
buffer<int> bTmp2{tmp2, range{dataSize}};
buffer<int> bTmp3{tmp3, range{dataSize}};
buffer<int> bOut{out, range{dataSize}};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access(cgh);
auto accIn2 = bIn2.get_access(cgh);
auto accTmp1 = bTmp1.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
cgh.parallel_for<AddKernel>(dataSize, AddKernel{accIn1, accIn2, accTmp1});
});

q.submit([&](handler &cgh) {
auto accTmp1 = bTmp1.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
auto accIn3 = bIn3.get_access(cgh);
auto accTmp2 = bTmp2.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
cgh.parallel_for<class KernelOne>(
dataSize, [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
});

q.submit([&](handler &cgh) {
auto accTmp1 = bTmp1.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
auto accTmp3 = bTmp3.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
cgh.parallel_for<class KernelTwo>(
dataSize, [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
});

q.submit([&](handler &cgh) {
auto accTmp2 = bTmp2.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
auto accTmp3 = bTmp3.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_private{});
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<AddKernel>(dataSize,
AddKernel{accTmp2, accTmp3, accOut});
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");
}

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i + i * 25) && "Computation error");
assert(tmp1[i] == -1 && "tmp1 not internalized");
assert(tmp2[i] == -1 && "tmp2 not internalized");
assert(tmp3[i] == -1 && "tmp3 not internalized");
}

return 0;
}

7 changes: 7 additions & 0 deletions sycl/test-e2e/NewOffloadDriver/lit.local.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import platform

config.unsupported_features += ['accelerator']

config.substitutions.append(
("%{embed-ir}", "%if any-device-is-hip || any-device-is-cuda %{ -fsycl-embed-ir %}")
)
Loading