From cb2efefc65a07040ab6273d5798b738f1ae2b009 Mon Sep 17 00:00:00 2001 From: Arvind Sudarsanam Date: Wed, 3 Jul 2024 08:36:42 -0400 Subject: [PATCH] [SYCLLowerIR][SemaSYCL] Support indirect hierarchical parallelism (#14264) This PR adds a missing feature in SYCL hierarchical parallelism support. Specifically, this PR adds support for the case when there are functions between parallel_for_work_group and parallel_for_work_item in the call stack. For example: void foo(sycl::group<1> group, ...) { group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... }); } // ... cgh.parallel_for_work_group( range<1>(...), range<1>(...), [=](group<1> g) { foo(g, ...); }); --------- Signed-off-by: Sudarsanam, Arvind --- clang/lib/Sema/SemaSYCL.cpp | 60 ++++++++++------- clang/test/CodeGenSYCL/sycl-pf-work-item.cpp | 2 + llvm/lib/SYCLLowerIR/LowerWGScope.cpp | 68 ++++++++++++++------ sycl/test-e2e/HierPar/hier_par_indirect.cpp | 30 +++++++++ 4 files changed, 118 insertions(+), 42 deletions(-) create mode 100644 sycl/test-e2e/HierPar/hier_par_indirect.cpp diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 27bdf6966e9c1..d08766191c0d6 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -752,6 +752,30 @@ static bool isDeclaredInSYCLNamespace(const Decl *D) { return ND && ND->getName() == "sycl"; } +static bool isSYCLPrivateMemoryVar(VarDecl *VD) { + return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory); +} + +static void addScopeAttrToLocalVars(FunctionDecl &F) { + for (Decl *D : F.decls()) { + VarDecl *VD = dyn_cast(D); + + if (!VD || isa(VD) || + VD->getStorageDuration() != StorageDuration::SD_Automatic) + continue; + // Local variables of private_memory type in the WG scope still have WI + // scope, all the rest - WG scope. Simple logic + // "if no scope than it is WG scope" won't work, because compiler may add + // locals not declared in user code (lambda object parameter, byval + // arguments) which will result in alloca w/o any attribute, so need WI + // scope too. + SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD) + ? SYCLScopeAttr::Level::WorkItem + : SYCLScopeAttr::Level::WorkGroup; + VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L)); + } +} + // This type does the heavy lifting for the management of device functions, // recursive function detection, and attribute collection for a single // kernel/external function. It walks the callgraph to find all functions that @@ -801,12 +825,24 @@ class SingleDeviceFunctionTracker { // Note: Here, we assume that this is called from within a // parallel_for_work_group; it is undefined to call it otherwise. // We deliberately do not diagnose a violation. + // The following changes have also been added: + // 1. The function inside which the parallel_for_work_item exists is + // marked with WorkGroup scope attribute, if not present already. + // 2. The local variables inside the function are marked with appropriate + // scope. if (CurrentDecl->getIdentifier() && CurrentDecl->getIdentifier()->getName() == "parallel_for_work_item" && isDeclaredInSYCLNamespace(CurrentDecl) && !CurrentDecl->hasAttr()) { CurrentDecl->addAttr(SYCLScopeAttr::CreateImplicit( Parent.SemaSYCLRef.getASTContext(), SYCLScopeAttr::Level::WorkItem)); + FunctionDecl *Caller = CallStack.back(); + if (!Caller->hasAttr()) { + Caller->addAttr( + SYCLScopeAttr::CreateImplicit(Parent.SemaSYCLRef.getASTContext(), + SYCLScopeAttr::Level::WorkGroup)); + addScopeAttrToLocalVars(*Caller); + } } // We previously thought we could skip this function if we'd seen it before, @@ -999,30 +1035,6 @@ class MarkWIScopeFnVisitor : public RecursiveASTVisitor { ASTContext &Ctx; }; -static bool isSYCLPrivateMemoryVar(VarDecl *VD) { - return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory); -} - -static void addScopeAttrToLocalVars(CXXMethodDecl &F) { - for (Decl *D : F.decls()) { - VarDecl *VD = dyn_cast(D); - - if (!VD || isa(VD) || - VD->getStorageDuration() != StorageDuration::SD_Automatic) - continue; - // Local variables of private_memory type in the WG scope still have WI - // scope, all the rest - WG scope. Simple logic - // "if no scope than it is WG scope" won't work, because compiler may add - // locals not declared in user code (lambda object parameter, byval - // arguments) which will result in alloca w/o any attribute, so need WI - // scope too. - SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD) - ? SYCLScopeAttr::Level::WorkItem - : SYCLScopeAttr::Level::WorkGroup; - VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L)); - } -} - /// Return method by name static CXXMethodDecl *getMethodByName(const CXXRecordDecl *CRD, StringRef MethodName) { diff --git a/clang/test/CodeGenSYCL/sycl-pf-work-item.cpp b/clang/test/CodeGenSYCL/sycl-pf-work-item.cpp index 7aa8d250ea822..7df666cc3dcdf 100644 --- a/clang/test/CodeGenSYCL/sycl-pf-work-item.cpp +++ b/clang/test/CodeGenSYCL/sycl-pf-work-item.cpp @@ -1,6 +1,7 @@ // RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -internal-isystem %S/Inputs -emit-llvm %s -o - | FileCheck %s // This test checks if the parallel_for_work_item called indirecly from // parallel_for_work_group gets the work_item_scope marker on it. +// It also checks if the calling function gets the work_group_scope marker on it. #include void foo(sycl::group<1> work_group) { @@ -18,4 +19,5 @@ int main(int argc, char **argv) { return 0; } +// CHECK: define {{.*}} void {{.*}}foo{{.*}} !work_group_scope // CHECK: define {{.*}} void @{{.*}}sycl{{.*}}group{{.*}}parallel_for_work_item{{.*}}(ptr addrspace(4) noundef align 1 dereferenceable_or_null(1) %this) {{.*}}!work_item_scope {{.*}}!parallel_for_work_item diff --git a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp index 8ab5218092d83..882129b369da1 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGScope.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGScope.cpp @@ -65,19 +65,6 @@ // (1) - materialization of a PFWI object // (2) - "fixup" of the private variable address. // -// TODO: add support for the case when there are other functions between -// parallel_for_work_group and parallel_for_work_item in the call stack. -// For example: -// -// void foo(sycl::group<1> group, ...) { -// group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... }); -// } -// ... -// cgh.parallel_for_work_group( -// range<1>(...), range<1>(...), [=](group<1> g) { -// foo(g, ...); -// }); -// // TODO The approach employed by this pass generates lots of barriers and data // copying between private and local memory, which might not be efficient. There // are optimization opportunities listed below. Also other approaches can be @@ -209,11 +196,36 @@ static bool isCallToAFuncMarkedWithMD(const Instruction *I, const char *MD) { return F && F->getMetadata(MD); } -// Checks is this is a call to parallel_for_work_item. +// Recursively searches for a call to a function with work_group +// metadata inside F. +static bool hasCallToAFuncWithWGMetadata(Function &F) { + for (auto &BB : F) + for (auto &I : BB) { + if (isCallToAFuncMarkedWithMD(&I, WG_SCOPE_MD)) + return true; + const CallInst *Call = dyn_cast(&I); + Function *F = dyn_cast_or_null(Call ? Call->getCalledFunction() + : nullptr); + if (F && hasCallToAFuncWithWGMetadata(*F)) + return true; + } + return false; +} + +// Checks if this is a call to parallel_for_work_item. static bool isPFWICall(const Instruction *I) { return isCallToAFuncMarkedWithMD(I, PFWI_MD); } +// Checks if F has any calls to function marked with PFWI_MD metadata. +static bool hasPFWICall(Function &F) { + for (auto &BB : F) + for (auto &I : BB) + if (isPFWICall(&I)) + return true; + return false; +} + // Checks if given instruction must be executed by all work items. static bool isWIScopeInst(const Instruction *I) { if (I->isTerminator()) @@ -425,6 +437,17 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow, } } +// Skip allocas, addrspacecasts associated with allocas and debug insts. +static Instruction *getFirstInstToProcess(BasicBlock *BB) { + Instruction *I = &BB->front(); + for (; + I->getOpcode() == Instruction::Alloca || + I->getOpcode() == Instruction::AddrSpaceCast || I->isDebugOrPseudoInst(); + I = I->getNextNode()) { + } + return I; +} + // Performs the following transformation for each basic block in the input map: // // BB: @@ -462,7 +485,11 @@ static void materializeLocalsInWIScopeBlocksImpl( for (auto &P : BB2MatLocals) { // generate LeaderBB and private<->shadow copies in proper BBs BasicBlock *LeaderBB = P.first; - BasicBlock *BB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "LeaderMat"); + // Skip allocas, addrspacecasts associated with allocas and debug insts. + // Alloca instructions and it's associated instructions must be in the + // beginning of the function. + Instruction *LeaderBBFront = getFirstInstToProcess(LeaderBB); + BasicBlock *BB = LeaderBB->splitBasicBlock(LeaderBBFront, "LeaderMat"); // Add a barrier to the original block: Instruction *At = spirv::genWGBarrier(*BB->getFirstNonPHI(), TT)->getNextNode(); @@ -476,7 +503,8 @@ static void materializeLocalsInWIScopeBlocksImpl( // fill the leader BB: // fetch data from leader's private copy (which is always up to date) into // the corresponding shadow variable - Builder.SetInsertPoint(&LeaderBB->front()); + LeaderBBFront = getFirstInstToProcess(LeaderBB); + Builder.SetInsertPoint(LeaderBBFront); copyBetweenPrivateAndShadow(L, Shadow, Builder, true /*private->shadow*/); // store data to the local variable - effectively "refresh" the value of // the local in each work item in the work group @@ -485,8 +513,8 @@ static void materializeLocalsInWIScopeBlocksImpl( false /*shadow->private*/); } // now generate the TestBB and the leader WI guard - BasicBlock *TestBB = - LeaderBB->splitBasicBlock(&LeaderBB->front(), "TestMat"); + LeaderBBFront = getFirstInstToProcess(LeaderBB); + BasicBlock *TestBB = LeaderBB->splitBasicBlock(LeaderBBFront, "TestMat"); std::swap(TestBB, LeaderBB); guardBlockWithIsLeaderCheck(TestBB, LeaderBB, BB, At->getDebugLoc(), TT); } @@ -752,6 +780,10 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F, FunctionAnalysisManager &FAM) { if (!F.getMetadata(WG_SCOPE_MD)) return PreservedAnalyses::all(); + // If a function does not have any PFWI calls and it has calls to a function + // that has work_group metadata, then we do not need to lower such functions. + if (!hasPFWICall(F) && hasCallToAFuncWithWGMetadata(F)) + return PreservedAnalyses::all(); LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n"); const auto &TT = llvm::Triple(F.getParent()->getTargetTriple()); // Ranges of "side effect" instructions diff --git a/sycl/test-e2e/HierPar/hier_par_indirect.cpp b/sycl/test-e2e/HierPar/hier_par_indirect.cpp new file mode 100644 index 0000000000000..b0a1787368f97 --- /dev/null +++ b/sycl/test-e2e/HierPar/hier_par_indirect.cpp @@ -0,0 +1,30 @@ +//==- hier_par_indirect.cpp --- hierarchical parallelism test for WG scope--==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// This test checks correctness of hierarchical kernel execution when the work +// item code is not directly inside work group scope. + +#include +#include + +void __attribute__((noinline)) foo(sycl::group<1> work_group) { + work_group.parallel_for_work_item([&](sycl::h_item<1> index) {}); +} + +int main(int argc, char **argv) { + sycl::queue q; + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128}, + ([=](sycl::group<1> wGroup) { foo(wGroup); })); + }).wait(); + std::cout << "test passed" << std::endl; + return 0; +}