Skip to content

Commit

Permalink
[SYCLLowerIR][SemaSYCL] Support indirect hierarchical parallelism (#1…
Browse files Browse the repository at this point in the history
…4264)

This PR adds a missing feature in SYCL hierarchical parallelism support.
Specifically, this PR adds support for the case when there are functions
between parallel_for_work_group and parallel_for_work_item in the call
stack.
For example:
void foo(sycl::group<1> group, ...) {
    group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
}
// ...
  cgh.parallel_for_work_group<class kernel>(
   range<1>(...), range<1>(...), [=](group<1> g) {
   foo(g, ...);
  });

---------

Signed-off-by: Sudarsanam, Arvind <arvind.sudarsanam@intel.com>
  • Loading branch information
asudarsa committed Jul 3, 2024
1 parent 2b48ab5 commit cb2efef
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 42 deletions.
60 changes: 36 additions & 24 deletions clang/lib/Sema/SemaSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,30 @@ static bool isDeclaredInSYCLNamespace(const Decl *D) {
return ND && ND->getName() == "sycl";
}

static bool isSYCLPrivateMemoryVar(VarDecl *VD) {
return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory);
}

static void addScopeAttrToLocalVars(FunctionDecl &F) {
for (Decl *D : F.decls()) {
VarDecl *VD = dyn_cast<VarDecl>(D);

if (!VD || isa<ParmVarDecl>(VD) ||
VD->getStorageDuration() != StorageDuration::SD_Automatic)
continue;
// Local variables of private_memory type in the WG scope still have WI
// scope, all the rest - WG scope. Simple logic
// "if no scope than it is WG scope" won't work, because compiler may add
// locals not declared in user code (lambda object parameter, byval
// arguments) which will result in alloca w/o any attribute, so need WI
// scope too.
SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD)
? SYCLScopeAttr::Level::WorkItem
: SYCLScopeAttr::Level::WorkGroup;
VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L));
}
}

// This type does the heavy lifting for the management of device functions,
// recursive function detection, and attribute collection for a single
// kernel/external function. It walks the callgraph to find all functions that
Expand Down Expand Up @@ -801,12 +825,24 @@ class SingleDeviceFunctionTracker {
// Note: Here, we assume that this is called from within a
// parallel_for_work_group; it is undefined to call it otherwise.
// We deliberately do not diagnose a violation.
// The following changes have also been added:
// 1. The function inside which the parallel_for_work_item exists is
// marked with WorkGroup scope attribute, if not present already.
// 2. The local variables inside the function are marked with appropriate
// scope.
if (CurrentDecl->getIdentifier() &&
CurrentDecl->getIdentifier()->getName() == "parallel_for_work_item" &&
isDeclaredInSYCLNamespace(CurrentDecl) &&
!CurrentDecl->hasAttr<SYCLScopeAttr>()) {
CurrentDecl->addAttr(SYCLScopeAttr::CreateImplicit(
Parent.SemaSYCLRef.getASTContext(), SYCLScopeAttr::Level::WorkItem));
FunctionDecl *Caller = CallStack.back();
if (!Caller->hasAttr<SYCLScopeAttr>()) {
Caller->addAttr(
SYCLScopeAttr::CreateImplicit(Parent.SemaSYCLRef.getASTContext(),
SYCLScopeAttr::Level::WorkGroup));
addScopeAttrToLocalVars(*Caller);
}
}

// We previously thought we could skip this function if we'd seen it before,
Expand Down Expand Up @@ -999,30 +1035,6 @@ class MarkWIScopeFnVisitor : public RecursiveASTVisitor<MarkWIScopeFnVisitor> {
ASTContext &Ctx;
};

static bool isSYCLPrivateMemoryVar(VarDecl *VD) {
return SemaSYCL::isSyclType(VD->getType(), SYCLTypeAttr::private_memory);
}

static void addScopeAttrToLocalVars(CXXMethodDecl &F) {
for (Decl *D : F.decls()) {
VarDecl *VD = dyn_cast<VarDecl>(D);

if (!VD || isa<ParmVarDecl>(VD) ||
VD->getStorageDuration() != StorageDuration::SD_Automatic)
continue;
// Local variables of private_memory type in the WG scope still have WI
// scope, all the rest - WG scope. Simple logic
// "if no scope than it is WG scope" won't work, because compiler may add
// locals not declared in user code (lambda object parameter, byval
// arguments) which will result in alloca w/o any attribute, so need WI
// scope too.
SYCLScopeAttr::Level L = isSYCLPrivateMemoryVar(VD)
? SYCLScopeAttr::Level::WorkItem
: SYCLScopeAttr::Level::WorkGroup;
VD->addAttr(SYCLScopeAttr::CreateImplicit(F.getASTContext(), L));
}
}

/// Return method by name
static CXXMethodDecl *getMethodByName(const CXXRecordDecl *CRD,
StringRef MethodName) {
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGenSYCL/sycl-pf-work-item.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -internal-isystem %S/Inputs -emit-llvm %s -o - | FileCheck %s
// This test checks if the parallel_for_work_item called indirecly from
// parallel_for_work_group gets the work_item_scope marker on it.
// It also checks if the calling function gets the work_group_scope marker on it.
#include <sycl.hpp>

void foo(sycl::group<1> work_group) {
Expand All @@ -18,4 +19,5 @@ int main(int argc, char **argv) {
return 0;
}

// CHECK: define {{.*}} void {{.*}}foo{{.*}} !work_group_scope
// CHECK: define {{.*}} void @{{.*}}sycl{{.*}}group{{.*}}parallel_for_work_item{{.*}}(ptr addrspace(4) noundef align 1 dereferenceable_or_null(1) %this) {{.*}}!work_item_scope {{.*}}!parallel_for_work_item
68 changes: 50 additions & 18 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,19 +65,6 @@
// (1) - materialization of a PFWI object
// (2) - "fixup" of the private variable address.
//
// TODO: add support for the case when there are other functions between
// parallel_for_work_group and parallel_for_work_item in the call stack.
// For example:
//
// void foo(sycl::group<1> group, ...) {
// group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
// }
// ...
// cgh.parallel_for_work_group<class kernel>(
// range<1>(...), range<1>(...), [=](group<1> g) {
// foo(g, ...);
// });
//
// TODO The approach employed by this pass generates lots of barriers and data
// copying between private and local memory, which might not be efficient. There
// are optimization opportunities listed below. Also other approaches can be
Expand Down Expand Up @@ -209,11 +196,36 @@ static bool isCallToAFuncMarkedWithMD(const Instruction *I, const char *MD) {
return F && F->getMetadata(MD);
}

// Checks is this is a call to parallel_for_work_item.
// Recursively searches for a call to a function with work_group
// metadata inside F.
static bool hasCallToAFuncWithWGMetadata(Function &F) {
for (auto &BB : F)
for (auto &I : BB) {
if (isCallToAFuncMarkedWithMD(&I, WG_SCOPE_MD))
return true;
const CallInst *Call = dyn_cast<CallInst>(&I);
Function *F = dyn_cast_or_null<Function>(Call ? Call->getCalledFunction()
: nullptr);
if (F && hasCallToAFuncWithWGMetadata(*F))
return true;
}
return false;
}

// Checks if this is a call to parallel_for_work_item.
static bool isPFWICall(const Instruction *I) {
return isCallToAFuncMarkedWithMD(I, PFWI_MD);
}

// Checks if F has any calls to function marked with PFWI_MD metadata.
static bool hasPFWICall(Function &F) {
for (auto &BB : F)
for (auto &I : BB)
if (isPFWICall(&I))
return true;
return false;
}

// Checks if given instruction must be executed by all work items.
static bool isWIScopeInst(const Instruction *I) {
if (I->isTerminator())
Expand Down Expand Up @@ -425,6 +437,17 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
}
}

// Skip allocas, addrspacecasts associated with allocas and debug insts.
static Instruction *getFirstInstToProcess(BasicBlock *BB) {
Instruction *I = &BB->front();
for (;
I->getOpcode() == Instruction::Alloca ||
I->getOpcode() == Instruction::AddrSpaceCast || I->isDebugOrPseudoInst();
I = I->getNextNode()) {
}
return I;
}

// Performs the following transformation for each basic block in the input map:
//
// BB:
Expand Down Expand Up @@ -462,7 +485,11 @@ static void materializeLocalsInWIScopeBlocksImpl(
for (auto &P : BB2MatLocals) {
// generate LeaderBB and private<->shadow copies in proper BBs
BasicBlock *LeaderBB = P.first;
BasicBlock *BB = LeaderBB->splitBasicBlock(&LeaderBB->front(), "LeaderMat");
// Skip allocas, addrspacecasts associated with allocas and debug insts.
// Alloca instructions and it's associated instructions must be in the
// beginning of the function.
Instruction *LeaderBBFront = getFirstInstToProcess(LeaderBB);
BasicBlock *BB = LeaderBB->splitBasicBlock(LeaderBBFront, "LeaderMat");
// Add a barrier to the original block:
Instruction *At =
spirv::genWGBarrier(*BB->getFirstNonPHI(), TT)->getNextNode();
Expand All @@ -476,7 +503,8 @@ static void materializeLocalsInWIScopeBlocksImpl(
// fill the leader BB:
// fetch data from leader's private copy (which is always up to date) into
// the corresponding shadow variable
Builder.SetInsertPoint(&LeaderBB->front());
LeaderBBFront = getFirstInstToProcess(LeaderBB);
Builder.SetInsertPoint(LeaderBBFront);
copyBetweenPrivateAndShadow(L, Shadow, Builder, true /*private->shadow*/);
// store data to the local variable - effectively "refresh" the value of
// the local in each work item in the work group
Expand All @@ -485,8 +513,8 @@ static void materializeLocalsInWIScopeBlocksImpl(
false /*shadow->private*/);
}
// now generate the TestBB and the leader WI guard
BasicBlock *TestBB =
LeaderBB->splitBasicBlock(&LeaderBB->front(), "TestMat");
LeaderBBFront = getFirstInstToProcess(LeaderBB);
BasicBlock *TestBB = LeaderBB->splitBasicBlock(LeaderBBFront, "TestMat");
std::swap(TestBB, LeaderBB);
guardBlockWithIsLeaderCheck(TestBB, LeaderBB, BB, At->getDebugLoc(), TT);
}
Expand Down Expand Up @@ -752,6 +780,10 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
FunctionAnalysisManager &FAM) {
if (!F.getMetadata(WG_SCOPE_MD))
return PreservedAnalyses::all();
// If a function does not have any PFWI calls and it has calls to a function
// that has work_group metadata, then we do not need to lower such functions.
if (!hasPFWICall(F) && hasCallToAFuncWithWGMetadata(F))
return PreservedAnalyses::all();
LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n");
const auto &TT = llvm::Triple(F.getParent()->getTargetTriple());
// Ranges of "side effect" instructions
Expand Down
30 changes: 30 additions & 0 deletions sycl/test-e2e/HierPar/hier_par_indirect.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
//==- hier_par_indirect.cpp --- hierarchical parallelism test for WG scope--==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// This test checks correctness of hierarchical kernel execution when the work
// item code is not directly inside work group scope.

#include <iostream>
#include <sycl/detail/core.hpp>

void __attribute__((noinline)) foo(sycl::group<1> work_group) {
work_group.parallel_for_work_item([&](sycl::h_item<1> index) {});
}

int main(int argc, char **argv) {
sycl::queue q;
q.submit([&](sycl::handler &cgh) {
cgh.parallel_for_work_group(sycl::range<1>{1}, sycl::range<1>{128},
([=](sycl::group<1> wGroup) { foo(wGroup); }));
}).wait();
std::cout << "test passed" << std::endl;
return 0;
}

0 comments on commit cb2efef

Please sign in to comment.