Skip to content

Commit

Permalink
Add support for executable duplication in encoding specialization pass.
Browse files Browse the repository at this point in the history
Duplicates stream.executables based on the affinity analysis of
stream.async.dispatch ops. Some executables can be launched by different
devices. It can produce wrong codegen artifacts when bindings types are
encoded (i.e., the tensor type has an encoding attribute). Because they can
result in different layouts, especially when multi-device is involved. E.g.,
say that device_a and device_b interpret a tensor type with encodings in
different layouts, and there is an executable that can be launch with
resources from either device_a or device_b. It is confusing what the input
layouts for the executable because there are two possibilities. In this
case, we have to duplicate the executable with updated encoding, and modify
the dispatch to launch proper executable based on device analysis.

Signed-off-by: hanhanW <hanhan0912@gmail.com>
  • Loading branch information
hanhanW committed Dec 19, 2024
1 parent cb19fce commit 6e605a4
Show file tree
Hide file tree
Showing 2 changed files with 223 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/LogicalResult.h"
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/Interfaces/FunctionInterfaces.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Support/LLVM.h"
Expand Down Expand Up @@ -52,6 +53,175 @@ SmallVector<const T *> gatherUsedDialectInterfaces(mlir::ModuleOp moduleOp) {
return results;
}

/// Returns the affinities of the `dispatchOp`'s resource operands. An empty
/// array attribute indicates that the resource operand affinity is not found.
/// Usually, it happens when it fails on affinity analysis.
/// Note that the size of the result might not equal to the number of resource
/// operands. If a resource operand type is not AffinityType, it is skipped.
static SmallVector<Attribute>
getResourceOperandsAffinities(IREE::Stream::AffinityAnalysis &affinityAnalysis,
IREE::Stream::AsyncDispatchOp dispatchOp) {
SmallVector<Attribute> result;
Builder b(dispatchOp.getContext());
auto emptyArray = b.getArrayAttr({});
for (auto operand : dispatchOp.getResourceOperands()) {
// Skip if the operand type is not AffinityType.
if (!isa<IREE::Stream::AffinityTypeInterface>(operand.getType())) {
continue;
}
SmallVector<IREE::Stream::AffinityAttr> affinities;
if (!affinityAnalysis.tryLookupResourceAffinity(operand, affinities)) {
result.push_back(emptyArray);
continue;
}
result.push_back(b.getArrayAttr(llvm::to_vector_of<Attribute>(affinities)));
}
return result;
}

/// Duplicates stream.executables based on the affinity analysis of
/// stream.async.dispatch ops. Some executables can be launched by different
/// devices. It can produce wrong codegen artifacts when bindings types are
/// encoded (i.e., the tensor type has an encoding attribute). Because they can
/// result in different layouts, especially when multi-device is involved. E.g.,
/// say that device_a and device_b interpret a tensor type with encodings in
/// different layouts, and there is an executable that can be launch with
/// resources from either device_a or device_b. It is confusing what the input
/// layouts for the executable because there are two possibilities. In this
/// case, we have to duplicate the executable with updated encoding, and modify
/// the dispatch to launch proper executable based on device analysis.
static LogicalResult duplicateExecutablesPerAffinityVariant(
ModuleOp moduleOp, SymbolTable symbolTable, FunctionOpInterface funcOp,
IREE::Stream::ResolveLayoutAttrFn resolveLayoutAttr) {
MLIRContext *ctx = moduleOp.getContext();
IRRewriter rewriter(ctx);

// 1. Gather per-export [execution affinity -> [resource affinities]] map.
IREE::Stream::AffinityAnalysis affinityAnalysis(moduleOp);
if (failed(affinityAnalysis.run())) {
return moduleOp.emitError("failed on running affinity analysis");
}
SmallVector<IREE::Stream::AsyncDispatchOp> candidates;
funcOp.walk(
[&](IREE::Stream::AsyncDispatchOp op) { candidates.push_back(op); });

// export -> [affinity -> array per resource of affinities PVS].
DenseMap<IREE::Stream::ExecutableExportOp,
SetVector<std::pair<IREE::Stream::AffinityAttr, ArrayAttr>>>
exportToDispatchSites;

llvm::MapVector<IREE::Stream::AsyncDispatchOp, SmallVector<Attribute>>
resourceAffinities;
for (auto dispatchOp : candidates) {
SmallVector<IREE::Stream::AffinityAttr> execAffinities;
if (!affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
execAffinities)) {
return dispatchOp.emitError("failed on execution affinity lookup");
}
assert(execAffinities.size() == 1 &&
"We should only have a single execution "
"affinity when running the pass.");

SmallVector<Attribute> operandAffinityAttrs =
getResourceOperandsAffinities(affinityAnalysis, dispatchOp);
resourceAffinities[dispatchOp] = operandAffinityAttrs;

dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
symbolTable.lookupSymbolIn(moduleOp, entryPoint));
exportToDispatchSites[exportOp].insert(std::make_pair(
execAffinities[0], rewriter.getArrayAttr(operandAffinityAttrs)));
});
}

LLVM_DEBUG({
llvm::dbgs() << "Dump of exportToDispatchSites\n";
for (auto [exportOp, affinities] : exportToDispatchSites) {
llvm::dbgs() << " ExportOp: " << exportOp.getSymName() << "\n";
for (auto [execAffinity, resourceAffinities] : affinities) {
llvm::dbgs() << " executaion affinity: " << execAffinity << "\n";
llvm::dbgs() << " resource affinities: " << resourceAffinities
<< "\n";
}
}
});

// 2. Duplicate executables for each unqiue resource affinities.

// Mapping from [execution affinity, resource operands affinities, export] to
// the executable op.
using DispatchSiteInfo = std::tuple<IREE::Stream::AffinityAttr, ArrayAttr,
IREE::Stream::ExecutableExportOp>;
DenseMap<DispatchSiteInfo, IREE::Stream::ExecutableOp>
dispatchSiteToExecutableOp;
for (auto [exportOp, execAndResourceAffinities] : exportToDispatchSites) {
auto executableOp = exportOp->getParentOfType<IREE::Stream::ExecutableOp>();
// No need to duplicate the executable if all the uses have the same
// affinities.
// TODO(hanchung): Do not duplicate the executables if bindings are not
// encoded. I.e., all the tensor types do not have encodings.
if (execAndResourceAffinities.size() == 1) {
auto [execAffinity, resourceAffinities] = execAndResourceAffinities[0];
dispatchSiteToExecutableOp[DispatchSiteInfo(
execAffinity, resourceAffinities, exportOp)] = executableOp;
continue;
}

int64_t dupId = -1;
for (auto [execAffinity, resourceAffinities] : execAndResourceAffinities) {
rewriter.setInsertionPointAfter(executableOp);
IREE::Stream::ExecutableOp dupOp = executableOp;
if (dupId != -1) {
auto symName = std::string(executableOp.getSymName());
symName += "_dup" + std::to_string(dupId);
dupOp = rewriter.cloneWithoutRegions(executableOp);
rewriter.modifyOpInPlace(dupOp, [&] {
dupOp.setSymName(symName);
IRMapping mapping;
executableOp.getRegion().cloneInto(&dupOp.getRegion(), mapping);
});
}
dispatchSiteToExecutableOp[DispatchSiteInfo(
execAffinity, resourceAffinities, exportOp)] = dupOp;
dupId++;
}
}

// 3. Update dispatch sites, i.e., point dispatch entry points to
// corresponding cloned executables.
for (auto dispatchOp : candidates) {
SmallVector<Attribute> newEntryPoints;
SmallVector<IREE::Stream::AffinityAttr> execAffinities;
// Sanity checks. It should already meet the requirement because they are
// checked in step 1.
assert(affinityAnalysis.tryLookupExecutionAffinity(dispatchOp,
execAffinities));
assert(execAffinities.size() == 1);
SmallVector<Attribute> operandAttrs = resourceAffinities[dispatchOp];
dispatchOp.forEachEntryPointAttr([&](SymbolRefAttr entryPoint) {
auto exportOp = cast<IREE::Stream::ExecutableExportOp>(
symbolTable.lookupSymbolIn(moduleOp, entryPoint));
auto info = DispatchSiteInfo(
execAffinities[0], rewriter.getArrayAttr(operandAttrs), exportOp);
assert(dispatchSiteToExecutableOp.count(info));

auto executableOp = dispatchSiteToExecutableOp[info];
auto newSym = SymbolRefAttr::get(executableOp->getAttrOfType<StringAttr>(
SymbolTable::getSymbolAttrName()),
entryPoint.getNestedReferences());
newEntryPoints.push_back(newSym);
});

rewriter.modifyOpInPlace(dispatchOp, [&] {
dispatchOp.setEntryPointsAttr(rewriter.getArrayAttr(newEntryPoints));
});
}

// TODO(hanchung): Update encodings in executables.

return success();
}

// TODO(hanchung): Add "cloneWithEncoding" method to RankedTensorType.
static RankedTensorType cloneWithEncoding(RankedTensorType type,
Attribute encodingAttr) {
Expand Down Expand Up @@ -149,6 +319,7 @@ struct SpecializeEncodingsPass
return signalPassFailure();
}

SymbolTable symbolTable(moduleOp);
llvm::MapVector<StringRef, IREE::Stream::ExecutableOp> executableOps;
for (auto executableOp : moduleOp.getOps<IREE::Stream::ExecutableOp>()) {
executableOps[executableOp.getName()] = executableOp;
Expand All @@ -164,7 +335,11 @@ struct SpecializeEncodingsPass
return signalPassFailure();
}

// TODO(hanchung): Duplicate executables and update dispatch ops.
if (failed(duplicateExecutablesPerAffinityVariant(
moduleOp, symbolTable, funcOp, resolveLayoutAttr))) {
funcOp.emitError("failed on executable duplication");
return signalPassFailure();
}
}
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,59 @@
module {
util.global private @device_a = #device_target_local_0_

util.func public @main(%d0: index, %d1: index) -> index {
util.func public @encoded_sizeof_op(%d0: index, %d1: index) -> index {
%size = stream.tensor.sizeof on(#hal.device.affinity<@device_a>) tensor<?x?xf32, #encoding>{%d0, %d1} : index
util.return %size : index
}
}
// CHECK: #[[EXECUTABLE:.+]] = #hal.executable.target<"vmvx",
// CHECK: #[[$ENCODING:.+]] = #iree_encoding.encoding
// CHECK-SAME: layouts = [#[[EXECUTABLE]]]
// CHECK-LABEL: util.func public @main
// CHECK-LABEL: util.func public @encoded_sizeof_op
// CHECK: %[[RES:.+]] = stream.tensor.sizeof {{.+}} tensor<?x?xf32, #[[$ENCODING]]>
// CHECK: return %[[RES]]

// -----

#executable_target_vmvx_bytecode_fb = #hal.executable.target<"vmvx", "vmvx-bytecode-fb", {ukernels = "none"}>
#map = affine_map<(d0) -> (d0)>
#device_target_local_0_ = #hal.device.target<"local", {ordinal = 0 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
#device_target_local_1_ = #hal.device.target<"local", {ordinal = 1 : index}, [#executable_target_vmvx_bytecode_fb]> : !hal.device
module attributes {stream.affinity.default = #hal.device.affinity<@device_a>} {
util.global private @device_a = #device_target_local_0_
util.global private @device_b = #device_target_local_1_
stream.executable private @ex {
stream.executable.export public @dispatch
}
util.func public @mutli_device(%arg0: !hal.buffer_view, %arg1: !hal.fence, %arg2: !hal.fence) -> !hal.buffer_view {
%c16 = arith.constant 16 : index
%c0 = arith.constant 0 : index
%c4 = arith.constant 4 : index
%element_type_f32 = hal.element_type<f32> : i32
%dense_row_major = hal.encoding_type<dense_row_major> : i32
hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c4]) type(%element_type_f32) encoding(%dense_row_major)
%0 = stream.tensor.import on(#hal.device.affinity<@device_a>) %arg0 : !hal.buffer_view -> tensor<4xf32> in !stream.resource<external>{%c16}
%1 = stream.timepoint.import on(#hal.device.affinity<@device_a>) %arg1 : (!hal.fence) => !stream.timepoint
%2 = stream.timepoint.await %1 => %0 : !stream.resource<external>{%c16}
%3 = stream.async.transfer %2 : !stream.resource<external>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
%4 = stream.async.dispatch on(#hal.device.affinity<@device_a>) @ex::@dispatch(%3[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%5 = stream.async.transfer %4 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_b>) !stream.resource<*>{%c16}
%6 = stream.async.dispatch on(#hal.device.affinity<@device_b>) @ex::@dispatch(%5[%c0 to %c16 for %c16]) : (!stream.resource<*>{%c16}) -> !stream.resource<*>{%c16}
%7 = stream.async.transfer %6 : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_b>) -> to(#hal.device.affinity<@device_a>) !stream.resource<*>{%c16}
%result, %result_timepoint = stream.timepoint.barrier on(#hal.device.affinity<@device_a>) %7 : !stream.resource<*>{%c16} => !stream.timepoint
stream.timepoint.chain_external on(#hal.device.affinity<@device_a>) %result_timepoint => (%arg2 : !hal.fence)
%8 = stream.async.transfer %result : !stream.resource<*>{%c16} from(#hal.device.affinity<@device_a>) -> to(#hal.device.affinity<@device_a>) !stream.resource<external>{%c16}
%9 = stream.tensor.export on(#hal.device.affinity<@device_a>) %8 : tensor<4xf32> in !stream.resource<external>{%c16} -> !hal.buffer_view
util.return %9 : !hal.buffer_view
}
}

// CHECK: #[[DEVICE_LOCAL_0:.+]] = #hal.device.target
// CHECK: #[[DEVICE_LOCAL_1:.+]] = #hal.device.target
// CHECK: util.global private @[[$DEVICE_A:.+]] = #[[DEVICE_LOCAL_0]]
// CHECK: util.global private @[[$DEVICE_B:.+]] = #[[DEVICE_LOCAL_1]]
// CHECK: stream.executable private @[[$EX0:.+]]
// CHECK: stream.executable private @[[$EX1:.+]]
// CHECK-LABEL: util.func public @multi_device
// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_A]]>) @[[EX0]]::@dispatch
// CHECK: stream.async.dispatch on(#hal.device.affinity<@[[$DEVICE_B]]>) @[[EX1]]::@dispatch

0 comments on commit 6e605a4

Please sign in to comment.