Skip to content

Commit

Permalink
[intel] Remove nvidia_gpu usages (#998)
Browse files Browse the repository at this point in the history
`nvidia_gpu` should not be used in Intel backend.

---------

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
  • Loading branch information
whitneywhtsang authored Apr 30, 2024
1 parent b24013a commit 0ddbd2f
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 107 deletions.
6 changes: 0 additions & 6 deletions third_party/intel/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,6 @@ def make_ttir(mod, metadata, opt):

@staticmethod
def make_ttgir(mod, metadata, opt, device_arch):
cluster_info = intel.ClusterInfo()
if opt.cluster_dims is not None:
cluster_info.clusterDimX = opt.cluster_dims[0]
cluster_info.clusterDimY = opt.cluster_dims[1]
cluster_info.clusterDimZ = opt.cluster_dims[2]
# TTIR -> TTGIR
pm = ir.pass_manager(mod.context)
pm.enable_debug()
Expand All @@ -148,7 +143,6 @@ def make_ttgir(mod, metadata, opt, device_arch):
passes.common.add_symbol_dce(pm)
passes.common.add_canonicalizer(pm)
pm.run(mod)
metadata["cluster_dims"] = (cluster_info.clusterDimX, cluster_info.clusterDimY, cluster_info.clusterDimZ)
return mod

@staticmethod
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,89 +369,6 @@ struct ConvertLayoutOpConversion
}
}

LogicalResult
lowerDistToDistWithDistSmem(triton::gpu::ConvertLayoutOp op,
OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
auto loc = op.getLoc();
auto typeConverter = getTypeConverter();
auto srcTy = op.getSrc().getType();
auto dstTy = op.getType();
auto srcLayout = srcTy.getEncoding();
auto dstLayout = dstTy.getEncoding();
auto srcShapePerCTA = getShapePerCTA(srcTy);
auto srcCTAsPerCGA = triton::gpu::getCTAsPerCGA(srcLayout);
auto srcCTAOrder = triton::gpu::getCTAOrder(srcLayout);
unsigned rank = srcShapePerCTA.size();

auto llvmElemTy = getTypeConverter()->convertType(dstTy.getElementType());
auto elemPtrTy = ptr_ty(rewriter.getContext(), 3);

Value smemBase =
LLVM::intel::getSharedMemoryBase(loc, rewriter, op.getOperation());
smemBase = bitcast(smemBase, elemPtrTy);
auto smemShape = convertType<unsigned, int64_t>(srcShapePerCTA);

// Store to local shared memory
{
auto inVals = unpackLLElements(loc, adaptor.getSrc(), rewriter);
auto inIndices = ::intel::emitIndices(loc, rewriter, srcLayout, srcTy,
/*withCTAOffset*/ false);

assert(inIndices.size() == inVals.size() &&
"Unexpected number of indices emitted");

for (unsigned i = 0; i < inIndices.size(); ++i) {
Value offset = linearize(rewriter, loc, inIndices[i], smemShape);
Value ptr = gep(elemPtrTy, llvmElemTy, smemBase, offset);
store(inVals[i], ptr);
}
}

// Cluster barrier
rewriter.create<triton::nvidia_gpu::ClusterArriveOp>(loc, false);
rewriter.create<triton::nvidia_gpu::ClusterWaitOp>(loc);

// Load from remote shared memory
{
SmallVector<Value> srcShapePerCTACache;
for (unsigned i = 0; i < rank; ++i)
srcShapePerCTACache.push_back(i32_val(srcShapePerCTA[i]));

SmallVector<Value> outVals;
auto outIndices = ::intel::emitIndices(loc, rewriter, dstLayout, dstTy,
/*withCTAOffset*/ true);

for (unsigned i = 0; i < outIndices.size(); ++i) {
auto coord = outIndices[i];
assert(coord.size() == rank && "Unexpected rank of index emitted");

SmallVector<Value> multiDimCTAId, localCoord;
for (unsigned d = 0; d < rank; ++d) {
multiDimCTAId.push_back(udiv(coord[d], srcShapePerCTACache[d]));
localCoord.push_back(urem(coord[d], srcShapePerCTACache[d]));
}

Value remoteCTAId =
linearize(rewriter, loc, multiDimCTAId, srcCTAsPerCGA, srcCTAOrder);
Value localOffset = linearize(rewriter, loc, localCoord, smemShape);

Value ptr = gep(elemPtrTy, llvmElemTy, smemBase, localOffset);
outVals.push_back(load_dsmem(ptr, remoteCTAId, llvmElemTy));
}

Value result =
packLLElements(loc, getTypeConverter(), outVals, rewriter, dstTy);
rewriter.replaceOp(op, result);
}

// Cluster barrier
rewriter.create<triton::nvidia_gpu::ClusterArriveOp>(loc, false);
rewriter.create<triton::nvidia_gpu::ClusterWaitOp>(loc);

return success();
}

// blocked/dpas -> blocked/dpas.
// Data padding in shared memory to avoid bank conflict.
LogicalResult
Expand All @@ -465,8 +382,6 @@ struct ConvertLayoutOpConversion
Attribute srcLayout = srcTy.getEncoding();
Attribute dstLayout = dstTy.getEncoding();

if (shouldUseDistSmem(srcLayout, dstLayout))
return lowerDistToDistWithDistSmem(op, adaptor, rewriter);
Value smemBase =
LLVM::intel::getSharedMemoryBase(loc, rewriter, op.getOperation());
auto elemPtrTy = ptr_ty(rewriter.getContext(), 3);
Expand Down
16 changes: 0 additions & 16 deletions third_party/intel/triton_xpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,22 +62,6 @@ void init_triton_intel(py::module &&m) {
auto passes = m.def_submodule("passes");
init_triton_intel_passes_ttgpuir(passes.def_submodule("ttgpuir"));

// cluster info
py::class_<mlir::triton::nvidia_gpu::ClusterInfo>(m, "ClusterInfo")
.def(py::init<>())
.def_readwrite("clusterDimX",
&mlir::triton::nvidia_gpu::ClusterInfo::clusterDimX)
.def_readwrite("clusterDimY",
&mlir::triton::nvidia_gpu::ClusterInfo::clusterDimY)
.def_readwrite("clusterDimZ",
&mlir::triton::nvidia_gpu::ClusterInfo::clusterDimZ)
.def("__repr__", [](mlir::triton::nvidia_gpu::ClusterInfo &self) {
std::ostringstream oss;
oss << "(" << self.clusterDimX << ", " << self.clusterDimY << ", "
<< self.clusterDimZ << ")";
return oss.str();
});

// load dialects
m.def("load_dialects", [](mlir::MLIRContext &context) {
mlir::DialectRegistry registry;
Expand Down

0 comments on commit 0ddbd2f

Please sign in to comment.