From fbb37e960616efcf7cd5c1ebbe95f75c65d565dc Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Mon, 13 May 2024 11:35:28 +0100 Subject: [PATCH] [AArch64] Add an all-in-one histogram intrinsic Based on discussion from https://discourse.llvm.org/t/rfc-vectorization-support-for-histogram-count-operations/74788 Current interface is: llvm.experimental.histogram( ptrs, inc_amount, mask) The integer type used by 'inc_amount' needs to match the type of the buckets in memory. The intrinsic covers the following operations: * Gather load * histogram on the elements of 'ptrs' * multiply the histogram results by 'inc_amount' * add the result of the multiply to the values loaded by the gather * scatter store the results of the add Supports lowering to histcnt instructions for AArch64 targets, and scalarization for all others at present. --- llvm/docs/LangRef.rst | 54 +++++++++ .../llvm/Analysis/TargetTransformInfo.h | 7 ++ .../llvm/Analysis/TargetTransformInfoImpl.h | 4 + llvm/include/llvm/CodeGen/ISDOpcodes.h | 5 + llvm/include/llvm/CodeGen/SelectionDAG.h | 3 + llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 33 +++++ llvm/include/llvm/IR/Intrinsics.td | 7 ++ llvm/lib/Analysis/TargetTransformInfo.cpp | 5 + .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 38 ++++++ .../SelectionDAG/SelectionDAGBuilder.cpp | 63 ++++++++++ .../SelectionDAG/SelectionDAGBuilder.h | 1 + .../SelectionDAG/SelectionDAGDumper.cpp | 3 + .../Target/AArch64/AArch64ISelLowering.cpp | 63 ++++++++++ llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 + .../Scalar/ScalarizeMaskedMemIntrin.cpp | 69 +++++++++++ .../AArch64/neon-scalarize-histogram.ll | 114 ++++++++++++++++++ llvm/test/CodeGen/AArch64/sve2-histcnt.ll | 53 ++++++++ 17 files changed, 523 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll create mode 100644 llvm/test/CodeGen/AArch64/sve2-histcnt.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index d0515876f9e4f6..06809f8bf445d8 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -19143,6 +19143,60 @@ will be on any later loop iteration. This intrinsic will only return 0 if the input count is also 0. A non-zero input count will produce a non-zero result. +'``llvm.experimental.vector.histogram.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +These intrinsics are overloaded. + +These intrinsics represent histogram-like operations; that is, updating values +in memory that may not be contiguous, and where multiple elements within a +single vector may be updating the same value in memory. + +The update operation must be specified as part of the intrinsic name. For a +simple histogram like the following the ``add`` operation would be used. + +.. code-block:: c + + void simple_histogram(int *restrict buckets, unsigned *indices, int N, int inc) { + for (int i = 0; i < N; ++i) + buckets[indices[i]] += inc; + } + +More update operation types may be added in the future. + +:: + + declare <8 x i32> @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask) + declare @llvm.experimental.vector.histogram.add.nxv2p0.i64( %ptrs, i64 %inc, %mask) + +Arguments: +"""""""""" + +The first argument is a vector of pointers to the memory locations to be +updated. The second argument is a scalar used to update the value from +memory; it must match the type of value to be updated. The final argument +is a mask value to exclude locations from being modified. + +Semantics: +"""""""""" + +The '``llvm.experimental.vector.histogram.*``' intrinsics are used to perform +updates on potentially overlapping values in memory. The intrinsics represent +the follow sequence of operations: + +1. Gather load from the ``ptrs`` operand, with element type matching that of + the ``inc`` operand. +2. Update of the values loaded from memory. In the case of the ``add`` + update operation, this means: + + 1. Perform a cross-vector histogram operation on the ``ptrs`` operand. + 2. Multiply the result by the ``inc`` operand. + 3. Add the result to the values loaded from memory +3. Scatter the result of the update operation to the memory locations from + the ``ptrs`` operand. + +The ``mask`` operand will apply to at least the gather and scatter operations. + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index f0eb83c143e2c8..0c3a6b3742c735 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -797,6 +797,9 @@ class TargetTransformInfo { /// Return true if the target supports strided load. bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const; + // Return true if the target supports masked vector histograms. + bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const; + /// Return true if this is an alternating opcode pattern that can be lowered /// to a single instruction on the target. In X86 this is for the addsub /// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR. @@ -1883,6 +1886,7 @@ class TargetTransformInfo::Concept { virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0; + virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0; virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const = 0; @@ -2386,6 +2390,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override { return Impl.isLegalStridedLoadStore(DataType, Alignment); } + bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override { + return Impl.isLegalMaskedVectorHistogram(AddrType, DataType); + } bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const override { return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 262ebdb3cbef99..9a57331d281db3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -316,6 +316,10 @@ class TargetTransformInfoImplBase { return false; } + bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const { + return false; + } + bool enableOrderedReductions() const { return false; } bool hasDivRemOp(Type *DataType, bool IsSigned) const { return false; } diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 6429947958ee91..d8af97957e48ec 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1402,6 +1402,11 @@ enum NodeType { // which is later translated to an implicit use in the MIR. CONVERGENCECTRL_GLUE, + // Experimental vector histogram intrinsic + // Operands: Input Chain, Inc, Mask, Base, Index, Scale, ID + // Output: Output Chain + EXPERIMENTAL_VECTOR_HISTOGRAM, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index c08e57ba3f6783..979ef8033eb5e7 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1526,6 +1526,9 @@ class SelectionDAG { ArrayRef Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating = false); + SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, + ArrayRef Ops, MachineMemOperand *MMO, + ISD::MemIndexType IndexType); SDValue getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index e7c71041454557..ac94c6099d0802 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -542,6 +542,7 @@ BEGIN_TWO_BYTE_PACK() friend class MaskedLoadStoreSDNode; friend class MaskedGatherScatterSDNode; friend class VPGatherScatterSDNode; + friend class MaskedHistogramSDNode; uint16_t : NumMemSDNodeBits; @@ -552,6 +553,7 @@ BEGIN_TWO_BYTE_PACK() // MaskedLoadStoreBaseSDNode => enum ISD::MemIndexedMode // VPGatherScatterSDNode => enum ISD::MemIndexType // MaskedGatherScatterSDNode => enum ISD::MemIndexType + // MaskedHistogramSDNode => enum ISD::MemIndexType uint16_t AddressingMode : 3; }; enum { NumLSBaseSDNodeBits = NumMemSDNodeBits + 3 }; @@ -564,6 +566,7 @@ BEGIN_TWO_BYTE_PACK() friend class MaskedLoadSDNode; friend class MaskedGatherSDNode; friend class VPGatherSDNode; + friend class MaskedHistogramSDNode; uint16_t : NumLSBaseSDNodeBits; @@ -1420,6 +1423,7 @@ class MemSDNode : public SDNode { return getOperand(2); case ISD::MGATHER: case ISD::MSCATTER: + case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return getOperand(3); default: return getOperand(1); @@ -1468,6 +1472,7 @@ class MemSDNode : public SDNode { case ISD::EXPERIMENTAL_VP_STRIDED_STORE: case ISD::GET_FPENV_MEM: case ISD::SET_FPENV_MEM: + case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: return true; default: return N->isMemIntrinsic() || N->isTargetMemoryOpcode(); @@ -2953,6 +2958,34 @@ class MaskedScatterSDNode : public MaskedGatherScatterSDNode { } }; +class MaskedHistogramSDNode : public MemSDNode { +public: + friend class SelectionDAG; + + MaskedHistogramSDNode(unsigned Order, const DebugLoc &DL, SDVTList VTs, + EVT MemVT, MachineMemOperand *MMO, + ISD::MemIndexType IndexType) + : MemSDNode(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, Order, DL, VTs, MemVT, + MMO) { + LSBaseSDNodeBits.AddressingMode = IndexType; + } + + ISD::MemIndexType getIndexType() const { + return static_cast(LSBaseSDNodeBits.AddressingMode); + } + + const SDValue &getBasePtr() const { return getOperand(3); } + const SDValue &getIndex() const { return getOperand(4); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getScale() const { return getOperand(5); } + const SDValue &getInc() const { return getOperand(1); } + const SDValue &getIntID() const { return getOperand(6); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == ISD::EXPERIMENTAL_VECTOR_HISTOGRAM; + } +}; + class FPStateAccessSDNode : public MemSDNode { public: friend class SelectionDAG; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 42192d472ba6ec..f1c7d950f92755 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1856,6 +1856,13 @@ def int_experimental_vp_strided_load : DefaultAttrsIntrinsic<[llvm_anyvector_ty llvm_i32_ty], [ NoCapture>, IntrNoSync, IntrReadMem, IntrWillReturn, IntrArgMemOnly ]>; +// Experimental histogram +def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Increment + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + // Operators let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { // Integer arithmetic diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 00443ace46f745..f6a458f7ded466 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -513,6 +513,11 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType, return TTIImpl->isLegalStridedLoadStore(DataType, Alignment); } +bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType, + Type *DataType) const { + return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType); +} + bool TargetTransformInfo::enableOrderedReductions() const { return TTIImpl->enableOrderedReductions(); } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 0a258350c68a58..247f52370e4c11 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -9633,6 +9633,44 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, return V; } +SDValue SelectionDAG::getMaskedHistogram(SDVTList VTs, EVT MemVT, + const SDLoc &dl, ArrayRef Ops, + MachineMemOperand *MMO, + ISD::MemIndexType IndexType) { + assert(Ops.size() == 7 && "Incompatible number of operands"); + + FoldingSetNodeID ID; + AddNodeIDNode(ID, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, VTs, Ops); + ID.AddInteger(MemVT.getRawBits()); + ID.AddInteger(getSyntheticNodeSubclassData( + dl.getIROrder(), VTs, MemVT, MMO, IndexType)); + ID.AddInteger(MMO->getPointerInfo().getAddrSpace()); + ID.AddInteger(MMO->getFlags()); + void *IP = nullptr; + if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) { + cast(E)->refineAlignment(MMO); + return SDValue(E, 0); + } + + auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), + VTs, MemVT, MMO, IndexType); + createOperands(N, Ops); + + assert(N->getMask().getValueType().getVectorElementCount() == + N->getIndex().getValueType().getVectorElementCount() && + "Vector width mismatch between mask and data"); + assert(isa(N->getScale()) && + N->getScale()->getAsAPIntVal().isPowerOf2() && + "Scale should be a constant power of 2"); + assert(N->getInc().getValueType().isInteger() && "Non integer update value"); + + CSEMap.InsertNode(N, IP); + InsertNode(N); + SDValue V(N, 0); + NewSDValueDbgMsg(V, "Creating new node: ", this); + return V; +} + SDValue SelectionDAG::getGetFPEnv(SDValue Chain, const SDLoc &dl, SDValue Ptr, EVT MemVT, MachineMemOperand *MMO) { assert(Chain.getValueType() == MVT::Other && "Invalid chain type"); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index b76036a22992db..ca352da5d36eb4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6281,6 +6281,64 @@ void SelectionDAGBuilder::visitConvergenceControl(const CallInst &I, } } +void SelectionDAGBuilder::visitVectorHistogram(const CallInst &I, + unsigned IntrinsicID) { + // For now, we're only lowering an 'add' histogram. + // We can add others later, e.g. saturating adds, min/max. + assert(IntrinsicID == Intrinsic::experimental_vector_histogram_add && + "Tried to lower unsupported histogram type"); + SDLoc sdl = getCurSDLoc(); + Value *Ptr = I.getOperand(0); + SDValue Inc = getValue(I.getOperand(1)); + SDValue Mask = getValue(I.getOperand(2)); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + DataLayout TargetDL = DAG.getDataLayout(); + EVT VT = Inc.getValueType(); + Align Alignment = DAG.getEVTAlign(VT); + + const MDNode *Ranges = getRangeMetadata(I); + + SDValue Root = DAG.getRoot(); + SDValue Base; + SDValue Index; + ISD::MemIndexType IndexType; + SDValue Scale; + bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, + I.getParent(), VT.getScalarStoreSize()); + + unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); + + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MachinePointerInfo(AS), + MachineMemOperand::MOLoad | MachineMemOperand::MOStore, + MemoryLocation::UnknownSize, Alignment, I.getAAMetadata(), Ranges); + + if (!UniformBase) { + Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); + Index = getValue(Ptr); + IndexType = ISD::SIGNED_SCALED; + Scale = + DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); + } + + EVT IdxVT = Index.getValueType(); + EVT EltTy = IdxVT.getVectorElementType(); + if (TLI.shouldExtendGSIndex(IdxVT, EltTy)) { + EVT NewIdxVT = IdxVT.changeVectorElementType(EltTy); + Index = DAG.getNode(ISD::SIGN_EXTEND, sdl, NewIdxVT, Index); + } + + SDValue ID = DAG.getTargetConstant(IntrinsicID, sdl, MVT::i32); + + SDValue Ops[] = {Root, Inc, Mask, Base, Index, Scale, ID}; + SDValue Histogram = DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), VT, sdl, + Ops, MMO, IndexType); + + setValue(&I, Histogram); + DAG.setRoot(Histogram); +} + /// Lower the call to the specified intrinsic function. void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { @@ -7948,6 +8006,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::experimental_convergence_entry: case Intrinsic::experimental_convergence_loop: visitConvergenceControl(I, Intrinsic); + return; + case Intrinsic::experimental_vector_histogram_add: { + visitVectorHistogram(I, Intrinsic); + return; + } } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 211e1653de560a..ae361f8c500a08 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -624,6 +624,7 @@ class SelectionDAGBuilder { void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic); void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI); void visitConvergenceControl(const CallInst &I, unsigned Intrinsic); + void visitVectorHistogram(const CallInst &I, unsigned IntrinsicID); void visitVPLoad(const VPIntrinsic &VPIntrin, EVT VT, const SmallVectorImpl &OpValues); void visitVPStore(const VPIntrinsic &VPIntrin, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 4ad4a938ca97f2..59742e90c6791c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -529,6 +529,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::PATCHPOINT: return "patchpoint"; + case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: + return "histogram"; + // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ case ISD::SDID: \ diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1e0071fffe6665..0f1db3cb17aa69 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1618,6 +1618,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); } + // Histcnt is SVE2 only + if (Subtarget->hasSVE2() && Subtarget->isSVEAvailable()) + setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::Other, + Custom); + // NOTE: Currently this has to happen after computeRegisterProperties rather // than the preferred option of combining it with the addRegisterClass call. if (Subtarget->useSVEForFixedLengthVectors()) { @@ -6775,6 +6780,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, return LowerFunnelShift(Op, DAG); case ISD::FLDEXP: return LowerFLDEXP(Op, DAG); + case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM: + return LowerVECTOR_HISTOGRAM(Op, DAG); } } @@ -27355,6 +27362,62 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op, return DAG.getMergeValues({Lo, Hi}, DL); } +SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op, + SelectionDAG &DAG) const { + // FIXME: Maybe share some code with LowerMGather/Scatter? + MaskedHistogramSDNode *HG = cast(Op); + SDLoc DL(HG); + SDValue Chain = HG->getChain(); + SDValue Inc = HG->getInc(); + SDValue Mask = HG->getMask(); + SDValue Ptr = HG->getBasePtr(); + SDValue Index = HG->getIndex(); + SDValue Scale = HG->getScale(); + SDValue IntID = HG->getIntID(); + + // The Intrinsic ID determines the type of update operation. + ConstantSDNode *CID = cast(IntID.getNode()); + // Right now, we only support 'add' as an update. + assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add && + "Unexpected histogram update operation"); + + EVT IncVT = Inc.getValueType(); + EVT IndexVT = Index.getValueType(); + EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT, + IndexVT.getVectorElementCount()); + SDValue Zero = DAG.getConstant(0, DL, MVT::i64); + SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero); + SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc); + SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale}; + + // Set the MMO to load only, rather than load|store. + MachineMemOperand *GMMO = HG->getMemOperand(); + GMMO->setFlags(MachineMemOperand::MOLoad); + ISD::MemIndexType IndexType = HG->getIndexType(); + SDValue Gather = + DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops, + GMMO, IndexType, ISD::NON_EXTLOAD); + + SDValue GChain = Gather.getValue(1); + + // Perform the histcnt, multiply by inc, add to bucket data. + SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT); + SDValue HistCnt = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index); + SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat); + SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul); + + // Create a new MMO for the scatter. + MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand( + GMMO->getPointerInfo(), MachineMemOperand::MOStore, GMMO->getSize(), + GMMO->getAlign(), GMMO->getAAInfo()); + + SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale}; + SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL, + ScatterOps, SMMO, IndexType, false); + return Scatter; +} + SDValue AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op, SelectionDAG &DAG) const { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index b3e282a0406038..a44a3d35d2f9c8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1149,6 +1149,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVECTOR_HISTOGRAM(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index a4111fad5d9f2e..de80fa2c05023c 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -862,6 +862,69 @@ static void scalarizeMaskedCompressStore(const DataLayout &DL, CallInst *CI, ModifiedDT = true; } +static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, + DomTreeUpdater *DTU, + bool &ModifiedDT) { + // If we extend histogram to return a result someday (like the updated vector) + // then we'll need to support it here. + assert(CI->getType()->isVoidTy() && "Histogram with non-void return."); + Value *Ptrs = CI->getArgOperand(0); + Value *Inc = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + + auto *AddrType = cast(Ptrs->getType()); + Type *EltTy = Inc->getType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + Builder.SetInsertPoint(InsertPt); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // FIXME: Do we need to add an alignment parameter to the intrinsic? + unsigned VectorWidth = AddrType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + if (isConstantIntVector(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getAggregateElement(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); + Value *Add = Builder.CreateAdd(Load, Inc); + Builder.CreateStore(Add, Ptr); + } + CI->eraseFromParent(); + return; + } + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + Value *Predicate = + Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + + Instruction *ThenTerm = + SplitBlockAndInsertIfThen(Predicate, InsertPt, /*Unreachable=*/false, + /*BranchWeights=*/nullptr, DTU); + + BasicBlock *CondBlock = ThenTerm->getParent(); + CondBlock->setName("cond.histogram.update"); + + Builder.SetInsertPoint(CondBlock->getTerminator()); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); + LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); + Value *Add = Builder.CreateAdd(Load, Inc); + Builder.CreateStore(Add, Ptr); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0); + NewIfBlock->setName("else"); + Builder.SetInsertPoint(NewIfBlock, NewIfBlock->begin()); + } + + CI->eraseFromParent(); + ModifiedDT = true; +} + static bool runImpl(Function &F, const TargetTransformInfo &TTI, DominatorTree *DT) { std::optional DTU; @@ -938,6 +1001,12 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, switch (II->getIntrinsicID()) { default: break; + case Intrinsic::experimental_vector_histogram_add: + if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(), + CI->getArgOperand(1)->getType())) + return false; + scalarizeMaskedVectorHistogram(DL, CI, DTU, ModifiedDT); + break; case Intrinsic::masked_load: // Scalarize unsupported vector masked load if (TTI.isLegalMaskedLoad( diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll new file mode 100644 index 00000000000000..45f1429a810a0c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s + +;; This test exercises the default lowering of the histogram to scalarized code. + +define void @histogram_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB0_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB0_4 +; CHECK-NEXT: .LBB0_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB0_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: add x9, x9, x0 +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB0_2 +; CHECK-NEXT: .LBB0_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: add x9, x9, x0 +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB1_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB1_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB1_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB1_8 +; CHECK-NEXT: .LBB1_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB1_6 +; CHECK-NEXT: .LBB1_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: add w9, w9, #1 +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: ldr w10, [x8] +; CHECK-NEXT: add w10, w10, #1 +; CHECK-NEXT: str w10, [x8] +; CHECK-NEXT: ldr w8, [x9] +; CHECK-NEXT: add w8, w8, #1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: ldr w10, [x8] +; CHECK-NEXT: add w10, w10, #1 +; CHECK-NEXT: str w10, [x8] +; CHECK-NEXT: ldr w8, [x9] +; CHECK-NEXT: add w8, w8, #1 +; CHECK-NEXT: str w8, [x9] +; CHECK-NEXT: ret + + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve2-histcnt.ll b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll new file mode 100644 index 00000000000000..557a42116cdb00 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-histcnt.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -mtriple=aarch64 < %s -o - | FileCheck %s + +define void @histogram_i64( %buckets, i64 %inc, %mask) #0 { +; CHECK-LABEL: histogram_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d +; CHECK-NEXT: mov z3.d, x0 +; CHECK-NEXT: ld1d { z2.d }, p0/z, [z0.d] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d +; CHECK-NEXT: st1d { z1.d }, p0, [z0.d] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.add.nxv2p0.i64( %buckets, i64 %inc, %mask) + ret void +} + +;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt +;; by 1, so we should be able to remove that and directly add the histcnt to the +;; current bucket data. +define void @histogram_i32_literal(ptr %base, %indices, %mask) #0 { +; CHECK-LABEL: histogram_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s +; CHECK-NEXT: mov z3.s, #1 // =0x1 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ret + + %buckets = getelementptr i32, ptr %base, %indices + call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( %buckets, i32 1, %mask) + ret void +} + +define void @histogram_i32_literal_noscale(ptr %base, %indices, %mask) #0 { +; CHECK-LABEL: histogram_i32_literal_noscale: +; CHECK: // %bb.0: +; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s +; CHECK-NEXT: mov z3.s, #1 // =0x1 +; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s +; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + + %buckets = getelementptr i8, ptr %base, %indices + call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( %buckets, i32 1, %mask) + ret void +} + +attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }