Updates after review

JonPsson1 · Jan 10, 2024 · 27f7015 · 27f7015
1 parent 76bf0f9
commit 27f7015
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 30 deletions.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -343,12 +343,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
   if (N->getOpcode() == ISD::ATOMIC_LOAD) {
     ISD::LoadExtType ETy = cast<AtomicSDNode>(N)->getExtensionType();
     if (ETy == ISD::NON_EXTLOAD) {
-      if (TLI.getExtendForAtomicOps() == ISD::SIGN_EXTEND)
+      switch (TLI.getExtendForAtomicOps()) {
+      case ISD::SIGN_EXTEND:
         ETy = ISD::SEXTLOAD;
-      else if (TLI.getExtendForAtomicOps() == ISD::ZERO_EXTEND)
+        break;
+      case ISD::ZERO_EXTEND:
         ETy = ISD::ZEXTLOAD;
-      else
+        break;
+      case ISD::ANY_EXTEND:
         ETy = ISD::EXTLOAD;
+        break;
+      default:
+        llvm_unreachable("Invalid atomic op extension");
+      }
     }
     cast<AtomicSDNode>(Res)->setExtensionType(ETy);
   }

diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1516,13 +1516,10 @@ bool SystemZDAGToDAGISel::storeLoadIsAligned(SDNode *N) const {
   MachineMemOperand *MMO = MemAccess->getMemOperand();
   assert(MMO && "Expected a memory operand.");
 
-  // These instructions are not atomic.
-  if (MMO->isAtomic())
-    return false;
-
   // The memory access must have a proper alignment and no index register.
+  // ATOMIC_LOADs do not have the offset operand.
   if (MemAccess->getAlign().value() < StoreSize ||
-      !MemAccess->getOffset().isUndef())
+      (!MMO->isAtomic() && !MemAccess->getOffset().isUndef()))
     return false;
 
   // The MMO must not have an unaligned offset.

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4507,7 +4507,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   return DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
 }
 
-SDValue SystemZTargetLowering::lowerATOMIC_I128_LDST(SDValue Op,
+SDValue SystemZTargetLowering::lowerATOMIC_LDST_I128(SDValue Op,
                                                      SelectionDAG &DAG) const {
   auto *Node = cast<AtomicSDNode>(Op.getNode());
   assert(Node->getMemoryVT() == MVT::i128 && "Only custom lowering i128.");
@@ -5637,12 +5637,11 @@ static SDValue tryBuildVectorShuffle(SelectionDAG &DAG,
   return GS.getNode(DAG, SDLoc(BVN));
 }
 
-bool SystemZTargetLowering::isVectorElementLoad(SDValue Op, EVT VecVT) const {
+bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
   if (Op.getOpcode() == ISD::LOAD && cast<LoadSDNode>(Op)->isUnindexed())
     return true;
   if (auto *AL = dyn_cast<AtomicSDNode>(Op))
-    if (AL->getOpcode() == ISD::ATOMIC_LOAD && SDValue(AL, 0).hasOneUse() &&
-        AL->getMemoryVT() == VecVT.getScalarType())
+    if (AL->getOpcode() == ISD::ATOMIC_LOAD)
       return true;
   if (Subtarget.hasVectorEnhancements2() && Op.getOpcode() == SystemZISD::LRV)
     return true;
@@ -5681,13 +5680,13 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   //   we would need 2 instructions to replicate it: VLVGP followed by VREPx.
   //   This is only a win if the single defined element is used more than once.
   //   In other cases we're better off using a single VLVGx.
-  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single, VT)))
+  if (Single.getNode() && (Count > 1 || isVectorElementLoad(Single)))
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
 
   // If all elements are loads, use VLREP/VLEs (below).
   bool AllLoads = true;
   for (auto Elem : Elems)
-    if (!isVectorElementLoad(Elem, VT)) {
+    if (!isVectorElementLoad(Elem)) {
       AllLoads = false;
       break;
     }
@@ -5759,7 +5758,7 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
     std::map<const SDNode*, unsigned> UseCounts;
     SDNode *LoadMaxUses = nullptr;
     for (unsigned I = 0; I < NumElements; ++I)
-      if (isVectorElementLoad(Elems[I], VT)) {
+      if (isVectorElementLoad(Elems[I])) {
         SDNode *Ld = Elems[I].getNode();
         UseCounts[Ld]++;
         if (LoadMaxUses == nullptr || UseCounts[LoadMaxUses] < UseCounts[Ld])
@@ -6122,7 +6121,7 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_SWAPW);
   case ISD::ATOMIC_STORE:
   case ISD::ATOMIC_LOAD:
-    return lowerATOMIC_I128_LDST(Op, DAG);
+    return lowerATOMIC_LDST_I128(Op, DAG);
   case ISD::ATOMIC_LOAD_ADD:
     return lowerATOMIC_LOAD_OP(Op, DAG, SystemZISD::ATOMIC_LOADW_ADD);
   case ISD::ATOMIC_LOAD_SUB:

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -694,7 +694,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerATOMIC_I128_LDST(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerATOMIC_LDST_I128(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerATOMIC_LOAD_OP(SDValue Op, SelectionDAG &DAG,
                               unsigned Opcode) const;
   SDValue lowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
@@ -704,7 +704,7 @@ class SystemZTargetLowering : public TargetLowering {
   SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-  bool isVectorElementLoad(SDValue Op, EVT VecVT) const;
+  bool isVectorElementLoad(SDValue Op) const;
   SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
                       SmallVectorImpl<SDValue> &Elems) const;
   SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;

diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -504,17 +504,17 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, nonatomic_ld, 4>;
-  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, nonatomic_ld, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
 }
 
 // Fused multiply-subtract.
 let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, nonatomic_ld, 4>;
-  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, nonatomic_ld, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
 }
 
 // Division.

diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -607,10 +607,6 @@ def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
 def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
 def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
 
-def nonatomic_ld : PatFrag<(ops node:$ptr), (load node:$ptr), [{
-  return !cast<LoadSDNode>(N)->isAtomic();
-}]>;
-
 // Non-volatile stores.
 class NonvolatileStore<SDPatternOperator store>
   : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{

diff --git a/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll b/llvm/test/CodeGen/SystemZ/atomic-memofolds.ll
@@ -170,12 +170,11 @@ define i64 @f14(i64 %a, ptr %src) {
   ret i64 %sub
 }
 
-; Check that maeb (reg/mem) is *not* used for an atomic load.
 define float @f15(float %f1, ptr %ptr, float %acc) {
 ; CHECK-LABEL: f15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lde %f1, 0(%r2)
-; CHECK-NEXT:    wfmasb %f0, %f0, %f1, %f2
+; CHECK-NEXT:    maeb %f2, %f0, 0(%r2)
+; CHECK-NEXT:    ldr %f0, %f2
 ; CHECK-NEXT:    br %r14
   %f2 = load atomic float, ptr %ptr seq_cst, align 4
   %res = call float @llvm.fma.f32 (float %f1, float %f2, float %acc)
@@ -387,6 +386,39 @@ define void @f25_b(ptr %src, ptr %dst) {
   ret void
 }
 
+; Do *not* use vlrep for an extending load.
+define <4 x i32> @f25_c(ptr %ptr) {
+; CHECK-LABEL: f25_c:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lb %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    br %r14
+  %L = load atomic i8, ptr %ptr seq_cst, align 4
+  %S = sext i8 %L to i32
+  %val = insertelement <4 x i32> undef, i32 %S, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
+; Do *not* use vlrep if there is another scalar use.
+define <4 x i32> @f25_d(ptr %ptr, ptr %dst) {
+; CHECK-LABEL: f25_d:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    l %r0, 0(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r0
+; CHECK-NEXT:    vrepf %v24, %v0, 1
+; CHECK-NEXT:    st %r0, 0(%r3)
+; CHECK-NEXT:    br %r14
+  %L = load atomic i32, ptr %ptr seq_cst, align 4
+  store i32 %L, ptr %dst, align 4
+  %val = insertelement <4 x i32> undef, i32 %L, i32 0
+  %ret = shufflevector <4 x i32> %val, <4 x i32> undef,
+                       <4 x i32> zeroinitializer
+  ret <4 x i32> %ret
+}
+
 define void @f26(ptr %src, ptr %dst) {
 ; CHECK-LABEL: f26:
 ; CHECK:       # %bb.0:
@@ -412,6 +444,8 @@ define void @f26_b(ptr %src, ptr %dst) {
   ret void
 }
 
+
+
 ; Vector Load logical element and zero.
 define <16 x i8> @f27(ptr %ptr) {
 ; CHECK-LABEL: f27:
@@ -607,7 +641,7 @@ define void @f43(ptr %ptr) {
 define void @f44(ptr %ptr) {
 ; CHECK-LABEL: f44:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    larl %r1, .LCPI48_0
+; CHECK-NEXT:    larl %r1, .LCPI50_0
 ; CHECK-NEXT:    ld %f0, 0(%r1)
 ; CHECK-NEXT:    std %f0, 0(%r2)
 ; CHECK-NEXT:    bcr 14, %r0