From 05f987743170cbd9fc97699c7a9b352055de7300 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 12 Jul 2024 13:47:10 +0100
Subject: [PATCH] [X86] Add handling for
 shift_logical(select(icmp_uge(amt,BW),0,x),amt) -> avx2 shift(x,amt)

We need to catch this otherwise pre-AVX512 targets will fold this to shift_logical(and(icmp_ult(amt,BW),x),amt)
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 16 ++++++++++++++++
 llvm/test/CodeGen/X86/combine-shl.ll    | 19 +++++--------------
 llvm/test/CodeGen/X86/combine-srl.ll    | 19 +++++--------------
 3 files changed, 26 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0a11e2134c7088..a731541ca7778e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48042,6 +48042,14 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG,
         SV == VT.getScalarSizeInBits()) {
       return DAG.getNode(X86ISD::VSHLV, DL, VT, N00, N1);
     }
+    // fold shl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psllv(x,amt)
+    if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 &&
+        cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUGE &&
+        ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
+        ISD::isConstantSplatVectorAllZeros(N00.getNode()) &&
+        SV == VT.getScalarSizeInBits()) {
+      return DAG.getNode(X86ISD::VSHLV, DL, VT, N01, N1);
+    }
   }
 
   // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
@@ -48176,6 +48184,14 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
         SV == VT.getScalarSizeInBits()) {
       return DAG.getNode(X86ISD::VSRLV, DL, VT, N00, N1);
     }
+    // fold srl(select(icmp_uge(amt,BW),0,x),amt) -> avx2 psrlv(x,amt)
+    if (Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == N1 &&
+        cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUGE &&
+        ISD::isConstantSplatVector(Cond.getOperand(1).getNode(), SV) &&
+        ISD::isConstantSplatVectorAllZeros(N00.getNode()) &&
+        SV == VT.getScalarSizeInBits()) {
+      return DAG.getNode(X86ISD::VSRLV, DL, VT, N01, N1);
+    }
   }
 
   // Only do this on the last DAG combine as it can interfere with other
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index c5ce1e0046ad09..8d8c1d26fc5cac 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -1044,19 +1044,10 @@ define <4 x i32> @combine_vec_shl_commuted_clamped(<4 x i32> %sh, <4 x i32> %amt
 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX2-LABEL: combine_vec_shl_commuted_clamped:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
-; AVX2-NEXT:    vpminud %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: combine_vec_shl_commuted_clamped:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: combine_vec_shl_commuted_clamped:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
   %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
   %shl = shl <4 x i32> %1, %amt
@@ -1112,4 +1103,4 @@ define <4 x i32> @combine_vec_shl_commuted_clamped1(<4 x i32> %sh, <4 x i32> %am
   %shl = shl <4 x i32> %sh, %amt
   %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shl
   ret <4 x i32> %1
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 78dcf6e7434007..f2a9aa217f7ec6 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -771,19 +771,10 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped(<4 x i32> %sh, <4 x i32> %am
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
 ; SSE41-NEXT:    retq
 ;
-; AVX2-LABEL: combine_vec_lshr_commuted_clamped:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
-; AVX2-NEXT:    vpminud %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
-; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: combine_vec_lshr_commuted_clamped:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: combine_vec_lshr_commuted_clamped:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %cmp.i = icmp uge <4 x i32> %amt, <i32 32, i32 32, i32 32, i32 32>
   %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %sh
   %shr = lshr <4 x i32> %1, %amt
@@ -854,4 +845,4 @@ define <4 x i32> @combine_vec_lshr_commuted_clamped1(<4 x i32> %sh, <4 x i32> %a
   %shr = lshr <4 x i32> %sh, %amt
   %1 = select <4 x i1> %cmp.i, <4 x i32> zeroinitializer, <4 x i32> %shr
   ret <4 x i32> %1
-}
\ No newline at end of file
+}