diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 6e45084dc4b80a..08e751edbb3287 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -1,98 +1,1443 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s - -; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f32: -; GCN: buffer_load_b32 [[REGC:v[0-9]+]] -; GCN: buffer_load_b32 [[REGB:v[0-9]+]] -; GCN: buffer_load_b32 [[REGA:v[0-9]+]] -; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_b32 [[RESULT]], -define amdgpu_kernel void @test_fmaximum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile float, ptr addrspace(1) %aptr, align 4 - %b = load volatile float, ptr addrspace(1) %bptr, align 4 - %c = load volatile float, ptr addrspace(1) %cptr, align 4 - %f0 = call float @llvm.maximum.f32(float %a, float %b) - %f1 = call float @llvm.maximum.f32(float %f0, float %c) - store float %f1, ptr addrspace(1) %out, align 4 - ret void -} - -; Commute operand of second fmaximum -; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f32: -; GCN: buffer_load_b32 [[REGB:v[0-9]+]] -; GCN: buffer_load_b32 [[REGA:v[0-9]+]] -; GCN: buffer_load_b32 [[REGC:v[0-9]+]] -; GCN: v_maximum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_b32 [[RESULT]], -define amdgpu_kernel void @test_fmaximum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile float, ptr addrspace(1) %aptr, align 4 - %b = load volatile float, ptr addrspace(1) %bptr, align 4 - %c = load volatile float, ptr addrspace(1) %cptr, align 4 - %f0 = call float @llvm.maximum.f32(float %a, float %b) - %f1 = call float @llvm.maximum.f32(float %c, float %f0) - store float %f1, ptr addrspace(1) %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_fmaximum3_olt_0_f16: -; GCN: buffer_load_u16 [[REGC:v[0-9]+]] -; GCN: buffer_load_u16 [[REGB:v[0-9]+]] -; GCN: buffer_load_u16 [[REGA:v[0-9]+]] -; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_b16 [[RESULT]], -define amdgpu_kernel void @test_fmaximum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile half, ptr addrspace(1) %aptr, align 2 - %b = load volatile half, ptr addrspace(1) %bptr, align 2 - %c = load volatile half, ptr addrspace(1) %cptr, align 2 - %f0 = call half @llvm.maximum.f16(half %a, half %b) - %f1 = call half @llvm.maximum.f16(half %f0, half %c) - store half %f1, ptr addrspace(1) %out, align 2 - ret void -} - -; GCN-LABEL: {{^}}test_fmaximum3_olt_1_f16: -; GCN: buffer_load_u16 [[REGA:v[0-9]+]] -; GCN: buffer_load_u16 [[REGB:v[0-9]+]] -; GCN: buffer_load_u16 [[REGC:v[0-9]+]] -; GCN: v_maximum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] -; GCN: buffer_store_b16 [[RESULT]], -define amdgpu_kernel void @test_fmaximum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile half, ptr addrspace(1) %aptr, align 2 - %b = load volatile half, ptr addrspace(1) %bptr, align 2 - %c = load volatile half, ptr addrspace(1) %cptr, align 2 - %f0 = call half @llvm.maximum.f16(half %a, half %b) - %f1 = call half @llvm.maximum.f16(half %c, half %f0) - store half %f1, ptr addrspace(1) %out, align 2 - ret void -} - -; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of maximum3 -; since there are no pack instructions for fmaximum3. -; GCN-LABEL: {{^}}no_fmaximum3_v2f16: -; GCN: v_pk_maximum_f16 v0, v0, v1 -; GCN: v_pk_maximum_f16 v0, v2, v0 -; GCN: v_pk_maximum_f16 v0, v0, v3 -; GCN-NEXT: s_setpc_b64 -define <2 x half> @no_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { -entry: - %max = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) - %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max) - %res = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max1, <2 x half> %d) - ret <2 x half> %res -} - -; GCN-LABEL: {{^}}no_fmaximum3_olt_0_f64: -; GCN-COUNT-2: v_maximum_f64 -define amdgpu_kernel void @no_fmaximum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile double, ptr addrspace(1) %aptr, align 4 - %b = load volatile double, ptr addrspace(1) %bptr, align 4 - %c = load volatile double, ptr addrspace(1) %cptr, align 4 - %f0 = call double @llvm.maximum.f64(double %a, double %b) - %f1 = call double @llvm.maximum.f64(double %f0, double %c) - store double %f1, ptr addrspace(1) %out, align 4 - ret void -} - -declare double @llvm.maximum.f64(double, double) -declare float @llvm.maximum.f32(float, float) -declare half @llvm.maximum.f16(half, half) -declare <2 x half> @llvm.maximum.v2f16(<2 x half>, <2 x half>) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define float @v_fmaximum3_f32(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v2, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %c, float %max0) + ret float %max1 +} + +define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inreg %c) { +; GFX12-LABEL: s_fmaximum3_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_maximum3_f32 v0, s0, s1, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: ; return to shader part epilog + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + %cast = bitcast float %max1 to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) + ret i32 %readfirstlane +} + +define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fabs0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, |v0|, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call float @llvm.fabs.f32(float %a) + %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fabs1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, |v1|, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fabs = call float @llvm.fabs.f32(float %b) + %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fabs2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fabs = call float @llvm.fabs.f32(float %c) + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs) + ret float %max1 +} + +define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v1|, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call float @llvm.fabs.f32(float %a) + %b.fabs = call float @llvm.fabs.f32(float %b) + %c.fabs = call float @llvm.fabs.f32(float %c) + %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b.fabs) + %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs) + ret float %max1 +} + +define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg float %a + %b.fneg = fneg float %b + %c.fneg = fneg float %c + %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b.fneg) + %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg) + ret float %max1 +} + +define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fneg_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, -|v0|, -|v1|, -|v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call float @llvm.fabs.f32(float %a) + %b.fabs = call float @llvm.fabs.f32(float %b) + %c.fabs = call float @llvm.fabs.f32(float %c) + %a.fneg.fabs = fneg float %a.fabs + %b.fneg.fabs = fneg float %b.fabs + %c.fneg.fabs = fneg float %c.fabs + %max0 = call float @llvm.maximum.f32(float %a.fneg.fabs, float %b.fneg.fabs) + %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg.fabs) + ret float %max1 +} + +define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fneg0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, -v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg float %a + %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fneg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, -v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fneg = fneg float %b + %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_fneg2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fneg = fneg float %c + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg) + ret float %max1 +} + +define float @v_fmaximum3_f32_const0(float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_const0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, 0x41000000, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float 8.0, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32__const2(float %a, float %b) { +; GFX12-LABEL: v_fmaximum3_f32__const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 0x41000000 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float 8.0) + ret float %max1 +} + +define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) { +; GFX12-LABEL: v_fmaximum3_f32_inlineimm0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, 4.0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float 4.0, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fmaximum3_f32__inlineimm(float %a, float %b) { +; GFX12-LABEL: v_fmaximum3_f32__inlineimm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v1, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float %a, float %b) + %max1 = call float @llvm.maximum.f32(float %max0, float 4.0) + ret float %max1 +} + +define float @v_fmaximum3_f32_const1_const2(float %a) { +; GFX12-LABEL: v_fmaximum3_f32_const1_const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, 0x41000000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_maximum3_f32 v0, v0, s0, 0x41800000 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.maximum.f32(float %a, float 8.0) + %max1 = call float @llvm.maximum.f32(float %max0, float 16.0) + ret float %max1 +} + +define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v2f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v4, v0, v2 +; GFX12-NEXT: v_maximum3_f32 v1, v5, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0) + ret <2 x float> %max1 +} + +define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v2f32_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, v4 +; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c) + ret <2 x float> %max1 +} + +define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v2f32__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v2|, |v4| +; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v3|, |v5| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) + %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b) + %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c) + %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs) + %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fabs) + ret <2 x float> %max1 +} + +define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v2f32__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v2, -v4 +; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v3, -v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <2 x float> %a + %b.fneg = fneg <2 x float> %b + %c.fneg = fneg <2 x float> %c + %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg) + %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c.fneg) + ret <2 x float> %max1 +} + +define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v2 +; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> ) + %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c) + ret <2 x float> %max1 +} + +define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b) { +; GFX12-LABEL: v_fmaximum3_v2f32__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v2, 4.0 +; GFX12-NEXT: v_maximum3_f32 v1, v1, v3, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) + %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> ) + ret <2 x float> %max1 +} + +define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v3f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v6, v0, v3 +; GFX12-NEXT: v_maximum3_f32 v1, v7, v1, v4 +; GFX12-NEXT: v_maximum3_f32 v2, v8, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) + %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0) + ret <3 x float> %max1 +} + +define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v3f32_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v3, v6 +; GFX12-NEXT: v_maximum3_f32 v1, v1, v4, v7 +; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, v8 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) + %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c) + ret <3 x float> %max1 +} + +define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v3f32__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, |v0|, |v3|, |v6| +; GFX12-NEXT: v_maximum3_f32 v1, |v1|, |v4|, |v7| +; GFX12-NEXT: v_maximum3_f32 v2, |v2|, |v5|, |v8| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) + %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b) + %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c) + %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs) + %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fabs) + ret <3 x float> %max1 +} + +define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v3f32__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, -v0, -v3, -v6 +; GFX12-NEXT: v_maximum3_f32 v1, -v1, -v4, -v7 +; GFX12-NEXT: v_maximum3_f32 v2, -v2, -v5, -v8 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <3 x float> %a + %b.fneg = fneg <3 x float> %b + %c.fneg = fneg <3 x float> %c + %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg) + %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c.fneg) + ret <3 x float> %max1 +} + +define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) { +; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, 2.0, v3 +; GFX12-NEXT: v_maximum3_f32 v1, v1, 2.0, v4 +; GFX12-NEXT: v_maximum3_f32 v2, v2, 2.0, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> ) + %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c) + ret <3 x float> %max1 +} + +define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b) { +; GFX12-LABEL: v_fmaximum3_v3f32__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f32 v0, v0, v3, 4.0 +; GFX12-NEXT: v_maximum3_f32 v1, v1, v4, 4.0 +; GFX12-NEXT: v_maximum3_f32 v2, v2, v5, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) + %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> ) + ret <3 x float> %max1 +} + + +define half @v_fmaximum3_f16(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16_commute(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v2, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %c, half %max0) + ret half %max1 +} + +define amdgpu_ps i32 @s_fmaximum3_f16(half inreg %a, half inreg %b, half inreg %c) { +; GFX12-LABEL: s_fmaximum3_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_maximum3_f16 v0, s0, s1, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: ; return to shader part epilog + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + %cast = bitcast half %max1 to i16 + %zext = zext i16 %cast to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) + ret i32 %readfirstlane +} + +define half @v_fmaximum3_f16_fabs0(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fabs0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, |v0|, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call half @llvm.fabs.f16(half %a) + %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16_fabs1(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fabs1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, |v1|, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fabs = call half @llvm.fabs.f16(half %b) + %max0 = call half @llvm.maximum.f16(half %a, half %b.fabs) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16_fabs2(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fabs2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fabs = call half @llvm.fabs.f16(half %c) + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs) + ret half %max1 +} + +define half @v_fmaximum3_f16_fabs_all(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, |v0|, |v1|, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call half @llvm.fabs.f16(half %a) + %b.fabs = call half @llvm.fabs.f16(half %b) + %c.fabs = call half @llvm.fabs.f16(half %c) + %max0 = call half @llvm.maximum.f16(half %a.fabs, half %b.fabs) + %max1 = call half @llvm.maximum.f16(half %max0, half %c.fabs) + ret half %max1 +} + +define half @v_fmaximum3_f16_fneg_all(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, -v0, -v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg half %a + %b.fneg = fneg half %b + %c.fneg = fneg half %c + %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b.fneg) + %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg) + ret half %max1 +} + +define half @v_fmaximum3_f16_fneg_fabs_all(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fneg_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, -|v0|, -|v1|, -|v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call half @llvm.fabs.f16(half %a) + %b.fabs = call half @llvm.fabs.f16(half %b) + %c.fabs = call half @llvm.fabs.f16(half %c) + %a.fneg.fabs = fneg half %a.fabs + %b.fneg.fabs = fneg half %b.fabs + %c.fneg.fabs = fneg half %c.fabs + %max0 = call half @llvm.maximum.f16(half %a.fneg.fabs, half %b.fneg.fabs) + %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg.fabs) + ret half %max1 +} + +define half @v_fmaximum3_f16_fneg0(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fneg0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, -v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg half %a + %max0 = call half @llvm.maximum.f16(half %a.fneg, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16_fneg1(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fneg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, -v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fneg = fneg half %b + %max0 = call half @llvm.maximum.f16(half %a, half %b.fneg) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16_fneg2(half %a, half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_fneg2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fneg = fneg half %c + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c.fneg) + ret half %max1 +} + +define half @v_fmaximum3_f16_const0(half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_const0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, 0x4800, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half 8.0, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16__const2(half %a, half %b) { +; GFX12-LABEL: v_fmaximum3_f16__const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 0x4800 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half 8.0) + ret half %max1 +} + +define half @v_fmaximum3_f16_inlineimm0(half %b, half %c) { +; GFX12-LABEL: v_fmaximum3_f16_inlineimm0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, 4.0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half 4.0, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fmaximum3_f16__inlineimm(half %a, half %b) { +; GFX12-LABEL: v_fmaximum3_f16__inlineimm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum3_f16 v0, v0, v1, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half %a, half %b) + %max1 = call half @llvm.maximum.f16(half %max0, half 4.0) + ret half %max1 +} + +define half @v_fmaximum3_f16_const1_const2(half %a) { +; GFX12-LABEL: v_fmaximum3_f16_const1_const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_movk_i32 s0, 0x4800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_maximum3_f16 v0, v0, s0, 0x4c00 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.maximum.f16(half %a, half 8.0) + %max1 = call half @llvm.maximum.f16(half %max0, half 16.0) + ret half %max1 +} + +define <2 x half> @v_fmaximum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v2, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %c, <2 x half> %max0) + ret <2 x half> %max1 +} + +define <2 x half> @v_fmaximum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v2f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) + ret <2 x half> %max1 +} + +define <2 x half> @v_fmaximum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v2f16__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) + %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) + %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c) + %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fabs) + ret <2 x half> %max1 +} + +define <2 x half> @v_fmaximum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v2f16__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <2 x half> %a + %b.fneg = fneg <2 x half> %b + %c.fneg = fneg <2 x half> %c + %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c.fneg) + ret <2 x half> %max1 +} + +define <2 x half> @v_fmaximum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> ) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> %c) + ret <2 x half> %max1 +} + +define <2 x half> @v_fmaximum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { +; GFX12-LABEL: v_fmaximum3_v2f16__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.maximum.v2f16(<2 x half> %max0, <2 x half> ) + ret <2 x half> %max1 +} + +define <3 x half> @v_fmaximum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v3f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v4, v0 +; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) + %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %c, <3 x half> %max0) + ret <3 x half> %max1 +} + +define <3 x half> @v_fmaximum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v3f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) + %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) + ret <3 x half> %max1 +} + +define <3 x half> @v_fmaximum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v3f16__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX12-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) + %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) + %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c) + %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs) + %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fabs) + ret <3 x half> %max1 +} + +define <3 x half> @v_fmaximum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v3f16__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <3 x half> %a + %b.fneg = fneg <3 x half> %b + %c.fneg = fneg <3 x half> %c + %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg) + %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c.fneg) + ret <3 x half> %max1 +} + +define <3 x half> @v_fmaximum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 2.0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> ) + %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> %c) + ret <3 x half> %max1 +} + +define <3 x half> @v_fmaximum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { +; GFX12-LABEL: v_fmaximum3_v3f16__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %a, <3 x half> %b) + %max1 = call <3 x half> @llvm.maximum.v3f16(<3 x half> %max0, <3 x half> ) + ret <3 x half> %max1 +} + +define <4 x half> @v_fmaximum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v4, v0 +; GFX12-NEXT: v_pk_maximum_f16 v1, v5, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) + %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %c, <4 x half> %max0) + ret <4 x half> %max1 +} + +define <4 x half> @v_fmaximum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v4f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) + %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) + ret <4 x half> %max1 +} + +define <4 x half> @v_fmaximum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v4f16__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX12-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) + %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) + %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c) + %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a.fabs, <4 x half> %b.fabs) + %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c.fabs) + ret <4 x half> %max1 +} + +define <4 x half> @v_fmaximum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v4f16__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <4 x half> %a + %b.fneg = fneg <4 x half> %b + %c.fneg = fneg <4 x half> %c + %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a.fneg, <4 x half> %b.fneg) + %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c.fneg) + ret <4 x half> %max1 +} + +define <4 x half> @v_fmaximum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { +; GFX12-LABEL: v_fmaximum3_v4f16__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> ) + %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> %c) + ret <4 x half> %max1 +} + +define <4 x half> @v_fmaximum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { +; GFX12-LABEL: v_fmaximum3_v4f16__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_maximum_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_maximum_f16 v1, v1, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b) + %max1 = call <4 x half> @llvm.maximum.v4f16(<4 x half> %max0, <4 x half> ) + ret <4 x half> %max1 +} + +define double @v_fmaximum3_f64(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64_commute(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[4:5], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %c, double %max0) + ret double %max1 +} + +define amdgpu_ps <2 x i32> @s_fmaximum3_f64(double inreg %a, double inreg %b, double inreg %c) { +; GFX12-LABEL: s_fmaximum3_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_maximum_f64 v[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], s[4:5] +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: ; return to shader part epilog + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + %cast = bitcast double %max1 to <2 x i32> + %elt0 = extractelement <2 x i32> %cast, i32 0 + %elt1 = extractelement <2 x i32> %cast, i32 1 + %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) + %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) + %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1 + ret <2 x i32> %insert.1 +} + +define double @v_fmaximum3_f64_fabs0(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fabs0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], |v[0:1]|, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call double @llvm.fabs.f64(double %a) + %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64_fabs1(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fabs1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[2:3]| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fabs = call double @llvm.fabs.f64(double %b) + %max0 = call double @llvm.maximum.f64(double %a, double %b.fabs) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64_fabs2(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fabs2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[4:5]| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fabs = call double @llvm.fabs.f64(double %c) + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs) + ret double %max1 +} + +define double @v_fmaximum3_f64_fabs_all(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], |v[0:1]|, |v[2:3]| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], |v[4:5]| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call double @llvm.fabs.f64(double %a) + %b.fabs = call double @llvm.fabs.f64(double %b) + %c.fabs = call double @llvm.fabs.f64(double %c) + %max0 = call double @llvm.maximum.f64(double %a.fabs, double %b.fabs) + %max1 = call double @llvm.maximum.f64(double %max0, double %c.fabs) + ret double %max1 +} + +define double @v_fmaximum3_f64_fneg_all(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], -v[0:1], -v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg double %a + %b.fneg = fneg double %b + %c.fneg = fneg double %c + %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b.fneg) + %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg) + ret double %max1 +} + +define double @v_fmaximum3_f64_fneg_fabs_all(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fneg_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], -|v[0:1]|, -|v[2:3]| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -|v[4:5]| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call double @llvm.fabs.f64(double %a) + %b.fabs = call double @llvm.fabs.f64(double %b) + %c.fabs = call double @llvm.fabs.f64(double %c) + %a.fneg.fabs = fneg double %a.fabs + %b.fneg.fabs = fneg double %b.fabs + %c.fneg.fabs = fneg double %c.fabs + %max0 = call double @llvm.maximum.f64(double %a.fneg.fabs, double %b.fneg.fabs) + %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg.fabs) + ret double %max1 +} + +define double @v_fmaximum3_f64_fneg0(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fneg0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], -v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg double %a + %max0 = call double @llvm.maximum.f64(double %a.fneg, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64_fneg1(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fneg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fneg = fneg double %b + %max0 = call double @llvm.maximum.f64(double %a, double %b.fneg) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64_fneg2(double %a, double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_fneg2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], -v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fneg = fneg double %c + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c.fneg) + ret double %max1 +} + +define double @v_fmaximum3_f64_const0(double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_const0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double 8.0, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64__const2(double %a, double %b) { +; GFX12-LABEL: v_fmaximum3_f64__const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double 8.0) + ret double %max1 +} + +define double @v_fmaximum3_f64_inlineimm0(double %b, double %c) { +; GFX12-LABEL: v_fmaximum3_f64_inlineimm0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], 4.0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double 4.0, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fmaximum3_f64__inlineimm(double %a, double %b) { +; GFX12-LABEL: v_fmaximum3_f64__inlineimm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], v[0:1], 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double %a, double %b) + %max1 = call double @llvm.maximum.f64(double %max0, double 4.0) + ret double %max1 +} + +define double @v_fmaximum3_f64_const1_const2(double %a) { +; GFX12-LABEL: v_fmaximum3_f64_const1_const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40200000, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_maximum_f64 v[0:1], 0x40300000, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.maximum.f64(double %a, double 8.0) + %max1 = call double @llvm.maximum.f64(double %max0, double 16.0) + ret double %max1 +} diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index eef271e69a384d..3cfc5c8da4a207 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -1,98 +1,1443 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s - -; GCN-LABEL: {{^}}test_fminimum3_olt_0_f32: -; GCN: buffer_load_b32 [[REGC:v[0-9]+]] -; GCN: buffer_load_b32 [[REGB:v[0-9]+]] -; GCN: buffer_load_b32 [[REGA:v[0-9]+]] -; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_b32 [[RESULT]], -define amdgpu_kernel void @test_fminimum3_olt_0_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile float, ptr addrspace(1) %aptr, align 4 - %b = load volatile float, ptr addrspace(1) %bptr, align 4 - %c = load volatile float, ptr addrspace(1) %cptr, align 4 - %f0 = call float @llvm.minimum.f32(float %a, float %b) - %f1 = call float @llvm.minimum.f32(float %f0, float %c) - store float %f1, ptr addrspace(1) %out, align 4 - ret void -} - -; Commute operand of second fminimum -; GCN-LABEL: {{^}}test_fminimum3_olt_1_f32: -; GCN: buffer_load_b32 [[REGB:v[0-9]+]] -; GCN: buffer_load_b32 [[REGA:v[0-9]+]] -; GCN: buffer_load_b32 [[REGC:v[0-9]+]] -; GCN: v_minimum3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_b32 [[RESULT]], -define amdgpu_kernel void @test_fminimum3_olt_1_f32(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile float, ptr addrspace(1) %aptr, align 4 - %b = load volatile float, ptr addrspace(1) %bptr, align 4 - %c = load volatile float, ptr addrspace(1) %cptr, align 4 - %f0 = call float @llvm.minimum.f32(float %a, float %b) - %f1 = call float @llvm.minimum.f32(float %c, float %f0) - store float %f1, ptr addrspace(1) %out, align 4 - ret void -} - -; GCN-LABEL: {{^}}test_fminimum3_olt_0_f16: -; GCN: buffer_load_u16 [[REGC:v[0-9]+]] -; GCN: buffer_load_u16 [[REGB:v[0-9]+]] -; GCN: buffer_load_u16 [[REGA:v[0-9]+]] -; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]] -; GCN: buffer_store_b16 [[RESULT]], -define amdgpu_kernel void @test_fminimum3_olt_0_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile half, ptr addrspace(1) %aptr, align 2 - %b = load volatile half, ptr addrspace(1) %bptr, align 2 - %c = load volatile half, ptr addrspace(1) %cptr, align 2 - %f0 = call half @llvm.minimum.f16(half %a, half %b) - %f1 = call half @llvm.minimum.f16(half %f0, half %c) - store half %f1, ptr addrspace(1) %out, align 2 - ret void -} - -; GCN-LABEL: {{^}}test_fminimum3_olt_1_f16: -; GCN: buffer_load_u16 [[REGA:v[0-9]+]] -; GCN: buffer_load_u16 [[REGB:v[0-9]+]] -; GCN: buffer_load_u16 [[REGC:v[0-9]+]] -; GCN: v_minimum3_f16 [[RESULT:v[0-9]+]], [[REGC]], [[REGA]], [[REGB]] -; GCN: buffer_store_b16 [[RESULT]], -define amdgpu_kernel void @test_fminimum3_olt_1_f16(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile half, ptr addrspace(1) %aptr, align 2 - %b = load volatile half, ptr addrspace(1) %bptr, align 2 - %c = load volatile half, ptr addrspace(1) %cptr, align 2 - %f0 = call half @llvm.minimum.f16(half %a, half %b) - %f1 = call half @llvm.minimum.f16(half %c, half %f0) - store half %f1, ptr addrspace(1) %out, align 2 - ret void -} - -; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of minimum3 -; since there are no pack instructions for fminimum3. -; GCN-LABEL: {{^}}no_fminimum3_v2f16: -; GCN: v_pk_minimum_f16 v0, v0, v1 -; GCN: v_pk_minimum_f16 v0, v2, v0 -; GCN: v_pk_minimum_f16 v0, v0, v3 -; GCN-NEXT: s_setpc_b64 -define <2 x half> @no_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) { -entry: - %min = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) - %min1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %min) - %res = call <2 x half> @llvm.minimum.v2f16(<2 x half> %min1, <2 x half> %d) - ret <2 x half> %res -} - -; GCN-LABEL: {{^}}no_fminimum3_olt_0_f64: -; GCN-COUNT-2: v_minimum_f64 -define amdgpu_kernel void @no_fminimum3_olt_0_f64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr, ptr addrspace(1) %cptr) { - %a = load volatile double, ptr addrspace(1) %aptr, align 4 - %b = load volatile double, ptr addrspace(1) %bptr, align 4 - %c = load volatile double, ptr addrspace(1) %cptr, align 4 - %f0 = call double @llvm.minimum.f64(double %a, double %b) - %f1 = call double @llvm.minimum.f64(double %f0, double %c) - store double %f1, ptr addrspace(1) %out, align 4 - ret void -} - -declare double @llvm.minimum.f64(double, double) -declare float @llvm.minimum.f32(float, float) -declare half @llvm.minimum.f16(half, half) -declare <2 x half> @llvm.minimum.v2f16(<2 x half>, <2 x half>) +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s + +define float @v_fminimum3_f32(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32_commute(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v2, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %c, float %max0) + ret float %max1 +} + +define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inreg %c) { +; GFX12-LABEL: s_fminimum3_f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_minimum3_f32 v0, s0, s1, v0 +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: ; return to shader part epilog + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + %cast = bitcast float %max1 to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast) + ret i32 %readfirstlane +} + +define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fabs0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, |v0|, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call float @llvm.fabs.f32(float %a) + %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fabs1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, |v1|, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fabs = call float @llvm.fabs.f32(float %b) + %max0 = call float @llvm.minimum.f32(float %a, float %b.fabs) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fabs2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fabs = call float @llvm.fabs.f32(float %c) + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs) + ret float %max1 +} + +define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v1|, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call float @llvm.fabs.f32(float %a) + %b.fabs = call float @llvm.fabs.f32(float %b) + %c.fabs = call float @llvm.fabs.f32(float %c) + %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b.fabs) + %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs) + ret float %max1 +} + +define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg float %a + %b.fneg = fneg float %b + %c.fneg = fneg float %c + %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b.fneg) + %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg) + ret float %max1 +} + +define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fneg_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, -|v0|, -|v1|, -|v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call float @llvm.fabs.f32(float %a) + %b.fabs = call float @llvm.fabs.f32(float %b) + %c.fabs = call float @llvm.fabs.f32(float %c) + %a.fneg.fabs = fneg float %a.fabs + %b.fneg.fabs = fneg float %b.fabs + %c.fneg.fabs = fneg float %c.fabs + %max0 = call float @llvm.minimum.f32(float %a.fneg.fabs, float %b.fneg.fabs) + %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg.fabs) + ret float %max1 +} + +define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fneg0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, -v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg float %a + %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fneg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, -v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fneg = fneg float %b + %max0 = call float @llvm.minimum.f32(float %a, float %b.fneg) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_fneg2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fneg = fneg float %c + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg) + ret float %max1 +} + +define float @v_fminimum3_f32_const0(float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_const0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, 0x41000000, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float 8.0, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32__const2(float %a, float %b) { +; GFX12-LABEL: v_fminimum3_f32__const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 0x41000000 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float 8.0) + ret float %max1 +} + +define float @v_fminimum3_f32_inlineimm0(float %b, float %c) { +; GFX12-LABEL: v_fminimum3_f32_inlineimm0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, 4.0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float 4.0, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float %c) + ret float %max1 +} + +define float @v_fminimum3_f32__inlineimm(float %a, float %b) { +; GFX12-LABEL: v_fminimum3_f32__inlineimm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v1, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float %a, float %b) + %max1 = call float @llvm.minimum.f32(float %max0, float 4.0) + ret float %max1 +} + +define float @v_fminimum3_f32_const1_const2(float %a) { +; GFX12-LABEL: v_fminimum3_f32_const1_const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_mov_b32 s0, 0x41000000 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_minimum3_f32 v0, v0, s0, 0x41800000 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call float @llvm.minimum.f32(float %a, float 8.0) + %max1 = call float @llvm.minimum.f32(float %max0, float 16.0) + ret float %max1 +} + +define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fminimum3_v2f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v4, v0, v2 +; GFX12-NEXT: v_minimum3_f32 v1, v5, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0) + ret <2 x float> %max1 +} + +define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fminimum3_v2f32_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v2, v4 +; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c) + ret <2 x float> %max1 +} + +define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fminimum3_v2f32__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v2|, |v4| +; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v3|, |v5| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) + %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b) + %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c) + %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fabs, <2 x float> %b.fabs) + %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fabs) + ret <2 x float> %max1 +} + +define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, <2 x float> %c) { +; GFX12-LABEL: v_fminimum3_v2f32__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v2, -v4 +; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v3, -v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <2 x float> %a + %b.fneg = fneg <2 x float> %b + %c.fneg = fneg <2 x float> %c + %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a.fneg, <2 x float> %b.fneg) + %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c.fneg) + ret <2 x float> %max1 +} + +define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c) { +; GFX12-LABEL: v_fminimum3_v2f32__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, 2.0, v2 +; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> ) + %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c) + ret <2 x float> %max1 +} + +define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b) { +; GFX12-LABEL: v_fminimum3_v2f32__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v2, 4.0 +; GFX12-NEXT: v_minimum3_f32 v1, v1, v3, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) + %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> ) + ret <2 x float> %max1 +} + +define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fminimum3_v3f32: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v6, v0, v3 +; GFX12-NEXT: v_minimum3_f32 v1, v7, v1, v4 +; GFX12-NEXT: v_minimum3_f32 v2, v8, v2, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) + %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0) + ret <3 x float> %max1 +} + +define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fminimum3_v3f32_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v3, v6 +; GFX12-NEXT: v_minimum3_f32 v1, v1, v4, v7 +; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, v8 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) + %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c) + ret <3 x float> %max1 +} + +define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fminimum3_v3f32__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, |v0|, |v3|, |v6| +; GFX12-NEXT: v_minimum3_f32 v1, |v1|, |v4|, |v7| +; GFX12-NEXT: v_minimum3_f32 v2, |v2|, |v5|, |v8| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) + %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b) + %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c) + %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fabs, <3 x float> %b.fabs) + %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fabs) + ret <3 x float> %max1 +} + +define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, <3 x float> %c) { +; GFX12-LABEL: v_fminimum3_v3f32__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, -v0, -v3, -v6 +; GFX12-NEXT: v_minimum3_f32 v1, -v1, -v4, -v7 +; GFX12-NEXT: v_minimum3_f32 v2, -v2, -v5, -v8 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <3 x float> %a + %b.fneg = fneg <3 x float> %b + %c.fneg = fneg <3 x float> %c + %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a.fneg, <3 x float> %b.fneg) + %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c.fneg) + ret <3 x float> %max1 +} + +define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c) { +; GFX12-LABEL: v_fminimum3_v3f32__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, 2.0, v3 +; GFX12-NEXT: v_minimum3_f32 v1, v1, 2.0, v4 +; GFX12-NEXT: v_minimum3_f32 v2, v2, 2.0, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> ) + %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c) + ret <3 x float> %max1 +} + +define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b) { +; GFX12-LABEL: v_fminimum3_v3f32__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f32 v0, v0, v3, 4.0 +; GFX12-NEXT: v_minimum3_f32 v1, v1, v4, 4.0 +; GFX12-NEXT: v_minimum3_f32 v2, v2, v5, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) + %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> ) + ret <3 x float> %max1 +} + + +define half @v_fminimum3_f16(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16_commute(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v2, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %c, half %max0) + ret half %max1 +} + +define amdgpu_ps i32 @s_fminimum3_f16(half inreg %a, half inreg %b, half inreg %c) { +; GFX12-LABEL: s_fminimum3_f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_mov_b32_e32 v0, s2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_minimum3_f16 v0, s0, s1, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: ; return to shader part epilog + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + %cast = bitcast half %max1 to i16 + %zext = zext i16 %cast to i32 + %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %zext) + ret i32 %readfirstlane +} + +define half @v_fminimum3_f16_fabs0(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fabs0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, |v0|, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call half @llvm.fabs.f16(half %a) + %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16_fabs1(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fabs1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, |v1|, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fabs = call half @llvm.fabs.f16(half %b) + %max0 = call half @llvm.minimum.f16(half %a, half %b.fabs) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16_fabs2(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fabs2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fabs = call half @llvm.fabs.f16(half %c) + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs) + ret half %max1 +} + +define half @v_fminimum3_f16_fabs_all(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, |v0|, |v1|, |v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call half @llvm.fabs.f16(half %a) + %b.fabs = call half @llvm.fabs.f16(half %b) + %c.fabs = call half @llvm.fabs.f16(half %c) + %max0 = call half @llvm.minimum.f16(half %a.fabs, half %b.fabs) + %max1 = call half @llvm.minimum.f16(half %max0, half %c.fabs) + ret half %max1 +} + +define half @v_fminimum3_f16_fneg_all(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, -v0, -v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg half %a + %b.fneg = fneg half %b + %c.fneg = fneg half %c + %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b.fneg) + %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg) + ret half %max1 +} + +define half @v_fminimum3_f16_fneg_fabs_all(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fneg_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, -|v0|, -|v1|, -|v2| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call half @llvm.fabs.f16(half %a) + %b.fabs = call half @llvm.fabs.f16(half %b) + %c.fabs = call half @llvm.fabs.f16(half %c) + %a.fneg.fabs = fneg half %a.fabs + %b.fneg.fabs = fneg half %b.fabs + %c.fneg.fabs = fneg half %c.fabs + %max0 = call half @llvm.minimum.f16(half %a.fneg.fabs, half %b.fneg.fabs) + %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg.fabs) + ret half %max1 +} + +define half @v_fminimum3_f16_fneg0(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fneg0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, -v0, v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg half %a + %max0 = call half @llvm.minimum.f16(half %a.fneg, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16_fneg1(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fneg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, -v1, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fneg = fneg half %b + %max0 = call half @llvm.minimum.f16(half %a, half %b.fneg) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16_fneg2(half %a, half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_fneg2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, -v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fneg = fneg half %c + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c.fneg) + ret half %max1 +} + +define half @v_fminimum3_f16_const0(half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_const0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, 0x4800, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half 8.0, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16__const2(half %a, half %b) { +; GFX12-LABEL: v_fminimum3_f16__const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 0x4800 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half 8.0) + ret half %max1 +} + +define half @v_fminimum3_f16_inlineimm0(half %b, half %c) { +; GFX12-LABEL: v_fminimum3_f16_inlineimm0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, 4.0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half 4.0, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half %c) + ret half %max1 +} + +define half @v_fminimum3_f16__inlineimm(half %a, half %b) { +; GFX12-LABEL: v_fminimum3_f16__inlineimm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum3_f16 v0, v0, v1, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half %a, half %b) + %max1 = call half @llvm.minimum.f16(half %max0, half 4.0) + ret half %max1 +} + +define half @v_fminimum3_f16_const1_const2(half %a) { +; GFX12-LABEL: v_fminimum3_f16_const1_const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_movk_i32 s0, 0x4800 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_minimum3_f16 v0, v0, s0, 0x4c00 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call half @llvm.minimum.f16(half %a, half 8.0) + %max1 = call half @llvm.minimum.f16(half %max0, half 16.0) + ret half %max1 +} + +define <2 x half> @v_fminimum3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fminimum3_v2f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v0, v2, v0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %c, <2 x half> %max0) + ret <2 x half> %max1 +} + +define <2 x half> @v_fminimum3_v2f16_commute(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fminimum3_v2f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) + ret <2 x half> %max1 +} + +define <2 x half> @v_fminimum3_v2f16__fabs_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fminimum3_v2f16__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %a) + %b.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %b) + %c.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %c) + %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fabs, <2 x half> %b.fabs) + %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fabs) + ret <2 x half> %max1 +} + +define <2 x half> @v_fminimum3_v2f16__fneg_all(<2 x half> %a, <2 x half> %b, <2 x half> %c) { +; GFX12-LABEL: v_fminimum3_v2f16__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <2 x half> %a + %b.fneg = fneg <2 x half> %b + %c.fneg = fneg <2 x half> %c + %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a.fneg, <2 x half> %b.fneg) + %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c.fneg) + ret <2 x half> %max1 +} + +define <2 x half> @v_fminimum3_v2f16__inlineimm1(<2 x half> %a, <2 x half> %c) { +; GFX12-LABEL: v_fminimum3_v2f16__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> ) + %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> %c) + ret <2 x half> %max1 +} + +define <2 x half> @v_fminimum3_v2f16__inlineimm2(<2 x half> %a, <2 x half> %b) { +; GFX12-LABEL: v_fminimum3_v2f16__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b) + %max1 = call <2 x half> @llvm.minimum.v2f16(<2 x half> %max0, <2 x half> ) + ret <2 x half> %max1 +} + +define <3 x half> @v_fminimum3_v3f16(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fminimum3_v3f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v4, v0 +; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) + %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %c, <3 x half> %max0) + ret <3 x half> %max1 +} + +define <3 x half> @v_fminimum3_v3f16_commute(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fminimum3_v3f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) + %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) + ret <3 x half> %max1 +} + +define <3 x half> @v_fminimum3_v3f16__fabs_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fminimum3_v3f16__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX12-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %a) + %b.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %b) + %c.fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %c) + %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fabs, <3 x half> %b.fabs) + %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fabs) + ret <3 x half> %max1 +} + +define <3 x half> @v_fminimum3_v3f16__fneg_all(<3 x half> %a, <3 x half> %b, <3 x half> %c) { +; GFX12-LABEL: v_fminimum3_v3f16__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <3 x half> %a + %b.fneg = fneg <3 x half> %b + %c.fneg = fneg <3 x half> %c + %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a.fneg, <3 x half> %b.fneg) + %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c.fneg) + ret <3 x half> %max1 +} + +define <3 x half> @v_fminimum3_v3f16__inlineimm1(<3 x half> %a, <3 x half> %c) { +; GFX12-LABEL: v_fminimum3_v3f16__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 2.0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> ) + %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> %c) + ret <3 x half> %max1 +} + +define <3 x half> @v_fminimum3_v3f16__inlineimm2(<3 x half> %a, <3 x half> %b) { +; GFX12-LABEL: v_fminimum3_v3f16__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %a, <3 x half> %b) + %max1 = call <3 x half> @llvm.minimum.v3f16(<3 x half> %max0, <3 x half> ) + ret <3 x half> %max1 +} + +define <4 x half> @v_fminimum3_v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fminimum3_v4f16: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v4, v0 +; GFX12-NEXT: v_pk_minimum_f16 v1, v5, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) + %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %c, <4 x half> %max0) + ret <4 x half> %max1 +} + +define <4 x half> @v_fminimum3_v4f16_commute(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fminimum3_v4f16_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) + %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) + ret <4 x half> %max1 +} + +define <4 x half> @v_fminimum3_v4f16__fabs_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fminimum3_v4f16__fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX12-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX12-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX12-NEXT: v_and_b32_e32 v5, 0x7fff7fff, v5 +; GFX12-NEXT: v_and_b32_e32 v4, 0x7fff7fff, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %a) + %b.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %b) + %c.fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %c) + %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a.fabs, <4 x half> %b.fabs) + %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c.fabs) + ret <4 x half> %max1 +} + +define <4 x half> @v_fminimum3_v4f16__fneg_all(<4 x half> %a, <4 x half> %b, <4 x half> %c) { +; GFX12-LABEL: v_fminimum3_v4f16__fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 neg_lo:[1,1] neg_hi:[1,1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v4 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v5 neg_lo:[0,1] neg_hi:[0,1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg <4 x half> %a + %b.fneg = fneg <4 x half> %b + %c.fneg = fneg <4 x half> %c + %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a.fneg, <4 x half> %b.fneg) + %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c.fneg) + ret <4 x half> %max1 +} + +define <4 x half> @v_fminimum3_v4f16__inlineimm1(<4 x half> %a, <4 x half> %c) { +; GFX12-LABEL: v_fminimum3_v4f16__inlineimm1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 2.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> ) + %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> %c) + ret <4 x half> %max1 +} + +define <4 x half> @v_fminimum3_v4f16__inlineimm2(<4 x half> %a, <4 x half> %b) { +; GFX12-LABEL: v_fminimum3_v4f16__inlineimm2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, v2 +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_pk_minimum_f16 v0, v0, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: v_pk_minimum_f16 v1, v1, 4.0 op_sel_hi:[1,0] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b) + %max1 = call <4 x half> @llvm.minimum.v4f16(<4 x half> %max0, <4 x half> ) + ret <4 x half> %max1 +} + +define double @v_fminimum3_f64(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64_commute(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[4:5], v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %c, double %max0) + ret double %max1 +} + +define amdgpu_ps <2 x i32> @s_fminimum3_f64(double inreg %a, double inreg %b, double inreg %c) { +; GFX12-LABEL: s_fminimum3_f64: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_minimum_f64 v[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], s[4:5] +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: ; return to shader part epilog + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + %cast = bitcast double %max1 to <2 x i32> + %elt0 = extractelement <2 x i32> %cast, i32 0 + %elt1 = extractelement <2 x i32> %cast, i32 1 + %readlane0 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt0) + %readlane1 = call i32 @llvm.amdgcn.readfirstlane(i32 %elt1) + %insert.0 = insertelement <2 x i32> poison, i32 %readlane0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %readlane1, i32 1 + ret <2 x i32> %insert.1 +} + +define double @v_fminimum3_f64_fabs0(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fabs0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], |v[0:1]|, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call double @llvm.fabs.f64(double %a) + %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64_fabs1(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fabs1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[2:3]| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fabs = call double @llvm.fabs.f64(double %b) + %max0 = call double @llvm.minimum.f64(double %a, double %b.fabs) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64_fabs2(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fabs2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[4:5]| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fabs = call double @llvm.fabs.f64(double %c) + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs) + ret double %max1 +} + +define double @v_fminimum3_f64_fabs_all(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], |v[0:1]|, |v[2:3]| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], |v[4:5]| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call double @llvm.fabs.f64(double %a) + %b.fabs = call double @llvm.fabs.f64(double %b) + %c.fabs = call double @llvm.fabs.f64(double %c) + %max0 = call double @llvm.minimum.f64(double %a.fabs, double %b.fabs) + %max1 = call double @llvm.minimum.f64(double %max0, double %c.fabs) + ret double %max1 +} + +define double @v_fminimum3_f64_fneg_all(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fneg_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], -v[0:1], -v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg double %a + %b.fneg = fneg double %b + %c.fneg = fneg double %c + %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b.fneg) + %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg) + ret double %max1 +} + +define double @v_fminimum3_f64_fneg_fabs_all(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fneg_fabs_all: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], -|v[0:1]|, -|v[2:3]| +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -|v[4:5]| +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fabs = call double @llvm.fabs.f64(double %a) + %b.fabs = call double @llvm.fabs.f64(double %b) + %c.fabs = call double @llvm.fabs.f64(double %c) + %a.fneg.fabs = fneg double %a.fabs + %b.fneg.fabs = fneg double %b.fabs + %c.fneg.fabs = fneg double %c.fabs + %max0 = call double @llvm.minimum.f64(double %a.fneg.fabs, double %b.fneg.fabs) + %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg.fabs) + ret double %max1 +} + +define double @v_fminimum3_f64_fneg0(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fneg0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], -v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %a.fneg = fneg double %a + %max0 = call double @llvm.minimum.f64(double %a.fneg, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64_fneg1(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fneg1: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %b.fneg = fneg double %b + %max0 = call double @llvm.minimum.f64(double %a, double %b.fneg) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64_fneg2(double %a, double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_fneg2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], -v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %c.fneg = fneg double %c + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c.fneg) + ret double %max1 +} + +define double @v_fminimum3_f64_const0(double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_const0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double 8.0, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64__const2(double %a, double %b) { +; GFX12-LABEL: v_fminimum3_f64__const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double 8.0) + ret double %max1 +} + +define double @v_fminimum3_f64_inlineimm0(double %b, double %c) { +; GFX12-LABEL: v_fminimum3_f64_inlineimm0: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], 4.0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double 4.0, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double %c) + ret double %max1 +} + +define double @v_fminimum3_f64__inlineimm(double %a, double %b) { +; GFX12-LABEL: v_fminimum3_f64__inlineimm: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], v[0:1], 4.0 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double %a, double %b) + %max1 = call double @llvm.minimum.f64(double %max0, double 4.0) + ret double %max1 +} + +define double @v_fminimum3_f64_const1_const2(double %a) { +; GFX12-LABEL: v_fminimum3_f64_const1_const2: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40200000, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_minimum_f64 v[0:1], 0x40300000, v[0:1] +; GFX12-NEXT: s_setpc_b64 s[30:31] + %max0 = call double @llvm.minimum.f64(double %a, double 8.0) + %max1 = call double @llvm.minimum.f64(double %max0, double 16.0) + ret double %max1 +}