Skip to content

Commit

Permalink
8337062: x86_64: Unordered add/mul reduction support for vector api
Browse files Browse the repository at this point in the history
Reviewed-by: jbhateja, sgibbons
  • Loading branch information
Sandhya Viswanathan committed Aug 1, 2024
1 parent 21e86d1 commit dc35f3e
Show file tree
Hide file tree
Showing 17 changed files with 416 additions and 127 deletions.
119 changes: 119 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1786,6 +1786,16 @@ void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegis
}
}

void C2_MacroAssembler::unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
switch (opcode) {
case Op_AddReductionVF: addps(dst, src); break;
case Op_AddReductionVD: addpd(dst, src); break;
case Op_MulReductionVF: mulps(dst, src); break;
case Op_MulReductionVD: mulpd(dst, src); break;
default: assert(false, "%s", NodeClassNames[opcode]);
}
}

void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
int vector_len = Assembler::AVX_256bit;

Expand Down Expand Up @@ -1834,6 +1844,18 @@ void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegis
}
}

void C2_MacroAssembler::unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
int vector_len = Assembler::AVX_256bit;

switch (opcode) {
case Op_AddReductionVF: vaddps(dst, src1, src2, vector_len); break;
case Op_AddReductionVD: vaddpd(dst, src1, src2, vector_len); break;
case Op_MulReductionVF: vmulps(dst, src1, src2, vector_len); break;
case Op_MulReductionVD: vmulpd(dst, src1, src2, vector_len); break;
default: assert(false, "%s", NodeClassNames[opcode]);
}
}

void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1, XMMRegister vtmp2) {
Expand All @@ -1852,6 +1874,24 @@ void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
}
}

void C2_MacroAssembler::unordered_reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1, XMMRegister vtmp2) {
switch (opcode) {
case Op_AddReductionVF:
case Op_MulReductionVF:
unorderedReduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
break;

case Op_AddReductionVD:
case Op_MulReductionVD:
unorderedReduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
break;

default: assert(false, "%s", NodeClassNames[opcode]);
}
}

void C2_MacroAssembler::reduceB(int opcode, int vlen,
Register dst, Register src1, XMMRegister src2,
XMMRegister vtmp1, XMMRegister vtmp2) {
Expand Down Expand Up @@ -1954,6 +1994,45 @@ void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegist
}
}

void C2_MacroAssembler::unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
switch (vlen) {
case 2:
assert(vtmp1 == xnoreg, "");
assert(vtmp2 == xnoreg, "");
unorderedReduce2F(opcode, dst, src);
break;
case 4:
assert(vtmp2 == xnoreg, "");
unorderedReduce4F(opcode, dst, src, vtmp1);
break;
case 8:
unorderedReduce8F(opcode, dst, src, vtmp1, vtmp2);
break;
case 16:
unorderedReduce16F(opcode, dst, src, vtmp1, vtmp2);
break;
default: assert(false, "wrong vector length");
}
}

void C2_MacroAssembler::unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
switch (vlen) {
case 2:
assert(vtmp1 == xnoreg, "");
assert(vtmp2 == xnoreg, "");
unorderedReduce2D(opcode, dst, src);
break;
case 4:
assert(vtmp2 == xnoreg, "");
unorderedReduce4D(opcode, dst, src, vtmp1);
break;
case 8:
unorderedReduce8D(opcode, dst, src, vtmp1, vtmp2);
break;
default: assert(false, "wrong vector length");
}
}

void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
if (opcode == Op_AddReductionVI) {
if (vtmp1 != src2) {
Expand Down Expand Up @@ -2181,6 +2260,29 @@ void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src,
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
}

void C2_MacroAssembler::unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src) {
pshufd(dst, src, 0x1);
reduce_operation_128(T_FLOAT, opcode, dst, src);
}

void C2_MacroAssembler::unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
pshufd(vtmp, src, 0xE);
unordered_reduce_operation_128(T_FLOAT, opcode, vtmp, src);
unorderedReduce2F(opcode, dst, vtmp);
}

void C2_MacroAssembler::unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
vextractf128_high(vtmp1, src);
unordered_reduce_operation_128(T_FLOAT, opcode, vtmp1, src);
unorderedReduce4F(opcode, dst, vtmp1, vtmp2);
}

void C2_MacroAssembler::unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
vextractf64x4_high(vtmp2, src);
unordered_reduce_operation_256(T_FLOAT, opcode, vtmp2, vtmp2, src);
unorderedReduce8F(opcode, dst, vtmp2, vtmp1, vtmp2);
}

void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
reduce_operation_128(T_DOUBLE, opcode, dst, src);
pshufd(vtmp, src, 0xE);
Expand All @@ -2199,6 +2301,23 @@ void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, X
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
}

void C2_MacroAssembler::unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src) {
pshufd(dst, src, 0xE);
reduce_operation_128(T_DOUBLE, opcode, dst, src);
}

void C2_MacroAssembler::unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
vextractf128_high(vtmp, src);
unordered_reduce_operation_128(T_DOUBLE, opcode, vtmp, src);
unorderedReduce2D(opcode, dst, vtmp);
}

void C2_MacroAssembler::unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
vextractf64x4_high(vtmp2, src);
unordered_reduce_operation_256(T_DOUBLE, opcode, vtmp2, vtmp2, src);
unorderedReduce4D(opcode, dst, vtmp2, vtmp1);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, bool merge, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, merge, vector_len);
}
Expand Down
18 changes: 18 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@
void reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1, XMMRegister vtmp2 = xnoreg);
void unordered_reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg);
void reduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void mulreduceB(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void reduceS(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
Expand All @@ -161,6 +164,8 @@
private:
void reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void unorderedReduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void unorderedReduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

// Int Reduction
void reduce2I (int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
Expand Down Expand Up @@ -197,14 +202,27 @@
void reduce8F (int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

// Unordered Float Reduction
void unorderedReduce2F(int opcode, XMMRegister dst, XMMRegister src);
void unorderedReduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
void unorderedReduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void unorderedReduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

// Double Reduction
void reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
void reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);
void reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

// Unordered Double Reduction
void unorderedReduce2D(int opcode, XMMRegister dst, XMMRegister src);
void unorderedReduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp);
void unorderedReduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2);

// Base reduction instruction
void reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src);
void reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void unordered_reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src);
void unordered_reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2);

public:
#ifdef _LP64
Expand Down
132 changes: 126 additions & 6 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -5109,7 +5109,7 @@ instruct reductionL_avx512dq(rRegL dst, rRegL src1, vec src2, vec vtmp1, vec vtm
// =======================Float Reduction==========================================

instruct reductionF128(regF dst, vec src, vec vtmp) %{
predicate(Matcher::vector_length(n->in(2)) <= 4); // src
predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) <= 4); // src
match(Set dst (AddReductionVF dst src));
match(Set dst (MulReductionVF dst src));
effect(TEMP dst, TEMP vtmp);
Expand All @@ -5123,7 +5123,7 @@ instruct reductionF128(regF dst, vec src, vec vtmp) %{
%}

instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
predicate(Matcher::vector_length(n->in(2)) == 8); // src
predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
match(Set dst (AddReductionVF dst src));
match(Set dst (MulReductionVF dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
Expand All @@ -5137,7 +5137,7 @@ instruct reduction8F(regF dst, vec src, vec vtmp1, vec vtmp2) %{
%}

instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
predicate(Matcher::vector_length(n->in(2)) == 16); // src
predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src
match(Set dst (AddReductionVF dst src));
match(Set dst (MulReductionVF dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
Expand All @@ -5150,10 +5150,79 @@ instruct reduction16F(regF dst, legVec src, legVec vtmp1, legVec vtmp2) %{
ins_pipe( pipe_slow );
%}


instruct unordered_reduction2F(regF dst, regF src1, vec src2) %{
// Non-strictly ordered floating-point add/mul reduction for floats. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
match(Set dst (AddReductionVF src1 src2));
match(Set dst (MulReductionVF src1 src2));
effect(TEMP dst);
format %{ "vector_reduction_float $dst,$src1,$src2 ;" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct unordered_reduction4F(regF dst, regF src1, vec src2, vec vtmp) %{
// Non-strictly ordered floating-point add/mul reduction for floats. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
match(Set dst (AddReductionVF src1 src2));
match(Set dst (MulReductionVF src1 src2));
effect(TEMP dst, TEMP vtmp);
format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct unordered_reduction8F(regF dst, regF src1, vec src2, vec vtmp1, vec vtmp2) %{
// Non-strictly ordered floating-point add/mul reduction for floats. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
match(Set dst (AddReductionVF src1 src2));
match(Set dst (MulReductionVF src1 src2));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct unordered_reduction16F(regF dst, regF src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
// Non-strictly ordered floating-point add/mul reduction for floats. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 16); // src2
match(Set dst (AddReductionVF src1 src2));
match(Set dst (MulReductionVF src1 src2));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_float $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

// =======================Double Reduction==========================================

instruct reduction2D(regD dst, vec src, vec vtmp) %{
predicate(Matcher::vector_length(n->in(2)) == 2); // src
predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src
match(Set dst (AddReductionVD dst src));
match(Set dst (MulReductionVD dst src));
effect(TEMP dst, TEMP vtmp);
Expand All @@ -5167,7 +5236,7 @@ instruct reduction2D(regD dst, vec src, vec vtmp) %{
%}

instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
predicate(Matcher::vector_length(n->in(2)) == 4); // src
predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src
match(Set dst (AddReductionVD dst src));
match(Set dst (MulReductionVD dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
Expand All @@ -5181,7 +5250,7 @@ instruct reduction4D(regD dst, vec src, vec vtmp1, vec vtmp2) %{
%}

instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
predicate(Matcher::vector_length(n->in(2)) == 8); // src
predicate(n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src
match(Set dst (AddReductionVD dst src));
match(Set dst (MulReductionVD dst src));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
Expand All @@ -5194,6 +5263,57 @@ instruct reduction8D(regD dst, legVec src, legVec vtmp1, legVec vtmp2) %{
ins_pipe( pipe_slow );
%}

instruct unordered_reduction2D(regD dst, regD src1, vec src2) %{
// Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 2); // src2
match(Set dst (AddReductionVD src1 src2));
match(Set dst (MulReductionVD src1 src2));
effect(TEMP dst);
format %{ "vector_reduction_double $dst,$src1,$src2 ;" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct unordered_reduction4D(regD dst, regD src1, vec src2, vec vtmp) %{
// Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 4); // src2
match(Set dst (AddReductionVD src1 src2));
match(Set dst (MulReductionVD src1 src2));
effect(TEMP dst, TEMP vtmp);
format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct unordered_reduction8D(regD dst, regD src1, legVec src2, legVec vtmp1, legVec vtmp2) %{
// Non-strictly ordered floating-point add/mul reduction for doubles. This rule is
// intended for the VectorAPI (which allows for non-strictly ordered add/mul reduction).
// src1 contains reduction identity
predicate(!n->as_Reduction()->requires_strict_order() && Matcher::vector_length(n->in(2)) == 8); // src2
match(Set dst (AddReductionVD src1 src2));
match(Set dst (MulReductionVD src1 src2));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2);
format %{ "vector_reduction_double $dst,$src1,$src2 ; using $vtmp1, $vtmp2 as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen = Matcher::vector_length(this, $src2);
__ unordered_reduce_fp(opcode, vlen, $dst$$XMMRegister, $src2$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

// =======================Byte Reduction==========================================

#ifdef _LP64
Expand Down
Loading

0 comments on commit dc35f3e

Please sign in to comment.