Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure the scalar Reciprocal*Estimate APIs use AVX512 where possible #101800

Merged
merged 8 commits into from
May 4, 2024
7 changes: 6 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -1594,7 +1594,7 @@ enum class ProfileChecks : unsigned int
{
CHECK_NONE = 0,
CHECK_HASLIKELIHOOD = 1 << 0, // check all FlowEdges for hasLikelihood
CHECK_LIKELIHOODSUM = 1 << 1, // check block succesor likelihoods sum to 1
CHECK_LIKELIHOODSUM = 1 << 1, // check block succesor likelihoods sum to 1
CHECK_LIKELY = 1 << 2, // fully check likelihood based weights
RAISE_ASSERT = 1 << 3, // assert on check failure
CHECK_ALL_BLOCKS = 1 << 4, // check blocks even if bbHasProfileWeight is false
Expand Down Expand Up @@ -4525,6 +4525,11 @@ class Compiler
CORINFO_THIS_TRANSFORM constraintCallThisTransform,
NamedIntrinsic* pIntrinsicName,
bool* isSpecialIntrinsic = nullptr);
GenTree* impEstimateIntrinsic(CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
CorInfoType callJitType,
NamedIntrinsic intrinsicName,
bool tailCall);
GenTree* impMathIntrinsic(CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
var_types callType,
Expand Down
164 changes: 156 additions & 8 deletions src/coreclr/jit/importercalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3125,7 +3125,15 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
// To be fixed in https://github.com/dotnet/runtime/pull/77465
const bool tier0opts = !opts.compDbgCode && !opts.jitFlags->IsSet(JitFlags::JIT_FLAG_MIN_OPT);

if (!mustExpand && tier0opts)
if (tier0opts)
{
// The *Estimate APIs are allowed to differ in behavior across hardware
// so ensure we treat them as "betterToExpand" to get deterministic behavior

betterToExpand |= (ni == NI_System_Math_ReciprocalEstimate);
betterToExpand |= (ni == NI_System_Math_ReciprocalSqrtEstimate);
}
else if (!mustExpand)
{
switch (ni)
{
Expand Down Expand Up @@ -3189,9 +3197,9 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
break;

default:
// Unsafe.* are all small enough to prefer expansions.
// Various intrinsics are all small enough to prefer expansions.
betterToExpand |= ni >= NI_SYSTEM_MATH_START && ni <= NI_SYSTEM_MATH_END;
betterToExpand |= ni >= NI_SRCS_UNSAFE_START && ni <= NI_SRCS_UNSAFE_END;
// Same for these
betterToExpand |= ni >= NI_PRIMITIVE_START && ni <= NI_PRIMITIVE_END;
break;
}
Expand Down Expand Up @@ -4146,6 +4154,13 @@ GenTree* Compiler::impIntrinsic(GenTree* newobjThis,
break;
}

case NI_System_Math_ReciprocalEstimate:
case NI_System_Math_ReciprocalSqrtEstimate:
{
retNode = impEstimateIntrinsic(method, sig, callJitType, ni, tailCall);
break;
}

case NI_System_Array_Clone:
case NI_System_Collections_Generic_Comparer_get_Default:
case NI_System_Collections_Generic_EqualityComparer_get_Default:
Expand Down Expand Up @@ -7413,13 +7428,15 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName)
// instructions to directly compute round/ceiling/floor/truncate.

case NI_System_Math_Abs:
case NI_System_Math_ReciprocalEstimate:
case NI_System_Math_ReciprocalSqrtEstimate:
case NI_System_Math_Sqrt:
return true;

case NI_System_Math_Ceiling:
case NI_System_Math_Floor:
case NI_System_Math_Truncate:
case NI_System_Math_Round:
case NI_System_Math_Truncate:
return compOpportunisticallyDependsOn(InstructionSet_SSE41);

case NI_System_Math_FusedMultiplyAdd:
Expand All @@ -7434,11 +7451,13 @@ bool Compiler::IsTargetIntrinsic(NamedIntrinsic intrinsicName)
case NI_System_Math_Abs:
case NI_System_Math_Ceiling:
case NI_System_Math_Floor:
case NI_System_Math_Truncate:
case NI_System_Math_Round:
case NI_System_Math_Sqrt:
case NI_System_Math_Max:
case NI_System_Math_Min:
case NI_System_Math_ReciprocalEstimate:
case NI_System_Math_ReciprocalSqrtEstimate:
case NI_System_Math_Round:
case NI_System_Math_Sqrt:
case NI_System_Math_Truncate:
return true;

case NI_System_Math_FusedMultiplyAdd:
Expand Down Expand Up @@ -7513,6 +7532,8 @@ bool Compiler::IsMathIntrinsic(NamedIntrinsic intrinsicName)
case NI_System_Math_MinMagnitudeNumber:
case NI_System_Math_MinNumber:
case NI_System_Math_Pow:
case NI_System_Math_ReciprocalEstimate:
case NI_System_Math_ReciprocalSqrtEstimate:
case NI_System_Math_Round:
case NI_System_Math_Sin:
case NI_System_Math_Sinh:
Expand Down Expand Up @@ -8730,6 +8751,120 @@ void Compiler::impCheckCanInline(GenTreeCall* call,
}
}

//------------------------------------------------------------------------
// impMinMaxIntrinsic: Imports a min or max intrinsic
//
// Arguments:
// method - The handle of the method being imported
// callType - The underlying type for the call
// intrinsicName - The intrinsic being imported
// tailCall - true if the method is a tail call; otherwise false
// isMax - true if the intrinsic computes the max; false for the min
// isMagnitude - true if the intrinsic compares using the absolute value of the inputs
// isNumber - true if the intrinsic propagates the number; false for NaN
tannergooding marked this conversation as resolved.
Show resolved Hide resolved
//
GenTree* Compiler::impEstimateIntrinsic(CORINFO_METHOD_HANDLE method,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CC. @dotnet/jit-contrib, this did end up being a runtime issue of sorts.

There's a general assumption made by R2R that the managed implementation (for Corelib in particular) will be consistent across all paths. That was strictly not the case for the *Estimate APIs where they are non-deterministic across hardware by design. -- Put another way, R2R essentially assumes that Isa.IsSupported is equivalent to compOpportunisticallyDependsOn(Isa) for corelib, but it assumes its equivalent to compExactlyDependsOn(Isa) for other libraries.

As such, the only fix we have today is to move it into the JIT where we can explicitly use compExactlyDependsOn(Isa) instead and force things to be annotated correctly. This has a minor added bonus in that it improves codegen around the method slightly for more complex scenarios and makes it more consistent with the handling of other math APIs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can it be solved via ExactlyDependsOn attributes on C# side?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not from what I saw. The analyzer will actually warn in the case that you try to do something like that, because of the expectation that the paths are semantically equivalent.

It's detailed more in depth here with some explanations of what is safe vs not safe for corelib:

- Any use of a platform intrinsic in the codebase MUST be wrapped with a call to an associated IsSupported property. This wrapping MUST be done within the same function that uses the hardware intrinsic, OR the function which uses the platform intrinsic must have the `CompExactlyDependsOn` attribute used to indicate that this function will unconditionally call platform intrinsics of from some type.

Doing it in the JIT, which is how all the other APIs that map down to 1 instruction are handled, ended up being the simpler approach.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tannergooding so does this PR solve the non-determinism? R2R still may prejit functions with these math functions, right?

Copy link
Member Author

@tannergooding tannergooding May 2, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so does this PR solve the non-determinism

No, the non-determinism is intentional for these APIs. They are explicitly estimating APIs that are allowed to differ in result based on the hardware they're running on. What this does fix is the non-determinism within a single process.

R2R still may prejit functions with these math functions, right?

The use of compExactlyDependsOn basically tells R2R that this method requires Avx512F and if it isn't present the functionality may differ. That will mark methods that call it as requiring themselves to be jitted.

If we made these recursive and therefore mustExpand, then R2R could leave a call to the method and only have the method jitted instead. But, that's a more complicated change since ReciprocalSqrtEstimate requires us to emit a call to System.Math.Sqrt in the worst case and requires adding handling for that to platforms that don't currently have hardware intrinsic support (like Arm32).

This is a relatively uncommon API (basically just used in specialized high-perf scenarios), so fixing the failing CI legs seemed more important than worrying about the handful of APIs that would be disqualified from R2R (none of which are in corelib or other core libraries, System.Numerics.Tensors is the only place in our own code).

CORINFO_SIG_INFO* sig,
CorInfoType callJitType,
NamedIntrinsic intrinsicName,
bool tailCall)
{
var_types callType = JITtype2varType(callJitType);

assert(varTypeIsFloating(callType));
assert(sig->numArgs == 1);

#if defined(FEATURE_HW_INTRINSICS)
// We use compExactlyDependsOn since these are estimate APIs where
// the behavior is explicitly allowed to differ across machines and
// we want to ensure that it gets marked as such in R2R.

var_types simdType = TYP_UNKNOWN;
NamedIntrinsic intrinsicId = NI_Illegal;

switch (intrinsicName)
{
case NI_System_Math_ReciprocalEstimate:
{
#if defined(TARGET_XARCH)
if (compExactlyDependsOn(InstructionSet_AVX512F))
{
simdType = TYP_SIMD16;
intrinsicId = NI_AVX512F_Reciprocal14Scalar;
}
else if ((callType == TYP_FLOAT) && compExactlyDependsOn(InstructionSet_SSE))
{
simdType = TYP_SIMD16;
intrinsicId = NI_SSE_ReciprocalScalar;
}
#elif defined(TARGET_ARM64)
if (compExactlyDependsOn(InstructionSet_AdvSimd_Arm64))
{
simdType = TYP_SIMD8;
intrinsicId = NI_AdvSimd_Arm64_ReciprocalEstimateScalar;
}
#endif // TARGET_ARM64
break;
}

case NI_System_Math_ReciprocalSqrtEstimate:
{
#if defined(TARGET_XARCH)
if (compExactlyDependsOn(InstructionSet_AVX512F))
{
simdType = TYP_SIMD16;
intrinsicId = NI_AVX512F_ReciprocalSqrt14Scalar;
}
else if ((callType == TYP_FLOAT) && compExactlyDependsOn(InstructionSet_SSE))
{
simdType = TYP_SIMD16;
intrinsicId = NI_SSE_ReciprocalSqrtScalar;
}
#elif defined(TARGET_ARM64)
if (compExactlyDependsOn(InstructionSet_AdvSimd_Arm64))
{
simdType = TYP_SIMD8;
intrinsicId = NI_AdvSimd_Arm64_ReciprocalSquareRootEstimateScalar;
}
#endif // TARGET_ARM64
break;
}

default:
{
unreached();
}
}

if (intrinsicId != NI_Illegal)
{
unsigned simdSize = 0;

if (simdType == TYP_SIMD8)
{
simdSize = 8;
}
else
{
assert(simdType == TYP_SIMD16);
simdSize = 16;
}

GenTree* op1 = impPopStack().val;

op1 = gtNewSimdCreateScalarUnsafeNode(simdType, op1, callJitType, simdSize);
op1 = gtNewSimdHWIntrinsicNode(simdType, op1, intrinsicId, callJitType, simdSize);

return gtNewSimdToScalarNode(callType, op1, callJitType, simdSize);
}
#endif // FEATURE_HW_INTRINSICS

// TODO-CQ: Returning this as an intrinsic blocks inlining and is undesirable
// return impMathIntrinsic(method, sig, callType, intrinsicName, tailCall);

return nullptr;
}

GenTree* Compiler::impMathIntrinsic(CORINFO_METHOD_HANDLE method,
CORINFO_SIG_INFO* sig,
var_types callType,
Expand Down Expand Up @@ -10339,7 +10474,20 @@ NamedIntrinsic Compiler::lookupPrimitiveFloatNamedIntrinsic(CORINFO_METHOD_HANDL

case 'R':
{
if (strcmp(methodName, "Round") == 0)
if (strncmp(methodName, "Reciprocal", 10) == 0)
{
methodName += 10;

if (strcmp(methodName, "Estimate") == 0)
{
result = NI_System_Math_ReciprocalEstimate;
}
else if (strcmp(methodName, "SqrtEstimate") == 0)
{
result = NI_System_Math_ReciprocalSqrtEstimate;
}
}
else if (strcmp(methodName, "Round") == 0)
{
result = NI_System_Math_Round;
}
Expand Down
2 changes: 2 additions & 0 deletions src/coreclr/jit/namedintrinsiclist.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ enum NamedIntrinsic : unsigned short
NI_System_Math_MinMagnitudeNumber,
NI_System_Math_MinNumber,
NI_System_Math_Pow,
NI_System_Math_ReciprocalEstimate,
NI_System_Math_ReciprocalSqrtEstimate,
NI_System_Math_Round,
NI_System_Math_Sin,
NI_System_Math_Sinh,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,12 @@ public static void ReciprocalSqrtEstimate<T>(ReadOnlySpan<T> x, Span<T> destinat

public static Vector128<T> Invoke(Vector128<T> x)
{
if (Avx512F.VL.IsSupported)
{
if (typeof(T) == typeof(float)) return Avx512F.VL.Reciprocal14(x.AsSingle()).As<float, T>();
if (typeof(T) == typeof(double)) return Avx512F.VL.Reciprocal14(x.AsDouble()).As<double, T>();
}

if (Sse.IsSupported)
{
if (typeof(T) == typeof(float)) return Sse.Reciprocal(x.AsSingle()).As<float, T>();
Expand All @@ -115,6 +121,12 @@ public static Vector128<T> Invoke(Vector128<T> x)

public static Vector256<T> Invoke(Vector256<T> x)
{
if (Avx512F.VL.IsSupported)
{
if (typeof(T) == typeof(float)) return Avx512F.VL.Reciprocal14(x.AsSingle()).As<float, T>();
if (typeof(T) == typeof(double)) return Avx512F.VL.Reciprocal14(x.AsDouble()).As<double, T>();
}

if (Avx.IsSupported)
{
if (typeof(T) == typeof(float)) return Avx.Reciprocal(x.AsSingle()).As<float, T>();
Expand Down Expand Up @@ -143,6 +155,12 @@ public static Vector512<T> Invoke(Vector512<T> x)

public static Vector128<T> Invoke(Vector128<T> x)
{
if (Avx512F.VL.IsSupported)
{
if (typeof(T) == typeof(float)) return Avx512F.VL.ReciprocalSqrt14(x.AsSingle()).As<float, T>();
if (typeof(T) == typeof(double)) return Avx512F.VL.ReciprocalSqrt14(x.AsDouble()).As<double, T>();
}

if (Sse.IsSupported)
{
if (typeof(T) == typeof(float)) return Sse.ReciprocalSqrt(x.AsSingle()).As<float, T>();
Expand All @@ -163,6 +181,12 @@ public static Vector128<T> Invoke(Vector128<T> x)

public static Vector256<T> Invoke(Vector256<T> x)
{
if (Avx512F.VL.IsSupported)
{
if (typeof(T) == typeof(float)) return Avx512F.VL.ReciprocalSqrt14(x.AsSingle()).As<float, T>();
if (typeof(T) == typeof(double)) return Avx512F.VL.ReciprocalSqrt14(x.AsDouble()).As<double, T>();
}

if (Avx.IsSupported)
{
if (typeof(T) == typeof(float)) return Avx.ReciprocalSqrt(x.AsSingle()).As<float, T>();
Expand Down
2 changes: 2 additions & 0 deletions src/libraries/System.Private.CoreLib/src/System/Double.cs
Original file line number Diff line number Diff line change
Expand Up @@ -865,9 +865,11 @@ bool IFloatingPoint<double>.TryWriteSignificandLittleEndian(Span<byte> destinati
public static double Lerp(double value1, double value2, double amount) => (value1 * (1.0 - amount)) + (value2 * amount);

/// <inheritdoc cref="IFloatingPointIeee754{TSelf}.ReciprocalEstimate(TSelf)" />
[Intrinsic]
public static double ReciprocalEstimate(double x) => Math.ReciprocalEstimate(x);

/// <inheritdoc cref="IFloatingPointIeee754{TSelf}.ReciprocalSqrtEstimate(TSelf)" />
[Intrinsic]
public static double ReciprocalSqrtEstimate(double x) => Math.ReciprocalSqrtEstimate(x);

/// <inheritdoc cref="IFloatingPointIeee754{TSelf}.ScaleB(TSelf, int)" />
Expand Down
24 changes: 4 additions & 20 deletions src/libraries/System.Private.CoreLib/src/System/Math.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1195,19 +1195,11 @@ public static double MinMagnitude(double x, double y)
/// <para>On ARM64 hardware this may use the <c>FRECPE</c> instruction which performs a single Newton-Raphson iteration.</para>
/// <para>On hardware without specialized support, this may just return <c>1.0 / d</c>.</para>
/// </remarks>
[Intrinsic]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static double ReciprocalEstimate(double d)
{
// x86 doesn't provide an estimate instruction for double-precision reciprocal

if (AdvSimd.Arm64.IsSupported)
{
return AdvSimd.Arm64.ReciprocalEstimateScalar(Vector64.CreateScalar(d)).ToScalar();
}
else
{
return 1.0 / d;
}
return 1.0 / d;
}

/// <summary>Returns an estimate of the reciprocal square root of a specified number.</summary>
Expand All @@ -1217,19 +1209,11 @@ public static double ReciprocalEstimate(double d)
/// <para>On ARM64 hardware this may use the <c>FRSQRTE</c> instruction which performs a single Newton-Raphson iteration.</para>
/// <para>On hardware without specialized support, this may just return <c>1.0 / Sqrt(d)</c>.</para>
/// </remarks>
[Intrinsic]
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static double ReciprocalSqrtEstimate(double d)
{
// x86 doesn't provide an estimate instruction for double-precision reciprocal square root

if (AdvSimd.Arm64.IsSupported)
{
return AdvSimd.Arm64.ReciprocalSquareRootEstimateScalar(Vector64.CreateScalar(d)).ToScalar();
}
else
{
return 1.0 / Sqrt(d);
}
return 1.0 / Sqrt(d);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down
Loading
Loading