Skip to content

Commit

Permalink
[APFloat] Add APFloat support for E8M0 type (llvm#107127)
Browse files Browse the repository at this point in the history
This patch adds an APFloat type for unsigned E8M0 format. This format is
used for representing the "scale-format" in the MX specification:
(section 5.4)

https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

This format does not support {Inf, denorms, zeroes}. Like FP32, this
format's exponents are 8-bits (all bits here) and the bias value is 127.
However, it differs from IEEE-FP32 in that the minExponent is -127
(instead of -126). There are updates done in the APFloat utility
functions to handle these constraints for this format.

* The bias calculation is different and convertIEEE* APIs are updated to
handle this.
* Since there are no significand bits, the isSignificandAll{Zeroes/Ones}
methods are updated accordingly.
* Although the format does not have any precision, the precision bit in
the fltSemantics is set to 1 for consistency with APFloat's internal
representation.
* Many utility functions are updated to handle the fact that this format
does not support Zero.
* Provide a separate initFromAPInt() implementation to handle the quirks
of the format.
* Add specific tests to verify the range of values for this format.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
  • Loading branch information
durga4github authored and Sterling-Augustine committed Oct 3, 2024
1 parent da9f84a commit f017956
Show file tree
Hide file tree
Showing 3 changed files with 653 additions and 30 deletions.
24 changes: 23 additions & 1 deletion llvm/include/llvm/ADT/APFloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,13 @@ struct APFloatBase {
// improved range compared to half (16-bit) formats, at (potentially)
// greater throughput than single precision (32-bit) formats.
S_FloatTF32,
// 8-bit floating point number with (all the) 8 bits for the exponent
// like in FP32. There are no zeroes, no infinities, and no denormal values.
// This format has unsigned representation only. (U -> Unsigned only).
// NaN is represented with all bits set to 1. Bias is 127.
// This format represents the scale data type in the MX specification from:
// https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
S_Float8E8M0FNU,
// 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
// types, there are no infinity or NaN values. The format is detailed in
// https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
Expand Down Expand Up @@ -229,6 +236,7 @@ struct APFloatBase {
static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
static const fltSemantics &Float8E3M4() LLVM_READNONE;
static const fltSemantics &FloatTF32() LLVM_READNONE;
static const fltSemantics &Float8E8M0FNU() LLVM_READNONE;
static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
Expand Down Expand Up @@ -581,7 +589,8 @@ class IEEEFloat final : public APFloatBase {
integerPart addSignificand(const IEEEFloat &);
integerPart subtractSignificand(const IEEEFloat &, integerPart);
lostFraction addOrSubtractSignificand(const IEEEFloat &, bool subtract);
lostFraction multiplySignificand(const IEEEFloat &, IEEEFloat);
lostFraction multiplySignificand(const IEEEFloat &, IEEEFloat,
bool ignoreAddend = false);
lostFraction multiplySignificand(const IEEEFloat&);
lostFraction divideSignificand(const IEEEFloat &);
void incrementSignificand();
Expand All @@ -591,6 +600,7 @@ class IEEEFloat final : public APFloatBase {
unsigned int significandLSB() const;
unsigned int significandMSB() const;
void zeroSignificand();
unsigned int getNumHighBits() const;
/// Return true if the significand excluding the integral bit is all ones.
bool isSignificandAllOnes() const;
bool isSignificandAllOnesExceptLSB() const;
Expand Down Expand Up @@ -652,6 +662,7 @@ class IEEEFloat final : public APFloatBase {
APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
APInt convertFloat8E3M4APFloatToAPInt() const;
APInt convertFloatTF32APFloatToAPInt() const;
APInt convertFloat8E8M0FNUAPFloatToAPInt() const;
APInt convertFloat6E3M2FNAPFloatToAPInt() const;
APInt convertFloat6E2M3FNAPFloatToAPInt() const;
APInt convertFloat4E2M1FNAPFloatToAPInt() const;
Expand All @@ -672,6 +683,7 @@ class IEEEFloat final : public APFloatBase {
void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
void initFromFloat8E3M4APInt(const APInt &api);
void initFromFloatTF32APInt(const APInt &api);
void initFromFloat8E8M0FNUAPInt(const APInt &api);
void initFromFloat6E3M2FNAPInt(const APInt &api);
void initFromFloat6E2M3FNAPInt(const APInt &api);
void initFromFloat4E2M1FNAPInt(const APInt &api);
Expand Down Expand Up @@ -1079,6 +1091,9 @@ class APFloat : public APFloatBase {
/// \param Semantics - type float semantics
static APFloat getAllOnesValue(const fltSemantics &Semantics);

/// Returns true if the given semantics supports either NaN or Infinity.
///
/// \param Sem - type float semantics
static bool hasNanOrInf(const fltSemantics &Sem) {
switch (SemanticsToEnum(Sem)) {
default:
Expand All @@ -1091,6 +1106,13 @@ class APFloat : public APFloatBase {
}
}

/// Returns true if the given semantics has actual significand.
///
/// \param Sem - type float semantics
static bool hasSignificand(const fltSemantics &Sem) {
return &Sem != &Float8E8M0FNU();
}

/// Used to insert APFloat objects, or objects that contain APFloat objects,
/// into FoldingSets.
void Profile(FoldingSetNodeID &NID) const;
Expand Down
Loading

0 comments on commit f017956

Please sign in to comment.