Implement P0952R2: 'A new specification for std::generate_canonical' (

#4740) Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
microsoft · Jun 27, 2024 · f9b0af8 · f9b0af8
1 parent 14b54ca
commit f9b0af8
Show file tree

Hide file tree

Showing 8 changed files with 366 additions and 107 deletions.
diff --git a/stl/inc/random b/stl/inc/random
@@ -261,30 +261,37 @@ private:
     vector<result_type> _Myvec;
 };
 
-_NODISCARD constexpr int _Generate_canonical_iterations(const int _Bits, const uint64_t _Gmin, const uint64_t _Gmax) {
-    // For a URBG type `G` with range == `(G::max() - G::min()) + 1`, returns the number of calls to generate at least
-    // _Bits bits of entropy. Specifically, max(1, ceil(_Bits / log2(range))).
-
-    _STL_INTERNAL_CHECK(0 <= _Bits && _Bits <= 64);
+struct _Urbg_gen_can_params {
+    bool _Rx_is_pow2 = false; // $R$ (in N4981 [rand.util.canonical]/1.2) is a power of 2
+    int _Kx          = 0; // $k$ in N4981 [rand.util.canonical]/1.4
+    uint64_t _Xx     = 0; // $x$ in N4981 [rand.util.canonical]/1.5
+    float _Scale     = 0.0f; // $r^{-d}$, always representable as a float
+    int _Smax_bits   = 0; // number of bits in $R^k$
+};
 
-    if (_Bits == 0 || (_Gmax == UINT64_MAX && _Gmin == 0)) {
-        return 1;
+_NODISCARD constexpr _Urbg_gen_can_params _Generate_canonical_params(const size_t _Bits, const uint64_t _Rm1) {
+    const auto _Rx        = _Unsigned128{_Rm1} + 1; // $R$ in N4981 [rand.util.canonical]/1.2
+    const auto _Target    = _Unsigned128{1} << _Bits; // $r^d$
+    _Unsigned128 _Product = 1;
+    _Urbg_gen_can_params _Result;
+    _Result._Rx_is_pow2 = (_Rx & (_Rx - 1)) == 0;
+    _Result._Kx         = 0;
+    while (_Product < _Target) {
+        _Product *= _Rx;
+        ++_Result._Kx;
     }
 
-    const auto _Range  = (_Gmax - _Gmin) + 1;
-    const auto _Target = ~uint64_t{0} >> (64 - _Bits);
-    uint64_t _Prod     = 1;
-    int _Ceil          = 0;
-    while (_Prod <= _Target) {
-        ++_Ceil;
-        if (_Prod > UINT64_MAX / _Range) {
-            break;
-        }
-
-        _Prod *= _Range;
+    _Result._Xx        = static_cast<uint64_t>(_Product / _Target);
+    _Result._Scale     = 1.0f / static_cast<float>(1ULL << _Bits);
+    _Result._Smax_bits = 0;
+    // $S \in [0, _Product)$
+    --_Product;
+    while (_Product > 0) {
+        ++_Result._Smax_bits;
+        _Product >>= 1;
     }
 
-    return _Ceil;
+    return _Result;
 }
 
 _EXPORT_STD template <class _Real, size_t _Bits, class _Gen>
@@ -294,27 +301,110 @@ _NODISCARD _Real generate_canonical(_Gen& _Gx) { // build a floating-point value
     constexpr auto _Digits  = static_cast<size_t>(numeric_limits<_Real>::digits);
     constexpr auto _Minbits = static_cast<int>(_Digits < _Bits ? _Digits : _Bits);
 
-    constexpr auto _Gxmin = static_cast<_Real>((_Gen::min)());
-    constexpr auto _Gxmax = static_cast<_Real>((_Gen::max)());
-    constexpr auto _Rx    = (_Gxmax - _Gxmin) + _Real{1};
+    if constexpr (_Minbits == 0) {
+        return _Real{0};
+    } else {
+        using _Result_uint_type = conditional_t<_Minbits <= 32, uint32_t, uint64_t>;
+
+        constexpr auto _Gxmin  = (_Gen::min)();
+        constexpr auto _Gxmax  = (_Gen::max)();
+        constexpr auto _Params = _Generate_canonical_params(_Minbits, _Gxmax - _Gxmin);
+
+        using _Sx_type = conditional_t<_Params._Smax_bits <= 32, uint32_t,
+            conditional_t<_Params._Smax_bits <= 64, uint64_t, _Unsigned128>>;
+        _Sx_type _Sx;
+
+        if constexpr (_Params._Rx_is_pow2) {
+            // Always needs only one attempt. Multiplications can be replaced with shift/add. Optimize k=1 case.
+            if constexpr (_Params._Kx == 1) {
+                _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
+            } else if constexpr (_Params._Kx > 1) {
+                constexpr int _Rx_bits = _Params._Smax_bits / _Params._Kx;
+                _Sx                    = 0;
+                int _Shift             = 0;
+                for (int _Idx = 0; _Idx < _Params._Kx; ++_Idx) {
+                    _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) << _Shift;
+                    _Shift += _Rx_bits;
+                }
+            } else {
+                _STL_INTERNAL_STATIC_ASSERT(false); // unexpected k
+            }
 
-    constexpr int _Kx = _Generate_canonical_iterations(_Minbits, (_Gen::min)(), (_Gen::max)());
+            _Sx >>= _Params._Smax_bits - _Minbits;
+            return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx)) * static_cast<_Real>(_Params._Scale);
+        } else {
+            constexpr auto _Rx        = _Sx_type{_Gxmax - _Gxmin} + 1u;
+            constexpr _Sx_type _Limit = static_cast<_Sx_type>(_Params._Xx) * (_Sx_type{1} << _Minbits);
+
+            do {
+                // unroll first two iterations to avoid unnecessary multiplications
+                _Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
+                if constexpr (_Params._Kx == 2) {
+                    _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
+                } else if constexpr (_Params._Kx > 2) {
+                    _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
+                    _Sx_type _Factor = _Rx;
+                    for (int _Idx = 2; _Idx < _Params._Kx; ++_Idx) {
+                        _Factor *= _Rx;
+                        _Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Factor;
+                    }
+                }
+            } while (_Sx >= _Limit);
 
-    _Real _Ans{0};
-    _Real _Factor{1};
+            return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx / _Params._Xx))
+                 * static_cast<_Real>(_Params._Scale);
+        }
+    }
+}
 
-    for (int _Idx = 0; _Idx < _Kx; ++_Idx) { // add in another set of bits
-        _Ans += (static_cast<_Real>(_Gx()) - _Gxmin) * _Factor;
-        _Factor *= _Rx;
+template <class _Uint_type, class _Gen, class _Real>
+bool _Nrand_for_tr1(
+    const uint64_t _Rd, const uint64_t _Rx, _Gen& _Gx, _Real& _Val, uint64_t& _Sx_init, uint64_t& _Factor_init) {
+    // Minimally-constexpr generate_canonical algorithm. Will save work and exit if _Factor would overflow.
+    _Uint_type _Sx             = _Sx_init;
+    _Uint_type _Factor         = _Factor_init;
+    const auto _Gxmin          = (_Gx.min)();
+    const auto _Overflow_limit = UINT64_MAX / _Rx;
+
+    _Uint_type _Xx    = 0;
+    _Uint_type _Limit = 0;
+    bool _Init        = false;
+    for (;;) {
+        while (_Factor < _Rd) {
+            if constexpr (is_same_v<_Uint_type, uint64_t>) {
+                if (_Factor > _Overflow_limit) {
+                    _Sx_init     = _Sx;
+                    _Factor_init = _Factor;
+                    return true;
+                }
+            }
+
+            _Sx += (_Gx() - _Gxmin) * _Factor;
+            _Factor *= _Rx;
+        }
+
+        if (!_Init) {
+            _Init  = true;
+            _Xx    = _Factor / _Rd;
+            _Limit = _Xx * _Rd;
+        }
+
+        if (_Sx < _Limit) {
+            break;
+        } else {
+            _Sx     = 0;
+            _Factor = 1;
+        }
     }
 
-    return _Ans / _Factor;
+    _Val = static_cast<_Real>(static_cast<uint64_t>(_Sx / _Xx));
+    return false;
 }
 
 template <class _Gen, class = void>
 struct _Has_static_min_max : false_type {};
 
-// This checks a requirement of N4950 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
+// This checks a requirement of N4981 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
 // to implement the whole concept - we just need to distinguish Standard machinery from tr1 machinery.
 template <class _Gen>
 struct _Has_static_min_max<_Gen, void_t<decltype(bool_constant<(_Gen::min)() < (_Gen::max)()>::value)>> : true_type {};
@@ -323,18 +413,40 @@ template <class _Real, class _Gen>
 _NODISCARD _Real _Nrand_impl(_Gen& _Gx) { // build a floating-point value from random sequence
     _RNG_REQUIRE_REALTYPE(_Nrand_impl, _Real);
 
-    constexpr auto _Digits  = static_cast<size_t>(numeric_limits<_Real>::digits);
-    constexpr auto _Bits    = ~size_t{0};
-    constexpr auto _Minbits = _Digits < _Bits ? _Digits : _Bits;
+    constexpr auto _Digits = static_cast<size_t>(numeric_limits<_Real>::digits);
+
+    if constexpr (_Has_static_min_max<_Gen>::value) {
+        return _STD generate_canonical<_Real, _Digits>(_Gx);
+    } else if constexpr (is_integral_v<typename _Gen::result_type>) {
+        // TRANSITION, for integral tr1 machinery only; Standard machinery can call generate_canonical directly
+        const auto _Gxmax = (_Gx.max)();
+        const auto _Gxmin = (_Gx.min)();
+        _Real _Val{0};
+
+        if (_Gxmax == UINT64_MAX && _Gxmin == 0) {
+            // special case when uint64_t can't hold the full URBG range
+            _Val = static_cast<_Real>(static_cast<uint64_t>(_Gx()) >> (64 - _Digits));
+        } else {
+            constexpr auto _Rd = 1ULL << _Digits;
+            const auto _Rx     = static_cast<uint64_t>(_Gxmax - _Gxmin) + 1;
+            uint64_t _Sx       = 0;
+            uint64_t _Factor   = 1;
+
+            // Try with 64 bits first, upgrade to 128 if necessary.
+            const bool _Would_overflow = _Nrand_for_tr1<uint64_t>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
+            if (_Would_overflow) {
+                _Nrand_for_tr1<_Unsigned128>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
+            }
+        }
 
-    if constexpr (_Has_static_min_max<_Gen>::value && _Minbits <= 64) {
-        return _STD generate_canonical<_Real, _Minbits>(_Gx);
-    } else { // TRANSITION, for tr1 machinery only; Standard machinery can call generate_canonical directly
+        return _STD ldexp(_Val, -static_cast<int>(_Digits));
+    } else {
+        // TRANSITION, the pre-P0952R2 algorithm, for non-integral tr1 machinery only (e.g. subtract_with_carry_01)
         const _Real _Gxmin = static_cast<_Real>((_Gx.min)());
         const _Real _Gxmax = static_cast<_Real>((_Gx.max)());
         const _Real _Rx    = (_Gxmax - _Gxmin) + _Real{1};
 
-        const int _Ceil = static_cast<int>(_STD ceil(static_cast<_Real>(_Minbits) / _STD log2(_Rx)));
+        const int _Ceil = static_cast<int>(_STD ceil(static_cast<_Real>(_Digits) / _STD log2(_Rx)));
         const int _Kx   = _Ceil < 1 ? 1 : _Ceil;
 
         _Real _Ans{0};
@@ -1968,8 +2080,8 @@ public:
     explicit _Rng_from_urng_v2(_Urng& _Func) noexcept : _Ref(_Func) {}
 
     _Diff operator()(_Diff _Index) { // adapt _Urng closed range to [0, _Index)
-        // From Daniel Lemire, "Fast Random Integer Generation in an Interval", ACM Trans. Model. Comput. Simul. 29 (1),
-        // 2019.
+        // From Daniel Lemire, "Fast Random Integer Generation in an Interval",
+        // ACM Trans. Model. Comput. Simul. 29 (1), 2019.
         //
         // Algorithm 5 <-> This Code:
         //   m <-> _Product
@@ -2925,8 +3037,8 @@ public:
 
 private:
     template <class _Engine>
-    result_type _Eval(
-        _Engine& _Eng, const param_type& _Par0) const { // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
+    result_type _Eval(_Engine& _Eng, const param_type& _Par0) const {
+        // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
         _Ty _Res;
         if (_Par0._Tx < 25) { // generate directly
             _Res = 0;

diff --git a/stl/inc/yvals_core.h b/stl/inc/yvals_core.h
@@ -62,6 +62,7 @@
 // P0883R2 Fixing Atomic Initialization
 // P0935R0 Eradicating Unnecessarily Explicit Default Constructors
 // P0941R2 Feature-Test Macros
+// P0952R2 A New Specification For generate_canonical()
 // P0972R0 noexcept For <chrono> zero(), min(), max()
 // P1065R2 constexpr INVOKE
 //     (the std::invoke function only; other components like bind and reference_wrapper are C++20 only)

diff --git a/tests/libcxx/expected_results.txt b/tests/libcxx/expected_results.txt
@@ -73,6 +73,9 @@ std/containers/unord/unord.set/insert_and_emplace_allocator_requirements.pass.cp
 # Bogus test believes that copyability of array<T, 0> must be the same as array<T, 1>
 std/containers/sequences/array/array.cons/implicit_copy.pass.cpp FAIL
 
+# libc++ hasn't implemented P0952R2, which changes the generate_canonical algorithm.
+std/numerics/rand/rand.util/rand.util.canonical/generate_canonical.pass.cpp FAIL
+
 # Test expects __cpp_lib_chrono to have the old value 201611L for P0505R0; we define the C++20 value 201907L for P1466R3.
 std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp FAIL
 

diff --git a/tests/std/test.lst b/tests/std/test.lst
@@ -198,7 +198,6 @@ tests\GH_001850_clog_tied_to_cout
 tests\GH_001858_iostream_exception
 tests\GH_001912_random_distribution_operator_const
 tests\GH_001914_cached_position
-tests\GH_001964_constexpr_generate_canonical
 tests\GH_002030_asan_annotate_string
 tests\GH_002030_asan_annotate_vector
 tests\GH_002039_byte_is_not_trivially_swappable
@@ -512,6 +511,7 @@ tests\P0898R3_identity
 tests\P0912R5_coroutine
 tests\P0919R3_heterogeneous_unordered_lookup
 tests\P0943R6_stdatomic_h
+tests\P0952R2_new_generate_canonical
 tests\P0966R1_string_reserve_should_not_shrink
 tests\P0980R1_constexpr_strings
 tests\P1004R2_constexpr_vector

diff --git a/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst b/tests/std/tests/GH_001017_discrete_distribution_out_of_range/env.lst
@@ -1,4 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-RUNALL_INCLUDE ..\usual_matrix.lst
+# Skip /clr:pure configurations; they pass but take an extremely long time to run, risking test timeouts.
+RUNALL_INCLUDE ..\impure_matrix.lst
diff --git a/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp b/tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp
diff --git a/...1964_constexpr_generate_canonical/env.lst → ...ts/P0952R2_new_generate_canonical/env.lst b/...1964_constexpr_generate_canonical/env.lst → ...ts/P0952R2_new_generate_canonical/env.lst