Skip to content

Commit

Permalink
Implement P0952R2: 'A new specification for std::generate_canonical' (
Browse files Browse the repository at this point in the history
#4740)

Co-authored-by: Stephan T. Lavavej <stl@nuwen.net>
  • Loading branch information
MattStephanson and StephanTLavavej authored Jun 27, 2024
1 parent 14b54ca commit f9b0af8
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 107 deletions.
194 changes: 153 additions & 41 deletions stl/inc/random
Original file line number Diff line number Diff line change
Expand Up @@ -261,30 +261,37 @@ private:
vector<result_type> _Myvec;
};

_NODISCARD constexpr int _Generate_canonical_iterations(const int _Bits, const uint64_t _Gmin, const uint64_t _Gmax) {
// For a URBG type `G` with range == `(G::max() - G::min()) + 1`, returns the number of calls to generate at least
// _Bits bits of entropy. Specifically, max(1, ceil(_Bits / log2(range))).

_STL_INTERNAL_CHECK(0 <= _Bits && _Bits <= 64);
struct _Urbg_gen_can_params {
bool _Rx_is_pow2 = false; // $R$ (in N4981 [rand.util.canonical]/1.2) is a power of 2
int _Kx = 0; // $k$ in N4981 [rand.util.canonical]/1.4
uint64_t _Xx = 0; // $x$ in N4981 [rand.util.canonical]/1.5
float _Scale = 0.0f; // $r^{-d}$, always representable as a float
int _Smax_bits = 0; // number of bits in $R^k$
};

if (_Bits == 0 || (_Gmax == UINT64_MAX && _Gmin == 0)) {
return 1;
_NODISCARD constexpr _Urbg_gen_can_params _Generate_canonical_params(const size_t _Bits, const uint64_t _Rm1) {
const auto _Rx = _Unsigned128{_Rm1} + 1; // $R$ in N4981 [rand.util.canonical]/1.2
const auto _Target = _Unsigned128{1} << _Bits; // $r^d$
_Unsigned128 _Product = 1;
_Urbg_gen_can_params _Result;
_Result._Rx_is_pow2 = (_Rx & (_Rx - 1)) == 0;
_Result._Kx = 0;
while (_Product < _Target) {
_Product *= _Rx;
++_Result._Kx;
}

const auto _Range = (_Gmax - _Gmin) + 1;
const auto _Target = ~uint64_t{0} >> (64 - _Bits);
uint64_t _Prod = 1;
int _Ceil = 0;
while (_Prod <= _Target) {
++_Ceil;
if (_Prod > UINT64_MAX / _Range) {
break;
}

_Prod *= _Range;
_Result._Xx = static_cast<uint64_t>(_Product / _Target);
_Result._Scale = 1.0f / static_cast<float>(1ULL << _Bits);
_Result._Smax_bits = 0;
// $S \in [0, _Product)$
--_Product;
while (_Product > 0) {
++_Result._Smax_bits;
_Product >>= 1;
}

return _Ceil;
return _Result;
}

_EXPORT_STD template <class _Real, size_t _Bits, class _Gen>
Expand All @@ -294,27 +301,110 @@ _NODISCARD _Real generate_canonical(_Gen& _Gx) { // build a floating-point value
constexpr auto _Digits = static_cast<size_t>(numeric_limits<_Real>::digits);
constexpr auto _Minbits = static_cast<int>(_Digits < _Bits ? _Digits : _Bits);

constexpr auto _Gxmin = static_cast<_Real>((_Gen::min)());
constexpr auto _Gxmax = static_cast<_Real>((_Gen::max)());
constexpr auto _Rx = (_Gxmax - _Gxmin) + _Real{1};
if constexpr (_Minbits == 0) {
return _Real{0};
} else {
using _Result_uint_type = conditional_t<_Minbits <= 32, uint32_t, uint64_t>;

constexpr auto _Gxmin = (_Gen::min)();
constexpr auto _Gxmax = (_Gen::max)();
constexpr auto _Params = _Generate_canonical_params(_Minbits, _Gxmax - _Gxmin);

using _Sx_type = conditional_t<_Params._Smax_bits <= 32, uint32_t,
conditional_t<_Params._Smax_bits <= 64, uint64_t, _Unsigned128>>;
_Sx_type _Sx;

if constexpr (_Params._Rx_is_pow2) {
// Always needs only one attempt. Multiplications can be replaced with shift/add. Optimize k=1 case.
if constexpr (_Params._Kx == 1) {
_Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
} else if constexpr (_Params._Kx > 1) {
constexpr int _Rx_bits = _Params._Smax_bits / _Params._Kx;
_Sx = 0;
int _Shift = 0;
for (int _Idx = 0; _Idx < _Params._Kx; ++_Idx) {
_Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) << _Shift;
_Shift += _Rx_bits;
}
} else {
_STL_INTERNAL_STATIC_ASSERT(false); // unexpected k
}

constexpr int _Kx = _Generate_canonical_iterations(_Minbits, (_Gen::min)(), (_Gen::max)());
_Sx >>= _Params._Smax_bits - _Minbits;
return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx)) * static_cast<_Real>(_Params._Scale);
} else {
constexpr auto _Rx = _Sx_type{_Gxmax - _Gxmin} + 1u;
constexpr _Sx_type _Limit = static_cast<_Sx_type>(_Params._Xx) * (_Sx_type{1} << _Minbits);

do {
// unroll first two iterations to avoid unnecessary multiplications
_Sx = static_cast<_Sx_type>(_Gx() - _Gxmin);
if constexpr (_Params._Kx == 2) {
_Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
} else if constexpr (_Params._Kx > 2) {
_Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Rx;
_Sx_type _Factor = _Rx;
for (int _Idx = 2; _Idx < _Params._Kx; ++_Idx) {
_Factor *= _Rx;
_Sx += static_cast<_Sx_type>(_Gx() - _Gxmin) * _Factor;
}
}
} while (_Sx >= _Limit);

_Real _Ans{0};
_Real _Factor{1};
return static_cast<_Real>(static_cast<_Result_uint_type>(_Sx / _Params._Xx))
* static_cast<_Real>(_Params._Scale);
}
}
}

for (int _Idx = 0; _Idx < _Kx; ++_Idx) { // add in another set of bits
_Ans += (static_cast<_Real>(_Gx()) - _Gxmin) * _Factor;
_Factor *= _Rx;
template <class _Uint_type, class _Gen, class _Real>
bool _Nrand_for_tr1(
const uint64_t _Rd, const uint64_t _Rx, _Gen& _Gx, _Real& _Val, uint64_t& _Sx_init, uint64_t& _Factor_init) {
// Minimally-constexpr generate_canonical algorithm. Will save work and exit if _Factor would overflow.
_Uint_type _Sx = _Sx_init;
_Uint_type _Factor = _Factor_init;
const auto _Gxmin = (_Gx.min)();
const auto _Overflow_limit = UINT64_MAX / _Rx;

_Uint_type _Xx = 0;
_Uint_type _Limit = 0;
bool _Init = false;
for (;;) {
while (_Factor < _Rd) {
if constexpr (is_same_v<_Uint_type, uint64_t>) {
if (_Factor > _Overflow_limit) {
_Sx_init = _Sx;
_Factor_init = _Factor;
return true;
}
}

_Sx += (_Gx() - _Gxmin) * _Factor;
_Factor *= _Rx;
}

if (!_Init) {
_Init = true;
_Xx = _Factor / _Rd;
_Limit = _Xx * _Rd;
}

if (_Sx < _Limit) {
break;
} else {
_Sx = 0;
_Factor = 1;
}
}

return _Ans / _Factor;
_Val = static_cast<_Real>(static_cast<uint64_t>(_Sx / _Xx));
return false;
}

template <class _Gen, class = void>
struct _Has_static_min_max : false_type {};

// This checks a requirement of N4950 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
// This checks a requirement of N4981 [rand.req.urng] `concept uniform_random_bit_generator` but doesn't attempt
// to implement the whole concept - we just need to distinguish Standard machinery from tr1 machinery.
template <class _Gen>
struct _Has_static_min_max<_Gen, void_t<decltype(bool_constant<(_Gen::min)() < (_Gen::max)()>::value)>> : true_type {};
Expand All @@ -323,18 +413,40 @@ template <class _Real, class _Gen>
_NODISCARD _Real _Nrand_impl(_Gen& _Gx) { // build a floating-point value from random sequence
_RNG_REQUIRE_REALTYPE(_Nrand_impl, _Real);

constexpr auto _Digits = static_cast<size_t>(numeric_limits<_Real>::digits);
constexpr auto _Bits = ~size_t{0};
constexpr auto _Minbits = _Digits < _Bits ? _Digits : _Bits;
constexpr auto _Digits = static_cast<size_t>(numeric_limits<_Real>::digits);

if constexpr (_Has_static_min_max<_Gen>::value) {
return _STD generate_canonical<_Real, _Digits>(_Gx);
} else if constexpr (is_integral_v<typename _Gen::result_type>) {
// TRANSITION, for integral tr1 machinery only; Standard machinery can call generate_canonical directly
const auto _Gxmax = (_Gx.max)();
const auto _Gxmin = (_Gx.min)();
_Real _Val{0};

if (_Gxmax == UINT64_MAX && _Gxmin == 0) {
// special case when uint64_t can't hold the full URBG range
_Val = static_cast<_Real>(static_cast<uint64_t>(_Gx()) >> (64 - _Digits));
} else {
constexpr auto _Rd = 1ULL << _Digits;
const auto _Rx = static_cast<uint64_t>(_Gxmax - _Gxmin) + 1;
uint64_t _Sx = 0;
uint64_t _Factor = 1;

// Try with 64 bits first, upgrade to 128 if necessary.
const bool _Would_overflow = _Nrand_for_tr1<uint64_t>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
if (_Would_overflow) {
_Nrand_for_tr1<_Unsigned128>(_Rd, _Rx, _Gx, _Val, _Sx, _Factor);
}
}

if constexpr (_Has_static_min_max<_Gen>::value && _Minbits <= 64) {
return _STD generate_canonical<_Real, _Minbits>(_Gx);
} else { // TRANSITION, for tr1 machinery only; Standard machinery can call generate_canonical directly
return _STD ldexp(_Val, -static_cast<int>(_Digits));
} else {
// TRANSITION, the pre-P0952R2 algorithm, for non-integral tr1 machinery only (e.g. subtract_with_carry_01)
const _Real _Gxmin = static_cast<_Real>((_Gx.min)());
const _Real _Gxmax = static_cast<_Real>((_Gx.max)());
const _Real _Rx = (_Gxmax - _Gxmin) + _Real{1};

const int _Ceil = static_cast<int>(_STD ceil(static_cast<_Real>(_Minbits) / _STD log2(_Rx)));
const int _Ceil = static_cast<int>(_STD ceil(static_cast<_Real>(_Digits) / _STD log2(_Rx)));
const int _Kx = _Ceil < 1 ? 1 : _Ceil;

_Real _Ans{0};
Expand Down Expand Up @@ -1968,8 +2080,8 @@ public:
explicit _Rng_from_urng_v2(_Urng& _Func) noexcept : _Ref(_Func) {}

_Diff operator()(_Diff _Index) { // adapt _Urng closed range to [0, _Index)
// From Daniel Lemire, "Fast Random Integer Generation in an Interval", ACM Trans. Model. Comput. Simul. 29 (1),
// 2019.
// From Daniel Lemire, "Fast Random Integer Generation in an Interval",
// ACM Trans. Model. Comput. Simul. 29 (1), 2019.
//
// Algorithm 5 <-> This Code:
// m <-> _Product
Expand Down Expand Up @@ -2925,8 +3037,8 @@ public:

private:
template <class _Engine>
result_type _Eval(
_Engine& _Eng, const param_type& _Par0) const { // Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
result_type _Eval(_Engine& _Eng, const param_type& _Par0) const {
// Press et al., Numerical Recipes in C, 2nd ed., pp 295-296.
_Ty _Res;
if (_Par0._Tx < 25) { // generate directly
_Res = 0;
Expand Down
1 change: 1 addition & 0 deletions stl/inc/yvals_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
// P0883R2 Fixing Atomic Initialization
// P0935R0 Eradicating Unnecessarily Explicit Default Constructors
// P0941R2 Feature-Test Macros
// P0952R2 A New Specification For generate_canonical()
// P0972R0 noexcept For <chrono> zero(), min(), max()
// P1065R2 constexpr INVOKE
// (the std::invoke function only; other components like bind and reference_wrapper are C++20 only)
Expand Down
3 changes: 3 additions & 0 deletions tests/libcxx/expected_results.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ std/containers/unord/unord.set/insert_and_emplace_allocator_requirements.pass.cp
# Bogus test believes that copyability of array<T, 0> must be the same as array<T, 1>
std/containers/sequences/array/array.cons/implicit_copy.pass.cpp FAIL

# libc++ hasn't implemented P0952R2, which changes the generate_canonical algorithm.
std/numerics/rand/rand.util/rand.util.canonical/generate_canonical.pass.cpp FAIL

# Test expects __cpp_lib_chrono to have the old value 201611L for P0505R0; we define the C++20 value 201907L for P1466R3.
std/language.support/support.limits/support.limits.general/chrono.version.compile.pass.cpp FAIL

Expand Down
2 changes: 1 addition & 1 deletion tests/std/test.lst
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ tests\GH_001850_clog_tied_to_cout
tests\GH_001858_iostream_exception
tests\GH_001912_random_distribution_operator_const
tests\GH_001914_cached_position
tests\GH_001964_constexpr_generate_canonical
tests\GH_002030_asan_annotate_string
tests\GH_002030_asan_annotate_vector
tests\GH_002039_byte_is_not_trivially_swappable
Expand Down Expand Up @@ -512,6 +511,7 @@ tests\P0898R3_identity
tests\P0912R5_coroutine
tests\P0919R3_heterogeneous_unordered_lookup
tests\P0943R6_stdatomic_h
tests\P0952R2_new_generate_canonical
tests\P0966R1_string_reserve_should_not_shrink
tests\P0980R1_constexpr_strings
tests\P1004R2_constexpr_vector
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

RUNALL_INCLUDE ..\usual_matrix.lst
# Skip /clr:pure configurations; they pass but take an extremely long time to run, risking test timeouts.
RUNALL_INCLUDE ..\impure_matrix.lst
64 changes: 0 additions & 64 deletions tests/std/tests/GH_001964_constexpr_generate_canonical/test.cpp

This file was deleted.

Loading

0 comments on commit f9b0af8

Please sign in to comment.