Skip to content

Commit

Permalink
Add native sha compression.
Browse files Browse the repository at this point in the history
  • Loading branch information
evoskuil committed Nov 26, 2024
1 parent fddf944 commit 0d700f1
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 19 deletions.
13 changes: 13 additions & 0 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,19 @@ class algorithm
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;

template<size_t Round, size_t Lane>
INLINE static void round_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wk) NOEXCEPT;

INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void summarize_native(wstate_t<xint128_t>& out,
const wstate_t<xint128_t>& in) NOEXCEPT;

template <size_t Lane>
INLINE static void compress_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
INLINE static void compress_native(xstate_t<xWord>& xstate,
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ template <size_t Lane>
constexpr void CLASS::
compress_(auto& state, const auto& buffer) NOEXCEPT
{
// SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements.
// This is a copy (state type varies due to vectorization).
const auto start = state;

Expand Down
176 changes: 158 additions & 18 deletions include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@
namespace libbitcoin {
namespace system {
namespace sha {

// schedule
// ----------------------------------------------------------------------------
// protected

TEMPLATE
template<size_t Round>
Expand All @@ -43,19 +47,21 @@ prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
{
if constexpr (SHA::strength == 160)
{
////static_assert(false, "sha160 not implemented");
}
else if constexpr (use_neon)
{
static_assert(SHA::strength == 256);

////static_assert(false, "neon not implemented");
if constexpr (use_neon)
{
}
else if constexpr (use_shani)
{
}
}
else if constexpr (use_shani)
else if constexpr (SHA::strength == 256)
{
static_assert(SHA::strength == 256);

wbuffer[Round] = mm_sha256msg2_epu32
if constexpr (use_neon)
{
}
else if constexpr (use_shani)
{
wbuffer[Round] = mm_sha256msg2_epu32
(
mm_add_epi32
(
Expand All @@ -70,6 +76,7 @@ prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
),
wbuffer[Round - 1]
);
}
}
}

Expand Down Expand Up @@ -101,16 +108,12 @@ schedule(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
konstant(array_cast<word_t>(wbuffer));
}

// schedule
// ----------------------------------------------------------------------------
// protected

TEMPLATE
INLINE void CLASS::
schedule_native(buffer_t& buffer) NOEXCEPT
{
// neon and sha160 not yet implemented, sha512 is not native.
if constexpr (SHA::strength != 160 && SHA::strength != 512 && !use_neon)
if constexpr (SHA::strength == 256 && !use_neon)
{
schedule(array_cast<xint128_t>(buffer));
}
Expand All @@ -133,6 +136,134 @@ schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
// ----------------------------------------------------------------------------
// protected

TEMPLATE
template<size_t Round, size_t Lane>
INLINE void CLASS::
round_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wk) NOEXCEPT
{
if constexpr (SHA::strength == 160)
{
if constexpr (use_neon)
{
}
else if constexpr (use_shani)
{
}
}
else if constexpr (SHA::strength == 256)
{
if constexpr (use_neon)
{
}
else if constexpr (use_shani)
{
// Process wk[Round][0..1], [HGDC][FEBA] (initial state)
state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]);

// Process wk[Round][2..3] (shifted down)
state[0] = mm_sha256rnds2_epu32(state[0], state[1],
mm_shuffle_epi32(wk[Round], 0x0e));
}
}
}

TEMPLATE
INLINE void CLASS::
summarize_native(wstate_t<xint128_t>& out,
const wstate_t<xint128_t>& in) NOEXCEPT
{
if constexpr (SHA::strength == 160)
{
if constexpr (use_neon)
{
}
else if constexpr (use_shani)
{
}
}
else if constexpr (SHA::strength == 256)
{
if constexpr (use_neon)
{
}
else if constexpr (use_shani)
{
out[0] = mm_add_epi32(out[0], in[0]);
out[1] = mm_add_epi32(out[1], in[1]);
}
}
}

TEMPLATE
INLINE void CLASS::
shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT
{
// Change wstate to mm_sha256rnds2_epu32 expected form:
// [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high).
const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1);
const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b);
wstate[0] = mm_alignr_epi8(t1, t2, 8);
wstate[1] = mm_blend_epi16(t2, t1, 15);
}

TEMPLATE
INLINE void CLASS::
unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT
{
// Restore wstate to normal form:
// [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high).
const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b);
const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1);
wstate[0] = mm_blend_epi16(t1, t2, 15);
wstate[1] = mm_alignr_epi8(t2, t1, 8);
}

TEMPLATE
template <size_t Lane>
INLINE void CLASS::
compress_native(wstate_t<xint128_t>& wstate,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
{
// Shuffle and unshuffle can be done outside of all blocks, but this would
// leave state in a non-normal form, so presently absorbing that cost.
shuffle(wstate);

// This is a copy.
const auto start = wstate;

round_native< 0, Lane>(wstate, wbuffer);
round_native< 1, Lane>(wstate, wbuffer);
round_native< 2, Lane>(wstate, wbuffer);
round_native< 3, Lane>(wstate, wbuffer);
round_native< 4, Lane>(wstate, wbuffer);
round_native< 5, Lane>(wstate, wbuffer);
round_native< 6, Lane>(wstate, wbuffer);
round_native< 7, Lane>(wstate, wbuffer);
round_native< 8, Lane>(wstate, wbuffer);
round_native< 9, Lane>(wstate, wbuffer);
round_native<10, Lane>(wstate, wbuffer);
round_native<11, Lane>(wstate, wbuffer);
round_native<12, Lane>(wstate, wbuffer);
round_native<13, Lane>(wstate, wbuffer);
round_native<14, Lane>(wstate, wbuffer);
round_native<15, Lane>(wstate, wbuffer);

if constexpr (SHA::rounds == 80)
{
round_native<16, Lane>(wstate, wbuffer);
round_native<17, Lane>(wstate, wbuffer);
round_native<18, Lane>(wstate, wbuffer);
round_native<19, Lane>(wstate, wbuffer);
}

// This is just a vectorized version of summarize().
summarize_native(wstate, start);

// See above comments on shuffle().
unshuffle(wstate);
}

TEMPLATE
template <typename xWord, size_t Lane>
INLINE void CLASS::
Expand All @@ -157,8 +288,17 @@ template <size_t Lane>
INLINE void CLASS::
compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
{
// TODO: Single block compression.
compress_<Lane>(state, buffer);
// TODO: sha160 state is too small to array cast into two xwords.
// neon and sha160 not yet implemented, sha512 is not native.
if constexpr (SHA::strength == 256 && !use_neon)
{
compress_native<Lane>(array_cast<xint128_t>(state),
array_cast<xint128_t>(buffer));
}
else
{
compress_<Lane>(state, buffer);
}
}

} // namespace sha
Expand Down
4 changes: 4 additions & 0 deletions include/bitcoin/system/intrinsics/xcpu/defines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,8 @@ BC_POP_WARNING()
#define mm_extract_epi32(a, Lane) {}
#define mm_extract_epi64(a, Lane) {}
#define mm_shuffle_epi8(a, mask) (a)
#define mm_shuffle_epi32(a, mask) (a)
#define mm_blend_epi16(a, b, mask) (a)
#define mm_load_si128(a) {}
#define mm_loadu_si128(a) {}
#define mm_store_si128(memory, a)
Expand Down Expand Up @@ -167,6 +169,8 @@ BC_POP_WARNING()
#define mm_extract_epi32(a, Lane) _mm_extract_epi32(a, Lane)
#define mm_extract_epi64(a, Lane) _mm_extract_epi64(a, Lane) // undefined for X32
#define mm_shuffle_epi8(a, mask) _mm_shuffle_epi8(a, mask)
#define mm_shuffle_epi32(a, mask) _mm_shuffle_epi32(a, mask)
#define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask)
#define mm_load_si128(a) _mm_load_si128(a)
#define mm_loadu_si128(a) _mm_loadu_si128(a)
#define mm_store_si128(memory, a) _mm_store_si128(memory, a)
Expand Down

0 comments on commit 0d700f1

Please sign in to comment.