From 5c4c98f9d22cac51b077209d0ac9280d1ff13b61 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 12:04:04 -0500 Subject: [PATCH 01/10] Fix sha256 merkle test vector. --- test/hash/sha/vector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/hash/sha/vector.cpp b/test/hash/sha/vector.cpp index ddc248fdac..3619629711 100644 --- a/test/hash/sha/vector.cpp +++ b/test/hash/sha/vector.cpp @@ -27,7 +27,7 @@ BOOST_AUTO_TEST_SUITE(vector_tests) BOOST_AUTO_TEST_CASE(vector__sha256__merkle_root__expected) { - using sha_256 = sha::algorithm, true, true, true>; + using sha_256 = sha::algorithm, true, true, true>; // First round // AVX2 From 60bca2b377438eb7c0db46c778008272d077a19e Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 12:34:06 -0500 Subject: [PATCH 02/10] Update include order documentation. --- src/define.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/define.cpp b/src/define.cpp index 05c055d798..5d32dd90b7 100644 --- a/src/define.cpp +++ b/src/define.cpp @@ -53,7 +53,7 @@ // /words : /data // /radix : /words // /serial : /radix -// /hash : /radix +// /hash : /radix /endian // /crypto : /hash // /stream : /crypto /endian /error // /chain : /stream forks [forward: settings] From 9b7e02796710c98ea83b18a8e5bd803bc142cc43 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 13:06:48 -0500 Subject: [PATCH 03/10] Style. --- .../bitcoin/system/impl/hash/sha/algorithm_iterate.ipp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index 7dd70bb98e..e643e8ca01 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -38,17 +38,17 @@ template INLINE auto CLASS:: pack(const xblock_t& xblock) NOEXCEPT { - using xword = to_extended; + using xword_t = to_extended; if constexpr (Lanes == 2) { - return byteswap(set( + return byteswap(set( xblock[0][Word], xblock[1][Word])); } else if constexpr (Lanes == 4) { - return byteswap(set( + return byteswap(set( xblock[0][Word], xblock[1][Word], xblock[2][Word], @@ -56,7 +56,7 @@ pack(const xblock_t& xblock) NOEXCEPT } else if constexpr (Lanes == 8) { - return byteswap(set( + return byteswap(set( xblock[0][Word], xblock[1][Word], xblock[2][Word], @@ -68,7 +68,7 @@ pack(const xblock_t& xblock) NOEXCEPT } else if constexpr (Lanes == 16) { - return byteswap(set( + return byteswap(set( xblock[ 0][Word], xblock[ 1][Word], xblock[ 2][Word], From 941a5fcf0802eac7684bb410701bec400847ae2c Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 17:41:22 -0500 Subject: [PATCH 04/10] Make have<> and have_lanes<> constants (vs. constexpr). --- .../impl/hash/sha/algorithm_iterate.ipp | 2 +- .../system/impl/hash/sha/algorithm_merkle.ipp | 2 +- .../impl/hash/sha/algorithm_parsing.ipp | 185 +++++++++++++----- .../system/impl/hash/sha/algorithm_sigma.ipp | 2 +- include/bitcoin/system/intrinsics/haves.hpp | 18 +- .../system/intrinsics/xcpu/defines.hpp | 12 +- test/intrinsics/haves.cpp | 93 +++------ 7 files changed, 176 insertions(+), 138 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index e643e8ca01..ee9abbe756 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -181,7 +181,7 @@ vector_schedule_sequential_compress(state_t& state, iblocks_t& blocks) NOEXCEPT constexpr auto lanes = capacity; static_assert(is_valid_lanes); - if constexpr (have()) + if constexpr (have) { if (blocks.size() >= lanes) { diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp index 9e04c545e9..d0ab65e166 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp @@ -351,7 +351,7 @@ merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT constexpr auto lanes = capacity; static_assert(is_valid_lanes); - if constexpr (have()) + if constexpr (have) { if (blocks.size() >= lanes) { diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp index b40f3d6290..03104ac9de 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp @@ -58,24 +58,51 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT } else if constexpr (bc::is_little_endian) { - // TODO: evaluate 4/8/16 lane optimization using byteswap. - const auto& in = array_cast(block); - buffer[0] = native_from_big_end(in[0]); - buffer[1] = native_from_big_end(in[1]); - buffer[2] = native_from_big_end(in[2]); - buffer[3] = native_from_big_end(in[3]); - buffer[4] = native_from_big_end(in[4]); - buffer[5] = native_from_big_end(in[5]); - buffer[6] = native_from_big_end(in[6]); - buffer[7] = native_from_big_end(in[7]); - buffer[8] = native_from_big_end(in[8]); - buffer[9] = native_from_big_end(in[9]); - buffer[10] = native_from_big_end(in[10]); - buffer[11] = native_from_big_end(in[11]); - buffer[12] = native_from_big_end(in[12]); - buffer[13] = native_from_big_end(in[13]); - buffer[14] = native_from_big_end(in[14]); - buffer[15] = native_from_big_end(in[15]); + ////if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(block); + //// auto& out = array_cast(buffer); + //// out[0] = byteswap(in[0]); + ////} + ////else if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(block); + //// auto& out = array_cast(buffer); + //// out[0] = byteswap(in[0]); + //// out[1] = byteswap(in[1]); + ////} + ////else if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(block); + //// auto& out = array_cast(buffer); + //// out[0] = byteswap(in[0]); + //// out[1] = byteswap(in[1]); + //// out[2] = byteswap(in[2]); + //// out[3] = byteswap(in[3]); + ////} + ////else + ////{ + const auto& in = array_cast(block); + buffer[0] = native_from_big_end(in[0]); + buffer[1] = native_from_big_end(in[1]); + buffer[2] = native_from_big_end(in[2]); + buffer[3] = native_from_big_end(in[3]); + buffer[4] = native_from_big_end(in[4]); + buffer[5] = native_from_big_end(in[5]); + buffer[6] = native_from_big_end(in[6]); + buffer[7] = native_from_big_end(in[7]); + buffer[8] = native_from_big_end(in[8]); + buffer[9] = native_from_big_end(in[9]); + buffer[10] = native_from_big_end(in[10]); + buffer[11] = native_from_big_end(in[11]); + buffer[12] = native_from_big_end(in[12]); + buffer[13] = native_from_big_end(in[13]); + buffer[14] = native_from_big_end(in[14]); + buffer[15] = native_from_big_end(in[15]); + ////} } else { @@ -105,15 +132,33 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { - const auto& in = array_cast(half); - buffer[0] = native_from_big_end(in[0]); - buffer[1] = native_from_big_end(in[1]); - buffer[2] = native_from_big_end(in[2]); - buffer[3] = native_from_big_end(in[3]); - buffer[4] = native_from_big_end(in[4]); - buffer[5] = native_from_big_end(in[5]); - buffer[6] = native_from_big_end(in[6]); - buffer[7] = native_from_big_end(in[7]); + ////if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(half); + //// auto& out = array_cast(buffer); + //// out[0] = byteswap(in[0]); + ////} + ////else if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(half); + //// auto& out = array_cast(buffer); + //// out[0] = byteswap(in[0]); + //// out[1] = byteswap(in[1]); + ////} + ////else + ////{ + const auto& in = array_cast(half); + buffer[0] = native_from_big_end(in[0]); + buffer[1] = native_from_big_end(in[1]); + buffer[2] = native_from_big_end(in[2]); + buffer[3] = native_from_big_end(in[3]); + buffer[4] = native_from_big_end(in[4]); + buffer[5] = native_from_big_end(in[5]); + buffer[6] = native_from_big_end(in[6]); + buffer[7] = native_from_big_end(in[7]); + ////} } else { @@ -141,16 +186,33 @@ input_right(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { - // TODO: evaluate 4/8 lane optimization using byteswap. - const auto& in = array_cast(half); - buffer[8] = native_from_big_end(in[0]); - buffer[9] = native_from_big_end(in[1]); - buffer[10] = native_from_big_end(in[2]); - buffer[11] = native_from_big_end(in[3]); - buffer[12] = native_from_big_end(in[4]); - buffer[13] = native_from_big_end(in[5]); - buffer[14] = native_from_big_end(in[6]); - buffer[15] = native_from_big_end(in[7]); + ////if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(half); + //// auto& out = array_cast(buffer); + //// out[1] = byteswap(in[0]); + ////} + ////else if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(half); + //// auto& out = array_cast(buffer); + //// out[2] = byteswap(in[0]); + //// out[3] = byteswap(in[1]); + ////} + ////else + ////{ + const auto& in = array_cast(half); + buffer[8] = native_from_big_end(in[0]); + buffer[9] = native_from_big_end(in[1]); + buffer[10] = native_from_big_end(in[2]); + buffer[11] = native_from_big_end(in[3]); + buffer[12] = native_from_big_end(in[4]); + buffer[13] = native_from_big_end(in[5]); + buffer[14] = native_from_big_end(in[6]); + buffer[15] = native_from_big_end(in[7]); + ////} } else { @@ -185,30 +247,51 @@ output(const state_t& state) NOEXCEPT } else if constexpr (bc::is_little_endian) { - if constexpr (SHA::strength == 160) + if constexpr (SHA::strength != 160) { - return array_cast(state_t - { - native_to_big_end(state[0]), - native_to_big_end(state[1]), - native_to_big_end(state[2]), - native_to_big_end(state[3]), - native_to_big_end(state[4]) - }); + ////if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(state); + //// return array_cast(wstate_t + //// { + //// byteswap(in[0]) + //// }); + ////} + ////else if constexpr (have_lanes) + ////{ + //// using xword_t = to_extended; + //// const auto& in = array_cast(state); + //// return array_cast(wstate_t + //// { + //// byteswap(in[0]), + //// byteswap(in[1]) + //// }); + ////} + ////else + ////{ + return array_cast(state_t + { + native_to_big_end(state[0]), + native_to_big_end(state[1]), + native_to_big_end(state[2]), + native_to_big_end(state[3]), + native_to_big_end(state[4]), + native_to_big_end(state[5]), + native_to_big_end(state[6]), + native_to_big_end(state[7]) + }); + ////} } else { - // TODO: evaluate 4/8 lane optimization using byteswap. return array_cast(state_t { native_to_big_end(state[0]), native_to_big_end(state[1]), native_to_big_end(state[2]), native_to_big_end(state[3]), - native_to_big_end(state[4]), - native_to_big_end(state[5]), - native_to_big_end(state[6]), - native_to_big_end(state[7]) + native_to_big_end(state[4]) }); } } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp index e6f880dd64..866fac88b8 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp @@ -96,7 +96,7 @@ TEMPLATE void CLASS:: schedule_sigma(buffer_t& buffer) NOEXCEPT { - if constexpr (SHA::strength != 160 && have_lanes()) + if constexpr (SHA::strength != 160 && have_lanes) { prepare8<16>(buffer); prepare8<24>(buffer); diff --git a/include/bitcoin/system/intrinsics/haves.hpp b/include/bitcoin/system/intrinsics/haves.hpp index 5c93d71268..ebfe9b2fdf 100644 --- a/include/bitcoin/system/intrinsics/haves.hpp +++ b/include/bitcoin/system/intrinsics/haves.hpp @@ -181,9 +181,8 @@ using to_extended = iif), xint256_t, xint512_t>>>>>>; -/// Availability of extended integer intrinsics. template = true> -constexpr bool have() NOEXCEPT +constexpr bool have_() NOEXCEPT { if constexpr (is_same_type) return with_avx512; @@ -195,10 +194,13 @@ constexpr bool have() NOEXCEPT return false; } -/// Availability of extended integer filled by Lanes Integrals. +/// Availability of extended integer intrinsics. +template = true> +constexpr bool have = have_(); + template = true> -constexpr bool have_lanes() NOEXCEPT +constexpr bool have_lanes_() NOEXCEPT { if constexpr (capacity == Lanes) return with_avx512; @@ -210,14 +212,18 @@ constexpr bool have_lanes() NOEXCEPT return false; } -/// Availability of extended integer, override for non-integral word. template = true> -constexpr bool have_lanes() NOEXCEPT +constexpr bool have_lanes_() NOEXCEPT { return false; } +/// Availability of extended integer filled by Lanes Integrals. +template = true> +constexpr bool have_lanes = have_lanes_(); + BC_POP_WARNING() } // namespace system diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 7594327053..7d51f85a0f 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -42,16 +42,8 @@ // **************************************************************************** // The xint extended integer symbols are always defined for XCPU builds, but // these intrinsics are dependent upon the underlying platform and will cause -// a build failure on CLANG/GCC if not configured into the build (after being -// detected on the build platform). On all platforms these also require runtime -// detection. This creates three layers of conditionality, exploding complexity. -// These macros suppress the build failure, allowing code to operation over the -// necessary runtime detection, with runtime detection layered over build -// configuration symbols. Consequently there is no need for preprocessor -// statements within application code, just a runtime test for availability. -// These symbols will be defined where availability is possible (compiled). -// with() is defined for constexpr conditionality and have() is -// defined for runtime conditionality. The latter is false if is the former. +// a build failure on CLANG/GCC if not configured into the build. These macros +// suppress the build failure when symbols are not defined, without #ifdef. // **************************************************************************** // These are not defined for 32 bit bit builds. diff --git a/test/intrinsics/haves.cpp b/test/intrinsics/haves.cpp index 8ea188d475..0710c2b196 100644 --- a/test/intrinsics/haves.cpp +++ b/test/intrinsics/haves.cpp @@ -114,74 +114,31 @@ BOOST_AUTO_TEST_CASE(intrinsics_haves__try_shani__always__match) // have_lanes // ---------------------------------------------------------------------------- -BOOST_AUTO_TEST_CASE(intrinsics__have_lanes__avx512__expected) -{ - auto have = false; - - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx512); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx512); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx512); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx512); - - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); -} - -BOOST_AUTO_TEST_CASE(intrinsics__have_lanes__avx2__expected) -{ - auto have = false; - - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx2); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx2); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx2); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_avx2); - - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); -} - -BOOST_AUTO_TEST_CASE(intrinsics__have_lanes__sse41__expected) -{ - auto have = false; - - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_sse41); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_sse41); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_sse41); - have = have_lanes(); - BOOST_CHECK_EQUAL(have, with_sse41); - - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); - have = have_lanes(); - BOOST_CHECK(!have); -} +static_assert(have_lanes == with_avx512); +static_assert(have_lanes == with_avx512); +static_assert(have_lanes == with_avx512); +static_assert(!have_lanes); +static_assert(!have_lanes); +static_assert(!have_lanes); +static_assert(!have_lanes); + +static_assert(have_lanes == with_avx2); +static_assert(have_lanes == with_avx2); +static_assert(have_lanes == with_avx2); +static_assert(have_lanes == with_avx2); +static_assert(!have_lanes); +static_assert(!have_lanes); +static_assert(!have_lanes); +static_assert(!have_lanes); + +static_assert(have_lanes == with_sse41); +static_assert(have_lanes == with_sse41); +static_assert(have_lanes == with_sse41); +static_assert(have_lanes == with_sse41); +static_assert(!have_lanes); +static_assert(!have_lanes); +static_assert(!have_lanes); +static_assert(!have_lanes); // is_extended // ---------------------------------------------------------------------------- From 5f686e651555125bebda02e9dc61e6d5e75e4a76 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 20:24:32 -0500 Subject: [PATCH 05/10] Change vector load/store signatures. --- .../system/intrinsics/xcpu/functional_128.hpp | 19 +++++++++---------- .../system/intrinsics/xcpu/functional_256.hpp | 19 +++++++++---------- .../system/intrinsics/xcpu/functional_512.hpp | 19 +++++++++---------- 3 files changed, 27 insertions(+), 30 deletions(-) diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp index 1d17c99466..68a227ceb2 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp @@ -255,28 +255,27 @@ INLINE xint128_t byteswap(xint128_t a) NOEXCEPT 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8)); } -/// load/store (element sizes are actually irrelevant) +/// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- -using data128 = std_array; -INLINE xint128_t load_aligned(const data128& bytes) NOEXCEPT +INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT { - return mm_load_si128(pointer_cast(bytes.data())); + return mm_load_si128(&bytes); } -INLINE xint128_t load(const data128& bytes) NOEXCEPT +INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT { - return mm_loadu_si128(pointer_cast(bytes.data())); + return mm_loadu_si128(&bytes); } -INLINE void store_aligned(data128& bytes, xint128_t a) NOEXCEPT +INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT { - mm_store_si128(pointer_cast(bytes.data()), a); + mm_store_si128(&bytes, a); } -INLINE void store(data128& bytes, xint128_t a) NOEXCEPT +INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT { - mm_storeu_si128(pointer_cast(bytes.data()), a); + mm_storeu_si128(&bytes, a); } #else diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp index 7ecdf07cdf..e2c3711cb4 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp @@ -268,28 +268,27 @@ INLINE xint256_t byteswap(xint256_t a) NOEXCEPT 23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24)); } -/// load/store (element sizes are actually irrelevant) +/// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- -using data256 = std_array; -INLINE xint256_t load_aligned(const data256& bytes) NOEXCEPT +INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT { - return mm256_load_si256(pointer_cast(bytes.data())); + return mm256_load_si256(&bytes); } -INLINE xint256_t load(const data256& bytes) NOEXCEPT +INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT { - return mm256_loadu_si256(pointer_cast(bytes.data())); + return mm256_loadu_si256(&bytes); } -INLINE void store_aligned(data256& bytes, xint256_t a) NOEXCEPT +INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT { - mm256_store_si256(pointer_cast(bytes.data()), a); + mm256_store_si256(&bytes, a); } -INLINE void store(data256& bytes, xint256_t a) NOEXCEPT +INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT { - mm256_storeu_si256(pointer_cast(bytes.data()), a); + mm256_storeu_si256(&bytes, a); } #else diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp index 4a30a57b93..3308bd0362 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp @@ -308,28 +308,27 @@ INLINE xint512_t byteswap(xint512_t a) NOEXCEPT 55, 54, 53, 52, 51, 50, 49, 48, 63, 62, 61, 60, 59, 58, 57, 56)); } -/// load/store (element sizes are actually irrelevant) +/// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- -using data512 = std_array; -INLINE xint512_t load_aligned(const data512& bytes) NOEXCEPT +INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT { - return mm512_load_si512(pointer_cast(bytes.data())); + return mm512_load_si512(&bytes); } -INLINE xint512_t load(const data512& bytes) NOEXCEPT +INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT { - return mm512_loadu_si512(pointer_cast(bytes.data())); + return mm512_loadu_si512(&bytes); } -INLINE void store_aligned(data512& bytes, xint512_t a) NOEXCEPT +INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT { - mm512_store_si512(pointer_cast(bytes.data()), a); + mm512_store_si512(&bytes, a); } -INLINE void store(data512& bytes, xint512_t a) NOEXCEPT +INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT { - mm512_storeu_si512(pointer_cast(bytes.data()), a); + mm512_storeu_si512(&bytes, a); } #else From 8964165b6fd3214714595981479052eba9afecd8 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 20:25:20 -0500 Subject: [PATCH 06/10] Enable vector_konstant. --- .../impl/hash/sha/algorithm_konstant.ipp | 45 +++++++++---------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp index 32afdfaec4..2873e729f2 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp @@ -50,33 +50,28 @@ template INLINE void CLASS:: vector_konstant(wbuffer_t& wbuffer) NOEXCEPT { - constexpr auto r = Round; constexpr auto s = SHA::word_bits; constexpr auto lanes = capacity; + constexpr auto r = Round * lanes; - if constexpr (lanes == 2) + if constexpr (lanes == 16) { wbuffer[Round] = f::add(wbuffer[Round], set( - K::get[r + 1], K::get[r + 0])); - } - else if constexpr (lanes == 4) - { - wbuffer[Round] = f::add(wbuffer[Round], set( - K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0])); + K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3], + K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7], + K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11], + K::get[r + 12], K::get[r + 13], K::get[r + 14], K::get[r + 15])); } else if constexpr (lanes == 8) { wbuffer[Round] = f::add(wbuffer[Round], set( - K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4], - K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0])); + K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3], + K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7])); } - else if constexpr (lanes == 16) + else if constexpr (lanes == 4) { wbuffer[Round] = f::add(wbuffer[Round], set( - K::get[r + 15], K::get[r + 14], K::get[r + 13], K::get[r + 12], - K::get[r + 11], K::get[r + 10], K::get[r + 9], K::get[r + 8], - K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4], - K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0])); + K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3])); } } @@ -84,9 +79,9 @@ TEMPLATE void CLASS:: vector_konstant(buffer_t& buffer) NOEXCEPT { - if constexpr (use_x512) + if constexpr (have_lanes) { - auto& wbuffer = array_cast(buffer); + auto& wbuffer = array_cast>(buffer); vector_konstant<0>(wbuffer); vector_konstant<1>(wbuffer); vector_konstant<2>(wbuffer); @@ -97,9 +92,9 @@ vector_konstant(buffer_t& buffer) NOEXCEPT vector_konstant<4>(wbuffer); } } - else if constexpr (use_x256) + else if constexpr (have_lanes) { - auto& wbuffer = array_cast(buffer); + auto& wbuffer = array_cast>(buffer); vector_konstant<0>(wbuffer); vector_konstant<1>(wbuffer); vector_konstant<2>(wbuffer); @@ -115,9 +110,9 @@ vector_konstant(buffer_t& buffer) NOEXCEPT vector_konstant<9>(wbuffer); } } - else if constexpr (use_x128) + else if constexpr (have_lanes) { - auto& wbuffer = array_cast(buffer); + auto& wbuffer = array_cast>(buffer); vector_konstant<0>(wbuffer); vector_konstant<1>(wbuffer); vector_konstant<2>(wbuffer); @@ -262,10 +257,10 @@ konstant(buffer_t& buffer) NOEXCEPT { konstant_(buffer); } - ////else if constexpr (vector) - ////{ - //// vector_konstant(buffer); - ////} + else if constexpr (vector) + { + vector_konstant(buffer); + } else { konstant_(buffer); From adf935f9ab93275068e84871262a389bfbbb9689 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 20:25:48 -0500 Subject: [PATCH 07/10] Enable vector parse. --- .../impl/hash/sha/algorithm_parsing.ipp | 173 +++++++++--------- 1 file changed, 86 insertions(+), 87 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp index 03104ac9de..4275ae9b97 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp @@ -58,33 +58,33 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT } else if constexpr (bc::is_little_endian) { - ////if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(block); - //// auto& out = array_cast(buffer); - //// out[0] = byteswap(in[0]); - ////} - ////else if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(block); - //// auto& out = array_cast(buffer); - //// out[0] = byteswap(in[0]); - //// out[1] = byteswap(in[1]); - ////} - ////else if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(block); - //// auto& out = array_cast(buffer); - //// out[0] = byteswap(in[0]); - //// out[1] = byteswap(in[1]); - //// out[2] = byteswap(in[2]); - //// out[3] = byteswap(in[3]); - ////} - ////else - ////{ + if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(block); + auto& out = array_cast(buffer); + out[0] = byteswap(in[0]); + } + else if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(block); + auto& out = array_cast(buffer); + out[0] = byteswap(in[0]); + out[1] = byteswap(in[1]); + } + else if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(block); + auto& out = array_cast(buffer); + out[0] = byteswap(in[0]); + out[1] = byteswap(in[1]); + out[2] = byteswap(in[2]); + out[3] = byteswap(in[3]); + } + else + { const auto& in = array_cast(block); buffer[0] = native_from_big_end(in[0]); buffer[1] = native_from_big_end(in[1]); @@ -102,7 +102,7 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT buffer[13] = native_from_big_end(in[13]); buffer[14] = native_from_big_end(in[14]); buffer[15] = native_from_big_end(in[15]); - ////} + } } else { @@ -119,7 +119,6 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT if (std::is_constant_evaluated()) { - // TODO: evaluate 4/8 lane optimization using byteswap. constexpr auto size = SHA::word_bytes; from_big<0 * size>(buffer.at(0), half); from_big<1 * size>(buffer.at(1), half); @@ -132,23 +131,23 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { - ////if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(half); - //// auto& out = array_cast(buffer); - //// out[0] = byteswap(in[0]); - ////} - ////else if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(half); - //// auto& out = array_cast(buffer); - //// out[0] = byteswap(in[0]); - //// out[1] = byteswap(in[1]); - ////} - ////else - ////{ + if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(half); + auto& out = array_cast(buffer); + out[0] = byteswap(in[0]); + } + else if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(half); + auto& out = array_cast(buffer); + out[0] = byteswap(in[0]); + out[1] = byteswap(in[1]); + } + else + { const auto& in = array_cast(half); buffer[0] = native_from_big_end(in[0]); buffer[1] = native_from_big_end(in[1]); @@ -158,7 +157,7 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT buffer[5] = native_from_big_end(in[5]); buffer[6] = native_from_big_end(in[6]); buffer[7] = native_from_big_end(in[7]); - ////} + } } else { @@ -186,23 +185,23 @@ input_right(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { - ////if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(half); - //// auto& out = array_cast(buffer); - //// out[1] = byteswap(in[0]); - ////} - ////else if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(half); - //// auto& out = array_cast(buffer); - //// out[2] = byteswap(in[0]); - //// out[3] = byteswap(in[1]); - ////} - ////else - ////{ + if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(half); + auto& out = array_cast(buffer); + out[1] = byteswap(in[0]); + } + else if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(half); + auto& out = array_cast(buffer); + out[2] = byteswap(in[0]); + out[3] = byteswap(in[1]); + } + else + { const auto& in = array_cast(half); buffer[8] = native_from_big_end(in[0]); buffer[9] = native_from_big_end(in[1]); @@ -212,7 +211,7 @@ input_right(buffer_t& buffer, const half_t& half) NOEXCEPT buffer[13] = native_from_big_end(in[5]); buffer[14] = native_from_big_end(in[6]); buffer[15] = native_from_big_end(in[7]); - ////} + } } else { @@ -249,27 +248,27 @@ output(const state_t& state) NOEXCEPT { if constexpr (SHA::strength != 160) { - ////if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(state); - //// return array_cast(wstate_t - //// { - //// byteswap(in[0]) - //// }); - ////} - ////else if constexpr (have_lanes) - ////{ - //// using xword_t = to_extended; - //// const auto& in = array_cast(state); - //// return array_cast(wstate_t - //// { - //// byteswap(in[0]), - //// byteswap(in[1]) - //// }); - ////} - ////else - ////{ + if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(state); + return array_cast(wstate_t + { + byteswap(in[0]) + }); + } + else if constexpr (have_lanes) + { + using xword_t = to_extended; + const auto& in = array_cast(state); + return array_cast(wstate_t + { + byteswap(in[0]), + byteswap(in[1]) + }); + } + else + { return array_cast(state_t { native_to_big_end(state[0]), @@ -281,7 +280,7 @@ output(const state_t& state) NOEXCEPT native_to_big_end(state[6]), native_to_big_end(state[7]) }); - ////} + } } else { From cf1904cbf9a17ec0b2580fa11858d2beaed639f8 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 21:20:26 -0500 Subject: [PATCH 08/10] Disable certain vectorizations failing on non-xcode clang. --- .../impl/hash/sha/algorithm_konstant.ipp | 2 +- .../system/impl/hash/sha/algorithm_parsing.ipp | 18 +++++++++--------- include/bitcoin/system/intrinsics/haves.hpp | 7 +++++++ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp index 2873e729f2..e62127a10c 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp @@ -257,7 +257,7 @@ konstant(buffer_t& buffer) NOEXCEPT { konstant_(buffer); } - else if constexpr (vector) + else if constexpr (vector && !with_clang) { vector_konstant(buffer); } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp index 4275ae9b97..4f0ae899a6 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp @@ -58,14 +58,14 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT } else if constexpr (bc::is_little_endian) { - if constexpr (have_lanes) + if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(block); auto& out = array_cast(buffer); out[0] = byteswap(in[0]); } - else if constexpr (have_lanes) + else if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(block); @@ -73,7 +73,7 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT out[0] = byteswap(in[0]); out[1] = byteswap(in[1]); } - else if constexpr (have_lanes) + else if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(block); @@ -131,14 +131,14 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { - if constexpr (have_lanes) + if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(half); auto& out = array_cast(buffer); out[0] = byteswap(in[0]); } - else if constexpr (have_lanes) + else if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(half); @@ -185,14 +185,14 @@ input_right(buffer_t& buffer, const half_t& half) NOEXCEPT } else if constexpr (bc::is_little_endian) { - if constexpr (have_lanes) + if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(half); auto& out = array_cast(buffer); out[1] = byteswap(in[0]); } - else if constexpr (have_lanes) + else if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(half); @@ -248,7 +248,7 @@ output(const state_t& state) NOEXCEPT { if constexpr (SHA::strength != 160) { - if constexpr (have_lanes) + if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(state); @@ -257,7 +257,7 @@ output(const state_t& state) NOEXCEPT byteswap(in[0]) }); } - else if constexpr (have_lanes) + else if constexpr (have_lanes && !with_clang) { using xword_t = to_extended; const auto& in = array_cast(state); diff --git a/include/bitcoin/system/intrinsics/haves.hpp b/include/bitcoin/system/intrinsics/haves.hpp index ebfe9b2fdf..af4bc1c459 100644 --- a/include/bitcoin/system/intrinsics/haves.hpp +++ b/include/bitcoin/system/intrinsics/haves.hpp @@ -30,6 +30,13 @@ namespace libbitcoin { namespace system { +// Hack +#if defined(HAVE_CLANG) && !defined(HAVE_XCODE) + constexpr auto with_clang = true; +#else + constexpr auto with_clang = false; +#endif + // Functions may only be constexpr conditionally. BC_PUSH_WARNING(USE_CONSTEXPR_FOR_FUNCTION) From a305e35419fb5831fa19831afb2294ea3400a8d3 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 21:22:36 -0500 Subject: [PATCH 09/10] Comments. --- include/bitcoin/system/intrinsics/haves.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/bitcoin/system/intrinsics/haves.hpp b/include/bitcoin/system/intrinsics/haves.hpp index af4bc1c459..1bbcb9f915 100644 --- a/include/bitcoin/system/intrinsics/haves.hpp +++ b/include/bitcoin/system/intrinsics/haves.hpp @@ -30,7 +30,7 @@ namespace libbitcoin { namespace system { -// Hack +// HACK: work around vectorizations failing on non-xcode clang. #if defined(HAVE_CLANG) && !defined(HAVE_XCODE) constexpr auto with_clang = true; #else @@ -42,7 +42,6 @@ BC_PUSH_WARNING(USE_CONSTEXPR_FOR_FUNCTION) /// Constant symbols for compiled intrinsics interfaces. /// --------------------------------------------------------------------------- -// sse41a (assembly) optimization is implemented without assembly. #if defined(HAVE_SSE41) constexpr auto with_sse41 = true; From 9aa6b0d71b58d2f45a5b69b537c75ca1d9d490b7 Mon Sep 17 00:00:00 2001 From: evoskuil Date: Tue, 3 Dec 2024 21:26:43 -0500 Subject: [PATCH 10/10] Delint intrinsic macro. --- include/bitcoin/system/intrinsics/xcpu/defines.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 7d51f85a0f..000f655459 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -331,7 +331,7 @@ BC_POP_WARNING() #define mm_sha1nexte_epu32(a, b) {} #define mm_sha256msg1_epu32(a, b) {} #define mm_sha256msg2_epu32(a, b) {} - #define mm_sha256rnds2_epu32(a, b, k) {} + #define mm_sha256rnds2_epu32(a, b, k) (k) #else #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b)