diff --git a/include/bitcoin/system/data/iterable.hpp b/include/bitcoin/system/data/iterable.hpp index 3a2daeaace..08c1b2d924 100644 --- a/include/bitcoin/system/data/iterable.hpp +++ b/include/bitcoin/system/data/iterable.hpp @@ -175,7 +175,7 @@ class iterable return begin_; } - template + template inline iterable& advance() NOEXCEPT { // This is safe for overflow, will advance to end. @@ -185,7 +185,7 @@ class iterable return *this; } - template + template inline const std_array& to_array() const NOEXCEPT { return unsafe_array_cast(begin_); diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index 6201230bcf..5b25bfac1c 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -281,6 +281,11 @@ class algorithm INLINE static void iterate_vector(state_t& state, iblocks_t& blocks) NOEXCEPT; + template + INLINE static void iterate_native(state_t& state, + const ablocks_t& blocks) NOEXCEPT; + INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT; + template INLINE static constexpr void iterate_(state_t& state, const ablocks_t& blocks) NOEXCEPT; @@ -317,7 +322,8 @@ class algorithm const xstate_t& xstate) NOEXCEPT; template = true> - INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT; + INLINE static void merkle_hash_vector(idigests_t& digests, + iblocks_t& blocks) NOEXCEPT; INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT; VCONSTEXPR static void merkle_hash_(digests_t& digests, size_t offset=zero) NOEXCEPT; @@ -330,10 +336,10 @@ class algorithm auto x6, auto x7, auto x8) NOEXCEPT; template - INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT; + INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT; template - INLINE static void prepare8(buffer_t& buffer) NOEXCEPT; + INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT; template INLINE static void schedule_sigma(xbuffer_t& xbuffer) NOEXCEPT; @@ -357,45 +363,24 @@ class algorithm /// Native SHA optimizations (single blocks). /// ----------------------------------------------------------------------- - template - INLINE static void prepare_native(wbuffer_t& wbuffer) NOEXCEPT; - static void schedule_native(wbuffer_t& wbuffer) NOEXCEPT; - - template - INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT; - INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT; - - template - INLINE static void round_native(wstate_t& state, - const wbuffer_t& wk) NOEXCEPT; - - INLINE static void shuffle(wstate_t& wstate) NOEXCEPT; - INLINE static void unshuffle(wstate_t& wstate) NOEXCEPT; - INLINE static void summarize_native(wstate_t& out, - const wstate_t& in) NOEXCEPT; + INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; + INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT; + INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT; + INLINE static void prepare(xint128_t& message0, xint128_t message1, + xint128_t& message2) NOEXCEPT; - template - static void compress_native(wstate_t& state, - const wbuffer_t& wbuffer) NOEXCEPT; - - template - INLINE static void compress_native(xstate_t& xstate, - const xbuffer_t& xbuffer) NOEXCEPT; - - template - INLINE static void compress_native(state_t& state, - const xbuffer_t& xbuffer) NOEXCEPT; + template + INLINE static void round_4(xint128_t& state0, xint128_t& state1, + xint128_t message) NOEXCEPT; - template - INLINE static void compress_native(state_t& state, - const buffer_t& buffer) NOEXCEPT; + static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT; public: /// Summary public values. /// ----------------------------------------------------------------------- static constexpr auto caching = Cached; - static constexpr auto native = (use_shani || use_neon) && - !is_same_size; + static constexpr auto native = (use_shani || use_neon) + && (SHA::strength == 256 || SHA::strength == 160); static constexpr auto vector = (use_x128 || use_x256 || use_x512) && !(build_x32 && is_same_size); }; diff --git a/include/bitcoin/system/have.hpp b/include/bitcoin/system/have.hpp index b9897f0fb9..79da053996 100644 --- a/include/bitcoin/system/have.hpp +++ b/include/bitcoin/system/have.hpp @@ -110,12 +110,13 @@ #define HAVE_XASSEMBLY #endif +/// DISABLED /// ARM Neon intrinsics. #if defined(HAVE_ARM) // -march=armv8-a+crc+crypto [all] // -arch arm64 [apple] (also -isysroot to phone sdk) #if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC) - #define HAVE_NEON + ////#define HAVE_NEON #endif #endif diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index f9da36fd0f..6b9db05926 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT e = /*a =*/ f::add(f::add(f::add(f::rol<5, s>(a), fn(b, c, d)), e), wk); b = /*c =*/ f::rol<30, s>(b); - - // SHA-NI - // Four rounds (total rounds 80/4). - // First round is add(e, w), then sha1nexte(e, w). - // fk is round-based enumeration implying f selection and k value. - // e1 = sha1nexte(e0, w); - // abcd = sha1rnds4(abcd, e0, fk); - // NEON - // f is implied by k in wk. - // e1 = vsha1h(vgetq_lane(abcd, 0); - // vsha1cq(abcd, e0, vaddq(w, k)); } TEMPLATE @@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h, const auto t = f::add(f::add(f::add(Sigma1(e), choice(e, f, g)), h), wk); d = /*e =*/ f::add(d, t); h = /*a =*/ f::add(f::add(Sigma0(a), majority(a, b, c)), t); - - // Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state). - // SHA-NI - // const auto value = add(w, k); - // abcd = sha256rnds2(abcd, efgh, value); - // efgh = sha256rnds2(efgh, abcd, shuffle(value)); - // NEON - // const auto value = vaddq(w, k); - // abcd = vsha256hq(abcd, efgh, value); - // efgh = vsha256h2q(efgh, abcd, value); } TEMPLATE @@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT state[(SHA::rounds + 3 - Round) % SHA::state_words], state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e extract(wk[Round])); - - // SHA-NI/NEON - // State packs in 128 (one state variable), reduces above to 1 out[]. - // Input value is 128 (w). Constants (k) statically initialized as 128. } else { @@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT state[(SHA::rounds + 6 - Round) % SHA::state_words], state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h extract(wk[Round])); - - // SHA-NI/NEON - // Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1). - // Input value is 128 (w). Constants (k) statically initialized as 128. } } @@ -276,11 +247,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT { compress_(state, buffer); } - else if constexpr (native) - { - // Single block shani compression optimization. - compress_native(state, buffer); - } + ////else if constexpr (native) + ////{ + //// // Single block shani compression optimization. + //// compress_native(state, buffer); + ////} ////else if constexpr (vector) ////{ //// // Compression is not vectorized within a block, however this is diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp index ee9abbe756..bb51e8e9ce 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp @@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t& blocks) NOEXCEPT { if (blocks.size() >= min_lanes) { - auto iblocks = iblocks_t{ array_cast(blocks) }; + iblocks_t iblocks{ array_cast(blocks) }; iterate_vector(state, iblocks); } else @@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t& blocks) NOEXCEPT } } +// Native SHA +// ============================================================================ +// www.intel.com/content/dam/develop/external/us/en/documents/ +// intel-sha-extensions-white-paper-402097.pdf + +TEMPLATE +INLINE void CLASS:: +iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT +{ + native_rounds(state, blocks); +} + +TEMPLATE +template +INLINE void CLASS:: +iterate_native(state_t& state, const ablocks_t& blocks) NOEXCEPT +{ + iblocks_t iblocks{ array_cast(blocks) }; + native_rounds(state, iblocks); +} + +// Dispatch and normal forms. +// ============================================================================ +// protected + TEMPLATE template INLINE constexpr void CLASS:: @@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t& blocks) NOEXCEPT { iterate_(state, blocks); } - else if constexpr (native) + else if constexpr (native && SHA::strength == 256) { - // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling. - // Multiple block shani message schduling and compression optimization. - iterate_(state, blocks); + iterate_native(state, blocks); } else if constexpr (vector) { @@ -294,11 +317,9 @@ TEMPLATE INLINE void CLASS:: iterate(state_t& state, iblocks_t& blocks) NOEXCEPT { - if constexpr (native) + if constexpr (native && SHA::strength == 256) { - // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling. - // Multiple block shani message schduling and compression optimization. - iterate_(state, blocks); + iterate_native(state, blocks); } else if constexpr (vector) { diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp index e62127a10c..26cf225d14 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp @@ -50,13 +50,12 @@ template INLINE void CLASS:: vector_konstant(wbuffer_t& wbuffer) NOEXCEPT { - constexpr auto s = SHA::word_bits; constexpr auto lanes = capacity; constexpr auto r = Round * lanes; if constexpr (lanes == 16) { - wbuffer[Round] = f::add(wbuffer[Round], set( + wbuffer[Round] = add(wbuffer[Round], set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3], K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7], K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11], @@ -64,13 +63,13 @@ vector_konstant(wbuffer_t& wbuffer) NOEXCEPT } else if constexpr (lanes == 8) { - wbuffer[Round] = f::add(wbuffer[Round], set( + wbuffer[Round] = add(wbuffer[Round], set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3], K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7])); } else if constexpr (lanes == 4) { - wbuffer[Round] = f::add(wbuffer[Round], set( + wbuffer[Round] = add(wbuffer[Round], set( K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3])); } } diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp index d0ab65e166..72f3853beb 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp @@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT // ---------------------------------------------------------------------------- // public +// TODO: consider eliminating endianness conversions internal to the root +// computation, instead converting on way in and way out ony, and using non +// converting input/output (nop) functions. + TEMPLATE VCONSTEXPR typename CLASS::digest_t CLASS:: merkle_root(digests_t&& digests) NOEXCEPT diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index b09941ea35..79f7a34898 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -21,285 +21,156 @@ // Native (SHA-NI or NEON) // ============================================================================ -// Native does not change the buffer size (not expanded), just its "shape". -// Four words are buffered into one xint128_t, resulting in 1/4 the buffer -// array size and number of rounds. Four state words are packed into each of -// two state variables. This applies to sha160 and sha256, but sha512 native -// is not supported. - -// The base buffer is already populated with proper endianness. -// Input could be optimized using intrinsics (see comments in parse). -// The unextended state vector is already output with proper endianness. -// Output could also be optimized using intrinsics (see comments in parse). +// The iterative method is used for sha native as it is an order of magnitude +// more efficient and cannot benefit from vectorization. namespace libbitcoin { namespace system { namespace sha { - -// schedule + +// TODO: intel sha160, arm sha160, arm sha256 + +// intel sha256 // ---------------------------------------------------------------------------- // protected TEMPLATE -template INLINE void CLASS:: -prepare_native(wbuffer_t& wbuffer) NOEXCEPT -{ - if constexpr (SHA::strength == 160) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - } - } - else if constexpr (SHA::strength == 256) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - wbuffer[Round] = mm_sha256msg2_epu32 - ( - mm_add_epi32 - ( - mm_alignr_epi8 - ( - wbuffer[Round - 1], wbuffer[Round - 2], SHA::word_bytes - ), - mm_sha256msg1_epu32 - ( - wbuffer[Round - 4], wbuffer[Round - 3] - ) - ), - wbuffer[Round - 1] - ); - } - } -} - -TEMPLATE -void CLASS:: -schedule_native(wbuffer_t& wbuffer) NOEXCEPT +shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT { - prepare_native<4>(wbuffer); - prepare_native<5>(wbuffer); - prepare_native<6>(wbuffer); - prepare_native<7>(wbuffer); - prepare_native<8>(wbuffer); - prepare_native<9>(wbuffer); - prepare_native<10>(wbuffer); - prepare_native<11>(wbuffer); - prepare_native<12>(wbuffer); - prepare_native<13>(wbuffer); - prepare_native<14>(wbuffer); - prepare_native<15>(wbuffer); - - if constexpr (SHA::rounds == 80) - { - prepare_native<16>(wbuffer); - prepare_native<17>(wbuffer); - prepare_native<18>(wbuffer); - prepare_native<19>(wbuffer); - } - - konstant(array_cast(wbuffer)); + const auto shuffle0 = mm_shuffle_epi32(state0, 0xb1); + const auto shuffle1 = mm_shuffle_epi32(state1, 0x1b); + state0 = mm_alignr_epi8(shuffle0, shuffle1, 0x08); + state1 = mm_blend_epi16(shuffle1, shuffle0, 0xf0); } TEMPLATE INLINE void CLASS:: -schedule_native(buffer_t& buffer) NOEXCEPT +unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT { - // neon and sha160 not yet implemented, sha512 is not native. - if constexpr (SHA::strength == 256 && !use_neon) - { - schedule_native(array_cast(buffer)); - } - else - { - schedule_(buffer); - } + const auto shuffle0 = mm_shuffle_epi32(state0, 0x1b); + const auto shuffle1 = mm_shuffle_epi32(state1, 0xb1); + state0 = mm_blend_epi16(shuffle0, shuffle1, 0xf0); + state1 = mm_alignr_epi8(shuffle1, shuffle0, 0x08); } TEMPLATE -template INLINE void CLASS:: -schedule_native(xbuffer_t& xbuffer) NOEXCEPT +prepare(xint128_t& message0, xint128_t message1) NOEXCEPT { - // Merkle extended buffer is not native dispatched. - schedule_(xbuffer); + message0 = mm_sha256msg1_epu32(message0, message1); } -// compression -// ---------------------------------------------------------------------------- -// protected - TEMPLATE -template INLINE void CLASS:: -round_native(wstate_t& state, - const wbuffer_t& wk) NOEXCEPT +prepare(xint128_t& SHANI_ONLY(message0), xint128_t message1, + xint128_t& message2) NOEXCEPT { - if constexpr (SHA::strength == 160) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - } - } - else if constexpr (SHA::strength == 256) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - // Process wk[Round][0..1], [HGDC][FEBA] (initial state) - state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]); - - // Process wk[Round][2..3] (shifted down) - state[0] = mm_sha256rnds2_epu32(state[0], state[1], - mm_shuffle_epi32(wk[Round], 0x0e)); - } - } + message2 = mm_sha256msg2_epu32(mm_add_epi32(message2, + mm_alignr_epi8(message1, message0, 4)), message1); } TEMPLATE +template INLINE void CLASS:: -summarize_native(wstate_t& out, - const wstate_t& in) NOEXCEPT +round_4(xint128_t& state0, xint128_t& state1, xint128_t message) NOEXCEPT { - if constexpr (SHA::strength == 160) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - } - } - else if constexpr (SHA::strength == 256) - { - if constexpr (use_neon) - { - } - else if constexpr (use_shani) - { - out[0] = mm_add_epi32(out[0], in[0]); - out[1] = mm_add_epi32(out[1], in[1]); - } - } -} + constexpr auto r = Round * 4; + const auto wk = add(message, set( + K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3])); -TEMPLATE -INLINE void CLASS:: -shuffle(wstate_t& wstate) NOEXCEPT -{ - // Change wstate to mm_sha256rnds2_epu32 expected form: - // [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high). - const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1); - const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b); - wstate[0] = mm_alignr_epi8(t1, t2, 0x08); - wstate[1] = mm_blend_epi16(t2, t1, 0xf0); + state1 = mm_sha256rnds2_epu32(state1, state0, wk); + state0 = mm_sha256rnds2_epu32(state0, state1, mm_shuffle_epi32(wk, 0x0e)); } TEMPLATE -INLINE void CLASS:: -unshuffle(wstate_t& wstate) NOEXCEPT +void CLASS:: +native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT { - // Restore wstate to normal form: - // [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high). - const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b); - const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1); - wstate[0] = mm_blend_epi16(t1, t2, 0xf0); - wstate[1] = mm_alignr_epi8(t2, t1, 0x08); -} + // Individual state vars are used vs. array to ensure register persistence. + auto& wstate = array_cast(state); + auto lo = load(wstate[0]); + auto hi = load(wstate[1]); -TEMPLATE -template -void CLASS:: -compress_native(wstate_t& wstate, - const wbuffer_t& wbuffer) NOEXCEPT -{ - // Shuffle and unshuffle can be done outside of all blocks, but this would - // leave state in a non-normal form, so presently absorbing that cost. - shuffle(wstate); - - // This is a copy. - const auto start = wstate; - - round_native< 0, Lane>(wstate, wbuffer); - round_native< 1, Lane>(wstate, wbuffer); - round_native< 2, Lane>(wstate, wbuffer); - round_native< 3, Lane>(wstate, wbuffer); - round_native< 4, Lane>(wstate, wbuffer); - round_native< 5, Lane>(wstate, wbuffer); - round_native< 6, Lane>(wstate, wbuffer); - round_native< 7, Lane>(wstate, wbuffer); - round_native< 8, Lane>(wstate, wbuffer); - round_native< 9, Lane>(wstate, wbuffer); - round_native<10, Lane>(wstate, wbuffer); - round_native<11, Lane>(wstate, wbuffer); - round_native<12, Lane>(wstate, wbuffer); - round_native<13, Lane>(wstate, wbuffer); - round_native<14, Lane>(wstate, wbuffer); - round_native<15, Lane>(wstate, wbuffer); - - if constexpr (SHA::rounds == 80) + // shuffle organizes state as expected by sha256rnds2. + shuffle(lo, hi); + + while (!blocks.empty()) { - round_native<16, Lane>(wstate, wbuffer); - round_native<17, Lane>(wstate, wbuffer); - round_native<18, Lane>(wstate, wbuffer); - round_native<19, Lane>(wstate, wbuffer); - } + const auto start_lo = lo; + const auto start_hi = hi; + const auto& wblock = array_cast(blocks.to_array()); - // This is just a vectorized version of summarize(). - summarize_native(wstate, start); + auto message0 = byteswap(load(wblock[0])); + round_4<0>(lo, hi, message0); - // See above comments on shuffle(). - unshuffle(wstate); -} + auto message1 = byteswap(load(wblock[1])); + round_4<1>(lo, hi, message1); -TEMPLATE -template -INLINE void CLASS:: -compress_native(xstate_t& xstate, - const xbuffer_t& xbuffer) NOEXCEPT -{ - // Merkle extended state/buffer is not native dispatched. - compress_(xstate, xbuffer); -} + prepare(message0, message1); + auto message2 = byteswap(load(wblock[2])); + round_4<2>(lo, hi, message2); -TEMPLATE -template -INLINE void CLASS:: -compress_native(state_t& state, const xbuffer_t& xbuffer) NOEXCEPT -{ - // Iterate extended buffer is not native dispatched. - compress_(state, xbuffer); -} + prepare(message1, message2); + auto message3 = byteswap(load(wblock[3])); + round_4<3>(lo, hi, message3); -TEMPLATE -template -INLINE void CLASS:: -compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT -{ - // TODO: debug. - // TODO: sha160 state is too small to array cast into two xwords. - // neon and sha160 not yet implemented, sha512 is not native. - if constexpr (SHA::strength == 256 && !use_neon) - { - compress_native(array_cast(state), - array_cast(buffer)); - } - else - { - compress_(state, buffer); + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<4>(lo, hi, message0); + + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<5>(lo, hi, message1); + + prepare(message0, message1, message2); + prepare(message0, message1); + round_4<6>(lo, hi, message2); + + prepare(message1, message2, message3); + prepare(message1, message2); + round_4<7>(lo, hi, message3); + + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<8>(lo, hi, message0); + + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<9>(lo, hi, message1); + + prepare(message0, message1, message2); + prepare(message0, message1); + round_4<10>(lo, hi, message2); + + prepare(message1, message2, message3); + prepare(message1, message2); + round_4<11>(lo, hi, message3); + + prepare(message2, message3, message0); + prepare(message2, message3); + round_4<12>(lo, hi, message0); + + prepare(message3, message0, message1); + prepare(message3, message0); + round_4<13>(lo, hi, message1); + + prepare(message0, message1, message2); + round_4<14>(lo, hi, message2); + + prepare(message1, message2, message3); + round_4<15>(lo, hi, message3); + + lo = add(lo, start_lo); + hi = add(hi, start_hi); + blocks.advance(); } + + // unshuffle restores state to normal form. + unshuffle(lo, hi); + + store(wstate[0], lo); + store(wstate[1], hi); } } // namespace sha diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp index f2796d4b5c..b65704e27b 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp @@ -138,11 +138,11 @@ schedule(buffer_t& buffer) NOEXCEPT { schedule_(buffer); } - else if constexpr (native) - { - // Single block (with shani) message scheduling optimization. - schedule_native(buffer); - } + ////else if constexpr (native) + ////{ + //// // Single block (with shani) message scheduling optimization. + //// schedule_native(buffer); + ////} else if constexpr (vector) { // Single block (without shani) message scheduling optimization. diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp index 866fac88b8..4425eea8eb 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp @@ -41,24 +41,23 @@ sigma0_8(auto x1, auto x2, auto x3, auto x4, auto x5, auto x6, auto x7, TEMPLATE template INLINE void CLASS:: -prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT +prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT { static_assert(Round >= 16); constexpr auto r02 = Round - 2; constexpr auto r07 = Round - 7; constexpr auto r16 = Round - 16; - constexpr auto s = SHA::word_bits; // buffer[r07 + 7] is buffer[Round + 0], so sigma0 is limited to 8 lanes. - buffer[Round + Offset] = f::add( - f::add(buffer[r16 + Offset], get(xsigma0)), - f::add(buffer[r07 + Offset], sigma1(buffer[r02 + Offset]))); + buffer[Round + Offset] = add( + add(buffer[r16 + Offset], get(xsigma0)), + add(buffer[r07 + Offset], sigma1(buffer[r02 + Offset]))); } TEMPLATE template INLINE void CLASS:: -prepare8(buffer_t& buffer) NOEXCEPT +prepare_8(buffer_t& buffer) NOEXCEPT { // Requires avx512 for sha512 and avx2 for sha256. // The simplicity of sha160 message prepare precludes this optimization. @@ -73,14 +72,14 @@ prepare8(buffer_t& buffer) NOEXCEPT buffer[r15 + 0], buffer[r15 + 1], buffer[r15 + 2], buffer[r15 + 3], buffer[r15 + 4], buffer[r15 + 5], buffer[r15 + 6], buffer[r15 + 7]); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); - prepare1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); + prepare_1(buffer, xsigma0); } TEMPLATE @@ -98,17 +97,17 @@ schedule_sigma(buffer_t& buffer) NOEXCEPT { if constexpr (SHA::strength != 160 && have_lanes) { - prepare8<16>(buffer); - prepare8<24>(buffer); - prepare8<32>(buffer); - prepare8<40>(buffer); - prepare8<48>(buffer); - prepare8<56>(buffer); + prepare_8<16>(buffer); + prepare_8<24>(buffer); + prepare_8<32>(buffer); + prepare_8<40>(buffer); + prepare_8<48>(buffer); + prepare_8<56>(buffer); if constexpr (SHA::rounds == 80) { - prepare8<64>(buffer); - prepare8<72>(buffer); + prepare_8<64>(buffer); + prepare_8<72>(buffer); } konstant(buffer); diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 000f655459..0d997d0960 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -104,7 +104,6 @@ BC_POP_WARNING() #endif #if !defined(HAVE_SSE41) - #define mm_alignr_epi8(a, b, c) {} #define mm_and_si128(a, b) (a) #define mm_or_si128(a, b) (a) #define mm_xor_si128(a, b) (a) @@ -126,9 +125,8 @@ BC_POP_WARNING() #define mm_extract_epi64(a, Lane) {} #define mm_shuffle_epi8(a, mask) (a) #define mm_shuffle_epi32(a, mask) (a) - #define mm_blend_epi16(a, b, mask) (a) - #define mm_load_si128(a) {} - #define mm_loadu_si128(a) {} + #define mm_load_si128(a) (a) + #define mm_loadu_si128(a) (a) #define mm_store_si128(memory, a) #define mm_storeu_si128(memory, a) #define mm_set1_epi8(K) @@ -140,7 +138,6 @@ BC_POP_WARNING() #define mm_set_epi16(x08, x07, x06, x05, x04, x03, x02, x01) #define mm_set_epi8(x16, x15, x14, x13, x12, x11, x10, x09, x08, x07, x06, x05, x04, x03, x02, x01) #else - #define mm_alignr_epi8(a, b, c) _mm_alignr_epi8(a, b, c) // for native sha (128 only) #define mm_and_si128(a, b) _mm_and_si128(a, b) #define mm_or_si128(a, b) _mm_or_si128(a, b) #define mm_xor_si128(a, b) _mm_xor_si128(a, b) @@ -162,7 +159,6 @@ BC_POP_WARNING() #define mm_extract_epi64(a, Lane) _mm_extract_epi64(a, Lane) // undefined for X32 #define mm_shuffle_epi8(a, mask) _mm_shuffle_epi8(a, mask) #define mm_shuffle_epi32(a, mask) _mm_shuffle_epi32(a, mask) - #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask) #define mm_load_si128(a) _mm_load_si128(a) #define mm_loadu_si128(a) _mm_loadu_si128(a) #define mm_store_si128(memory, a) _mm_store_si128(memory, a) @@ -201,8 +197,8 @@ BC_POP_WARNING() #define mm256_extract_epi32(a, Lane) {} #define mm256_extract_epi64(a, Lane) {} #define mm256_shuffle_epi8(a, mask) (a) - #define mm256_load_si256(a) {} - #define mm256_loadu_si256(a) {} + #define mm256_load_si256(a) (a) + #define mm256_loadu_si256(a) (a) #define mm256_store_si256(memory, a) {} #define mm256_storeu_si256(memory, a) {} #define mm256_set1_epi8(K) @@ -273,8 +269,8 @@ BC_POP_WARNING() #define mm512_extract_epi32(a, Lane) {} #define mm512_extract_epi64(a, Lane) {} #define mm512_shuffle_epi8(a, mask) (a) - #define mm512_load_si512(a) {} - #define mm512_loadu_si512(a) {} + #define mm512_load_si512(a) (a) + #define mm512_loadu_si512(a) (a) #define mm512_store_si512(memory, a) #define mm512_storeu_si512(memory, a) #define mm512_set1_epi8(K) @@ -325,21 +321,31 @@ BC_POP_WARNING() #endif #if !defined(HAVE_SHANI) - #define mm_sha1msg1_epu32(a, b) {} - #define mm_sha1msg2_epu32(a, b) {} - #define mm_sha1rnds4_epu32(a, b, functor) {} - #define mm_sha1nexte_epu32(a, b) {} - #define mm_sha256msg1_epu32(a, b) {} - #define mm_sha256msg2_epu32(a, b) {} - #define mm_sha256rnds2_epu32(a, b, k) (k) + #define mm_sha1msg1_epu32(a, b) {} + #define mm_sha1msg2_epu32(a, b) {} + #define mm_sha1rnds4_epu32(a, b, f) {} + #define mm_sha1nexte_epu32(a, b) {} + #define mm_sha256msg1_epu32(a, b) (b) + #define mm_sha256msg2_epu32(a, b) (b) + #define mm_sha256rnds2_epu32(a, b, k) (k) + #define mm_alignr_epi8(a, b, c) (a) + #define mm_blend_epi16(a, b, mask) (a) + #define SHANI_ONLY(a) #else - #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) - #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b) - #define mm_sha1rnds4_epu32(a, b, functor) _mm_sha1rnds4_epu32(a, b, functor) - #define mm_sha1nexte_epu32(a, b) _mm_sha1nexte_epu32(a, b) - #define mm_sha256msg1_epu32(a, b) _mm_sha256msg1_epu32(a, b) - #define mm_sha256msg2_epu32(a, b) _mm_sha256msg2_epu32(a, b) - #define mm_sha256rnds2_epu32(a, b, k) _mm_sha256rnds2_epu32(a, b, k) + #define mm_sha1msg1_epu32(a, b) _mm_sha1msg1_epu32(a, b) + #define mm_sha1msg2_epu32(a, b) _mm_sha1msg2_epu32(a, b) + #define mm_sha1rnds4_epu32(a, b, f) _mm_sha1rnds4_epu32(a, b, f) + #define mm_sha1nexte_epu32(a, b) _mm_sha1nexte_epu32(a, b) + #define mm_sha256msg1_epu32(a, b) _mm_sha256msg1_epu32(a, b) + #define mm_sha256msg2_epu32(a, b) _mm_sha256msg2_epu32(a, b) + #define mm_sha256rnds2_epu32(a, b, k) _mm_sha256rnds2_epu32(a, b, k) + + // supporting + #define mm_alignr_epi8(a, b, c) _mm_alignr_epi8(a, b, c) + #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask) + + // unused argument suppression + #define SHANI_ONLY(a) a #endif #endif diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp index 68a227ceb2..0f841da915 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_128.hpp @@ -76,9 +76,9 @@ INLINE xint128_t shr(xint128_t a) NOEXCEPT if constexpr (S == bits) return mm_srli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_srli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_srli_epi64(a, B); } @@ -93,9 +93,9 @@ INLINE xint128_t shl(xint128_t a) NOEXCEPT if constexpr (S == bits) return mm_slli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_slli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_slli_epi64(a, B); } @@ -117,11 +117,11 @@ INLINE xint128_t add(xint128_t a, xint128_t b) NOEXCEPT { if constexpr (S == bits) return mm_add_epi8(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_add_epi16(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_add_epi32(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm_add_epi64(a, b); } @@ -131,22 +131,36 @@ INLINE xint128_t addc(xint128_t a) NOEXCEPT { if constexpr (S == bits) return add(a, mm_set1_epi8(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm_set1_epi16(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm_set1_epi32(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm_set1_epi64x(K)); } } // namespace f -/// broadcast/get/set +/// add/broadcast/gadd/get/set /// --------------------------------------------------------------------------- // SSE2 -template = true, if_integral_integer = true> +template = true> +INLINE xint128_t add(xint128_t a, xint128_t b) NOEXCEPT +{ + if constexpr (is_same_type) + return mm_add_epi8(a, b); + if constexpr (is_same_type) + return mm_add_epi16(a, b); + if constexpr (is_same_type) + return mm_add_epi32(a, b); + if constexpr (is_same_type) + return mm_add_epi64(a, b); +} + +// SSE2 +template = true, + if_same = true> INLINE xint128_t broadcast(Word a) NOEXCEPT { // set1 broadcasts integer to all elements. @@ -172,13 +186,13 @@ INLINE Word get(xint128_t a) NOEXCEPT return mm_extract_epi8(a, Lane); // SSE2 - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm_extract_epi16(a, Lane); // SSE4.1 - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm_extract_epi32(a, Lane); - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm_extract_epi64(a, Lane); } @@ -257,25 +271,26 @@ INLINE xint128_t byteswap(xint128_t a) NOEXCEPT /// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- +/// These have defined overrides for !HAVE_SSE41 -INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT +INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT { - return mm_load_si128(&bytes); + return mm_loadu_si128(&bytes); } -INLINE xint128_t load(const xint128_t& bytes) NOEXCEPT +INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT { - return mm_loadu_si128(&bytes); + mm_storeu_si128(&bytes, a); } -INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT +INLINE xint128_t load_aligned(const xint128_t& bytes) NOEXCEPT { - mm_store_si128(&bytes, a); + return mm_load_si128(&bytes); } -INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT +INLINE void store_aligned(xint128_t& bytes, xint128_t a) NOEXCEPT { - mm_storeu_si128(&bytes, a); + mm_store_si128(&bytes, a); } #else @@ -283,6 +298,33 @@ INLINE void store(xint128_t& bytes, xint128_t a) NOEXCEPT // Symbol is defined but not usable as an integer. using xint128_t = std_array>; +template = true> +INLINE xint128_t add(xint128_t, xint128_t b) NOEXCEPT +{ + return b; +} + +template = true> +INLINE xint128_t set(uint32_t, uint32_t, uint32_t, uint32_t) NOEXCEPT +{ + return {}; +} + +template = true> +INLINE xint128_t byteswap(xint128_t a) NOEXCEPT +{ + return a; +} + +INLINE xint128_t load(const xint128_t& a) NOEXCEPT +{ + return a; +} + +INLINE void store(xint128_t&, xint128_t) NOEXCEPT +{ +} + #endif // HAVE_SSE41 } // namespace system diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp index e2c3711cb4..f385863ac7 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_256.hpp @@ -75,9 +75,9 @@ INLINE xint256_t shr(xint256_t a) NOEXCEPT // AVX2 if constexpr (S == bits) return mm256_srli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_srli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_srli_epi64(a, B); } @@ -92,9 +92,9 @@ INLINE xint256_t shl(xint256_t a) NOEXCEPT // AVX2 if constexpr (S == bits) return mm256_slli_epi16(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_slli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_slli_epi64(a, B); } @@ -116,11 +116,11 @@ INLINE xint256_t add(xint256_t a, xint256_t b) NOEXCEPT { if constexpr (S == bits) return mm256_add_epi8(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_add_epi16(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_add_epi32(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm256_add_epi64(a, b); } @@ -130,22 +130,36 @@ INLINE xint256_t addc(xint256_t a) NOEXCEPT { if constexpr (S == bits) return add(a, mm256_set1_epi8(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm256_set1_epi16(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm256_set1_epi32(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm256_set1_epi64x(K)); } } // namespace f -/// broadcast/get/set +/// add/broadcast/get/set /// --------------------------------------------------------------------------- // AVX -template = true, if_integral_integer = true> +template = true> +INLINE xint256_t add(xint256_t a, xint256_t b) NOEXCEPT +{ + if constexpr (is_same_type) + return mm256_add_epi8(a, b); + if constexpr (is_same_type) + return mm256_add_epi16(a, b); + if constexpr (is_same_type) + return mm256_add_epi32(a, b); + if constexpr (is_same_type) + return mm256_add_epi64(a, b); +} + +// AVX +template = true, + if_same = true> INLINE xint256_t broadcast(Word a) NOEXCEPT { // set1 broadcasts integer to all elements. @@ -169,13 +183,13 @@ INLINE Word get(xint256_t a) NOEXCEPT // AVX2 if constexpr (is_same_type) return mm256_extract_epi8(a, Lane); - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm256_extract_epi16(a, Lane); // AVX - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm256_extract_epi32(a, Lane); - else if constexpr (is_same_type) + if constexpr (is_same_type) return mm256_extract_epi64(a, Lane); } @@ -270,25 +284,26 @@ INLINE xint256_t byteswap(xint256_t a) NOEXCEPT /// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- +/// These have defined overrides for !HAVE_AVX2 -INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT +INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT { - return mm256_load_si256(&bytes); + return mm256_loadu_si256(&bytes); } -INLINE xint256_t load(const xint256_t& bytes) NOEXCEPT +INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT { - return mm256_loadu_si256(&bytes); + mm256_storeu_si256(&bytes, a); } -INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT +INLINE xint256_t load_aligned(const xint256_t& bytes) NOEXCEPT { - mm256_store_si256(&bytes, a); + return mm256_load_si256(&bytes); } -INLINE void store(xint256_t& bytes, xint256_t a) NOEXCEPT +INLINE void store_aligned(xint256_t& bytes, xint256_t a) NOEXCEPT { - mm256_storeu_si256(&bytes, a); + mm256_store_si256(&bytes, a); } #else diff --git a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp index 3308bd0362..53724cdbbd 100644 --- a/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/functional_512.hpp @@ -83,9 +83,9 @@ INLINE xint512_t shr(xint512_t a) NOEXCEPT return mm512_srli_epi16(a, B); // AVX512F - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_srli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_srli_epi64(a, B); } @@ -102,9 +102,9 @@ INLINE xint512_t shl(xint512_t a) NOEXCEPT return mm512_slli_epi16(a, B); // AVX512F - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_slli_epi32(a, B); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_slli_epi64(a, B); } @@ -126,13 +126,13 @@ INLINE xint512_t add(xint512_t a, xint512_t b) NOEXCEPT // AVX512BW if constexpr (S == bits) return mm512_add_epi8(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_add_epi16(a, b); // AVX512F - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_add_epi32(a, b); - else if constexpr (S == bits) + if constexpr (S == bits) return mm512_add_epi64(a, b); } @@ -142,11 +142,11 @@ INLINE xint512_t addc(xint512_t a) NOEXCEPT { if constexpr (S == bits) return add(a, mm512_set1_epi8(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm512_set1_epi16(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm512_set1_epi32(K)); - else if constexpr (S == bits) + if constexpr (S == bits) return add(a, mm512_set1_epi64(K)); } @@ -156,8 +156,22 @@ INLINE xint512_t addc(xint512_t a) NOEXCEPT /// --------------------------------------------------------------------------- // AVX512F -template = true, if_integral_integer = true> +template = true> +INLINE xint512_t add(xint512_t a, xint512_t b) NOEXCEPT +{ + if constexpr (is_same_type) + return mm256_add_epi8(a, b); + if constexpr (is_same_type) + return mm256_add_epi16(a, b); + if constexpr (is_same_type) + return mm256_add_epi32(a, b); + if constexpr (is_same_type) + return mm256_add_epi64(a, b); +} + +// AVX512F +template = true, + if_same = true> INLINE xint512_t broadcast(Word a) NOEXCEPT { // set1 broadcasts integer to all elements. @@ -310,25 +324,26 @@ INLINE xint512_t byteswap(xint512_t a) NOEXCEPT /// load/store (from casted to loaded/stored) /// --------------------------------------------------------------------------- +/// These have defined overrides for !HAVE_AVX2 -INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT +INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT { - return mm512_load_si512(&bytes); + return mm512_loadu_si512(&bytes); } -INLINE xint512_t load(const xint512_t& bytes) NOEXCEPT +INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT { - return mm512_loadu_si512(&bytes); + mm512_storeu_si512(&bytes, a); } -INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT +INLINE xint512_t load_aligned(const xint512_t& bytes) NOEXCEPT { - mm512_store_si512(&bytes, a); + return mm512_load_si512(&bytes); } -INLINE void store(xint512_t& bytes, xint512_t a) NOEXCEPT +INLINE void store_aligned(xint512_t& bytes, xint512_t a) NOEXCEPT { - mm512_storeu_si512(&bytes, a); + mm512_store_si512(&bytes, a); } #else