Merge pull request #1558 from evoskuil/master

Enable vector parse and vector k-addition.
libbitcoin · Dec 4, 2024 · f464f46 · f464f46
2 parents 4a0fb22 + 9aa6b0d
commit f464f46
Show file tree

Hide file tree

Showing 13 changed files with 237 additions and 202 deletions.
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
@@ -38,25 +38,25 @@ template <size_t Word, size_t Lanes>
 INLINE auto CLASS::
 pack(const xblock_t<Lanes>& xblock) NOEXCEPT
 {
-    using xword = to_extended<word_t, Lanes>;
+    using xword_t = to_extended<word_t, Lanes>;
 
     if constexpr (Lanes == 2)
     {
-        return byteswap<word_t>(set<xword>(
+        return byteswap<word_t>(set<xword_t>(
             xblock[0][Word],
             xblock[1][Word]));
     }
     else if constexpr (Lanes == 4)
     {
-        return byteswap<word_t>(set<xword>(
+        return byteswap<word_t>(set<xword_t>(
             xblock[0][Word],
             xblock[1][Word],
             xblock[2][Word],
             xblock[3][Word]));
     }
     else if constexpr (Lanes == 8)
     {
-        return byteswap<word_t>(set<xword>(
+        return byteswap<word_t>(set<xword_t>(
             xblock[0][Word],
             xblock[1][Word],
             xblock[2][Word],
@@ -68,7 +68,7 @@ pack(const xblock_t<Lanes>& xblock) NOEXCEPT
     }
     else if constexpr (Lanes == 16)
     {
-        return byteswap<word_t>(set<xword>(
+        return byteswap<word_t>(set<xword_t>(
             xblock[ 0][Word],
             xblock[ 1][Word],
             xblock[ 2][Word],
@@ -181,7 +181,7 @@ vector_schedule_sequential_compress(state_t& state, iblocks_t& blocks) NOEXCEPT
     constexpr auto lanes = capacity<xWord, word_t>;
     static_assert(is_valid_lanes<lanes>);
 
-    if constexpr (have<xWord>())
+    if constexpr (have<xWord>)
     {
         if (blocks.size() >= lanes)
         {

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
@@ -50,43 +50,38 @@ template<size_t Round, typename xWord>
 INLINE void CLASS::
 vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
 {
-    constexpr auto r = Round;
     constexpr auto s = SHA::word_bits;
     constexpr auto lanes = capacity<xWord, word_t>;
+    constexpr auto r = Round * lanes;
 
-    if constexpr (lanes == 2)
+    if constexpr (lanes == 16)
     {
         wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
-            K::get[r + 1], K::get[r + 0]));
-    }
-    else if constexpr (lanes == 4)
-    {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
-            K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0]));
+            K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
+            K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7],
+            K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11],
+            K::get[r + 12], K::get[r + 13], K::get[r + 14], K::get[r + 15]));
     }
     else if constexpr (lanes == 8)
     {
         wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
-            K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4],
-            K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0]));
+            K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
+            K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7]));
     }
-    else if constexpr (lanes == 16)
+    else if constexpr (lanes == 4)
     {
         wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
-            K::get[r + 15], K::get[r + 14], K::get[r + 13], K::get[r + 12],
-            K::get[r + 11], K::get[r + 10], K::get[r + 9], K::get[r + 8],
-            K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4],
-            K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0]));
+            K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
     }
 }
 
 TEMPLATE
 void CLASS::
 vector_konstant(buffer_t& buffer) NOEXCEPT
 {
-    if constexpr (use_x512)
+    if constexpr (have_lanes<word_t, 16>)
     {
-        auto& wbuffer = array_cast<xint512_t>(buffer);
+        auto& wbuffer = array_cast<to_extended<word_t, 16>>(buffer);
         vector_konstant<0>(wbuffer);
         vector_konstant<1>(wbuffer);
         vector_konstant<2>(wbuffer);
@@ -97,9 +92,9 @@ vector_konstant(buffer_t& buffer) NOEXCEPT
             vector_konstant<4>(wbuffer);
         }
     }
-    else if constexpr (use_x256)
+    else if constexpr (have_lanes<word_t, 8>)
     {
-        auto& wbuffer = array_cast<xint256_t>(buffer);
+        auto& wbuffer = array_cast<to_extended<word_t, 8>>(buffer);
         vector_konstant<0>(wbuffer);
         vector_konstant<1>(wbuffer);
         vector_konstant<2>(wbuffer);
@@ -115,9 +110,9 @@ vector_konstant(buffer_t& buffer) NOEXCEPT
             vector_konstant<9>(wbuffer);
         }
     }
-    else if constexpr (use_x128)
+    else if constexpr (have_lanes<word_t, 4>)
     {
-        auto& wbuffer = array_cast<xint128_t>(buffer);
+        auto& wbuffer = array_cast<to_extended<word_t, 4>>(buffer);
         vector_konstant<0>(wbuffer);
         vector_konstant<1>(wbuffer);
         vector_konstant<2>(wbuffer);
@@ -262,10 +257,10 @@ konstant(buffer_t& buffer) NOEXCEPT
     {
         konstant_(buffer);
     }
-    ////else if constexpr (vector)
-    ////{
-    ////    vector_konstant(buffer);
-    ////}
+    else if constexpr (vector && !with_clang)
+    {
+        vector_konstant(buffer);
+    }
     else
     {
         konstant_(buffer);

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
@@ -351,7 +351,7 @@ merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT
     constexpr auto lanes = capacity<xWord, word_t>;
     static_assert(is_valid_lanes<lanes>);
 
-    if constexpr (have<xWord>())
+    if constexpr (have<xWord>)
     {
         if (blocks.size() >= lanes)
         {

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_parsing.ipp
@@ -58,24 +58,51 @@ input(buffer_t& buffer, const block_t& block) NOEXCEPT
     }
     else if constexpr (bc::is_little_endian)
     {
-        // TODO: evaluate 4/8/16 lane optimization using byteswap.
-        const auto& in = array_cast<word_t>(block);
-        buffer[0] = native_from_big_end(in[0]);
-        buffer[1] = native_from_big_end(in[1]);
-        buffer[2] = native_from_big_end(in[2]);
-        buffer[3] = native_from_big_end(in[3]);
-        buffer[4] = native_from_big_end(in[4]);
-        buffer[5] = native_from_big_end(in[5]);
-        buffer[6] = native_from_big_end(in[6]);
-        buffer[7] = native_from_big_end(in[7]);
-        buffer[8] = native_from_big_end(in[8]);
-        buffer[9] = native_from_big_end(in[9]);
-        buffer[10] = native_from_big_end(in[10]);
-        buffer[11] = native_from_big_end(in[11]);
-        buffer[12] = native_from_big_end(in[12]);
-        buffer[13] = native_from_big_end(in[13]);
-        buffer[14] = native_from_big_end(in[14]);
-        buffer[15] = native_from_big_end(in[15]);
+        if constexpr (have_lanes<word_t, 16> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 16>;
+            const auto& in = array_cast<xword_t>(block);
+            auto& out = array_cast<xword_t>(buffer);
+            out[0] = byteswap<word_t>(in[0]);
+        }
+        else if constexpr (have_lanes<word_t, 8> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 8>;
+            const auto& in = array_cast<xword_t>(block);
+            auto& out = array_cast<xword_t>(buffer);
+            out[0] = byteswap<word_t>(in[0]);
+            out[1] = byteswap<word_t>(in[1]);
+        }
+        else if constexpr (have_lanes<word_t, 4> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 4>;
+            const auto& in = array_cast<xword_t>(block);
+            auto& out = array_cast<xword_t>(buffer);
+            out[0] = byteswap<word_t>(in[0]);
+            out[1] = byteswap<word_t>(in[1]);
+            out[2] = byteswap<word_t>(in[2]);
+            out[3] = byteswap<word_t>(in[3]);
+        }
+        else
+        {
+            const auto& in = array_cast<word_t>(block);
+            buffer[0] = native_from_big_end(in[0]);
+            buffer[1] = native_from_big_end(in[1]);
+            buffer[2] = native_from_big_end(in[2]);
+            buffer[3] = native_from_big_end(in[3]);
+            buffer[4] = native_from_big_end(in[4]);
+            buffer[5] = native_from_big_end(in[5]);
+            buffer[6] = native_from_big_end(in[6]);
+            buffer[7] = native_from_big_end(in[7]);
+            buffer[8] = native_from_big_end(in[8]);
+            buffer[9] = native_from_big_end(in[9]);
+            buffer[10] = native_from_big_end(in[10]);
+            buffer[11] = native_from_big_end(in[11]);
+            buffer[12] = native_from_big_end(in[12]);
+            buffer[13] = native_from_big_end(in[13]);
+            buffer[14] = native_from_big_end(in[14]);
+            buffer[15] = native_from_big_end(in[15]);
+        }
     }
     else
     {
@@ -92,7 +119,6 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT
 
     if (std::is_constant_evaluated())
     {
-        // TODO: evaluate 4/8 lane optimization using byteswap.
         constexpr auto size = SHA::word_bytes;
         from_big<0 * size>(buffer.at(0), half);
         from_big<1 * size>(buffer.at(1), half);
@@ -105,15 +131,33 @@ input_left(buffer_t& buffer, const half_t& half) NOEXCEPT
     }
     else if constexpr (bc::is_little_endian)
     {
-        const auto& in = array_cast<word>(half);
-        buffer[0] = native_from_big_end(in[0]);
-        buffer[1] = native_from_big_end(in[1]);
-        buffer[2] = native_from_big_end(in[2]);
-        buffer[3] = native_from_big_end(in[3]);
-        buffer[4] = native_from_big_end(in[4]);
-        buffer[5] = native_from_big_end(in[5]);
-        buffer[6] = native_from_big_end(in[6]);
-        buffer[7] = native_from_big_end(in[7]);
+        if constexpr (have_lanes<word_t, 8> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 8>;
+            const auto& in = array_cast<xword_t>(half);
+            auto& out = array_cast<xword_t>(buffer);
+            out[0] = byteswap<word_t>(in[0]);
+        }
+        else if constexpr (have_lanes<word_t, 4> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 4>;
+            const auto& in = array_cast<xword_t>(half);
+            auto& out = array_cast<xword_t>(buffer);
+            out[0] = byteswap<word_t>(in[0]);
+            out[1] = byteswap<word_t>(in[1]);
+        }
+        else
+        {
+            const auto& in = array_cast<word>(half);
+            buffer[0] = native_from_big_end(in[0]);
+            buffer[1] = native_from_big_end(in[1]);
+            buffer[2] = native_from_big_end(in[2]);
+            buffer[3] = native_from_big_end(in[3]);
+            buffer[4] = native_from_big_end(in[4]);
+            buffer[5] = native_from_big_end(in[5]);
+            buffer[6] = native_from_big_end(in[6]);
+            buffer[7] = native_from_big_end(in[7]);
+        }
     }
     else
     {
@@ -141,16 +185,33 @@ input_right(buffer_t& buffer, const half_t& half) NOEXCEPT
     }
     else if constexpr (bc::is_little_endian)
     {
-        // TODO: evaluate 4/8 lane optimization using byteswap.
-        const auto& in = array_cast<word>(half);
-        buffer[8] = native_from_big_end(in[0]);
-        buffer[9] = native_from_big_end(in[1]);
-        buffer[10] = native_from_big_end(in[2]);
-        buffer[11] = native_from_big_end(in[3]);
-        buffer[12] = native_from_big_end(in[4]);
-        buffer[13] = native_from_big_end(in[5]);
-        buffer[14] = native_from_big_end(in[6]);
-        buffer[15] = native_from_big_end(in[7]);
+        if constexpr (have_lanes<word_t, 8> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 8>;
+            const auto& in = array_cast<xword_t>(half);
+            auto& out = array_cast<xword_t>(buffer);
+            out[1] = byteswap<word_t>(in[0]);
+        }
+        else if constexpr (have_lanes<word_t, 4> && !with_clang)
+        {
+            using xword_t = to_extended<word_t, 4>;
+            const auto& in = array_cast<xword_t>(half);
+            auto& out = array_cast<xword_t>(buffer);
+            out[2] = byteswap<word_t>(in[0]);
+            out[3] = byteswap<word_t>(in[1]);
+        }
+        else
+        {
+            const auto& in = array_cast<word>(half);
+            buffer[8] = native_from_big_end(in[0]);
+            buffer[9] = native_from_big_end(in[1]);
+            buffer[10] = native_from_big_end(in[2]);
+            buffer[11] = native_from_big_end(in[3]);
+            buffer[12] = native_from_big_end(in[4]);
+            buffer[13] = native_from_big_end(in[5]);
+            buffer[14] = native_from_big_end(in[6]);
+            buffer[15] = native_from_big_end(in[7]);
+        }
     }
     else
     {
@@ -185,30 +246,51 @@ output(const state_t& state) NOEXCEPT
     }
     else if constexpr (bc::is_little_endian)
     {
-        if constexpr (SHA::strength == 160)
+        if constexpr (SHA::strength != 160)
         {
-            return array_cast<byte_t>(state_t
+            if constexpr (have_lanes<word_t, 8> && !with_clang)
             {
-                native_to_big_end(state[0]),
-                native_to_big_end(state[1]),
-                native_to_big_end(state[2]),
-                native_to_big_end(state[3]),
-                native_to_big_end(state[4])
-            });
+                using xword_t = to_extended<word_t, 8>;
+                const auto& in = array_cast<xword_t>(state);
+                return array_cast<byte_t>(wstate_t<xword_t>
+                {
+                    byteswap<word_t>(in[0])
+                });
+            }
+            else if constexpr (have_lanes<word_t, 4> && !with_clang)
+            {
+                using xword_t = to_extended<word_t, 4>;
+                const auto& in = array_cast<xword_t>(state);
+                return array_cast<byte_t>(wstate_t<xword_t>
+                {
+                    byteswap<word_t>(in[0]),
+                    byteswap<word_t>(in[1])
+                });
+            }
+            else
+            {
+                return array_cast<byte_t>(state_t
+                {
+                    native_to_big_end(state[0]),
+                    native_to_big_end(state[1]),
+                    native_to_big_end(state[2]),
+                    native_to_big_end(state[3]),
+                    native_to_big_end(state[4]),
+                    native_to_big_end(state[5]),
+                    native_to_big_end(state[6]),
+                    native_to_big_end(state[7])
+                });
+            }
         }
         else
         {
-            // TODO: evaluate 4/8 lane optimization using byteswap.
             return array_cast<byte_t>(state_t
             {
                 native_to_big_end(state[0]),
                 native_to_big_end(state[1]),
                 native_to_big_end(state[2]),
                 native_to_big_end(state[3]),
-                native_to_big_end(state[4]),
-                native_to_big_end(state[5]),
-                native_to_big_end(state[6]),
-                native_to_big_end(state[7])
+                native_to_big_end(state[4])
             });
         }
     }

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
@@ -96,7 +96,7 @@ TEMPLATE
 void CLASS::
 schedule_sigma(buffer_t& buffer) NOEXCEPT
 {
-    if constexpr (SHA::strength != 160 && have_lanes<word_t, 8>())
+    if constexpr (SHA::strength != 160 && have_lanes<word_t, 8>)
     {
         prepare8<16>(buffer);
         prepare8<24>(buffer);