official-stockfish · bluffblue · Jan 25, 2025
diff --git a/src/cache_optimizer.h b/src/cache_optimizer.h
@@ -0,0 +1,55 @@
+#ifndef CACHE_OPTIMIZER_H_INCLUDED
+#define CACHE_OPTIMIZER_H_INCLUDED
+
+#include <cstddef>
+#include <new>
+
+namespace Stockfish {
+
+template<typename T, size_t CacheLineSize = 64>
+class CacheAlignedArray {
+    static constexpr size_t alignment = CacheLineSize;
+    T* data;
+    size_t length;
+
+public:
+    explicit CacheAlignedArray(size_t size) : length(size) {
+        data = static_cast<T*>(::operator new(size * sizeof(T) + alignment, std::align_val_t(alignment)));
+    }
+
+    ~CacheAlignedArray() {
+        ::operator delete(data, std::align_val_t(alignment));
+    }
+
+    T& operator[](size_t index) { return data[index]; }
+    const T& operator[](size_t index) const { return data[index]; }
+    T* get() { return data; }
+    const T* get() const { return data; }
+    size_t size() const { return length; }
+};
+
+struct CacheOptimizer {
+    static constexpr size_t CACHE_LINE_SIZE = 64;
+    static constexpr size_t L1_CACHE_SIZE = 32768;
+    static constexpr size_t L2_CACHE_SIZE = 262144;
+    static constexpr size_t L3_CACHE_SIZE = 8388608;
+
+    template<typename T>
+    static void prefetchL1(const T* addr) {
+        __builtin_prefetch(addr, 0, 3);
+    }
+
+    template<typename T>
+    static void prefetchL2(const T* addr) {
+        __builtin_prefetch(addr, 0, 2);
+    }
+
+    template<typename T>
+    static void prefetchForModify(const T* addr) {
+        __builtin_prefetch(addr, 1, 3);
+    }
+};
+
+} 
+
+#endif
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
@@ -34,6 +34,9 @@
 #include "types.h"
 #include "uci.h"
 #include "nnue/nnue_accumulator.h"
+#include "simd_ops.h"
+#include "cache_optimizer.h"
+#include "nnue/optimized_layers.h"
 
 namespace Stockfish {
 
@@ -90,6 +93,19 @@ Value Eval::evaluate(const Eval::NNUE::Networks&    networks,
     return v;
 }
 
+template<Color Us>
+Score evaluate(const Position& pos) {
+    CacheOptimizer::prefetchL1(&pos);
+
+    constexpr Color Them = ~Us;
+
+    CacheAlignedArray<int32_t> accumulator(NNUE_DIMENSIONS);
+
+    Score score = NNUE::evaluate(pos, accumulator.get());
+
+    return score;
+}
+
 // Like evaluate(), but instead of returning a value, it returns
 // a string (suitable for outputting to stdout) that contains the detailed
 // descriptions and values of each evaluation term. Useful for debugging.

diff --git a/src/nnue/optimized_layers.h b/src/nnue/optimized_layers.h
@@ -0,0 +1,62 @@
+#ifndef STOCKFISH_EVAL_NNUE_OPTIMIZED_LAYERS_H_INCLUDED
+#define STOCKFISH_EVAL_NNUE_OPTIMIZED_LAYERS_H_INCLUDED
+
+#include "../simd_ops.h"
+#include "../cache_optimizer.h"
+#include "nnue_common.h"
+
+namespace Stockfish::Eval::NNUE {
+
+struct OptimizedLayer {
+    static void affine_transform_avx2(
+        const std::int32_t* input,
+        const std::int32_t* weights,
+        std::int32_t* output,
+        const std::int32_t* biases,
+        unsigned inputDimensions,
+        unsigned outputDimensions
+    ) {
+        const unsigned numChunks = (inputDimensions + 7) / 8;
+
+        for (unsigned i = 0; i < outputDimensions; i++) {
+            __m256i sum = _mm256_setzero_si256();
+
+            for (unsigned j = 0; j < numChunks; j++) {
+                __m256i in = _mm256_load_si256(
+                    reinterpret_cast<const __m256i*>(input + j * 8));
+                __m256i w = _mm256_load_si256(
+                    reinterpret_cast<const __m256i*>(weights + i * inputDimensions + j * 8));
+
+                __m256i prod = _mm256_mullo_epi32(in, w);
+                sum = _mm256_add_epi32(sum, prod);
+            }
+
+
+            __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), 
+                                          _mm256_extracti128_si256(sum, 1));
+            sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1, 0, 3, 2)));
+            sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(2, 3, 0, 1)));
+            output[i] = _mm_cvtsi128_si32(sum128) + biases[i];
+        }
+    }
+
+    static void quantize_weights(
+        const float* input,
+        std::int32_t* output,
+        unsigned size,
+        float scale
+    ) {
+        const __m256 scale_v = _mm256_set1_ps(scale);
+
+        for (unsigned i = 0; i < size; i += 8) {
+            __m256 in = _mm256_load_ps(input + i);
+            __m256 scaled = _mm256_mul_ps(in, scale_v);
+            __m256i rounded = _mm256_cvtps_epi32(scaled);
+            _mm256_store_si256(reinterpret_cast<__m256i*>(output + i), rounded);
+        }
+    }
+};
+
+} 
+
+#endif
diff --git a/src/parallel_search.h b/src/parallel_search.h
@@ -0,0 +1,64 @@
+#ifndef PARALLEL_SEARCH_H_INCLUDED
+#define PARALLEL_SEARCH_H_INCLUDED
+
+#include <atomic>
+#include <memory>
+#include <thread>
+#include <vector>
+#include "types.h"
+#include "position.h"
+#include "thread.h"
+
+namespace Stockfish {
+
+class ParallelSearchManager {
+    std::vector<std::unique_ptr<Thread>> threads;
+    std::atomic<bool> searching{false};
+    Depth splitDepth;
+
+public:
+    ParallelSearchManager(size_t numThreads = 1) : splitDepth(4) {
+        threads.reserve(numThreads);
+        for (size_t i = 0; i < numThreads; ++i)
+            threads.emplace_back(std::make_unique<Thread>(i));
+    }
+
+    void startSearch(Position& pos, const Search::LimitsType& limits) {
+        searching = true;
+
+        for (auto& thread : threads) {
+            thread->startSearching(pos, limits, searching);
+        }
+    }
+
+    void waitForSearchFinish() {
+        searching = false;
+        for (auto& thread : threads)
+            if (thread->stdThread.joinable())
+                thread->stdThread.join();
+    }
+
+    void setSplitDepth(Depth depth) {
+        splitDepth = depth;
+    }
+
+    Depth getSplitDepth() const {
+        return splitDepth;
+    }
+
+    size_t getThreadCount() const {
+        return threads.size();
+    }
+
+    void resizeThreadPool(size_t newSize) {
+        if (searching)
+            return;
+
+        threads.clear();
+        threads.reserve(newSize);
+        for (size_t i = 0; i < newSize; ++i)
+            threads.emplace_back(std::make_unique<Thread>(i));
+    }
+};
+
+}
diff --git a/src/simd_ops.h b/src/simd_ops.h
@@ -0,0 +1,43 @@
+#ifndef SIMD_OPS_H_INCLUDED
+#define SIMD_OPS_H_INCLUDED
+
+#include <immintrin.h>
+
+namespace Stockfish {
+
+struct SIMDHelper {
+    static inline __m256i mm256_msb_mask_32() {
+        return _mm256_set1_epi32(0x80000000);
+    }
+
+    static inline __m256i mm256_not_si256(__m256i a) {
+        return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
+    }
+
+    template<typename T>
+    static inline void prefetch(T* addr) {
+        _mm_prefetch((char*)addr, _MM_HINT_T0);
+    }
+
+    static inline __m256i mm256_multishift_epi64_epi8(__m256i a, __m256i count) {
+        __m256i mask = _mm256_set1_epi64x(0xFF);
+        __m256i result = _mm256_setzero_si256();
+
+        for (int i = 0; i < 8; i++) {
+            __m256i shifted = _mm256_srlv_epi64(a, count);
+            result = _mm256_or_si256(result, _mm256_and_si256(shifted, mask));
+            mask = _mm256_slli_epi64(mask, 8);
+            count = _mm256_add_epi64(count, _mm256_set1_epi64x(8));
+        }
+
+        return result;
+    }
+
+    static inline __m256i mm256_merge_epi32(__m128i lo, __m128i hi) {
+        return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+    }
+};
+
+} 
+
+#endif