diff --git a/src/cache_optimizer.h b/src/cache_optimizer.h new file mode 100644 index 00000000000..5fe84110b52 --- /dev/null +++ b/src/cache_optimizer.h @@ -0,0 +1,55 @@ +#ifndef CACHE_OPTIMIZER_H_INCLUDED +#define CACHE_OPTIMIZER_H_INCLUDED + +#include +#include + +namespace Stockfish { + +template +class CacheAlignedArray { + static constexpr size_t alignment = CacheLineSize; + T* data; + size_t length; + +public: + explicit CacheAlignedArray(size_t size) : length(size) { + data = static_cast(::operator new(size * sizeof(T) + alignment, std::align_val_t(alignment))); + } + + ~CacheAlignedArray() { + ::operator delete(data, std::align_val_t(alignment)); + } + + T& operator[](size_t index) { return data[index]; } + const T& operator[](size_t index) const { return data[index]; } + T* get() { return data; } + const T* get() const { return data; } + size_t size() const { return length; } +}; + +struct CacheOptimizer { + static constexpr size_t CACHE_LINE_SIZE = 64; + static constexpr size_t L1_CACHE_SIZE = 32768; + static constexpr size_t L2_CACHE_SIZE = 262144; + static constexpr size_t L3_CACHE_SIZE = 8388608; + + template + static void prefetchL1(const T* addr) { + __builtin_prefetch(addr, 0, 3); + } + + template + static void prefetchL2(const T* addr) { + __builtin_prefetch(addr, 0, 2); + } + + template + static void prefetchForModify(const T* addr) { + __builtin_prefetch(addr, 1, 3); + } +}; + +} + +#endif diff --git a/src/evaluate.cpp b/src/evaluate.cpp index 4fce86e3a9b..14fa472e6ea 100644 --- a/src/evaluate.cpp +++ b/src/evaluate.cpp @@ -34,6 +34,9 @@ #include "types.h" #include "uci.h" #include "nnue/nnue_accumulator.h" +#include "simd_ops.h" +#include "cache_optimizer.h" +#include "nnue/optimized_layers.h" namespace Stockfish { @@ -90,6 +93,19 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks, return v; } +template +Score evaluate(const Position& pos) { + CacheOptimizer::prefetchL1(&pos); + + constexpr Color Them = ~Us; + + CacheAlignedArray accumulator(NNUE_DIMENSIONS); + + Score score = NNUE::evaluate(pos, accumulator.get()); + + return score; +} + // Like evaluate(), but instead of returning a value, it returns // a string (suitable for outputting to stdout) that contains the detailed // descriptions and values of each evaluation term. Useful for debugging. diff --git a/src/nnue/optimized_layers.h b/src/nnue/optimized_layers.h new file mode 100644 index 00000000000..2fec1029ca4 --- /dev/null +++ b/src/nnue/optimized_layers.h @@ -0,0 +1,62 @@ +#ifndef STOCKFISH_EVAL_NNUE_OPTIMIZED_LAYERS_H_INCLUDED +#define STOCKFISH_EVAL_NNUE_OPTIMIZED_LAYERS_H_INCLUDED + +#include "../simd_ops.h" +#include "../cache_optimizer.h" +#include "nnue_common.h" + +namespace Stockfish::Eval::NNUE { + +struct OptimizedLayer { + static void affine_transform_avx2( + const std::int32_t* input, + const std::int32_t* weights, + std::int32_t* output, + const std::int32_t* biases, + unsigned inputDimensions, + unsigned outputDimensions + ) { + const unsigned numChunks = (inputDimensions + 7) / 8; + + for (unsigned i = 0; i < outputDimensions; i++) { + __m256i sum = _mm256_setzero_si256(); + + for (unsigned j = 0; j < numChunks; j++) { + __m256i in = _mm256_load_si256( + reinterpret_cast(input + j * 8)); + __m256i w = _mm256_load_si256( + reinterpret_cast(weights + i * inputDimensions + j * 8)); + + __m256i prod = _mm256_mullo_epi32(in, w); + sum = _mm256_add_epi32(sum, prod); + } + + + __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), + _mm256_extracti128_si256(sum, 1)); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1, 0, 3, 2))); + sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(2, 3, 0, 1))); + output[i] = _mm_cvtsi128_si32(sum128) + biases[i]; + } + } + + static void quantize_weights( + const float* input, + std::int32_t* output, + unsigned size, + float scale + ) { + const __m256 scale_v = _mm256_set1_ps(scale); + + for (unsigned i = 0; i < size; i += 8) { + __m256 in = _mm256_load_ps(input + i); + __m256 scaled = _mm256_mul_ps(in, scale_v); + __m256i rounded = _mm256_cvtps_epi32(scaled); + _mm256_store_si256(reinterpret_cast<__m256i*>(output + i), rounded); + } + } +}; + +} + +#endif diff --git a/src/parallel_search.h b/src/parallel_search.h new file mode 100644 index 00000000000..68d5f7ffaa8 --- /dev/null +++ b/src/parallel_search.h @@ -0,0 +1,64 @@ +#ifndef PARALLEL_SEARCH_H_INCLUDED +#define PARALLEL_SEARCH_H_INCLUDED + +#include +#include +#include +#include +#include "types.h" +#include "position.h" +#include "thread.h" + +namespace Stockfish { + +class ParallelSearchManager { + std::vector> threads; + std::atomic searching{false}; + Depth splitDepth; + +public: + ParallelSearchManager(size_t numThreads = 1) : splitDepth(4) { + threads.reserve(numThreads); + for (size_t i = 0; i < numThreads; ++i) + threads.emplace_back(std::make_unique(i)); + } + + void startSearch(Position& pos, const Search::LimitsType& limits) { + searching = true; + + for (auto& thread : threads) { + thread->startSearching(pos, limits, searching); + } + } + + void waitForSearchFinish() { + searching = false; + for (auto& thread : threads) + if (thread->stdThread.joinable()) + thread->stdThread.join(); + } + + void setSplitDepth(Depth depth) { + splitDepth = depth; + } + + Depth getSplitDepth() const { + return splitDepth; + } + + size_t getThreadCount() const { + return threads.size(); + } + + void resizeThreadPool(size_t newSize) { + if (searching) + return; + + threads.clear(); + threads.reserve(newSize); + for (size_t i = 0; i < newSize; ++i) + threads.emplace_back(std::make_unique(i)); + } +}; + +} diff --git a/src/simd_ops.h b/src/simd_ops.h new file mode 100644 index 00000000000..1e4dc2f12ca --- /dev/null +++ b/src/simd_ops.h @@ -0,0 +1,43 @@ +#ifndef SIMD_OPS_H_INCLUDED +#define SIMD_OPS_H_INCLUDED + +#include + +namespace Stockfish { + +struct SIMDHelper { + static inline __m256i mm256_msb_mask_32() { + return _mm256_set1_epi32(0x80000000); + } + + static inline __m256i mm256_not_si256(__m256i a) { + return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); + } + + template + static inline void prefetch(T* addr) { + _mm_prefetch((char*)addr, _MM_HINT_T0); + } + + static inline __m256i mm256_multishift_epi64_epi8(__m256i a, __m256i count) { + __m256i mask = _mm256_set1_epi64x(0xFF); + __m256i result = _mm256_setzero_si256(); + + for (int i = 0; i < 8; i++) { + __m256i shifted = _mm256_srlv_epi64(a, count); + result = _mm256_or_si256(result, _mm256_and_si256(shifted, mask)); + mask = _mm256_slli_epi64(mask, 8); + count = _mm256_add_epi64(count, _mm256_set1_epi64x(8)); + } + + return result; + } + + static inline __m256i mm256_merge_epi32(__m128i lo, __m128i hi) { + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); + } +}; + +} + +#endif