Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SIMD optimizations and cache management improvements #5818

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions src/cache_optimizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef CACHE_OPTIMIZER_H_INCLUDED
#define CACHE_OPTIMIZER_H_INCLUDED

#include <cstddef>
#include <new>

namespace Stockfish {

template<typename T, size_t CacheLineSize = 64>
class CacheAlignedArray {
static constexpr size_t alignment = CacheLineSize;
T* data;
size_t length;

public:
explicit CacheAlignedArray(size_t size) : length(size) {
data = static_cast<T*>(::operator new(size * sizeof(T) + alignment, std::align_val_t(alignment)));
}

~CacheAlignedArray() {
::operator delete(data, std::align_val_t(alignment));
}

T& operator[](size_t index) { return data[index]; }
const T& operator[](size_t index) const { return data[index]; }
T* get() { return data; }
const T* get() const { return data; }
size_t size() const { return length; }
};

struct CacheOptimizer {
static constexpr size_t CACHE_LINE_SIZE = 64;
static constexpr size_t L1_CACHE_SIZE = 32768;
static constexpr size_t L2_CACHE_SIZE = 262144;
static constexpr size_t L3_CACHE_SIZE = 8388608;

template<typename T>
static void prefetchL1(const T* addr) {
__builtin_prefetch(addr, 0, 3);
}

template<typename T>
static void prefetchL2(const T* addr) {
__builtin_prefetch(addr, 0, 2);
}

template<typename T>
static void prefetchForModify(const T* addr) {
__builtin_prefetch(addr, 1, 3);
}
};

}

#endif
16 changes: 16 additions & 0 deletions src/evaluate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@
#include "types.h"
#include "uci.h"
#include "nnue/nnue_accumulator.h"
#include "simd_ops.h"
#include "cache_optimizer.h"
#include "nnue/optimized_layers.h"

namespace Stockfish {

Expand Down Expand Up @@ -90,6 +93,19 @@ Value Eval::evaluate(const Eval::NNUE::Networks& networks,
return v;
}

template<Color Us>
Score evaluate(const Position& pos) {
CacheOptimizer::prefetchL1(&pos);

constexpr Color Them = ~Us;

CacheAlignedArray<int32_t> accumulator(NNUE_DIMENSIONS);

Score score = NNUE::evaluate(pos, accumulator.get());

return score;
}

// Like evaluate(), but instead of returning a value, it returns
// a string (suitable for outputting to stdout) that contains the detailed
// descriptions and values of each evaluation term. Useful for debugging.
Expand Down
62 changes: 62 additions & 0 deletions src/nnue/optimized_layers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#ifndef STOCKFISH_EVAL_NNUE_OPTIMIZED_LAYERS_H_INCLUDED
#define STOCKFISH_EVAL_NNUE_OPTIMIZED_LAYERS_H_INCLUDED

#include "../simd_ops.h"
#include "../cache_optimizer.h"
#include "nnue_common.h"

namespace Stockfish::Eval::NNUE {

struct OptimizedLayer {
static void affine_transform_avx2(
const std::int32_t* input,
const std::int32_t* weights,
std::int32_t* output,
const std::int32_t* biases,
unsigned inputDimensions,
unsigned outputDimensions
) {
const unsigned numChunks = (inputDimensions + 7) / 8;

for (unsigned i = 0; i < outputDimensions; i++) {
__m256i sum = _mm256_setzero_si256();

for (unsigned j = 0; j < numChunks; j++) {
__m256i in = _mm256_load_si256(
reinterpret_cast<const __m256i*>(input + j * 8));
__m256i w = _mm256_load_si256(
reinterpret_cast<const __m256i*>(weights + i * inputDimensions + j * 8));

__m256i prod = _mm256_mullo_epi32(in, w);
sum = _mm256_add_epi32(sum, prod);
}


__m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum),
_mm256_extracti128_si256(sum, 1));
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1, 0, 3, 2)));
sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(2, 3, 0, 1)));
output[i] = _mm_cvtsi128_si32(sum128) + biases[i];
}
}

static void quantize_weights(
const float* input,
std::int32_t* output,
unsigned size,
float scale
) {
const __m256 scale_v = _mm256_set1_ps(scale);

for (unsigned i = 0; i < size; i += 8) {
__m256 in = _mm256_load_ps(input + i);
__m256 scaled = _mm256_mul_ps(in, scale_v);
__m256i rounded = _mm256_cvtps_epi32(scaled);
_mm256_store_si256(reinterpret_cast<__m256i*>(output + i), rounded);
}
}
};

}

#endif
64 changes: 64 additions & 0 deletions src/parallel_search.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#ifndef PARALLEL_SEARCH_H_INCLUDED
#define PARALLEL_SEARCH_H_INCLUDED

#include <atomic>
#include <memory>
#include <thread>
#include <vector>
#include "types.h"
#include "position.h"
#include "thread.h"

namespace Stockfish {

class ParallelSearchManager {
std::vector<std::unique_ptr<Thread>> threads;
std::atomic<bool> searching{false};
Depth splitDepth;

public:
ParallelSearchManager(size_t numThreads = 1) : splitDepth(4) {
threads.reserve(numThreads);
for (size_t i = 0; i < numThreads; ++i)
threads.emplace_back(std::make_unique<Thread>(i));
}

void startSearch(Position& pos, const Search::LimitsType& limits) {
searching = true;

for (auto& thread : threads) {
thread->startSearching(pos, limits, searching);
}
}

void waitForSearchFinish() {
searching = false;
for (auto& thread : threads)
if (thread->stdThread.joinable())
thread->stdThread.join();
}

void setSplitDepth(Depth depth) {
splitDepth = depth;
}

Depth getSplitDepth() const {
return splitDepth;
}

size_t getThreadCount() const {
return threads.size();
}

void resizeThreadPool(size_t newSize) {
if (searching)
return;

threads.clear();
threads.reserve(newSize);
for (size_t i = 0; i < newSize; ++i)
threads.emplace_back(std::make_unique<Thread>(i));
}
};

}
43 changes: 43 additions & 0 deletions src/simd_ops.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef SIMD_OPS_H_INCLUDED
#define SIMD_OPS_H_INCLUDED

#include <immintrin.h>

namespace Stockfish {

struct SIMDHelper {
static inline __m256i mm256_msb_mask_32() {
return _mm256_set1_epi32(0x80000000);
}

static inline __m256i mm256_not_si256(__m256i a) {
return _mm256_xor_si256(a, _mm256_set1_epi32(-1));
}

template<typename T>
static inline void prefetch(T* addr) {
_mm_prefetch((char*)addr, _MM_HINT_T0);
}

static inline __m256i mm256_multishift_epi64_epi8(__m256i a, __m256i count) {
__m256i mask = _mm256_set1_epi64x(0xFF);
__m256i result = _mm256_setzero_si256();

for (int i = 0; i < 8; i++) {
__m256i shifted = _mm256_srlv_epi64(a, count);
result = _mm256_or_si256(result, _mm256_and_si256(shifted, mask));
mask = _mm256_slli_epi64(mask, 8);
count = _mm256_add_epi64(count, _mm256_set1_epi64x(8));
}

return result;
}

static inline __m256i mm256_merge_epi32(__m128i lo, __m128i hi) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
};

}

#endif
Loading