diff --git a/libs/common/Numerics/TensorPrimitives.IBinaryOperator.cs b/libs/common/Numerics/TensorPrimitives.IBinaryOperator.cs
new file mode 100644
index 0000000000..3c94b67b4a
--- /dev/null
+++ b/libs/common/Numerics/TensorPrimitives.IBinaryOperator.cs
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+
+namespace Garnet.common.Numerics
+{
+ public static unsafe partial class TensorPrimitives
+ {
+ /// x & y
+ public readonly struct BitwiseAndOperator : IBinaryOperator where T : IBitwiseOperators
+ {
+ public static T Invoke(T x, T y) => x & y;
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x & y;
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x & y;
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x & y;
+ }
+
+ /// x | y
+ public readonly struct BitwiseOrOperator : IBinaryOperator where T : IBitwiseOperators
+ {
+ public static T Invoke(T x, T y) => x | y;
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x | y;
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x | y;
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x | y;
+ }
+
+ /// x ^ y
+ public readonly struct BitwiseXorOperator : IBinaryOperator where T : IBitwiseOperators
+ {
+ public static T Invoke(T x, T y) => x ^ y;
+ public static Vector128 Invoke(Vector128 x, Vector128 y) => x ^ y;
+ public static Vector256 Invoke(Vector256 x, Vector256 y) => x ^ y;
+ public static Vector512 Invoke(Vector512 x, Vector512 y) => x ^ y;
+ }
+
+ /// Operator that takes two input values and returns a single value.
+ public interface IBinaryOperator
+ {
+ static abstract T Invoke(T x, T y);
+ static abstract Vector128 Invoke(Vector128 x, Vector128 y);
+ static abstract Vector256 Invoke(Vector256 x, Vector256 y);
+ static abstract Vector512 Invoke(Vector512 x, Vector512 y);
+ }
+ }
+}
\ No newline at end of file
diff --git a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
index 6917e25993..4f6a3d8b33 100644
--- a/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
+++ b/libs/server/Resp/Bitmap/BitmapManagerBitOp.cs
@@ -2,12 +2,16 @@
// Licensed under the MIT license.
using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
using Garnet.common;
-
+using static Garnet.common.Numerics.TensorPrimitives;
namespace Garnet.server
{
+ // TODO: Guard Vector### logic behind IsSupported & IsHardwareAccelerated
+ // TODO: Add Vector512 & Vector128 paths atleast
+ // TODO: Get rid of "IBinaryOperator" scalar logic?
+ // FÒLLOW-UP: Non-temporal stores after sizes larger than 256KB (like in TensorPrimitives)
+ // FÒLLOW-UP: Investigate alignment -> overlapping & jump-table (like in TensorPrimitives)
public unsafe partial class BitmapManager
{
///
@@ -26,16 +30,16 @@ public static bool BitOpMainUnsafeMultiKey(byte* dstPtr, int dstLen, byte** srcS
switch (bitop)
{
case (byte)BitmapOperation.NOT:
- __bitop_multikey_simdX256_not(dstPtr, dstLen, srcStartPtrs[0], srcEndPtrs[0] - srcStartPtrs[0]);
+ InvokeSingleKeyBitwiseNot(dstPtr, dstLen, srcStartPtrs[0], srcEndPtrs[0] - srcStartPtrs[0]);
break;
case (byte)BitmapOperation.AND:
- __bitop_multikey_simdX256_and(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
+ InvokeMultiKeyBitwise, BitwiseAndOperator>(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
break;
case (byte)BitmapOperation.OR:
- __bitop_multikey_simdX256_or(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
+ InvokeMultiKeyBitwise, BitwiseOrOperator>(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
break;
case (byte)BitmapOperation.XOR:
- __bitop_multikey_simdX256_xor(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
+ InvokeMultiKeyBitwise, BitwiseXorOperator>(dstPtr, dstLen, srcStartPtrs, srcEndPtrs, srcKeyCount, minSize);
break;
default:
throw new GarnetException("Unsupported BitOp command");
@@ -44,285 +48,87 @@ public static bool BitOpMainUnsafeMultiKey(byte* dstPtr, int dstLen, byte** srcS
}
///
- /// Negation bitop implementation using 256-wide SIMD registers.
+ /// Invokes unary bitwise-NOT operation for single source key using hardware accelerated SIMD intrinsics when possible.
///
/// Output buffer to write BitOp result
/// Output buffer length.
/// Pointer to source bitmap.
/// Source bitmap length.
- private static void __bitop_multikey_simdX256_not(byte* dstPtr, long dstLen, byte* srcBitmap, long srcLen)
+ private static void InvokeSingleKeyBitwiseNot(byte* dstPtr, long dstLen, byte* srcBitmap, long srcLen)
{
- int batchSize = 8 * 32;
long slen = srcLen;
- long stail = slen & (batchSize - 1);
+ long remainder = slen & ((Vector256.Count * 8) - 1);
//iterate using srcBitmap because always dstLen >= srcLen
byte* srcCurr = srcBitmap;
- byte* srcEnd = srcCurr + (slen - stail);
+ byte* srcEnd = srcCurr + (slen - remainder);
byte* dstCurr = dstPtr;
- #region 8x32
- while (srcCurr < srcEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcCurr);
- Vector256 d01 = Avx.LoadVector256(srcCurr + 32);
- Vector256 d02 = Avx.LoadVector256(srcCurr + 64);
- Vector256 d03 = Avx.LoadVector256(srcCurr + 96);
- Vector256 d04 = Avx.LoadVector256(srcCurr + 128);
- Vector256 d05 = Avx.LoadVector256(srcCurr + 160);
- Vector256 d06 = Avx.LoadVector256(srcCurr + 192);
- Vector256 d07 = Avx.LoadVector256(srcCurr + 224);
-
- Avx.Store(dstCurr, Avx2.Xor(d00, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 32, Avx2.Xor(d01, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 64, Avx2.Xor(d02, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 96, Avx2.Xor(d03, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 128, Avx2.Xor(d04, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 160, Avx2.Xor(d05, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 192, Avx2.Xor(d06, Vector256.AllBitsSet));
- Avx.Store(dstCurr + 224, Avx2.Xor(d07, Vector256.AllBitsSet));
-
- srcCurr += batchSize;
- dstCurr += batchSize;
- }
- if (stail == 0) return;
- #endregion
-
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- srcEnd = srcCurr + (slen - stail);
while (srcCurr < srcEnd)
{
- Vector256 d00 = Avx.LoadVector256(srcCurr);
- Avx.Store(dstCurr, Avx2.Xor(d00, Vector256.AllBitsSet));
- srcCurr += batchSize;
- dstCurr += batchSize;
+ var d00 = Vector256.Load(srcCurr);
+ var d01 = Vector256.Load(srcCurr + Vector256.Count);
+ var d02 = Vector256.Load(srcCurr + (Vector256.Count * 2));
+ var d03 = Vector256.Load(srcCurr + (Vector256.Count * 3));
+ var d04 = Vector256.Load(srcCurr + (Vector256.Count * 4));
+ var d05 = Vector256.Load(srcCurr + (Vector256.Count * 5));
+ var d06 = Vector256.Load(srcCurr + (Vector256.Count * 6));
+ var d07 = Vector256.Load(srcCurr + (Vector256.Count * 7));
+
+ Vector256.Store(~d00, dstCurr);
+ Vector256.Store(~d01, dstCurr + Vector256.Count);
+ Vector256.Store(~d02, dstCurr + Vector256.Count * 2);
+ Vector256.Store(~d03, dstCurr + Vector256.Count * 3);
+ Vector256.Store(~d04, dstCurr + Vector256.Count * 4);
+ Vector256.Store(~d05, dstCurr + Vector256.Count * 5);
+ Vector256.Store(~d06, dstCurr + Vector256.Count * 6);
+ Vector256.Store(~d07, dstCurr + Vector256.Count * 7);
+
+ srcCurr += Vector256.Count * 8;
+ dstCurr += Vector256.Count * 8;
}
- if (stail == 0) return;
- #endregion
-
- #region 4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- srcEnd = srcCurr + (slen - stail);
+ if (remainder == 0) return;
+
+ slen = remainder;
+ remainder = slen & (Vector256.Count - 1);
+ srcEnd = srcCurr + (slen - remainder);
while (srcCurr < srcEnd)
{
- long d00 = *(long*)(srcCurr);
- long d01 = *(long*)(srcCurr + 8);
- long d02 = *(long*)(srcCurr + 16);
- long d03 = *(long*)(srcCurr + 24);
-
- *(long*)dstCurr = ~d00;
- *(long*)(dstCurr + 8) = ~d01;
- *(long*)(dstCurr + 16) = ~d02;
- *(long*)(dstCurr + 24) = ~d03;
-
- srcCurr += batchSize;
- dstCurr += batchSize;
+ Vector256.Store(~Vector256.Load(srcCurr), dstCurr);
+
+ srcCurr += Vector256.Count;
+ dstCurr += Vector256.Count;
}
- if (stail == 0) return;
- #endregion
-
- #region 1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- srcEnd = srcCurr + (slen - stail);
+ if (remainder == 0) return;
+
+ slen = remainder;
+ remainder = slen & (sizeof(ulong) - 1);
+ srcEnd = srcCurr + (slen - remainder);
while (srcCurr < srcEnd)
{
- long d00 = *(long*)(srcCurr);
+ *(ulong*)dstCurr = ~*(ulong*)srcCurr;
- *(long*)dstCurr = ~d00;
-
- srcCurr += batchSize;
- dstCurr += batchSize;
+ srcCurr += sizeof(ulong);
+ dstCurr += sizeof(ulong);
}
- if (stail == 0) return;
- #endregion
-
- if (stail >= 7) dstCurr[6] = (byte)(~srcCurr[6]);
- if (stail >= 6) dstCurr[5] = (byte)(~srcCurr[5]);
- if (stail >= 5) dstCurr[4] = (byte)(~srcCurr[4]);
- if (stail >= 4) dstCurr[3] = (byte)(~srcCurr[3]);
- if (stail >= 3) dstCurr[2] = (byte)(~srcCurr[2]);
- if (stail >= 2) dstCurr[1] = (byte)(~srcCurr[1]);
- if (stail >= 1) dstCurr[0] = (byte)(~srcCurr[0]);
+ if (remainder == 0) return;
+
+ if (remainder >= 7) dstCurr[6] = (byte)~srcCurr[6];
+ if (remainder >= 6) dstCurr[5] = (byte)~srcCurr[5];
+ if (remainder >= 5) dstCurr[4] = (byte)~srcCurr[4];
+ if (remainder >= 4) dstCurr[3] = (byte)~srcCurr[3];
+ if (remainder >= 3) dstCurr[2] = (byte)~srcCurr[2];
+ if (remainder >= 2) dstCurr[1] = (byte)~srcCurr[1];
+ if (remainder >= 1) dstCurr[0] = (byte)~srcCurr[0];
}
- ///
- /// AND bitop implementation using 256-wide SIMD registers.
- ///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Pointer to start of bitmap sources.
- /// Pointer to end of bitmap sources
- /// Number of source keys.
- /// Minimum size of source bitmaps.
- private static void __bitop_multikey_simdX256_and(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
+ public static void GenericCodeGenDebugAid(int dstLen, int srcKeyCount, int minSize)
{
- int batchSize = 8 * 32;
- long slen = minSize;
- long stail = slen & (batchSize - 1);
-
- byte* dstCurr = dstPtr;
- byte* dstEnd = dstCurr + (slen - stail);
-
- #region 8x32
- while (dstCurr < dstEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
- Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
- Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
- Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
- Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
- Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
- Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
- Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
- Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
- Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
- Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
- Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
- Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
- d00 = Avx2.And(d00, s00);
- d01 = Avx2.And(d01, s01);
- d02 = Avx2.And(d02, s02);
- d03 = Avx2.And(d03, s03);
- d04 = Avx2.And(d04, s04);
- d05 = Avx2.And(d05, s05);
- d06 = Avx2.And(d06, s06);
- d07 = Avx2.And(d07, s07);
- srcStartPtrs[i] += batchSize;
- }
-
- Avx.Store(dstCurr, d00);
- Avx.Store(dstCurr + 32, d01);
- Avx.Store(dstCurr + 64, d02);
- Avx.Store(dstCurr + 96, d03);
- Avx.Store(dstCurr + 128, d04);
- Avx.Store(dstCurr + 160, d05);
- Avx.Store(dstCurr + 192, d06);
- Avx.Store(dstCurr + 224, d07);
-
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
-
- while (dstCurr < dstEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- d00 = Avx2.And(d00, s00);
- srcStartPtrs[i] += batchSize;
- }
- Avx.Store(dstCurr, d00);
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- long d01 = *(long*)(srcStartPtrs[0] + 8);
- long d02 = *(long*)(srcStartPtrs[0] + 16);
- long d03 = *(long*)(srcStartPtrs[0] + 24);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 &= *(long*)(srcStartPtrs[i]);
- d01 &= *(long*)(srcStartPtrs[i] + 8);
- d02 &= *(long*)(srcStartPtrs[i] + 16);
- d03 &= *(long*)(srcStartPtrs[i] + 24);
- srcStartPtrs[i] += batchSize;
- }
-
- *(long*)dstCurr = d00;
- *(long*)(dstCurr + 8) = d01;
- *(long*)(dstCurr + 16) = d02;
- *(long*)(dstCurr + 24) = d03;
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 &= *(long*)(srcStartPtrs[i]);
- srcStartPtrs[i] += batchSize;
- }
- *(long*)dstCurr = d00;
- dstCurr += batchSize;
- }
- #endregion
-
- fillTail:
- #region scalar_1x1
- byte* dstMaxEnd = dstPtr + dstLen;
- int offset = 0;
- while (dstCurr < dstMaxEnd)
- {
- byte d00;
- if (srcStartPtrs[0] + offset < srcEndPtrs[0])
- d00 = srcStartPtrs[0][offset];
- else
- {
- d00 = 0;
- goto writeBack;
- }
-
- for (int i = 1; i < srcKeyCount; i++)
- {
- if (srcStartPtrs[i] + offset < srcEndPtrs[i])
- d00 &= srcStartPtrs[i][offset];
- else
- {
- d00 = 0;
- goto writeBack;
- }
- }
- writeBack:
- *dstCurr++ = d00;
- offset++;
- }
- #endregion
+ InvokeMultiKeyBitwise, BitwiseAndOperator>((byte*)0, dstLen, (byte**)0, (byte**)0, srcKeyCount, minSize);
}
///
- /// OR bitop implementation using 256-wide SIMD registers.
+ /// Invokes bitwise bit-operation for multiple keys using hardware accelerated SIMD intrinsics when possible.
///
/// Output buffer to write BitOp result
/// Output buffer length.
@@ -330,321 +136,149 @@ private static void __bitop_multikey_simdX256_and(byte* dstPtr, int dstLen, byte
/// Pointer to end of bitmap sources
/// Number of source keys.
/// Minimum size of source bitmaps.
- private static void __bitop_multikey_simdX256_or(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
+ private static void InvokeMultiKeyBitwise(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
+ where TBinaryOperator : struct, IBinaryOperator
+ where TBinaryOperator2 : struct, IBinaryOperator
{
- int batchSize = 8 * 32;
long slen = minSize;
- long stail = slen & (batchSize - 1);
+ var remainder = slen & ((Vector256.Count * 8) - 1);
- byte* dstCurr = dstPtr;
- byte* dstEnd = dstCurr + (slen - stail);
+ var dstEndPtr = dstPtr + dstLen;
+ var dstBatchEndPtr = dstPtr + (slen - remainder);
- #region 8x32
- while (dstCurr < dstEnd)
+ ref var firstKeyPtr = ref srcStartPtrs[0];
+
+ while (dstPtr < dstBatchEndPtr)
{
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
- Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
- Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
- Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
- Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
- Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
- Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
+ var d00 = Vector256.Load(firstKeyPtr);
+ var d01 = Vector256.Load(firstKeyPtr + Vector256.Count);
+ var d02 = Vector256.Load(firstKeyPtr + (Vector256.Count * 2));
+ var d03 = Vector256.Load(firstKeyPtr + (Vector256.Count * 3));
+ var d04 = Vector256.Load(firstKeyPtr + (Vector256.Count * 4));
+ var d05 = Vector256.Load(firstKeyPtr + (Vector256.Count * 5));
+ var d06 = Vector256.Load(firstKeyPtr + (Vector256.Count * 6));
+ var d07 = Vector256.Load(firstKeyPtr + (Vector256.Count * 7));
+
+ firstKeyPtr += Vector256.Count * 8;
+
+ for (var i = 1; i < srcKeyCount; i++)
{
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
- Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
- Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
- Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
- Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
- Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
- Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
- d00 = Avx2.Or(d00, s00);
- d01 = Avx2.Or(d01, s01);
- d02 = Avx2.Or(d02, s02);
- d03 = Avx2.Or(d03, s03);
- d04 = Avx2.Or(d04, s04);
- d05 = Avx2.Or(d05, s05);
- d06 = Avx2.Or(d06, s06);
- d07 = Avx2.Or(d07, s07);
- srcStartPtrs[i] += batchSize;
+ ref var keyStartPtr = ref srcStartPtrs[i];
+
+ var s00 = Vector256.Load(keyStartPtr);
+ var s01 = Vector256.Load(keyStartPtr + Vector256.Count);
+ var s02 = Vector256.Load(keyStartPtr + (Vector256.Count * 2));
+ var s03 = Vector256.Load(keyStartPtr + (Vector256.Count * 3));
+ var s04 = Vector256.Load(keyStartPtr + (Vector256.Count * 4));
+ var s05 = Vector256.Load(keyStartPtr + (Vector256.Count * 5));
+ var s06 = Vector256.Load(keyStartPtr + (Vector256.Count * 6));
+ var s07 = Vector256.Load(keyStartPtr + (Vector256.Count * 7));
+
+ d00 = TBinaryOperator.Invoke(d00, s00);
+ d01 = TBinaryOperator.Invoke(d01, s01);
+ d02 = TBinaryOperator.Invoke(d02, s02);
+ d03 = TBinaryOperator.Invoke(d03, s03);
+ d04 = TBinaryOperator.Invoke(d04, s04);
+ d05 = TBinaryOperator.Invoke(d05, s05);
+ d06 = TBinaryOperator.Invoke(d06, s06);
+ d07 = TBinaryOperator.Invoke(d07, s07);
+
+ keyStartPtr += Vector256.Count * 8;
}
- Avx.Store(dstCurr, d00);
- Avx.Store(dstCurr + 32, d01);
- Avx.Store(dstCurr + 64, d02);
- Avx.Store(dstCurr + 96, d03);
- Avx.Store(dstCurr + 128, d04);
- Avx.Store(dstCurr + 160, d05);
- Avx.Store(dstCurr + 192, d06);
- Avx.Store(dstCurr + 224, d07);
+ Vector256.Store(d00, dstPtr);
+ Vector256.Store(d01, dstPtr + Vector256.Count);
+ Vector256.Store(d02, dstPtr + Vector256.Count * 2);
+ Vector256.Store(d03, dstPtr + Vector256.Count * 3);
+ Vector256.Store(d04, dstPtr + Vector256.Count * 4);
+ Vector256.Store(d05, dstPtr + Vector256.Count * 5);
+ Vector256.Store(d06, dstPtr + Vector256.Count * 6);
+ Vector256.Store(d07, dstPtr + Vector256.Count * 7);
- dstCurr += batchSize;
+ dstPtr += Vector256.Count * 8;
}
- if (stail == 0) goto fillTail;
- #endregion
+ if (remainder == 0) goto fillTail;
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
+ slen = remainder;
+ remainder = slen & (Vector256.Count - 1);
+ dstBatchEndPtr = dstPtr + (slen - remainder);
- while (dstCurr < dstEnd)
+ while (dstPtr < dstBatchEndPtr)
{
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- d00 = Avx2.Or(d00, s00);
- srcStartPtrs[i] += batchSize;
- }
- Avx.Store(dstCurr, d00);
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- long d01 = *(long*)(srcStartPtrs[0] + 8);
- long d02 = *(long*)(srcStartPtrs[0] + 16);
- long d03 = *(long*)(srcStartPtrs[0] + 24);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 |= *(long*)(srcStartPtrs[i]);
- d01 |= *(long*)(srcStartPtrs[i] + 8);
- d02 |= *(long*)(srcStartPtrs[i] + 16);
- d03 |= *(long*)(srcStartPtrs[i] + 24);
- srcStartPtrs[i] += batchSize;
- }
+ var d00 = Vector256.Load(firstKeyPtr);
+ firstKeyPtr += Vector256.Count;
- *(long*)dstCurr = d00;
- *(long*)(dstCurr + 8) = d01;
- *(long*)(dstCurr + 16) = d02;
- *(long*)(dstCurr + 24) = d03;
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
+ for (var i = 1; i < srcKeyCount; i++)
{
- d00 |= *(long*)(srcStartPtrs[i]);
- srcStartPtrs[i] += batchSize;
- }
- *(long*)dstCurr = d00;
- dstCurr += batchSize;
- }
- #endregion
-
- fillTail:
- #region scalar_1x1
- byte* dstMaxEnd = dstPtr + dstLen;
- int offset = 0;
- while (dstCurr < dstMaxEnd)
- {
- byte d00 = 0;
- if (srcStartPtrs[0] + offset < srcEndPtrs[0])
- {
- d00 = srcStartPtrs[0][offset];
- if (d00 == 0xff) goto writeBack;
- }
-
- for (int i = 1; i < srcKeyCount; i++)
- {
- if (srcStartPtrs[i] + offset < srcEndPtrs[i])
- {
- d00 |= srcStartPtrs[i][offset];
- if (d00 == 0xff) goto writeBack;
- }
- }
- writeBack:
- *dstCurr++ = d00;
- offset++;
- }
- #endregion
- }
-
- ///
- /// XOR bitop implementation using 256-wide SIMD registers.
- ///
- /// Output buffer to write BitOp result
- /// Output buffer length.
- /// Pointer to start of bitmap sources.
- /// Pointer to end of bitmap sources
- /// Number of source keys.
- /// Minimum size of source bitmaps.
- private static void __bitop_multikey_simdX256_xor(byte* dstPtr, int dstLen, byte** srcStartPtrs, byte** srcEndPtrs, int srcKeyCount, int minSize)
- {
- int batchSize = 8 * 32;
- long slen = minSize;
- long stail = slen & (batchSize - 1);
-
- byte* dstCurr = dstPtr;
- byte* dstEnd = dstCurr + (slen - stail);
+ var s00 = Vector256.Load(srcStartPtrs[i]);
+ d00 = TBinaryOperator.Invoke(d00, s00);
- #region 8x32
- while (dstCurr < dstEnd)
- {
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- Vector256 d01 = Avx.LoadVector256(srcStartPtrs[0] + 32);
- Vector256 d02 = Avx.LoadVector256(srcStartPtrs[0] + 64);
- Vector256 d03 = Avx.LoadVector256(srcStartPtrs[0] + 96);
- Vector256 d04 = Avx.LoadVector256(srcStartPtrs[0] + 128);
- Vector256 d05 = Avx.LoadVector256(srcStartPtrs[0] + 160);
- Vector256 d06 = Avx.LoadVector256(srcStartPtrs[0] + 192);
- Vector256 d07 = Avx.LoadVector256(srcStartPtrs[0] + 224);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- Vector256 s01 = Avx.LoadVector256(srcStartPtrs[i] + 32);
- Vector256 s02 = Avx.LoadVector256(srcStartPtrs[i] + 64);
- Vector256 s03 = Avx.LoadVector256(srcStartPtrs[i] + 96);
- Vector256 s04 = Avx.LoadVector256(srcStartPtrs[i] + 128);
- Vector256 s05 = Avx.LoadVector256(srcStartPtrs[i] + 160);
- Vector256 s06 = Avx.LoadVector256(srcStartPtrs[i] + 192);
- Vector256 s07 = Avx.LoadVector256(srcStartPtrs[i] + 224);
-
- d00 = Avx2.Xor(d00, s00);
- d01 = Avx2.Xor(d01, s01);
- d02 = Avx2.Xor(d02, s02);
- d03 = Avx2.Xor(d03, s03);
- d04 = Avx2.Xor(d04, s04);
- d05 = Avx2.Xor(d05, s05);
- d06 = Avx2.Xor(d06, s06);
- d07 = Avx2.Xor(d07, s07);
- srcStartPtrs[i] += batchSize;
+ srcStartPtrs[i] += Vector256.Count;
}
- Avx.Store(dstCurr, d00);
- Avx.Store(dstCurr + 32, d01);
- Avx.Store(dstCurr + 64, d02);
- Avx.Store(dstCurr + 96, d03);
- Avx.Store(dstCurr + 128, d04);
- Avx.Store(dstCurr + 160, d05);
- Avx.Store(dstCurr + 192, d06);
- Avx.Store(dstCurr + 224, d07);
+ Vector256.Store(d00, dstPtr);
- dstCurr += batchSize;
+ dstPtr += Vector256.Count;
}
- #endregion
+ if (remainder == 0) goto fillTail;
- #region 1x32
- slen = stail;
- batchSize = 1 * 32;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
+ slen = remainder;
+ remainder = slen & (sizeof(ulong) - 1);
+ dstBatchEndPtr = dstPtr + (slen - remainder);
- while (dstCurr < dstEnd)
+ while (dstPtr < dstBatchEndPtr)
{
- Vector256 d00 = Avx.LoadVector256(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- Vector256 s00 = Avx.LoadVector256(srcStartPtrs[i]);
- d00 = Avx2.Xor(d00, s00);
- srcStartPtrs[i] += batchSize;
- }
- Avx.Store(dstCurr, d00);
- dstCurr += batchSize;
- }
- #endregion
-
- #region scalar_4x8
- slen = stail;
- batchSize = 4 * 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- long d01 = *(long*)(srcStartPtrs[0] + 8);
- long d02 = *(long*)(srcStartPtrs[0] + 16);
- long d03 = *(long*)(srcStartPtrs[0] + 24);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
- {
- d00 ^= *(long*)(srcStartPtrs[i]);
- d01 ^= *(long*)(srcStartPtrs[i] + 8);
- d02 ^= *(long*)(srcStartPtrs[i] + 16);
- d03 ^= *(long*)(srcStartPtrs[i] + 24);
- srcStartPtrs[i] += batchSize;
- }
+ ulong d00 = *(ulong*)firstKeyPtr;
+ firstKeyPtr += sizeof(ulong);
- *(long*)dstCurr = d00;
- *(long*)(dstCurr + 8) = d01;
- *(long*)(dstCurr + 16) = d02;
- *(long*)(dstCurr + 24) = d03;
- dstCurr += batchSize;
- }
- if (stail == 0) goto fillTail;
- #endregion
-
- #region scalar_1x8
- slen = stail;
- batchSize = 8;
- stail = slen & (batchSize - 1);
- dstEnd = dstCurr + (slen - stail);
- while (dstCurr < dstEnd)
- {
- long d00 = *(long*)(srcStartPtrs[0]);
- srcStartPtrs[0] += batchSize;
- for (int i = 1; i < srcKeyCount; i++)
+ for (var i = 1; i < srcKeyCount; i++)
{
- d00 ^= *(long*)(srcStartPtrs[i]);
- srcStartPtrs[i] += batchSize;
+ d00 = TBinaryOperator2.Invoke(d00, *(ulong*)srcStartPtrs[i]);
+ srcStartPtrs[i] += sizeof(ulong);
}
- *(long*)dstCurr = d00;
- dstCurr += batchSize;
+
+ *(ulong*)dstPtr = d00;
+ dstPtr += sizeof(ulong);
}
- #endregion
fillTail:
- #region scalar_1x1
- byte* dstMaxEnd = dstPtr + dstLen;
- while (dstCurr < dstMaxEnd)
+ while (dstPtr < dstEndPtr)
{
byte d00 = 0;
- if (srcStartPtrs[0] < srcEndPtrs[0])
+
+ if (firstKeyPtr < srcEndPtrs[0])
{
- d00 = *srcStartPtrs[0];
- srcStartPtrs[0]++;
+ d00 = *firstKeyPtr;
+ firstKeyPtr++;
}
- for (int i = 1; i < srcKeyCount; i++)
+ for (var i = 1; i < srcKeyCount; i++)
{
if (srcStartPtrs[i] < srcEndPtrs[i])
{
- d00 ^= *srcStartPtrs[i];
+ d00 = TBinaryOperator.Invoke(d00, *srcStartPtrs[i]);
srcStartPtrs[i]++;
}
+ else
+ {
+ if (typeof(TBinaryOperator) == typeof(BitwiseAndOperator))
+ {
+ d00 = 0;
+ }
+ else if (typeof(TBinaryOperator) == typeof(BitwiseOrOperator))
+ {
+ // nop
+ }
+ else if (typeof(TBinaryOperator) == typeof(BitwiseXorOperator))
+ {
+ // TODO: I _think_ there's a error in this logic and we should have here:
+ // d00 ^= 0;
+ }
+ }
}
- *dstCurr++ = d00;
+
+ *dstPtr++ = d00;
}
- #endregion
}
-
}
}
\ No newline at end of file
diff --git a/test/Garnet.test/GarnetBitmapTests.cs b/test/Garnet.test/GarnetBitmapTests.cs
index 627a4984d4..11a2327417 100644
--- a/test/Garnet.test/GarnetBitmapTests.cs
+++ b/test/Garnet.test/GarnetBitmapTests.cs
@@ -32,14 +32,9 @@ public void TearDown()
TestUtils.DeleteDirectory(TestUtils.MethodTestDir);
}
- private long LongRandom() => ((long)this.r.Next() << 32) | (long)this.r.Next();
+ private long LongRandom() => r.NextInt64();
- private ulong ULongRandom()
- {
- ulong lsb = (ulong)(this.r.Next());
- ulong msb = (ulong)(this.r.Next()) << 32;
- return (msb | lsb);
- }
+ private ulong ULongRandom() => (ulong)r.NextInt64(long.MinValue, long.MaxValue);
private unsafe long ResponseToLong(byte[] response, int offset)
{
@@ -879,16 +874,18 @@ public void BitmapSimpleBitOpTests()
}
}
- private static void InitBitmap(ref byte[] dst, byte[] srcA, bool invert = false)
+ private static byte[] CopyBitmap(byte[] sourceBitmap, bool invert = false)
{
- dst = new byte[srcA.Length];
+ var dst = new byte[sourceBitmap.Length];
if (invert)
- for (int i = 0; i < srcA.Length; i++) dst[i] = (byte)~srcA[i];
+ for (int i = 0; i < sourceBitmap.Length; i++) dst[i] = (byte)~sourceBitmap[i];
else
- for (int i = 0; i < srcA.Length; i++) dst[i] = srcA[i];
+ sourceBitmap.AsSpan().CopyTo(dst);
+
+ return dst;
}
- private static void ApplyBitop(ref byte[] dst, byte[] srcA, Func f8)
+ private static void ApplyBitop(ref byte[] dst, byte[] srcA, Func op)
{
if (dst.Length < srcA.Length)
{
@@ -899,12 +896,12 @@ private static void ApplyBitop(ref byte[] dst, byte[] srcA, Func f8 = null;
- switch (bitwiseOps[j])
- {
- case Bitwise.And:
- f8 = (a, b) => (byte)(a & b);
- break;
- case Bitwise.Or:
- f8 = (a, b) => (byte)(a | b);
- break;
- case Bitwise.Xor:
- f8 = (a, b) => (byte)(a ^ b);
- break;
- }
+ Func op = bitwiseOps[j] switch
+ {
+ Bitwise.And => static (a, b) => (byte)(a & b),
+ Bitwise.Or => static (a, b) => (byte)(a | b),
+ Bitwise.Xor => static (a, b) => (byte)(a ^ b)
+ };
- dataX = null;
- InitBitmap(ref dataX, dataA);
- ApplyBitop(ref dataX, dataB, f8);
- ApplyBitop(ref dataX, dataC, f8);
- ApplyBitop(ref dataX, dataD, f8);
+ byte[] dataX = CopyBitmap(dataA);
+ ApplyBitop(ref dataX, dataB, op);
+ ApplyBitop(ref dataX, dataC, op);
+ ApplyBitop(ref dataX, dataD, op);
long size = db.StringBitOperation(bitwiseOps[j], x, keys);
ClassicAssert.AreEqual(size, dataX.Length);
@@ -1032,7 +1020,7 @@ public void BitmapSimpleBitOpVarLenGrowingSizeTests()
string x = "x";
byte[] dataA, dataB, dataC, dataD;
- byte[] dataX;
+
int minSize = 512;
Bitwise[] bitwiseOps = [Bitwise.And, Bitwise.Or, Bitwise.Xor, Bitwise.And, Bitwise.Or, Bitwise.Xor];
RedisKey[] keys = [a, b, c, d];
@@ -1042,15 +1030,14 @@ public void BitmapSimpleBitOpVarLenGrowingSizeTests()
{
dataA = new byte[r.Next(minSize, minSize + 32)];
r.NextBytes(dataA);
- db.StringSet(a, dataA);
+ byte[] expectedX = CopyBitmap(dataA, invert: true);
- dataX = null;
- InitBitmap(ref dataX, dataA, true);
+ db.StringSet(a, dataA);
long size = db.StringBitOperation(Bitwise.Not, x, a);
- ClassicAssert.AreEqual(size, dataX.Length);
+ ClassicAssert.AreEqual(expectedX.Length, size);
- byte[] expectedX = db.StringGet(x);
- ClassicAssert.AreEqual(dataX, expectedX);
+ byte[] actualX = db.StringGet(x);
+ ClassicAssert.AreEqual(expectedX, actualX);
}
//Test AND, OR, XOR
@@ -1062,8 +1049,7 @@ public void BitmapSimpleBitOpVarLenGrowingSizeTests()
dataB = new byte[r.Next(minSize, minSize + 16)]; minSize = dataB.Length;
dataC = new byte[r.Next(minSize, minSize + 16)]; minSize = dataC.Length;
dataD = new byte[r.Next(minSize, minSize + 16)]; minSize = dataD.Length;
- minSize = 17;
-
+
r.NextBytes(dataA);
r.NextBytes(dataB);
r.NextBytes(dataC);
@@ -1074,32 +1060,24 @@ public void BitmapSimpleBitOpVarLenGrowingSizeTests()
db.StringSet(c, dataC);
db.StringSet(d, dataD);
- Func f8 = null;
- switch (bitwiseOps[j])
+ Func op = bitwiseOps[j] switch
{
- case Bitwise.And:
- f8 = (a, b) => (byte)(a & b);
- break;
- case Bitwise.Or:
- f8 = (a, b) => (byte)(a | b);
- break;
- case Bitwise.Xor:
- f8 = (a, b) => (byte)(a ^ b);
- break;
- }
+ Bitwise.And => static (a, b) => (byte)(a & b),
+ Bitwise.Or => static (a, b) => (byte)(a | b),
+ Bitwise.Xor => static (a, b) => (byte)(a ^ b)
+ };
- dataX = null;
- InitBitmap(ref dataX, dataA);
- ApplyBitop(ref dataX, dataB, f8);
- ApplyBitop(ref dataX, dataC, f8);
- ApplyBitop(ref dataX, dataD, f8);
+ byte[] expectedX = CopyBitmap(dataA);
+ ApplyBitop(ref expectedX, dataB, op);
+ ApplyBitop(ref expectedX, dataC, op);
+ ApplyBitop(ref expectedX, dataD, op);
long size = db.StringBitOperation(bitwiseOps[j], x, keys);
- ClassicAssert.AreEqual(size, dataX.Length);
- byte[] expectedX = db.StringGet(x);
+ ClassicAssert.AreEqual(expectedX.Length, size);
+ byte[] dataX = db.StringGet(x);
- ClassicAssert.AreEqual(expectedX.Length, dataX.Length);
- ClassicAssert.AreEqual(dataX, expectedX);
+ ClassicAssert.AreEqual(expectedX.Length, expectedX.Length);
+ ClassicAssert.AreEqual(expectedX, dataX);
}
}
}