From 47b7a99f03aa6758308449e0a60b54a67111fc3c Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Mon, 24 Jan 2022 10:27:14 +0000
Subject: [PATCH 1/8] add benchmark for vikunja transform

- setup infrastructure for catch2 benchmark
---
 CMakeLists.txt                                |   1 +
 test/CMakeLists.txt                           |   3 +
 test/benchmarks/CMakeLists.txt                |  19 +++
 test/benchmarks/helper/CMakeLists.txt         |  25 ++++
 test/benchmarks/helper/test_bench_helper.cpp  |  90 +++++++++++++
 .../include/vikunja/bench/memory.hpp          | 120 ++++++++++++++++++
 test/benchmarks/transform/CMakeLists.txt      |  25 ++++
 .../transform/bench_vikunja_transform.cpp     | 118 +++++++++++++++++
 test/include/vikunja/test/utility.hpp         |   7 +
 9 files changed, 408 insertions(+)
 create mode 100644 test/benchmarks/CMakeLists.txt
 create mode 100644 test/benchmarks/helper/CMakeLists.txt
 create mode 100644 test/benchmarks/helper/test_bench_helper.cpp
 create mode 100644 test/benchmarks/include/vikunja/bench/memory.hpp
 create mode 100644 test/benchmarks/transform/CMakeLists.txt
 create mode 100644 test/benchmarks/transform/bench_vikunja_transform.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc34500..c0515d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,6 +23,7 @@ option(VIKUNJA_ENABLE_EXTRA_WARNINGS "Enable extra warnings" OFF)
 option(BUILD_TESTING "Build the testing tree." OFF)
 cmake_dependent_option(VIKUNJA_SYSTEM_CATCH2 "Use your local installation of Catch2" ON BUILD_TESTING OFF)
 cmake_dependent_option(VIKUNJA_ENABLE_CXX_TEST "Builds test that checks if the C++ standard is set correctly" OFF BUILD_TESTING OFF)
+cmake_dependent_option(VIKUNJA_ENABLE_BENCHMARKS "Enable benchmarks" OFF BUILD_TESTING OFF)
 
 # activate support for host/device lambdas in cuda
 # needs to be set before alpaka is included
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 319e6ef..5212a51 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -38,3 +38,6 @@ add_library(vikunja::testSetup ALIAS vikunjaTestSetup)
 list(APPEND _VIKUNJA_TEST_OPTIONS "--use-colour yes")
 add_subdirectory("unit/")
 add_subdirectory("integ/")
+if(VIKUNJA_ENABLE_BENCHMARKS)
+    add_subdirectory("benchmarks/")
+endif()
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
new file mode 100644
index 0000000..49f7773
--- /dev/null
+++ b/test/benchmarks/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Copyright 2021 Simeon Ehrig
+#
+# This file is part of vikunja.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+cmake_minimum_required(VERSION 3.18)
+
+add_library(vikunjaBenchSetup INTERFACE)
+target_compile_definitions(vikunjaBenchSetup INTERFACE CATCH_CONFIG_ENABLE_BENCHMARKING)
+target_include_directories(vikunjaBenchSetup INTERFACE include)
+add_library(vikunja::benchSetup ALIAS vikunjaBenchSetup)
+
+target_compile_definitions(vikunjaTestSetup PRIVATE CATCH_CONFIG_ENABLE_BENCHMARKING)
+
+add_subdirectory("helper/")
+add_subdirectory("transform/")
diff --git a/test/benchmarks/helper/CMakeLists.txt b/test/benchmarks/helper/CMakeLists.txt
new file mode 100644
index 0000000..1b8b692
--- /dev/null
+++ b/test/benchmarks/helper/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright 2021 Simeon Ehrig
+#
+# This file is part of vikunja.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+cmake_minimum_required(VERSION 3.18)
+
+set(_TARGET_NAME "test_bench_helper")
+
+alpaka_add_executable(
+  ${_TARGET_NAME}
+  test_bench_helper.cpp
+  )
+
+target_link_libraries(${_TARGET_NAME}
+  PRIVATE
+  vikunja::testSetup
+  vikunja::benchSetup
+  vikunja::internalvikunja
+)
+
+add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_VIKUNJA_TEST_OPTIONS})
diff --git a/test/benchmarks/helper/test_bench_helper.cpp b/test/benchmarks/helper/test_bench_helper.cpp
new file mode 100644
index 0000000..b779ec8
--- /dev/null
+++ b/test/benchmarks/helper/test_bench_helper.cpp
@@ -0,0 +1,90 @@
+/* Copyright 2021 Simeon Ehrig
+ *
+ * This file is part of vikunja.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <vikunja/bench/memory.hpp>
+#include <vikunja/test/AlpakaSetup.hpp>
+#include <vikunja/test/utility.hpp>
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <catch2/catch.hpp>
+
+
+TEMPLATE_TEST_CASE("allocate_mem_iota compare std::iota", "[iota]", int, float, double)
+{
+    using Data = TestType;
+    using Setup = vikunja::test::TestAlpakaSetup<
+        alpaka::DimInt<1u>, // dim
+        int, // Idx
+        alpaka::AccCpuSerial, // host type
+        alpaka::ExampleDefaultAcc, // device type
+        alpaka::Blocking // queue type
+        >;
+    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+
+    Setup::Idx size = GENERATE(1, 10, 3045, 2'000'000);
+    Data begin = GENERATE(0, 1, 45, -42);
+
+    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+    INFO("begin: " + std::to_string(begin));
+
+    Setup setup;
+    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+
+    auto devMem = vikunja::bench::allocate_mem_iota<Data>(setup, extent, begin);
+    auto hostMem(alpaka::allocBuf<Data, typename Setup::Idx>(setup.devHost, extent));
+    Data* const hostMemPtr(alpaka::getPtrNative(hostMem));
+
+    alpaka::memcpy(setup.queueAcc, hostMem, devMem, extent);
+
+    std::vector<Data> expected_result(size);
+    std::iota(std::begin(expected_result), std::end(expected_result), begin);
+
+    for(Setup::Idx i = 0; i < size; ++i)
+    {
+        REQUIRE(static_cast<Data>(expected_result[i]) == hostMemPtr[i]);
+    }
+}
+
+TEMPLATE_TEST_CASE("allocate_mem_iota different increment", "[iota]", int, float, double)
+{
+    using Data = TestType;
+    using Setup = vikunja::test::TestAlpakaSetup<
+        alpaka::DimInt<1u>, // dim
+        int, // Idx
+        alpaka::AccCpuSerial, // host type
+        alpaka::ExampleDefaultAcc, // device type
+        alpaka::Blocking // queue type
+        >;
+    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+
+    Setup::Idx size = GENERATE(1, 10, 3045);
+    Data begin = GENERATE(0, 1, 45, -42);
+    Data increment = GENERATE(1, -1, 45, -42);
+
+    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+    INFO("begin: " + std::to_string(begin));
+    INFO("increment: " + std::to_string(increment));
+
+    Setup setup;
+    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+
+    auto devMem = vikunja::bench::allocate_mem_iota<Data>(setup, extent, begin, increment);
+    auto hostMem(alpaka::allocBuf<Data, typename Setup::Idx>(setup.devHost, extent));
+    Data* const hostMemPtr(alpaka::getPtrNative(hostMem));
+
+    alpaka::memcpy(setup.queueAcc, hostMem, devMem, extent);
+
+    for(Setup::Idx i = 0; i < size; ++i)
+    {
+        Data expected_result = begin + static_cast<Data>(i) * increment;
+        REQUIRE_MESSAGE(expected_result == hostMemPtr[i], "failed with index: " + std::to_string(i));
+    }
+}
diff --git a/test/benchmarks/include/vikunja/bench/memory.hpp b/test/benchmarks/include/vikunja/bench/memory.hpp
new file mode 100644
index 0000000..1e88e2b
--- /dev/null
+++ b/test/benchmarks/include/vikunja/bench/memory.hpp
@@ -0,0 +1,120 @@
+/* Copyright 2021 Simeon Ehrig
+ *
+ * This file is part of vikunja.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <alpaka/alpaka.hpp>
+
+#include <algorithm>
+#include <vector>
+
+namespace vikunja::bench
+{
+    template<typename TData>
+    class IotaFunctor
+    {
+    private:
+        TData const m_begin;
+        TData const m_increment;
+
+    public:
+        //! Functor for iota implementation with generic data type.
+        //!
+        //! \tparam TData Type of each element
+        //! \param begin Value of the first element.
+        //! \param increment Distance between two elements.
+        IotaFunctor(TData const begin, TData const increment) : m_begin(begin), m_increment(increment)
+        {
+        }
+
+        //! Writes the result of `begin + index * increment` to each element of the output vector.
+        //!
+        //! \tparam TAcc The accelerator environment to be executed on.
+        //! \tparam TElem The element type.
+        //! \param acc The accelerator to be executed on.
+        //! \param output The destination vector.
+        //! \param numElements The number of elements.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc, typename TIdx>
+        ALPAKA_FN_ACC auto operator()(TAcc const& acc, TData* const output, TIdx const& numElements) const -> void
+        {
+            static_assert(alpaka::Dim<TAcc>::value == 1, "The VectorAddKernel expects 1-dimensional indices!");
+
+            TIdx const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+            TIdx const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+            TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
+
+            if(threadFirstElemIdx < numElements)
+            {
+                // Calculate the number of elements for this thread.
+                // The result is uniform for all but the last thread.
+                TIdx const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
+                TIdx const threadLastElemIdxClipped(
+                    (numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
+
+                for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
+                {
+                    output[i] = m_begin + static_cast<TData>(i) * m_increment;
+                }
+            }
+        }
+    };
+
+
+    //! Allocates memory and initialises each value with `begin + index * increment`,
+    //! where index is the position in the output vector. The allocation is done with `setup.devAcc`.
+    //!
+    //! \tparam TData Data type of the memory buffer.
+    //! \tparam TSetup Fully specialized type of `vikunja::test::TestAlpakaSetup`.
+    //! \tparam Type of the extent.
+    //! \tparam TBuf Type of the alpaka memory buffer.
+    //! \param setup Instance of `vikunja::test::TestAlpakaSetup`. The `setup.devAcc` and `setup.queueDev` is used
+    //! for allocation and initialization of the the memory.
+    //! \param extent Size of the memory buffer. Needs to be 1 dimensional.
+    //! \param begin Value of the first element. Depending of TData, it can be negative.
+    //! \param increment Distance between two elements of the vector. If the value is negative, the value of an
+    //! element is greater than its previous element.
+    template<
+        typename TData,
+        typename TSetup,
+        typename TExtent,
+        typename TBuf = alpaka::Buf<typename TSetup::DevAcc, TData, alpaka::DimInt<1u>, typename TSetup::Idx>>
+    TBuf allocate_mem_iota(
+        TSetup& setup,
+        TExtent const& extent,
+        TData const begin = TData{0},
+        TData const increment = TData{1})
+    {
+        // TODO: test also 2 and 3 dimensional memory
+        static_assert(TExtent::Dim::value == 1);
+
+        // TODO: optimize utilization for CPU backends
+        typename TSetup::Idx const elementsPerThread = 1;
+        typename TSetup::Idx linSize = extent.prod();
+
+        TBuf devMem(alpaka::allocBuf<TData, typename TSetup::Idx>(setup.devAcc, extent));
+
+        alpaka::WorkDivMembers<typename TSetup::Dim, typename TSetup::Idx> const workDiv(
+            alpaka::getValidWorkDiv<typename TSetup::Acc>(
+                setup.devAcc,
+                extent,
+                elementsPerThread,
+                false,
+                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+        IotaFunctor iotaFunctor(begin, increment);
+
+        alpaka::exec<typename TSetup::Acc>(
+            setup.queueAcc,
+            workDiv,
+            iotaFunctor,
+            alpaka::getPtrNative(devMem),
+            linSize);
+
+        return devMem;
+    }
+} // namespace vikunja::bench
diff --git a/test/benchmarks/transform/CMakeLists.txt b/test/benchmarks/transform/CMakeLists.txt
new file mode 100644
index 0000000..5e1c6a9
--- /dev/null
+++ b/test/benchmarks/transform/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright 2021 Simeon Ehrig
+#
+# This file is part of vikunja.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+cmake_minimum_required(VERSION 3.18)
+
+set(_TARGET_NAME_VIKUNJA_TRANSFORM "bench_vikunja_transform")
+
+alpaka_add_executable(
+  ${_TARGET_NAME_VIKUNJA_TRANSFORM}
+  bench_vikunja_transform.cpp
+  )
+
+target_link_libraries(${_TARGET_NAME_VIKUNJA_TRANSFORM}
+  PRIVATE
+  vikunja::testSetup
+  vikunja::benchSetup
+  vikunja::internalvikunja
+)
+
+add_test(NAME ${_TARGET_NAME_VIKUNJA_TRANSFORM} COMMAND ${_TARGET_NAME_VIKUNJA_TRANSFORM} ${_VIKUNJA_TEST_OPTIONS})
diff --git a/test/benchmarks/transform/bench_vikunja_transform.cpp b/test/benchmarks/transform/bench_vikunja_transform.cpp
new file mode 100644
index 0000000..9962beb
--- /dev/null
+++ b/test/benchmarks/transform/bench_vikunja_transform.cpp
@@ -0,0 +1,118 @@
+/* Copyright 2021 Simeon Ehrig
+ *
+ * This file is part of vikunja.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <vikunja/bench/memory.hpp>
+#include <vikunja/test/AlpakaSetup.hpp>
+#include <vikunja/test/utility.hpp>
+#include <vikunja/transform/transform.hpp>
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <thread>
+#include <vector>
+
+#include <catch2/catch.hpp>
+
+template<typename TData>
+inline void transform_benchmark(int size)
+{
+    using Setup = vikunja::test::TestAlpakaSetup<
+        alpaka::DimInt<1u>, // dim
+        std::uint64_t, // Idx
+        alpaka::AccCpuSerial, // host type
+        alpaka::ExampleDefaultAcc, // device type
+        alpaka::Blocking // queue type
+        >;
+    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+
+    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+
+    Setup setup;
+    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+
+    auto devMemInput = vikunja::bench::allocate_mem_iota<TData>(
+        setup,
+        extent,
+        static_cast<TData>(1), // first value
+        static_cast<TData>(1) // increment
+    );
+    TData* devMemInputPtrBegin = alpaka::getPtrNative(devMemInput);
+    TData* devMemInputPtrEnd = devMemInputPtrBegin + size;
+
+    auto devMemOutput = alpaka::allocBuf<TData, Setup::Idx>(setup.devAcc, extent);
+    TData* devMemOutputPtrBegin = alpaka::getPtrNative(devMemOutput);
+
+    auto hostMemOutput = alpaka::allocBuf<TData, Setup::Idx>(setup.devHost, extent);
+    TData* hostMemOutputPtrBegin = alpaka::getPtrNative(hostMemOutput);
+    TData* hostMemOutputPtrEnd = hostMemOutputPtrBegin + size;
+
+    auto functor = [] ALPAKA_FN_HOST_ACC(TData const i) -> TData { return 2 * i; };
+
+    vikunja::transform::deviceTransform<Setup::Acc>(
+        setup.devAcc,
+        setup.queueAcc,
+        devMemInputPtrBegin,
+        devMemInputPtrEnd,
+        devMemOutputPtrBegin,
+        functor);
+
+    alpaka::memcpy(setup.queueAcc, hostMemOutput, devMemOutput, extent);
+
+    TData result = std::reduce(hostMemOutputPtrBegin, hostMemOutputPtrEnd, static_cast<TData>(0));
+    TData expected_result = extent.prod() * (extent.prod() + 1);
+
+    // verify, that vikunja transform is working with problem size
+    REQUIRE(expected_result == Approx(result));
+
+    // honeypot to check that the function call in the benchmark block has not been removed by the optimizer
+    hostMemOutputPtrBegin[0] = static_cast<TData>(42);
+
+
+    BENCHMARK("transform vikunja")
+    {
+        vikunja::transform::deviceTransform<Setup::Acc>(
+            setup.devAcc,
+            setup.queueAcc,
+            devMemInputPtrBegin,
+            devMemInputPtrEnd,
+            devMemOutputPtrBegin,
+            functor);
+    };
+
+    alpaka::memcpy(setup.queueAcc, hostMemOutput, devMemOutput, extent);
+
+    result = std::reduce(hostMemOutputPtrBegin, hostMemOutputPtrEnd, static_cast<TData>(0));
+    REQUIRE(expected_result == Approx(result));
+}
+
+TEST_CASE("benchmark transform - int", "[transform][vikunja][int]")
+{
+    using Data = int;
+    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+
+    transform_benchmark<Data>(size);
+}
+
+TEST_CASE("benchmark transform - float", "[transform][vikunja][float]")
+{
+    using Data = float;
+    // removed 1'270'000 because of rounding errors.
+    int size = GENERATE(100, 100'000, 2'000'000);
+
+    transform_benchmark<Data>(size);
+}
+
+TEST_CASE("benchmark transform - double", "[transform][vikunja][double]")
+{
+    using Data = double;
+    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+
+    transform_benchmark<Data>(size);
+}
diff --git a/test/include/vikunja/test/utility.hpp b/test/include/vikunja/test/utility.hpp
index b9baaf1..d375cbd 100644
--- a/test/include/vikunja/test/utility.hpp
+++ b/test/include/vikunja/test/utility.hpp
@@ -16,6 +16,13 @@
 
 #include <sstream>
 
+#define REQUIRE_MESSAGE(cond, msg)                                                                                    \
+    do                                                                                                                \
+    {                                                                                                                 \
+        INFO(msg);                                                                                                    \
+        REQUIRE(cond);                                                                                                \
+    } while((void) 0, 0)
+
 namespace vikunja
 {
     namespace test

From 187265a05ee9bf7e2fedabe76061e26fde2eb3a5 Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Mon, 24 Jan 2022 14:09:59 +0000
Subject: [PATCH 2/8] Sphinx Doc: add section about testing and benchmarking

---
 docs/source/advanced/cmake.rst     | 14 ++++++--
 docs/source/basic/algorithm.rst    |  6 ++--
 docs/source/basic/installation.rst |  6 ++--
 docs/source/basic/introduction.rst |  2 +-
 docs/source/development/test.rst   | 55 ++++++++++++++++++++++++++++++
 docs/source/index.rst              |  1 +
 6 files changed, 75 insertions(+), 9 deletions(-)
 create mode 100644 docs/source/development/test.rst

diff --git a/docs/source/advanced/cmake.rst b/docs/source/advanced/cmake.rst
index 8a1a809..84b2658 100644
--- a/docs/source/advanced/cmake.rst
+++ b/docs/source/advanced/cmake.rst
@@ -20,6 +20,7 @@ Common
 
 Testing
 +++++++
+.. _cmake-test:
 
 **BUILD_TESTING** (OFF)
     .. code-block::
@@ -38,7 +39,14 @@ Testing
 
         Only works if BUILD_TESTING is ON.
         Special test that checks if ALPAKA_CXX_STANDARD works correctly.
-        The implementation is very compiler specific, so it is possible that the test is not supported by your used C++ compiler.
+        The implementation is very compiler specific, so it is possible that the test is not
+        supported by your used C++ compiler.
+
+**VIKUNJA_ENABLE_BENCHMARKS** (OFF)
+    .. code-block::
+
+        Only works if BUILD_TESTING is ON.
+        Enable the benchmarks. The benchmarks are built automatically and can be executed via ctest.
 
 alpaka
 ++++++
@@ -64,7 +72,7 @@ The following CMake variables are provided by alpaka. This section contains only
         - ALPAKA_ACC_GPU_CUDA_ENABLE
         - ALPAKA_ACC_GPU_HIP_ENABLE
 
-        Important: Not all alpaka accelerator backends are tested together with vikunja, 
+        Important: Not all alpaka accelerator backends are tested together with vikunja,
         see CI tests.
 
 **ALPAKA_CUDA_NVCC_EXPT_EXTENDED_LAMBDA** (ON)
@@ -75,4 +83,4 @@ The following CMake variables are provided by alpaka. This section contains only
 **ALPAKA_CUDA_EXPT_EXTENDED_LAMBDA** (ON)
     .. code-block::
 
-        Enable lambda support in Alpaka 0.7.x and later for the CUDA accelerator.
\ No newline at end of file
+        Enable lambda support in Alpaka 0.7.x and later for the CUDA accelerator.
diff --git a/docs/source/basic/algorithm.rst b/docs/source/basic/algorithm.rst
index 2054657..e81c55c 100644
--- a/docs/source/basic/algorithm.rst
+++ b/docs/source/basic/algorithm.rst
@@ -1,5 +1,5 @@
 Algorithms
-=========
+==========
 
 This page provides an overview of all algorithms implemented in vikunja.
 
@@ -8,7 +8,7 @@ All algorithms have the property that the order in which the input elements are
 Transform
 ---------
 
-Takes a range of elements as input, applies an unary operator to each element, and writes the result to an output range in the same order. 
+Takes a range of elements as input, applies an unary operator to each element, and writes the result to an output range in the same order.
 
 .. only:: html
 
@@ -35,4 +35,4 @@ Takes a range of elements as input and reduces it to a single element via an ope
 .. only:: latex
 
   .. image:: images/reduction.pdf
-    :alt: scheme: reduce algorithm
\ No newline at end of file
+    :alt: scheme: reduce algorithm
diff --git a/docs/source/basic/installation.rst b/docs/source/basic/installation.rst
index 4defddf..841c55a 100644
--- a/docs/source/basic/installation.rst
+++ b/docs/source/basic/installation.rst
@@ -19,8 +19,8 @@ Vikunja builds and installs itself using `CMake <https://cmake.org/>`_. Before y
 .. code-block:: bash
 
   git clone https://github.com/alpaka-group/vikunja.git
-  mkdir vikunja/build 
-  cd vikunja/build  
+  mkdir vikunja/build
+  cd vikunja/build
   cmake ..
   cmake --build .
   cmake --install .
@@ -38,6 +38,8 @@ Enable and run the tests:
    cmake --build .
    ctest
 
+Read this :doc:`section </development/test>` for more information about the tests.
+
 Enable and run an example:
 
 .. code-block:: bash
diff --git a/docs/source/basic/introduction.rst b/docs/source/basic/introduction.rst
index 0af13a2..47a98af 100644
--- a/docs/source/basic/introduction.rst
+++ b/docs/source/basic/introduction.rst
@@ -7,7 +7,7 @@ The basic concept of vikunja is to run an ``algorithm`` with an ``operator`` ove
 
   * **Transform**: Takes a range of elements as input, applies an operator to each element, and writes the result to an output range.
   * **Reduce**: Takes a range of elements as input and returns a single element. The reduce operator takes two elements of the input range, applies an operation to them, and returns a single element. The operator is applied up to the point where only one element remains.
-  * For more examples see: :ref:`Algorithm <Algorithm>`
+  * For more examples see: :doc:`Algorithm </basic/algorithm>`
 * An ``operator`` describes an algorithm which is applied to one (unary operator) or two (binary operator) elements and returns a result. The following examples assume that **i** is the first and **j** the second input element:
 
   * **sum**: `return i+j;`
diff --git a/docs/source/development/test.rst b/docs/source/development/test.rst
new file mode 100644
index 0000000..dce9cc1
--- /dev/null
+++ b/docs/source/development/test.rst
@@ -0,0 +1,55 @@
+Testing and Benchmarking
+========================
+
+Vikunja offers different types of tests. The source code is tested via unit and integration tests with `Cacht2 <https://github.com/catchorg/Catch2/tree/v2.x>`_. The CMake code is tested with integration tests and custom scripts.
+
+Source Code Tests
+-----------------
+
+Before you start writing source code tests, you should read the `Catch2 documentation <https://github.com/catchorg/Catch2/blob/v2.x/docs/tutorial.md#top>`_. Tests written with Catch2 are standalone application. Therefore, they have their own source code files and ``CMakeLists.txt`` files located in the ``test/unit`` and ``test/integ`` folders. If you set the CMake argument ``-DBUILD_TESTING=ON``, the executable files of the tests will be built automatically. All test cases are registered via the CMake function ``add_test``. Therefore, you can automatically run all tests in the build folder with the ``ctest`` command:
+
+.. code-block:: bash
+
+    mkdir build && cd build
+    cmake .. -DBUILD_TESTING=ON
+    cmake --build .
+    ctest
+
+For more CMake arguments for the tests, see the :ref:`CMake section <cmake-test>`.
+
+If you only want to run a single test, you can run the test executable directly. All test executables are located in the ``<build_folder>/tests``. It is also possible to run the executable with the ``--help`` flag to show additional options. For example, the ``-s`` flag displays additional information created with the Catch2 function ``INFO()``.
+
+.. code-block:: bash
+
+    mkdir build && cd build
+    cmake .. -DBUILD_TESTING=ON
+    cmake --build .
+    # display extra test options
+    test/integ/reduce/test_reduce --help
+    # run test with extra output
+    test/integ/reduce/test_reduce -s
+
+CMake Tests
+-----------
+
+The CMake integration tests check whether vikunja can be used correctly in another project via the CMake functions ``find_package()``) or ``add_subdirectory``. The CI contains test jobs for creating projects that use the vikunja library. The job names start with ``integration``. All associated files for the tests are in ``script/integration_test``.
+
+CXX Test
+++++++++
+
+There is a special Catch2 test that tests the vikunja CMake to see if the C++ standard is set correctly. The name of the test is ``test_cxx``. The test compares the C++ standard set by the compiler with an expected standard passed as an argument. By default, ``ctest`` automatically passes the expected C++ standard depending on the CMake variable ``ALPAKA_CXX_STANDARD``. If you run the test manually, you must pass it yourself:
+
+.. code-block:: bash
+
+    # expected, that the code was compiled with C++ 17
+    test/unit/cxx/test_cxx --cxx 17
+
+
+Benchmarks
+----------
+
+Vikunja uses `Catch2 benchmark <https://github.com/catchorg/Catch2/blob/v2.x/docs/benchmarks.md#top>`_ to automatically run benchmarks. By default, the benchmarks are not enabled. To enable the benchmarks, the CMake arguments ``-DBUILD_TESTING=ON -DVIKUNJA_ENABLE_BENCHMARKS=ON`` must be set. The benchmarks are created automatically and can be run with ``ctest``. As with the tests, you can run a particular benchmark directly from the executable file, e.g. ``test/benchmarks/transform/bench_vikunja_transform``. All benchmark executables are located in ``<build_folder>/test/benchmarks``.
+
+.. tip::
+
+    If you run ``bechmark_exe --help``, you get benchmark specific options.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 4f8635d..6dc3666 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,6 +41,7 @@ Generally, **follow the manual pages in-order** to get started. Individual chapt
    :maxdepth: 1
    :caption: Development
 
+   development/test.rst
    development/docs.rst
    development/styleguide.rst
    development/ci.rst

From 306befee92d35299700803d928d715cf022fb712 Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Mon, 24 Jan 2022 16:03:15 +0000
Subject: [PATCH 3/8] add thrust transform benchmark

- benchmark result on Quadro V100: thrust and vikunja are nearly equal
- vikunja is a little bit faster
---
 CMakeLists.txt                                |  1 +
 docs/source/advanced/cmake.rst                |  6 ++
 test/benchmarks/CMakeLists.txt                |  6 ++
 test/benchmarks/transform/CMakeLists.txt      | 19 +++++
 .../transform/bench_thrust_transform.cpp      | 72 +++++++++++++++++++
 .../transform/bench_vikunja_transform.cpp     |  3 +-
 6 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 test/benchmarks/transform/bench_thrust_transform.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c0515d0..624ba44 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ option(BUILD_TESTING "Build the testing tree." OFF)
 cmake_dependent_option(VIKUNJA_SYSTEM_CATCH2 "Use your local installation of Catch2" ON BUILD_TESTING OFF)
 cmake_dependent_option(VIKUNJA_ENABLE_CXX_TEST "Builds test that checks if the C++ standard is set correctly" OFF BUILD_TESTING OFF)
 cmake_dependent_option(VIKUNJA_ENABLE_BENCHMARKS "Enable benchmarks" OFF BUILD_TESTING OFF)
+cmake_dependent_option(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS "Enable benchmarks" OFF VIKUNJA_ENABLE_BENCHMARKS OFF)
 
 # activate support for host/device lambdas in cuda
 # needs to be set before alpaka is included
diff --git a/docs/source/advanced/cmake.rst b/docs/source/advanced/cmake.rst
index 84b2658..0513b7c 100644
--- a/docs/source/advanced/cmake.rst
+++ b/docs/source/advanced/cmake.rst
@@ -48,6 +48,12 @@ Testing
         Only works if BUILD_TESTING is ON.
         Enable the benchmarks. The benchmarks are built automatically and can be executed via ctest.
 
+**VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS** (OFF)
+    .. code-block::
+
+        Only works if VIKUNJA_ENABLE_BENCHMARKS and ALPAKA_ACC_GPU_CUDA_ENABLE are ON.
+        Enable thrust benchmarks for comparison.
+
 alpaka
 ++++++
 
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index 49f7773..e8b5ccc 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -15,5 +15,11 @@ add_library(vikunja::benchSetup ALIAS vikunjaBenchSetup)
 
 target_compile_definitions(vikunjaTestSetup PRIVATE CATCH_CONFIG_ENABLE_BENCHMARKING)
 
+if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
+    if(NOT ALPAKA_ACC_GPU_CUDA_ENABLE)
+        message(FATAL_ERROR "VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS requires the ALPAKA_ACC_GPU_CUDA_ENABLE backend to be enabled.")
+    endif()
+endif()
+
 add_subdirectory("helper/")
 add_subdirectory("transform/")
diff --git a/test/benchmarks/transform/CMakeLists.txt b/test/benchmarks/transform/CMakeLists.txt
index 5e1c6a9..fb477b2 100644
--- a/test/benchmarks/transform/CMakeLists.txt
+++ b/test/benchmarks/transform/CMakeLists.txt
@@ -23,3 +23,22 @@ target_link_libraries(${_TARGET_NAME_VIKUNJA_TRANSFORM}
 )
 
 add_test(NAME ${_TARGET_NAME_VIKUNJA_TRANSFORM} COMMAND ${_TARGET_NAME_VIKUNJA_TRANSFORM} ${_VIKUNJA_TEST_OPTIONS})
+
+
+if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
+  set(_TARGET_NAME_THRUST_TRANSFORM "bench_thrust_transform")
+
+  alpaka_add_executable(
+    ${_TARGET_NAME_THRUST_TRANSFORM}
+    bench_thrust_transform.cpp
+  )
+
+  target_link_libraries(${_TARGET_NAME_THRUST_TRANSFORM}
+    PRIVATE
+    vikunja::testSetup
+    vikunja::benchSetup
+    vikunja::internalvikunja
+  )
+
+  add_test(NAME ${_TARGET_NAME_THRUST_TRANSFORM} COMMAND ${_TARGET_NAME_THRUST_TRANSFORM} ${_VIKUNJA_TEST_OPTIONS})
+endif()
diff --git a/test/benchmarks/transform/bench_thrust_transform.cpp b/test/benchmarks/transform/bench_thrust_transform.cpp
new file mode 100644
index 0000000..3529ab8
--- /dev/null
+++ b/test/benchmarks/transform/bench_thrust_transform.cpp
@@ -0,0 +1,72 @@
+/* Copyright 2021 Simeon Ehrig
+ *
+ * This file is part of vikunja.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <numeric>
+#include <vector>
+
+#include <catch2/catch.hpp>
+#include <thrust/device_vector.h>
+
+template<typename TData>
+inline void benchmark_transform(int size)
+{
+    std::vector<TData> hostMemInput(size);
+    for(int i = 0; i < size; ++i)
+    {
+        hostMemInput[i] = static_cast<TData>(i) + static_cast<TData>(1);
+    }
+
+    thrust::device_vector<TData> devMemInput(hostMemInput);
+    thrust::device_vector<TData> devMemOutput(size);
+
+    auto functor = [] __device__(TData const i) -> TData { return 2 * i; };
+    thrust::transform(devMemInput.begin(), devMemInput.end(), devMemOutput.begin(), functor);
+
+    std::vector<TData> hostMemOutput(size);
+    thrust::copy(devMemOutput.begin(), devMemOutput.end(), hostMemOutput.begin());
+
+    TData result = std::reduce(hostMemOutput.begin(), hostMemOutput.end(), static_cast<TData>(0));
+    TData expected_result = static_cast<TData>(size) * (static_cast<TData>(size) + 1);
+    // verify, that vikunja transform is working with problem size
+    REQUIRE(expected_result == Approx(result));
+
+    // honeypot to check that the function call in the benchmark block has not been removed by the optimizer
+    hostMemOutput[0] = static_cast<TData>(42);
+
+    BENCHMARK("transform thrust")
+    {
+        thrust::transform(devMemInput.begin(), devMemInput.end(), devMemOutput.begin(), functor);
+    };
+
+    thrust::copy(devMemOutput.begin(), devMemOutput.end(), hostMemOutput.begin());
+
+    result = std::reduce(hostMemOutput.begin(), hostMemOutput.end(), static_cast<TData>(0));
+    REQUIRE(expected_result == Approx(result));
+}
+
+TEST_CASE("bechmark transform - int", "[transform][thrust][int]")
+{
+    using Data = int;
+    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+    benchmark_transform<Data>(size);
+}
+
+TEST_CASE("bechmark transform - float", "[transform][thrust][float]")
+{
+    using Data = float;
+    int size = GENERATE(100, 100'000, 2'000'000);
+    benchmark_transform<Data>(size);
+}
+
+TEST_CASE("bechmark transform - double", "[transform][thrust][double]")
+{
+    using Data = double;
+    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+    benchmark_transform<Data>(size);
+}
diff --git a/test/benchmarks/transform/bench_vikunja_transform.cpp b/test/benchmarks/transform/bench_vikunja_transform.cpp
index 9962beb..1050d12 100644
--- a/test/benchmarks/transform/bench_vikunja_transform.cpp
+++ b/test/benchmarks/transform/bench_vikunja_transform.cpp
@@ -15,8 +15,7 @@
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
 
-#include <thread>
-#include <vector>
+#include <numeric>
 
 #include <catch2/catch.hpp>
 

From 156f7519e69904b5bd8daae53f56dd5bcec7ebea Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Mon, 24 Jan 2022 16:51:49 +0000
Subject: [PATCH 4/8] add reduce benchmarks

- benchmark result on Quadro V100: thrust is faster than vikunja (less than 10%)
---
 test/benchmarks/CMakeLists.txt                |   1 +
 test/benchmarks/reduce/CMakeLists.txt         |  44 ++++++++
 .../benchmarks/reduce/bench_thrust_reduce.cpp |  63 +++++++++++
 .../reduce/bench_vikunja_reduce.cpp           | 106 ++++++++++++++++++
 4 files changed, 214 insertions(+)
 create mode 100644 test/benchmarks/reduce/CMakeLists.txt
 create mode 100644 test/benchmarks/reduce/bench_thrust_reduce.cpp
 create mode 100644 test/benchmarks/reduce/bench_vikunja_reduce.cpp

diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index e8b5ccc..4e168fc 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -23,3 +23,4 @@ endif()
 
 add_subdirectory("helper/")
 add_subdirectory("transform/")
+add_subdirectory("reduce/")
diff --git a/test/benchmarks/reduce/CMakeLists.txt b/test/benchmarks/reduce/CMakeLists.txt
new file mode 100644
index 0000000..e31c98c
--- /dev/null
+++ b/test/benchmarks/reduce/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Copyright 2021 Simeon Ehrig
+#
+# This file is part of vikunja.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+cmake_minimum_required(VERSION 3.18)
+
+set(_TARGET_NAME_VIKUNJA_REDUCE "bench_vikunja_reduce")
+
+alpaka_add_executable(
+  ${_TARGET_NAME_VIKUNJA_REDUCE}
+  bench_vikunja_reduce.cpp
+  )
+
+target_link_libraries(${_TARGET_NAME_VIKUNJA_REDUCE}
+  PRIVATE
+  vikunja::testSetup
+  vikunja::benchSetup
+  vikunja::internalvikunja
+)
+
+add_test(NAME ${_TARGET_NAME_VIKUNJA_REDUCE} COMMAND ${_TARGET_NAME_VIKUNJA_REDUCE} ${_VIKUNJA_TEST_OPTIONS})
+
+
+if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
+  set(_TARGET_NAME_THRUST_REDUCE "bench_thrust_reduce")
+
+  alpaka_add_executable(
+    ${_TARGET_NAME_THRUST_REDUCE}
+    bench_thrust_reduce.cpp
+  )
+
+  target_link_libraries(${_TARGET_NAME_THRUST_REDUCE}
+    PRIVATE
+    vikunja::testSetup
+    vikunja::benchSetup
+    vikunja::internalvikunja
+  )
+
+  add_test(NAME ${_TARGET_NAME_THRUST_REDUCE} COMMAND ${_TARGET_NAME_THRUST_REDUCE} ${_VIKUNJA_TEST_OPTIONS})
+endif()
diff --git a/test/benchmarks/reduce/bench_thrust_reduce.cpp b/test/benchmarks/reduce/bench_thrust_reduce.cpp
new file mode 100644
index 0000000..1139c10
--- /dev/null
+++ b/test/benchmarks/reduce/bench_thrust_reduce.cpp
@@ -0,0 +1,63 @@
+/* Copyright 2021 Simeon Ehrig
+ *
+ * This file is part of vikunja.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <numeric>
+#include <vector>
+
+#include <catch2/catch.hpp>
+#include <thrust/device_vector.h>
+
+template<typename TData>
+inline void benchmark_reduce(int size)
+{
+    std::vector<TData> hostMemInput(size);
+    for(int i = 0; i < size; ++i)
+    {
+        hostMemInput[i] = static_cast<TData>(i) + static_cast<TData>(1);
+    }
+
+    thrust::device_vector<TData> devMemInput(hostMemInput);
+
+    TData result = thrust::reduce(devMemInput.begin(), devMemInput.end(), static_cast<TData>(0));
+
+    TData expected_result = (static_cast<TData>(size) * (static_cast<TData>(size) + 1)) / static_cast<TData>(2);
+    // verify, that vikunja reduce is working with problem size
+    REQUIRE(expected_result == Approx(result));
+
+    // honeypot to check that the function call in the benchmark block has not been removed by the optimizer
+    result = static_cast<TData>(0);
+
+    BENCHMARK("reduce thrust")
+    {
+        result = thrust::reduce(devMemInput.begin(), devMemInput.end(), static_cast<TData>(0));
+    };
+
+    REQUIRE(expected_result == Approx(result));
+}
+
+TEST_CASE("bechmark reduce - int", "[reduce][thrust][int]")
+{
+    using Data = int;
+    int size = GENERATE(100, 100'000, 1'270'000, 1'600'000);
+    benchmark_reduce<Data>(size);
+}
+
+TEST_CASE("bechmark reduce - float", "[reduce][thrust][float]")
+{
+    using Data = float;
+    int size = GENERATE(100, 100'000, 2'000'000);
+    benchmark_reduce<Data>(size);
+}
+
+TEST_CASE("bechmark reduce - double", "[reduce][thrust][double]")
+{
+    using Data = double;
+    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+    benchmark_reduce<Data>(size);
+}
diff --git a/test/benchmarks/reduce/bench_vikunja_reduce.cpp b/test/benchmarks/reduce/bench_vikunja_reduce.cpp
new file mode 100644
index 0000000..49ca6b2
--- /dev/null
+++ b/test/benchmarks/reduce/bench_vikunja_reduce.cpp
@@ -0,0 +1,106 @@
+/* Copyright 2021 Simeon Ehrig
+ *
+ * This file is part of vikunja.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#include <vikunja/bench/memory.hpp>
+#include <vikunja/reduce/reduce.hpp>
+#include <vikunja/test/AlpakaSetup.hpp>
+#include <vikunja/test/utility.hpp>
+
+#include <alpaka/alpaka.hpp>
+#include <alpaka/example/ExampleDefaultAcc.hpp>
+
+#include <numeric>
+
+#include <catch2/catch.hpp>
+
+template<typename TData>
+inline void reduce_benchmark(int size)
+{
+    using Setup = vikunja::test::TestAlpakaSetup<
+        alpaka::DimInt<1u>, // dim
+        std::uint64_t, // Idx
+        alpaka::AccCpuSerial, // host type
+        alpaka::ExampleDefaultAcc, // device type
+        alpaka::Blocking // queue type
+        >;
+    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+
+    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+
+    Setup setup;
+    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+
+    auto devMemInput = vikunja::bench::allocate_mem_iota<TData>(
+        setup,
+        extent,
+        static_cast<TData>(1), // first value
+        static_cast<TData>(1) // increment
+    );
+    TData* devMemInputPtrBegin = alpaka::getPtrNative(devMemInput);
+    TData* devMemInputPtrEnd = devMemInputPtrBegin + size;
+
+    auto devMemOutput = alpaka::allocBuf<TData, Setup::Idx>(setup.devAcc, extent);
+    TData* devMemOutputPtrBegin = alpaka::getPtrNative(devMemOutput);
+
+    auto functor = [] ALPAKA_FN_HOST_ACC(TData const i, TData const j) -> TData { return i + j; };
+
+    TData result = vikunja::reduce::deviceReduce<Setup::Acc>(
+        setup.devAcc,
+        setup.devHost,
+        setup.queueAcc,
+        devMemInputPtrBegin,
+        devMemInputPtrEnd,
+        functor);
+
+    TData expected_result = (extent.prod() * (extent.prod() + static_cast<TData>(1)) / static_cast<TData>(2));
+
+    // verify, that vikunja reduce is working with problem size
+    REQUIRE(expected_result == Approx(result));
+
+    // honeypot to check that the function call in the benchmark block has not been removed by the optimizer
+    result = static_cast<TData>(0);
+
+    BENCHMARK("reduce vikunja")
+    {
+        result = vikunja::reduce::deviceReduce<Setup::Acc>(
+            setup.devAcc,
+            setup.devHost,
+            setup.queueAcc,
+            devMemInputPtrBegin,
+            devMemInputPtrEnd,
+            functor);
+    };
+
+    REQUIRE(expected_result == Approx(result));
+}
+
+TEST_CASE("benchmark reduce - int", "[reduce][vikunja][int]")
+{
+    using Data = int;
+    int size = GENERATE(100, 100'000, 1'270'000, 1'600'000);
+
+    reduce_benchmark<Data>(size);
+}
+
+TEST_CASE("benchmark reduce - float", "[reduce][vikunja][float]")
+{
+    using Data = float;
+    // removed 1'270'000 because of rounding errors.
+    int size = GENERATE(100, 100'000, 2'000'000);
+
+    reduce_benchmark<Data>(size);
+}
+
+TEST_CASE("benchmark reduce - double", "[reduce][vikunja][double]")
+{
+    using Data = double;
+    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+
+    reduce_benchmark<Data>(size);
+}

From b787d407d95355112bd13bfeb66faed7c774486f Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Mon, 24 Jan 2022 18:03:48 +0000
Subject: [PATCH 5/8] run benchmarks serial every time

---
 test/benchmarks/reduce/CMakeLists.txt    | 3 +++
 test/benchmarks/transform/CMakeLists.txt | 4 +++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/benchmarks/reduce/CMakeLists.txt b/test/benchmarks/reduce/CMakeLists.txt
index e31c98c..774142d 100644
--- a/test/benchmarks/reduce/CMakeLists.txt
+++ b/test/benchmarks/reduce/CMakeLists.txt
@@ -23,6 +23,8 @@ target_link_libraries(${_TARGET_NAME_VIKUNJA_REDUCE}
 )
 
 add_test(NAME ${_TARGET_NAME_VIKUNJA_REDUCE} COMMAND ${_TARGET_NAME_VIKUNJA_REDUCE} ${_VIKUNJA_TEST_OPTIONS})
+# avoid running the benchmarks in parallel
+set_tests_properties(${_TARGET_NAME_VIKUNJA_REDUCE} PROPERTIES RUN_SERIAL TRUE)
 
 
 if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
@@ -41,4 +43,5 @@ if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
   )
 
   add_test(NAME ${_TARGET_NAME_THRUST_REDUCE} COMMAND ${_TARGET_NAME_THRUST_REDUCE} ${_VIKUNJA_TEST_OPTIONS})
+  set_tests_properties(${_TARGET_NAME_THRUST_REDUCE} PROPERTIES RUN_SERIAL TRUE)
 endif()
diff --git a/test/benchmarks/transform/CMakeLists.txt b/test/benchmarks/transform/CMakeLists.txt
index fb477b2..6c89c40 100644
--- a/test/benchmarks/transform/CMakeLists.txt
+++ b/test/benchmarks/transform/CMakeLists.txt
@@ -23,7 +23,8 @@ target_link_libraries(${_TARGET_NAME_VIKUNJA_TRANSFORM}
 )
 
 add_test(NAME ${_TARGET_NAME_VIKUNJA_TRANSFORM} COMMAND ${_TARGET_NAME_VIKUNJA_TRANSFORM} ${_VIKUNJA_TEST_OPTIONS})
-
+# avoid running the benchmarks in parallel
+set_tests_properties(${_TARGET_NAME_VIKUNJA_TRANSFORM} PROPERTIES RUN_SERIAL TRUE)
 
 if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
   set(_TARGET_NAME_THRUST_TRANSFORM "bench_thrust_transform")
@@ -41,4 +42,5 @@ if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
   )
 
   add_test(NAME ${_TARGET_NAME_THRUST_TRANSFORM} COMMAND ${_TARGET_NAME_THRUST_TRANSFORM} ${_VIKUNJA_TEST_OPTIONS})
+  set_tests_properties(${_TARGET_NAME_THRUST_TRANSFORM} PROPERTIES RUN_SERIAL TRUE)
 endif()

From 7e1a2e01f8b4a6d1ced8371e8a9700963ab722a8 Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Tue, 25 Jan 2022 12:11:54 +0000
Subject: [PATCH 6/8] several fixes on the benchmarks

- add template parameter TIdx in the vikunja benchmarks
- improve result verification in transform benchmarks to avoid overflows
- use TEMPLATE_TEST_CASE instead TEST_CASE for the benckmarks

Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
---
 test/benchmarks/CMakeLists.txt                |  2 +-
 test/benchmarks/helper/CMakeLists.txt         |  2 +-
 test/benchmarks/helper/test_bench_helper.cpp  |  2 +-
 .../include/vikunja/bench/memory.hpp          |  5 +-
 test/benchmarks/reduce/CMakeLists.txt         |  2 +-
 .../benchmarks/reduce/bench_thrust_reduce.cpp | 38 +++++-----
 .../reduce/bench_vikunja_reduce.cpp           | 69 +++++++++----------
 test/benchmarks/transform/CMakeLists.txt      |  2 +-
 .../transform/bench_thrust_transform.cpp      | 40 ++++-------
 .../transform/bench_vikunja_transform.cpp     | 63 ++++++-----------
 10 files changed, 90 insertions(+), 135 deletions(-)

diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index 4e168fc..1aba1c6 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021 Simeon Ehrig
+# Copyright 2022 Simeon Ehrig
 #
 # This file is part of vikunja.
 #
diff --git a/test/benchmarks/helper/CMakeLists.txt b/test/benchmarks/helper/CMakeLists.txt
index 1b8b692..ac2fa6d 100644
--- a/test/benchmarks/helper/CMakeLists.txt
+++ b/test/benchmarks/helper/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021 Simeon Ehrig
+# Copyright 2022 Simeon Ehrig
 #
 # This file is part of vikunja.
 #
diff --git a/test/benchmarks/helper/test_bench_helper.cpp b/test/benchmarks/helper/test_bench_helper.cpp
index b779ec8..3749a7e 100644
--- a/test/benchmarks/helper/test_bench_helper.cpp
+++ b/test/benchmarks/helper/test_bench_helper.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2021 Simeon Ehrig
+/* Copyright 2022 Simeon Ehrig
  *
  * This file is part of vikunja.
  *
diff --git a/test/benchmarks/include/vikunja/bench/memory.hpp b/test/benchmarks/include/vikunja/bench/memory.hpp
index 1e88e2b..ff49406 100644
--- a/test/benchmarks/include/vikunja/bench/memory.hpp
+++ b/test/benchmarks/include/vikunja/bench/memory.hpp
@@ -1,4 +1,4 @@
-/* Copyright 2021 Simeon Ehrig
+/* Copyright 2022 Simeon Ehrig
  *
  * This file is part of vikunja.
  *
@@ -53,8 +53,7 @@ namespace vikunja::bench
                 // Calculate the number of elements for this thread.
                 // The result is uniform for all but the last thread.
                 TIdx const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
-                TIdx const threadLastElemIdxClipped(
-                    (numElements > threadLastElemIdx) ? threadLastElemIdx : numElements);
+                TIdx const threadLastElemIdxClipped(alpaka::math::min(acc, numElements, threadLastElemIdx));
 
                 for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
                 {
diff --git a/test/benchmarks/reduce/CMakeLists.txt b/test/benchmarks/reduce/CMakeLists.txt
index 774142d..c6a882e 100644
--- a/test/benchmarks/reduce/CMakeLists.txt
+++ b/test/benchmarks/reduce/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021 Simeon Ehrig
+# Copyright 2022 Simeon Ehrig
 #
 # This file is part of vikunja.
 #
diff --git a/test/benchmarks/reduce/bench_thrust_reduce.cpp b/test/benchmarks/reduce/bench_thrust_reduce.cpp
index 1139c10..463ace0 100644
--- a/test/benchmarks/reduce/bench_thrust_reduce.cpp
+++ b/test/benchmarks/reduce/bench_thrust_reduce.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2021 Simeon Ehrig
+/* Copyright 2022 Simeon Ehrig
  *
  * This file is part of vikunja.
  *
@@ -14,7 +14,7 @@
 #include <thrust/device_vector.h>
 
 template<typename TData>
-inline void benchmark_reduce(int size)
+inline void reduce_benchmark(int size)
 {
     std::vector<TData> hostMemInput(size);
     for(int i = 0; i < size; ++i)
@@ -35,29 +35,27 @@ inline void benchmark_reduce(int size)
 
     BENCHMARK("reduce thrust")
     {
-        result = thrust::reduce(devMemInput.begin(), devMemInput.end(), static_cast<TData>(0));
+        return result = thrust::reduce(devMemInput.begin(), devMemInput.end(), static_cast<TData>(0));
     };
 
     REQUIRE(expected_result == Approx(result));
 }
 
-TEST_CASE("bechmark reduce - int", "[reduce][thrust][int]")
+TEMPLATE_TEST_CASE("bechmark reduce", "[benchmark][reduce][thrust]", int, float, double)
 {
-    using Data = int;
-    int size = GENERATE(100, 100'000, 1'270'000, 1'600'000);
-    benchmark_reduce<Data>(size);
-}
-
-TEST_CASE("bechmark reduce - float", "[reduce][thrust][float]")
-{
-    using Data = float;
-    int size = GENERATE(100, 100'000, 2'000'000);
-    benchmark_reduce<Data>(size);
-}
+    using Data = TestType;
 
-TEST_CASE("bechmark reduce - double", "[reduce][thrust][double]")
-{
-    using Data = double;
-    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
-    benchmark_reduce<Data>(size);
+    if constexpr(std::is_same_v<Data, int>)
+    {
+        reduce_benchmark<Data>(GENERATE(100, 100'000, 1'270'000, 1'600'000));
+    }
+    else if constexpr(std::is_same_v<Data, float>)
+    {
+        // removed 1'270'000 because of rounding errors.
+        reduce_benchmark<Data>(GENERATE(100, 100'000, 2'000'000));
+    }
+    else if constexpr(std::is_same_v<Data, double>)
+    {
+        reduce_benchmark<Data>(GENERATE(100, 100'000, 1'270'000, 2'000'000));
+    }
 }
diff --git a/test/benchmarks/reduce/bench_vikunja_reduce.cpp b/test/benchmarks/reduce/bench_vikunja_reduce.cpp
index 49ca6b2..b3227d0 100644
--- a/test/benchmarks/reduce/bench_vikunja_reduce.cpp
+++ b/test/benchmarks/reduce/bench_vikunja_reduce.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2021 Simeon Ehrig
+/* Copyright 2022 Simeon Ehrig
  *
  * This file is part of vikunja.
  *
@@ -19,22 +19,22 @@
 
 #include <catch2/catch.hpp>
 
-template<typename TData>
-inline void reduce_benchmark(int size)
+template<typename TData, typename TIdx>
+inline void reduce_benchmark(TIdx size)
 {
     using Setup = vikunja::test::TestAlpakaSetup<
         alpaka::DimInt<1u>, // dim
-        std::uint64_t, // Idx
+        TIdx, // Idx
         alpaka::AccCpuSerial, // host type
         alpaka::ExampleDefaultAcc, // device type
         alpaka::Blocking // queue type
         >;
-    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+    using Vec = alpaka::Vec<typename Setup::Dim, typename Setup::Idx>;
 
-    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+    INFO((vikunja::test::print_acc_info<typename Setup::Dim>(size)));
 
     Setup setup;
-    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+    Vec extent = Vec::all(static_cast<typename Setup::Idx>(size));
 
     auto devMemInput = vikunja::bench::allocate_mem_iota<TData>(
         setup,
@@ -45,12 +45,12 @@ inline void reduce_benchmark(int size)
     TData* devMemInputPtrBegin = alpaka::getPtrNative(devMemInput);
     TData* devMemInputPtrEnd = devMemInputPtrBegin + size;
 
-    auto devMemOutput = alpaka::allocBuf<TData, Setup::Idx>(setup.devAcc, extent);
+    auto devMemOutput = alpaka::allocBuf<TData, typename Setup::Idx>(setup.devAcc, extent);
     TData* devMemOutputPtrBegin = alpaka::getPtrNative(devMemOutput);
 
     auto functor = [] ALPAKA_FN_HOST_ACC(TData const i, TData const j) -> TData { return i + j; };
 
-    TData result = vikunja::reduce::deviceReduce<Setup::Acc>(
+    TData result = vikunja::reduce::deviceReduce<typename Setup::Acc>(
         setup.devAcc,
         setup.devHost,
         setup.queueAcc,
@@ -68,39 +68,34 @@ inline void reduce_benchmark(int size)
 
     BENCHMARK("reduce vikunja")
     {
-        result = vikunja::reduce::deviceReduce<Setup::Acc>(
-            setup.devAcc,
-            setup.devHost,
-            setup.queueAcc,
-            devMemInputPtrBegin,
-            devMemInputPtrEnd,
-            functor);
+        return result = vikunja::reduce::deviceReduce<typename Setup::Acc>(
+                   setup.devAcc,
+                   setup.devHost,
+                   setup.queueAcc,
+                   devMemInputPtrBegin,
+                   devMemInputPtrEnd,
+                   functor);
     };
 
     REQUIRE(expected_result == Approx(result));
 }
 
-TEST_CASE("benchmark reduce - int", "[reduce][vikunja][int]")
+TEMPLATE_TEST_CASE("bechmark reduce", "[benchmark][reduce][vikunja]", int, float, double)
 {
-    using Data = int;
-    int size = GENERATE(100, 100'000, 1'270'000, 1'600'000);
+    using Data = TestType;
+    using Idx = std::uint64_t;
 
-    reduce_benchmark<Data>(size);
-}
-
-TEST_CASE("benchmark reduce - float", "[reduce][vikunja][float]")
-{
-    using Data = float;
-    // removed 1'270'000 because of rounding errors.
-    int size = GENERATE(100, 100'000, 2'000'000);
-
-    reduce_benchmark<Data>(size);
-}
-
-TEST_CASE("benchmark reduce - double", "[reduce][vikunja][double]")
-{
-    using Data = double;
-    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
-
-    reduce_benchmark<Data>(size);
+    if constexpr(std::is_same_v<Data, int>)
+    {
+        reduce_benchmark<Data, Idx>(GENERATE(100, 100'000, 1'270'000, 1'600'000));
+    }
+    else if constexpr(std::is_same_v<Data, float>)
+    {
+        // removed 1'270'000 because of rounding errors.
+        reduce_benchmark<Data, Idx>(GENERATE(100, 100'000, 2'000'000));
+    }
+    else if constexpr(std::is_same_v<Data, double>)
+    {
+        reduce_benchmark<Data, Idx>(GENERATE(100, 100'000, 1'270'000, 2'000'000));
+    }
 }
diff --git a/test/benchmarks/transform/CMakeLists.txt b/test/benchmarks/transform/CMakeLists.txt
index 6c89c40..5693714 100644
--- a/test/benchmarks/transform/CMakeLists.txt
+++ b/test/benchmarks/transform/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021 Simeon Ehrig
+# Copyright 2022 Simeon Ehrig
 #
 # This file is part of vikunja.
 #
diff --git a/test/benchmarks/transform/bench_thrust_transform.cpp b/test/benchmarks/transform/bench_thrust_transform.cpp
index 3529ab8..ed86842 100644
--- a/test/benchmarks/transform/bench_thrust_transform.cpp
+++ b/test/benchmarks/transform/bench_thrust_transform.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2021 Simeon Ehrig
+/* Copyright 2022 Simeon Ehrig
  *
  * This file is part of vikunja.
  *
@@ -7,14 +7,13 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 
-#include <numeric>
 #include <vector>
 
 #include <catch2/catch.hpp>
 #include <thrust/device_vector.h>
 
 template<typename TData>
-inline void benchmark_transform(int size)
+inline void transform_benchmark(int size)
 {
     std::vector<TData> hostMemInput(size);
     for(int i = 0; i < size; ++i)
@@ -31,42 +30,27 @@ inline void benchmark_transform(int size)
     std::vector<TData> hostMemOutput(size);
     thrust::copy(devMemOutput.begin(), devMemOutput.end(), hostMemOutput.begin());
 
-    TData result = std::reduce(hostMemOutput.begin(), hostMemOutput.end(), static_cast<TData>(0));
-    TData expected_result = static_cast<TData>(size) * (static_cast<TData>(size) + 1);
-    // verify, that vikunja transform is working with problem size
-    REQUIRE(expected_result == Approx(result));
-
+    for(int i = 0; i < size; ++i)
+    {
+        TData expected_result = static_cast<TData>(2) * static_cast<TData>(i + 1);
+        REQUIRE(expected_result == Approx(hostMemOutput[i]));
+    }
     // honeypot to check that the function call in the benchmark block has not been removed by the optimizer
     hostMemOutput[0] = static_cast<TData>(42);
 
     BENCHMARK("transform thrust")
     {
-        thrust::transform(devMemInput.begin(), devMemInput.end(), devMemOutput.begin(), functor);
+        return thrust::transform(devMemInput.begin(), devMemInput.end(), devMemOutput.begin(), functor);
     };
 
     thrust::copy(devMemOutput.begin(), devMemOutput.end(), hostMemOutput.begin());
 
-    result = std::reduce(hostMemOutput.begin(), hostMemOutput.end(), static_cast<TData>(0));
-    REQUIRE(expected_result == Approx(result));
+    REQUIRE(static_cast<TData>(2) == Approx(hostMemOutput[0]));
 }
 
-TEST_CASE("bechmark transform - int", "[transform][thrust][int]")
+TEMPLATE_TEST_CASE("bechmark transform", "[benchmark][thrust][vikunja]", int, float, double)
 {
-    using Data = int;
-    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
-    benchmark_transform<Data>(size);
-}
+    using Data = TestType;
 
-TEST_CASE("bechmark transform - float", "[transform][thrust][float]")
-{
-    using Data = float;
-    int size = GENERATE(100, 100'000, 2'000'000);
-    benchmark_transform<Data>(size);
-}
-
-TEST_CASE("bechmark transform - double", "[transform][thrust][double]")
-{
-    using Data = double;
-    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
-    benchmark_transform<Data>(size);
+    transform_benchmark<Data>(GENERATE(100, 100'000, 1'270'000, 2'000'000));
 }
diff --git a/test/benchmarks/transform/bench_vikunja_transform.cpp b/test/benchmarks/transform/bench_vikunja_transform.cpp
index 1050d12..2c71e3b 100644
--- a/test/benchmarks/transform/bench_vikunja_transform.cpp
+++ b/test/benchmarks/transform/bench_vikunja_transform.cpp
@@ -1,4 +1,4 @@
-/* Copyright 2021 Simeon Ehrig
+/* Copyright 2022 Simeon Ehrig
  *
  * This file is part of vikunja.
  *
@@ -15,26 +15,24 @@
 #include <alpaka/alpaka.hpp>
 #include <alpaka/example/ExampleDefaultAcc.hpp>
 
-#include <numeric>
-
 #include <catch2/catch.hpp>
 
-template<typename TData>
-inline void transform_benchmark(int size)
+template<typename TData, typename TIdx>
+inline void transform_benchmark(TIdx size)
 {
     using Setup = vikunja::test::TestAlpakaSetup<
         alpaka::DimInt<1u>, // dim
-        std::uint64_t, // Idx
+        TIdx, // Idx
         alpaka::AccCpuSerial, // host type
         alpaka::ExampleDefaultAcc, // device type
         alpaka::Blocking // queue type
         >;
-    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+    using Vec = alpaka::Vec<typename Setup::Dim, typename Setup::Idx>;
 
-    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+    INFO((vikunja::test::print_acc_info<typename Setup::Dim>(size)));
 
     Setup setup;
-    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+    Vec extent = Vec::all(static_cast<typename Setup::Idx>(size));
 
     auto devMemInput = vikunja::bench::allocate_mem_iota<TData>(
         setup,
@@ -45,16 +43,15 @@ inline void transform_benchmark(int size)
     TData* devMemInputPtrBegin = alpaka::getPtrNative(devMemInput);
     TData* devMemInputPtrEnd = devMemInputPtrBegin + size;
 
-    auto devMemOutput = alpaka::allocBuf<TData, Setup::Idx>(setup.devAcc, extent);
+    auto devMemOutput = alpaka::allocBuf<TData, typename Setup::Idx>(setup.devAcc, extent);
     TData* devMemOutputPtrBegin = alpaka::getPtrNative(devMemOutput);
 
-    auto hostMemOutput = alpaka::allocBuf<TData, Setup::Idx>(setup.devHost, extent);
+    auto hostMemOutput = alpaka::allocBuf<TData, typename Setup::Idx>(setup.devHost, extent);
     TData* hostMemOutputPtrBegin = alpaka::getPtrNative(hostMemOutput);
-    TData* hostMemOutputPtrEnd = hostMemOutputPtrBegin + size;
 
     auto functor = [] ALPAKA_FN_HOST_ACC(TData const i) -> TData { return 2 * i; };
 
-    vikunja::transform::deviceTransform<Setup::Acc>(
+    vikunja::transform::deviceTransform<typename Setup::Acc>(
         setup.devAcc,
         setup.queueAcc,
         devMemInputPtrBegin,
@@ -64,11 +61,11 @@ inline void transform_benchmark(int size)
 
     alpaka::memcpy(setup.queueAcc, hostMemOutput, devMemOutput, extent);
 
-    TData result = std::reduce(hostMemOutputPtrBegin, hostMemOutputPtrEnd, static_cast<TData>(0));
-    TData expected_result = extent.prod() * (extent.prod() + 1);
-
-    // verify, that vikunja transform is working with problem size
-    REQUIRE(expected_result == Approx(result));
+    for(auto i = static_cast<typename Setup::Idx>(0); i < size; ++i)
+    {
+        TData expected_result = static_cast<TData>(2) * static_cast<TData>(i + 1);
+        REQUIRE(expected_result == Approx(hostMemOutputPtrBegin[i]));
+    }
 
     // honeypot to check that the function call in the benchmark block has not been removed by the optimizer
     hostMemOutputPtrBegin[0] = static_cast<TData>(42);
@@ -76,7 +73,7 @@ inline void transform_benchmark(int size)
 
     BENCHMARK("transform vikunja")
     {
-        vikunja::transform::deviceTransform<Setup::Acc>(
+        return vikunja::transform::deviceTransform<typename Setup::Acc>(
             setup.devAcc,
             setup.queueAcc,
             devMemInputPtrBegin,
@@ -87,31 +84,13 @@ inline void transform_benchmark(int size)
 
     alpaka::memcpy(setup.queueAcc, hostMemOutput, devMemOutput, extent);
 
-    result = std::reduce(hostMemOutputPtrBegin, hostMemOutputPtrEnd, static_cast<TData>(0));
-    REQUIRE(expected_result == Approx(result));
-}
-
-TEST_CASE("benchmark transform - int", "[transform][vikunja][int]")
-{
-    using Data = int;
-    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
-
-    transform_benchmark<Data>(size);
-}
-
-TEST_CASE("benchmark transform - float", "[transform][vikunja][float]")
-{
-    using Data = float;
-    // removed 1'270'000 because of rounding errors.
-    int size = GENERATE(100, 100'000, 2'000'000);
-
-    transform_benchmark<Data>(size);
+    REQUIRE(static_cast<TData>(2) == Approx(hostMemOutputPtrBegin[0]));
 }
 
-TEST_CASE("benchmark transform - double", "[transform][vikunja][double]")
+TEMPLATE_TEST_CASE("bechmark transform", "[benchmark][transform][vikunja]", int, float, double)
 {
-    using Data = double;
-    int size = GENERATE(100, 100'000, 1'270'000, 2'000'000);
+    using Data = TestType;
+    using Idx = std::uint64_t;
 
-    transform_benchmark<Data>(size);
+    transform_benchmark<Data, Idx>(GENERATE(100, 100'000, 1'270'000, 2'000'000));
 }

From 1ae302ef8101e178e06893aa119167d567aa9b68 Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Tue, 25 Jan 2022 16:19:08 +0000
Subject: [PATCH 7/8] add benchmark helper function allocate_mem_constant()

---
 test/benchmarks/helper/test_bench_helper.cpp  | 33 +++++++
 .../include/vikunja/bench/memory.hpp          | 94 +++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/test/benchmarks/helper/test_bench_helper.cpp b/test/benchmarks/helper/test_bench_helper.cpp
index 3749a7e..c7b4ace 100644
--- a/test/benchmarks/helper/test_bench_helper.cpp
+++ b/test/benchmarks/helper/test_bench_helper.cpp
@@ -88,3 +88,36 @@ TEMPLATE_TEST_CASE("allocate_mem_iota different increment", "[iota]", int, float
         REQUIRE_MESSAGE(expected_result == hostMemPtr[i], "failed with index: " + std::to_string(i));
     }
 }
+
+TEMPLATE_TEST_CASE("allocate_mem_constant", "[iota]", int, float, double)
+{
+    using Data = TestType;
+    using Setup = vikunja::test::TestAlpakaSetup<
+        alpaka::DimInt<1u>, // dim
+        int, // Idx
+        alpaka::AccCpuSerial, // host type
+        alpaka::ExampleDefaultAcc, // device type
+        alpaka::Blocking // queue type
+        >;
+    using Vec = alpaka::Vec<Setup::Dim, Setup::Idx>;
+
+    Setup::Idx size = GENERATE(1, 10, 3045, 2'000'000);
+    Data constant = GENERATE(0, 1, 45, -42);
+
+    INFO((vikunja::test::print_acc_info<Setup::Dim>(size)));
+    INFO("constant: " + std::to_string(constant));
+
+    Setup setup;
+    Vec extent = Vec::all(static_cast<Setup::Idx>(size));
+
+    auto devMem = vikunja::bench::allocate_mem_constant<Data>(setup, extent, constant);
+    auto hostMem(alpaka::allocBuf<Data, typename Setup::Idx>(setup.devHost, extent));
+    Data* const hostMemPtr(alpaka::getPtrNative(hostMem));
+
+    alpaka::memcpy(setup.queueAcc, hostMem, devMem, extent);
+
+    for(Setup::Idx i = 0; i < size; ++i)
+    {
+        REQUIRE(static_cast<Data>(constant) == hostMemPtr[i]);
+    }
+}
diff --git a/test/benchmarks/include/vikunja/bench/memory.hpp b/test/benchmarks/include/vikunja/bench/memory.hpp
index ff49406..e6f4e67 100644
--- a/test/benchmarks/include/vikunja/bench/memory.hpp
+++ b/test/benchmarks/include/vikunja/bench/memory.hpp
@@ -116,4 +116,98 @@ namespace vikunja::bench
 
         return devMem;
     }
+
+    template<typename TData>
+    class ConstantInitFunctor
+    {
+    private:
+        TData const m_constant;
+
+    public:
+        //! Functor to write constant value in each element of a vector.
+        //!
+        //! \tparam TData Type of each element
+        //! \param begin Value of all elements.
+        ConstantInitFunctor(TData const constant) : m_constant(constant)
+        {
+        }
+
+        //! Writes the constant to each element of the output vector.
+        //!
+        //! \tparam TAcc The accelerator environment to be executed on.
+        //! \tparam TElem The element type.
+        //! \param acc The accelerator to be executed on.
+        //! \param output The destination vector.
+        //! \param numElements The number of elements.
+        ALPAKA_NO_HOST_ACC_WARNING
+        template<typename TAcc, typename TIdx>
+        ALPAKA_FN_ACC auto operator()(TAcc const& acc, TData* const output, TIdx const& numElements) const -> void
+        {
+            static_assert(alpaka::Dim<TAcc>::value == 1, "The VectorAddKernel expects 1-dimensional indices!");
+
+            TIdx const gridThreadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+            TIdx const threadElemExtent(alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+            TIdx const threadFirstElemIdx(gridThreadIdx * threadElemExtent);
+
+            if(threadFirstElemIdx < numElements)
+            {
+                // Calculate the number of elements for this thread.
+                // The result is uniform for all but the last thread.
+                TIdx const threadLastElemIdx(threadFirstElemIdx + threadElemExtent);
+                TIdx const threadLastElemIdxClipped(alpaka::math::min(acc, numElements, threadLastElemIdx));
+
+                for(TIdx i(threadFirstElemIdx); i < threadLastElemIdxClipped; ++i)
+                {
+                    output[i] = m_constant;
+                }
+            }
+        }
+    };
+
+    //! Allocates memory and initialises each value with a constant value.
+    //! The allocation is done with `setup.devAcc`.
+    //!
+    //! \tparam TData Data type of the memory buffer.
+    //! \tparam TSetup Fully specialized type of `vikunja::test::TestAlpakaSetup`.
+    //! \tparam Type of the extent.
+    //! \tparam TBuf Type of the alpaka memory buffer.
+    //! \param setup Instance of `vikunja::test::TestAlpakaSetup`. The `setup.devAcc` and `setup.queueDev` is used
+    //! for allocation and initialization of the the memory.
+    //! \param extent Size of the memory buffer. Needs to be 1 dimensional.
+    //! \param begin Value of the constant.
+    template<
+        typename TData,
+        typename TSetup,
+        typename TExtent,
+        typename TBuf = alpaka::Buf<typename TSetup::DevAcc, TData, alpaka::DimInt<1u>, typename TSetup::Idx>>
+    TBuf allocate_mem_constant(TSetup& setup, TExtent const& extent, TData const constant)
+    {
+        // TODO: test also 2 and 3 dimensional memory
+        static_assert(TExtent::Dim::value == 1);
+
+        // TODO: optimize utilization for CPU backends
+        typename TSetup::Idx const elementsPerThread = 1;
+        typename TSetup::Idx linSize = extent.prod();
+
+        TBuf devMem(alpaka::allocBuf<TData, typename TSetup::Idx>(setup.devAcc, extent));
+
+        alpaka::WorkDivMembers<typename TSetup::Dim, typename TSetup::Idx> const workDiv(
+            alpaka::getValidWorkDiv<typename TSetup::Acc>(
+                setup.devAcc,
+                extent,
+                elementsPerThread,
+                false,
+                alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
+
+        ConstantInitFunctor constantInitFunctor(constant);
+
+        alpaka::exec<typename TSetup::Acc>(
+            setup.queueAcc,
+            workDiv,
+            constantInitFunctor,
+            alpaka::getPtrNative(devMem),
+            linSize);
+
+        return devMem;
+    }
 } // namespace vikunja::bench

From c2fcf288a8e2cf1329537e980cc969a26cf29db9 Mon Sep 17 00:00:00 2001
From: Simeon Ehrig <s.ehrig@hzdr.de>
Date: Thu, 27 Jan 2022 13:05:10 +0000
Subject: [PATCH 8/8] fix typos in benchmark related documentation

- improve CMake for benchmarks
- Thanks to Hannes

Co-authored-by: Jan Stephan <j.stephan@hzdr.de>
---
 CMakeLists.txt                                |  2 +-
 docs/source/advanced/cmake.rst                | 14 +++++-----
 docs/source/development/test.rst              | 20 +++++++------
 test/CMakeLists.txt                           |  3 ++
 test/benchmarks/CMakeLists.txt                |  8 ------
 .../include/vikunja/bench/memory.hpp          | 28 +++++++++----------
 .../reduce/bench_vikunja_reduce.cpp           |  2 +-
 7 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 624ba44..5aedee0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ option(BUILD_TESTING "Build the testing tree." OFF)
 cmake_dependent_option(VIKUNJA_SYSTEM_CATCH2 "Use your local installation of Catch2" ON BUILD_TESTING OFF)
 cmake_dependent_option(VIKUNJA_ENABLE_CXX_TEST "Builds test that checks if the C++ standard is set correctly" OFF BUILD_TESTING OFF)
 cmake_dependent_option(VIKUNJA_ENABLE_BENCHMARKS "Enable benchmarks" OFF BUILD_TESTING OFF)
-cmake_dependent_option(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS "Enable benchmarks" OFF VIKUNJA_ENABLE_BENCHMARKS OFF)
+cmake_dependent_option(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS "Enable benchmarks using CUDA Thrust" OFF "VIKUNJA_ENABLE_BENCHMARKS;ALPAKA_ACC_GPU_CUDA_ENABLE" OFF)
 
 # activate support for host/device lambdas in cuda
 # needs to be set before alpaka is included
diff --git a/docs/source/advanced/cmake.rst b/docs/source/advanced/cmake.rst
index 0513b7c..78baa93 100644
--- a/docs/source/advanced/cmake.rst
+++ b/docs/source/advanced/cmake.rst
@@ -30,29 +30,29 @@ Testing
 **VIKUNJA_SYSTEM_CATCH2** (OFF)
     .. code-block::
 
-        Only works if BUILD_TESTING is ON.
+        Requires BUILD_TESTING to be ON.
         Use your local installation of Catch2.
         Otherwise, it will be automatically downloaded and installed in the local build folder.
 
 **VIKUNJA_ENABLE_CXX_TEST** (OFF)
     .. code-block::
 
-        Only works if BUILD_TESTING is ON.
+        Requires BUILD_TESTING to be ON.
         Special test that checks if ALPAKA_CXX_STANDARD works correctly.
         The implementation is very compiler specific, so it is possible that the test is not
-        supported by your used C++ compiler.
+        supported by your C++ compiler.
 
 **VIKUNJA_ENABLE_BENCHMARKS** (OFF)
     .. code-block::
 
-        Only works if BUILD_TESTING is ON.
-        Enable the benchmarks. The benchmarks are built automatically and can be executed via ctest.
+        Requires BUILD_TESTING to be ON.
+        Enables the benchmarks. The benchmarks are built automatically and can be executed via CTest.
 
 **VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS** (OFF)
     .. code-block::
 
-        Only works if VIKUNJA_ENABLE_BENCHMARKS and ALPAKA_ACC_GPU_CUDA_ENABLE are ON.
-        Enable thrust benchmarks for comparison.
+        Requires VIKUNJA_ENABLE_BENCHMARKS and ALPAKA_ACC_GPU_CUDA_ENABLE to be ON.
+        Enables Thrust benchmarks for comparison.
 
 alpaka
 ++++++
diff --git a/docs/source/development/test.rst b/docs/source/development/test.rst
index dce9cc1..1e16606 100644
--- a/docs/source/development/test.rst
+++ b/docs/source/development/test.rst
@@ -1,12 +1,12 @@
 Testing and Benchmarking
 ========================
 
-Vikunja offers different types of tests. The source code is tested via unit and integration tests with `Cacht2 <https://github.com/catchorg/Catch2/tree/v2.x>`_. The CMake code is tested with integration tests and custom scripts.
+Vikunja offers different types of tests. The source code is tested via unit and integration tests with `Catch2 <https://github.com/catchorg/Catch2/tree/v2.x>`_. The CMake code is tested with integration tests and custom scripts.
 
 Source Code Tests
 -----------------
 
-Before you start writing source code tests, you should read the `Catch2 documentation <https://github.com/catchorg/Catch2/blob/v2.x/docs/tutorial.md#top>`_. Tests written with Catch2 are standalone application. Therefore, they have their own source code files and ``CMakeLists.txt`` files located in the ``test/unit`` and ``test/integ`` folders. If you set the CMake argument ``-DBUILD_TESTING=ON``, the executable files of the tests will be built automatically. All test cases are registered via the CMake function ``add_test``. Therefore, you can automatically run all tests in the build folder with the ``ctest`` command:
+Before you start writing source code tests, you should read the `Catch2 documentation <https://github.com/catchorg/Catch2/blob/v2.x/docs/tutorial.md#top>`_. Tests written with Catch2 are standalone executables. They have their own source code files and ``CMakeLists.txt`` files located in the ``test/unit`` and ``test/integ`` folders. If you set the CMake argument ``-DBUILD_TESTING=ON``, the tests will be built automatically. All test executables are registered via the CMake function ``add_test``. Therefore, you can automatically run all tests from the build folder with the ``ctest`` command:
 
 .. code-block:: bash
 
@@ -17,7 +17,7 @@ Before you start writing source code tests, you should read the `Catch2 document
 
 For more CMake arguments for the tests, see the :ref:`CMake section <cmake-test>`.
 
-If you only want to run a single test, you can run the test executable directly. All test executables are located in the ``<build_folder>/tests``. It is also possible to run the executable with the ``--help`` flag to show additional options. For example, the ``-s`` flag displays additional information created with the Catch2 function ``INFO()``.
+If you only want to run a single test, you can run the test executable directly. All test executables are located in ``<build_folder>/tests``. It is also possible to run the executable with the ``--help`` flag to show additional options. For example, the ``-s`` flag displays additional information created with the Catch2 function ``INFO()``.
 
 .. code-block:: bash
 
@@ -29,27 +29,31 @@ If you only want to run a single test, you can run the test executable directly.
     # run test with extra output
     test/integ/reduce/test_reduce -s
 
+.. tip::
+
+    Each test is a CMake target that you can build separately. A test target always starts with ``test_``. To get all available test CMake targets, run ``cmake --build . -t help | grep 'test_'`` in the build folder. You can build a specific test with ``cmake --build . -t test_IndividualTestCase``.
+
 CMake Tests
 -----------
 
-The CMake integration tests check whether vikunja can be used correctly in another project via the CMake functions ``find_package()``) or ``add_subdirectory``. The CI contains test jobs for creating projects that use the vikunja library. The job names start with ``integration``. All associated files for the tests are in ``script/integration_test``.
+The CMake integration tests check whether vikunja can be used correctly in another project via the CMake functions ``find_package()`` or ``add_subdirectory``. The CI contains test jobs which create dummy projects that use the vikunja library. The job names start with ``integration``. All associated files for the tests are in ``script/integration_test``.
 
 CXX Test
 ++++++++
 
-There is a special Catch2 test that tests the vikunja CMake to see if the C++ standard is set correctly. The name of the test is ``test_cxx``. The test compares the C++ standard set by the compiler with an expected standard passed as an argument. By default, ``ctest`` automatically passes the expected C++ standard depending on the CMake variable ``ALPAKA_CXX_STANDARD``. If you run the test manually, you must pass it yourself:
+There is a special Catch2 test that tests vikunja's build system to see if the C++ standard is set correctly. The name of the test is ``test_cxx``. It compares the C++ standard set by the compiler with an expected standard passed as an argument. By default, ``ctest`` automatically passes the expected C++ standard depending on the CMake variable ``ALPAKA_CXX_STANDARD``. If you run the test manually, you must pass it yourself:
 
 .. code-block:: bash
 
-    # expected, that the code was compiled with C++ 17
+    # expects, that the code was compiled with C++ 17
     test/unit/cxx/test_cxx --cxx 17
 
 
 Benchmarks
 ----------
 
-Vikunja uses `Catch2 benchmark <https://github.com/catchorg/Catch2/blob/v2.x/docs/benchmarks.md#top>`_ to automatically run benchmarks. By default, the benchmarks are not enabled. To enable the benchmarks, the CMake arguments ``-DBUILD_TESTING=ON -DVIKUNJA_ENABLE_BENCHMARKS=ON`` must be set. The benchmarks are created automatically and can be run with ``ctest``. As with the tests, you can run a particular benchmark directly from the executable file, e.g. ``test/benchmarks/transform/bench_vikunja_transform``. All benchmark executables are located in ``<build_folder>/test/benchmarks``.
+Vikunja uses `Catch2 benchmark <https://github.com/catchorg/Catch2/blob/v2.x/docs/benchmarks.md#top>`_ to automatically run benchmarks. By default, benchmarks are not enabled. To enable them, the CMake arguments ``-DBUILD_TESTING=ON -DVIKUNJA_ENABLE_BENCHMARKS=ON`` must be set. The benchmarks are created automatically and can be run with ``ctest``. As with the tests, you can run a particular benchmark directly from the executable file, e.g. ``test/benchmarks/transform/bench_vikunja_transform``. All benchmark executables are located in ``<build_folder>/test/benchmarks``.
 
 .. tip::
 
-    If you run ``bechmark_exe --help``, you get benchmark specific options.
+    If you run ``<benchmark_exe> --help``, you get benchmark specific options.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 5212a51..ac73469 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -33,6 +33,9 @@ target_link_libraries(vikunjaTestSetup
   PUBLIC
   Catch2::Catch2
   )
+if(VIKUNJA_ENABLE_BENCHMARKS)
+  target_compile_definitions(vikunjaTestSetup PRIVATE CATCH_CONFIG_ENABLE_BENCHMARKING)
+endif()
 add_library(vikunja::testSetup ALIAS vikunjaTestSetup)
 
 list(APPEND _VIKUNJA_TEST_OPTIONS "--use-colour yes")
diff --git a/test/benchmarks/CMakeLists.txt b/test/benchmarks/CMakeLists.txt
index 1aba1c6..5928b76 100644
--- a/test/benchmarks/CMakeLists.txt
+++ b/test/benchmarks/CMakeLists.txt
@@ -13,14 +13,6 @@ target_compile_definitions(vikunjaBenchSetup INTERFACE CATCH_CONFIG_ENABLE_BENCH
 target_include_directories(vikunjaBenchSetup INTERFACE include)
 add_library(vikunja::benchSetup ALIAS vikunjaBenchSetup)
 
-target_compile_definitions(vikunjaTestSetup PRIVATE CATCH_CONFIG_ENABLE_BENCHMARKING)
-
-if(VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS)
-    if(NOT ALPAKA_ACC_GPU_CUDA_ENABLE)
-        message(FATAL_ERROR "VIKUNJA_ENABLE_CUDA_THRUST_BENCHMARKS requires the ALPAKA_ACC_GPU_CUDA_ENABLE backend to be enabled.")
-    endif()
-endif()
-
 add_subdirectory("helper/")
 add_subdirectory("transform/")
 add_subdirectory("reduce/")
diff --git a/test/benchmarks/include/vikunja/bench/memory.hpp b/test/benchmarks/include/vikunja/bench/memory.hpp
index e6f4e67..5b2a0c8 100644
--- a/test/benchmarks/include/vikunja/bench/memory.hpp
+++ b/test/benchmarks/include/vikunja/bench/memory.hpp
@@ -22,16 +22,16 @@ namespace vikunja::bench
         TData const m_increment;
 
     public:
-        //! Functor for iota implementation with generic data type.
+        //! Iota functor for generic data types.
         //!
         //! \tparam TData Type of each element
-        //! \param begin Value of the first element.
+        //! \param init Value of the first element.
         //! \param increment Distance between two elements.
-        IotaFunctor(TData const begin, TData const increment) : m_begin(begin), m_increment(increment)
+        IotaFunctor(TData const init, TData const increment) : m_begin(init), m_increment(increment)
         {
         }
 
-        //! Writes the result of `begin + index * increment` to each element of the output vector.
+        //! Writes the result of `init + index * increment` to each element of the output vector.
         //!
         //! \tparam TAcc The accelerator environment to be executed on.
         //! \tparam TElem The element type.
@@ -64,17 +64,17 @@ namespace vikunja::bench
     };
 
 
-    //! Allocates memory and initialises each value with `begin + index * increment`,
+    //! Allocates memory and initializes each value with `init + index * increment`,
     //! where index is the position in the output vector. The allocation is done with `setup.devAcc`.
     //!
     //! \tparam TData Data type of the memory buffer.
     //! \tparam TSetup Fully specialized type of `vikunja::test::TestAlpakaSetup`.
     //! \tparam Type of the extent.
     //! \tparam TBuf Type of the alpaka memory buffer.
-    //! \param setup Instance of `vikunja::test::TestAlpakaSetup`. The `setup.devAcc` and `setup.queueDev` is used
+    //! \param setup Instance of `vikunja::test::TestAlpakaSetup`. `setup.devAcc` and `setup.queueDev` are used
     //! for allocation and initialization of the the memory.
     //! \param extent Size of the memory buffer. Needs to be 1 dimensional.
-    //! \param begin Value of the first element. Depending of TData, it can be negative.
+    //! \param init Value of the first element. Depending on TData, it can be negative.
     //! \param increment Distance between two elements of the vector. If the value is negative, the value of an
     //! element is greater than its previous element.
     template<
@@ -85,7 +85,7 @@ namespace vikunja::bench
     TBuf allocate_mem_iota(
         TSetup& setup,
         TExtent const& extent,
-        TData const begin = TData{0},
+        TData const init = TData{0},
         TData const increment = TData{1})
     {
         // TODO: test also 2 and 3 dimensional memory
@@ -105,7 +105,7 @@ namespace vikunja::bench
                 false,
                 alpaka::GridBlockExtentSubDivRestrictions::Unrestricted));
 
-        IotaFunctor iotaFunctor(begin, increment);
+        IotaFunctor iotaFunctor(init, increment);
 
         alpaka::exec<typename TSetup::Acc>(
             setup.queueAcc,
@@ -124,10 +124,10 @@ namespace vikunja::bench
         TData const m_constant;
 
     public:
-        //! Functor to write constant value in each element of a vector.
+        //! Functor to write a constant value into each element of a vector.
         //!
         //! \tparam TData Type of each element
-        //! \param begin Value of all elements.
+        //! \param constant Value to which all elements are set.
         ConstantInitFunctor(TData const constant) : m_constant(constant)
         {
         }
@@ -164,17 +164,17 @@ namespace vikunja::bench
         }
     };
 
-    //! Allocates memory and initialises each value with a constant value.
+    //! Allocates memory and initializes each value with a constant value.
     //! The allocation is done with `setup.devAcc`.
     //!
     //! \tparam TData Data type of the memory buffer.
     //! \tparam TSetup Fully specialized type of `vikunja::test::TestAlpakaSetup`.
     //! \tparam Type of the extent.
     //! \tparam TBuf Type of the alpaka memory buffer.
-    //! \param setup Instance of `vikunja::test::TestAlpakaSetup`. The `setup.devAcc` and `setup.queueDev` is used
+    //! \param setup Instance of `vikunja::test::TestAlpakaSetup`. `setup.devAcc` and `setup.queueDev` are used
     //! for allocation and initialization of the the memory.
     //! \param extent Size of the memory buffer. Needs to be 1 dimensional.
-    //! \param begin Value of the constant.
+    //! \param constant Value of the constant.
     template<
         typename TData,
         typename TSetup,
diff --git a/test/benchmarks/reduce/bench_vikunja_reduce.cpp b/test/benchmarks/reduce/bench_vikunja_reduce.cpp
index b3227d0..2fb4913 100644
--- a/test/benchmarks/reduce/bench_vikunja_reduce.cpp
+++ b/test/benchmarks/reduce/bench_vikunja_reduce.cpp
@@ -91,7 +91,7 @@ TEMPLATE_TEST_CASE("bechmark reduce", "[benchmark][reduce][vikunja]", int, float
     }
     else if constexpr(std::is_same_v<Data, float>)
     {
-        // removed 1'270'000 because of rounding errors.
+        // removed 1'270'000 because of precision errors.
         reduce_benchmark<Data, Idx>(GENERATE(100, 100'000, 2'000'000));
     }
     else if constexpr(std::is_same_v<Data, double>)