alpaka-group · SimeonEhrig · Oct 27, 2022 · Nov 10, 2022 · Dec 22, 2022 · bernhardmgruber
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
@@ -1,3 +1,4 @@
 cmake_minimum_required(VERSION 3.18)
 add_subdirectory("reduce/")
 add_subdirectory("transform/")
+add_subdirectory("mdspan_transform/")
diff --git a/example/mdspan_transform/CMakeLists.txt b/example/mdspan_transform/CMakeLists.txt
@@ -0,0 +1,4 @@
+cmake_minimum_required(VERSION 3.18)
+set(_TARGET_NAME "example_mdspan_transform")
+alpaka_add_executable(${_TARGET_NAME} src/transform-main.cpp)
+target_link_libraries(${_TARGET_NAME} PUBLIC vikunja::internalvikunja)
diff --git a/example/mdspan_transform/src/transform-main.cpp b/example/mdspan_transform/src/transform-main.cpp
@@ -0,0 +1,124 @@
+// cppinsight cannot compile the alpaka.hpp header
+#define CPPINSIGHT_TEST 0
+
+#include <vikunja/access/MdspanLinear.hpp>
+
+#if CPPINSIGHT_TEST == 0
+
+#    include <vikunja/algorithm/transform.hpp>
+
+#    include <alpaka/alpaka.hpp>
+
+#endif
+
+#include <experimental/mdspan>
+#include <iostream>
+#include <type_traits>
+
+/**
+ * @brief Do the same like std::iota with a n-dimensional mdspan. The iteration order is from the right to the left
+ * dimension.
+ *
+ * @tparam TSpan type of the mdspan
+ * @tparam TData type of the functor
+ * @param span The mdspan
+ * @param index value of the first element
+ */
+template<typename TSpan, typename TData>
+void iota_span(TSpan span, TData index)
+{
+    static_assert(TSpan::rank() > 0);
+    auto functor = [&index](TData input) { return index++; };
+    Iterate_mdspan<TSpan::rank()>{}(span, span, functor);
+}
+
+#if CPPINSIGHT_TEST == 1
+
+int main()
+{
+    std::array<int, 12> d;
+
+    // stdex::mdspan m{d.data(), stdex::extents{12}};
+    // stdex::mdspan m{d.data(), stdex::extents{2, 6}};
+    // stdex::mdspan m{d.data(), stdex::extents{2, 4, 2}};
+    stdex::mdspan m{d.data(), stdex::extents{2, 2, 1, 4}};
+
+    iota_span(m, 42);
+
+    for(auto const& v : d)
+    {
+        std::cout << v << " ";
+    }
+    std::cout << std::endl;
+
+    return 0;
+}
+
+#else
+
+int main()
+{
+    using Idx = std::uint64_t;
+    Idx const num_dims = 5;
+    Idx const dim_size = 6;
+
+    using Acc = alpaka::AccCpuSerial<alpaka::DimInt<num_dims>, Idx>;
+    // using Acc = alpaka::AccGpuCudaRt<alpaka::DimInt<num_dims>, Idx>;
+
+    auto const devAcc(alpaka::getDevByIdx<Acc>(0u));
+    auto const devHost(alpaka::getDevByIdx<alpaka::PltfCpu>(0u));
+
+    using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
+    QueueAcc queueAcc(devAcc);
+
+
+    using Dim = alpaka::Dim<Acc>;
+    using Idx = alpaka::Idx<Acc>;
+    using Data = uint64_t;
+
+    using Vec = alpaka::Vec<Dim, Idx>;
+    Vec extent(Vec::all(num_dims));
+    for(Idx dim = 0; dim < num_dims; ++dim)
+    {
+        extent[dim] = static_cast<Idx>(dim_size);
+    }
+
+
+    auto deviceMem(alpaka::allocBuf<Data, Idx>(devAcc, extent));
+    auto deviceSpan = alpaka::getMdSpan(deviceMem);
+    auto hostMem(alpaka::allocBuf<Data, Idx>(devHost, extent));
+    Data* hostNativePtr = alpaka::getPtrNative(hostMem);
+    auto hostSpan = alpaka::getMdSpan(hostMem);
+
+    iota_span(hostSpan, 1);
+
+    auto doubleNum = [] ALPAKA_FN_HOST_ACC(Data const& i) { return 2 * i; };
+
+
+    alpaka::memcpy(queueAcc, deviceMem, hostMem, extent);
+
+    vikunja::device::transform<Acc>(devAcc, queueAcc, deviceSpan, deviceSpan, doubleNum);
+
+
+    // Copy the data back to the host for validation.
+    alpaka::memcpy(queueAcc, hostMem, deviceMem, extent);
+
+    Data resultSum = std::accumulate(hostNativePtr, hostNativePtr + extent.prod(), 0);
-    Data resultSum = std::accumulate(hostNativePtr, hostNativePtr + extent.prod(), 0);
+    Data resultSum = std::reduce(hostNativePtr, hostNativePtr + extent.prod());
-    Data resultSum = std::accumulate(hostNativePtr, hostNativePtr + extent.prod(), 0);
+    Data resultSum = std::reduce(hostNativePtr, hostNativePtr + extent.prod());
+
+    Data expectedResult = (extent.prod() * (extent.prod() + 1));
+
+    std::cout << "Testing accelerator: " << alpaka::getAccName<Acc>() << " with size: " << extent.prod() << "\n";
+    if(expectedResult == resultSum)
+    {
+        std::cout << "Transform was successful!\n";
+    }
+    else
+    {
+        std::cout << "Transform was not successful!\n"
+                  << "expected result: " << expectedResult << "\n"
+                  << "actual result: " << resultSum << std::endl;
+    }
+
+    return 0;
+}
+#endif
diff --git a/include/vikunja/access/MdspanLinear.hpp b/include/vikunja/access/MdspanLinear.hpp
@@ -0,0 +1,111 @@
+#pragma once
+
+#include <experimental/mdspan>
+
+#include <assert.h>
+
+namespace stdex = std::experimental;
+
+/**
+ * @brief Construct submdspan of mdspan. The submdspan has one rank less than the mdspan. The left dimension is fixed
+ * to a specific index. The rest of the dimension contains the full range.
+ *
+ * @tparam TRank Dimension of the new submdspan (needs to be mdspan::rank()-1).
+ */
+template<int TRank>
+struct Construct_Submdspan;
+
+template<>
+struct Construct_Submdspan<1>
+{
+    template<typename TSpan, typename... Types>
+    constexpr auto construct(TSpan span, std::size_t const fixed_index_pos, Types... args)
+    {
+        return stdex::submdspan(span, fixed_index_pos, args...);
+    }
+};
+
+template<int TRank>
+struct Construct_Submdspan
+{
+    /**
+     * @brief Returns the submdspan of a mdspan, with one dimension less.
+     *
+     * @tparam TSpan Type of the span
+     * @tparam Types needs to std::experimental::full_extent_t
+     * @param span mdspan from which the submdspan is created
+     * @param fixed_index_pos Index postion of the fixed dimension
+     * @param args needs to std::experimental::full_extent
+     * @return constexpr auto returns a stdex::submdspan
+     */
+    template<typename TSpan, typename... Types>
+    constexpr auto construct(TSpan span, std::size_t const fixed_index_pos, Types... args)
+    {
+        return Construct_Submdspan<TRank - 1>{}.construct(span, fixed_index_pos, stdex::full_extent, args...);
+    }
+};
+
+/**
+ * @brief Returns a submdspan of mdspan. The submdspan has one rank less than the mdspan. The left dimension is fixed
+ * to a specific index. The rest of the dimension contains the full range.
+ *
+ * @tparam TMDSpan
+ * @param span mdspan from which the submdspan is created
+ * @param fixed_index_pos Index postion of the fixed dimension
+ * @return constexpr auto returns a stdex::submdspan
+ */
+template<typename TMDSpan>
+constexpr auto submdspan_remove_dim(TMDSpan span, std::size_t const fixed_index_pos)
+{
+    constexpr auto rank = TMDSpan::rank();
+    return Construct_Submdspan<rank - 1>{}.construct(span, fixed_index_pos, stdex::full_extent);
+}
+
+/**
+ * @brief Iterates over all elements of an n dimension mdspan. The iteration order is from the right to the left
+ * dimension.
+ *
+ * @tparam TDim Rank of the mdspan
+ */
+template<int TDim>
+struct Iterate_mdspan;
+
+template<>
+struct Iterate_mdspan<1>
+{
+    template<typename TSpan, typename TFunc>
+    void operator()(TSpan input, TSpan output, TFunc& functor)
+    {
+        assert(input.extent(0) <= output.extent(0));
+        for(auto i = 0; i < input.extent(0); ++i)
+        {
+            output(i) = functor(input(i));
+        }
+    }
+};
+
+template<int TDim>
+struct Iterate_mdspan
+{
+    /**
+     * @brief Iterate over all elements of an mdspan and apply the functor on it.
+     *
+     * @tparam TSpan type of the mdspan's
+     * @tparam TFunc type of the functor
+     * @param input The input mdspan
+     * @param output The output mdspan
+     * @param functor The functor
+     */
+    template<typename TSpan, typename TFunc>
+    void operator()(TSpan input, TSpan output, TFunc& functor)
+    {
+        assert(input.extent(0) <= output.extent(0));
+
+        for(auto i = 0; i < input.extent(0); ++i)
+        {
+            auto subinput = submdspan_remove_dim(input, i);
+            auto suboutput = submdspan_remove_dim(input, i);
+            Iterate_mdspan<TSpan::rank() - 1>{}(subinput, suboutput, functor);
+        }
+    }
+};
diff --git a/include/vikunja/algorithm/transform.hpp b/include/vikunja/algorithm/transform.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+//#include <alpaka/alpaka.hpp>
+
+#include <vikunja/access/BlockStrategy.hpp>
+#include <vikunja/access/MdspanLinear.hpp>
+#include <vikunja/operators/operators.hpp>
+#include <vikunja/workdiv/BlockBasedWorkDiv.hpp>
+
+#include <experimental/mdspan>
+
+namespace vikunja
+{
+    namespace device
+    {
+        // FIXME: I'm only running on a single core CPU :-(
+        template<
+            typename TAcc,
+            typename WorkDivPolicy = vikunja::workdiv::BlockBasedPolicy<TAcc>,
+            typename MemAccessPolicy = vikunja::MemAccess::MemAccessPolicy<TAcc>,
+            typename TDevAcc,
+            typename TQueue,
+            typename TData,
+            typename TLayoutPolicy,
+            typename TAccessorPolicy,
+            typename TInputExtend,
+            typename TOutputExtend,
+            typename TFunc,
+            typename TOperator = vikunja::operators::UnaryOp<TAcc, TFunc, TData>>
+        void transform(
+            TDevAcc& devAcc,
+            TQueue& queue,
+            std::experimental::mdspan<TData, TInputExtend, TLayoutPolicy, TAccessorPolicy> input,
+            std::experimental::mdspan<TData, TOutputExtend, TLayoutPolicy, TAccessorPolicy> output,
+            TFunc const& func)
+        {
+            constexpr auto input_rank = decltype(input)::rank();
+            static_assert(input_rank > 0);
+
+            Iterate_mdspan<input_rank>{}(input, output, func);
+        }
+    } // namespace device
+} // namespace vikunja