Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test mdpsan for handling memory #91

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions example/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
cmake_minimum_required(VERSION 3.18)
add_subdirectory("reduce/")
add_subdirectory("transform/")
add_subdirectory("mdspan_transform/")
4 changes: 4 additions & 0 deletions example/mdspan_transform/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cmake_minimum_required(VERSION 3.18)
set(_TARGET_NAME "example_mdspan_transform")
alpaka_add_executable(${_TARGET_NAME} src/transform-main.cpp)
target_link_libraries(${_TARGET_NAME} PUBLIC vikunja::internalvikunja)
159 changes: 159 additions & 0 deletions example/mdspan_transform/src/transform-main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
// cppinsight cannot compile the alpaka.hpp header
#define CPPINSIGHT_TEST 0

#include <vikunja/access/MdspanLinear.hpp>

#if CPPINSIGHT_TEST == 0

# include <vikunja/algorithm/transform.hpp>

# include <alpaka/alpaka.hpp>

#endif

#include <array>
#include <chrono>
#include <experimental/mdspan>
#include <iostream>
#include <type_traits>

/**
* @brief Do the same like std::iota with a n-dimensional mdspan. The iteration order is from the right to the left
* dimension.
*
* @tparam TSpan type of the mdspan
* @tparam TData type of the functor
* @param span The mdspan
* @param index value of the first element
*/
template<typename TSpan, typename TData>
void iota_span(TSpan span, TData index)
{
static_assert(TSpan::rank() > 0);
auto functor = [&index](TData input) { return index++; };
Iterate_mdspan<TSpan::rank()>{}(span, span, functor);
}

#if CPPINSIGHT_TEST == 1

int main()
{
std::array<int, 12> d;

// stdex::mdspan m{d.data(), stdex::extents{12}};
// stdex::mdspan m{d.data(), stdex::extents{2, 6}};
// stdex::mdspan m{d.data(), stdex::extents{2, 4, 2}};
stdex::mdspan m{d.data(), stdex::extents{2, 2, 1, 4}};

iota_span(m, 42);

for(auto const& v : d)
{
std::cout << v << " ";
}
std::cout << std::endl;

return 0;
}

#else

int main()
{
using Idx = std::uint64_t;
Idx const num_dims = 3;
Idx const dim_size = 100;


using Acc = alpaka::AccCpuSerial<alpaka::DimInt<num_dims>, Idx>;
// using Acc = alpaka::AccGpuCudaRt<alpaka::DimInt<num_dims>, Idx>;

auto const devAcc(alpaka::getDevByIdx<Acc>(0u));
auto const devHost(alpaka::getDevByIdx<alpaka::PltfCpu>(0u));

using QueueAcc = alpaka::Queue<Acc, alpaka::Blocking>;
QueueAcc queueAcc(devAcc);


using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Data = uint64_t;

using Vec = alpaka::Vec<Dim, Idx>;
Vec extent(Vec::all(num_dims));
for(Idx dim = 0; dim < num_dims; ++dim)
{
extent[dim] = static_cast<Idx>(dim_size);
}


auto deviceInputMem(alpaka::allocBuf<Data, Idx>(devAcc, extent));
auto deviceInputSpan = alpaka::experimental::getMdSpan(deviceInputMem);
auto deviceOutputMem(alpaka::allocBuf<Data, Idx>(devAcc, extent));
auto deviceOutputSpan = alpaka::experimental::getMdSpan(deviceOutputMem);
auto hostMem(alpaka::allocBuf<Data, Idx>(devHost, extent));
Data* hostNativePtr = alpaka::getPtrNative(hostMem);
auto hostSpan = alpaka::experimental::getMdSpan(hostMem);

iota_span(hostSpan, 1);

auto doubleNum = [] ALPAKA_FN_HOST_ACC(Data const& i) { return 2 * i; };

alpaka::memcpy(queueAcc, deviceInputMem, hostMem, extent);

int constexpr bench_runs = 10000;


std::array<
std::tuple<
decltype(std::chrono::high_resolution_clock::now()),
decltype(std::chrono::high_resolution_clock::now())>,
bench_runs>
measure_points;

// warm up
vikunja::device::transform<Acc>(devAcc, queueAcc, deviceInputSpan, deviceOutputSpan, doubleNum);

for(int i = 0; i < bench_runs; ++i)
{
std::get<0>(measure_points[i]) = std::chrono::high_resolution_clock::now();
vikunja::device::transform<Acc>(devAcc, queueAcc, deviceInputSpan, deviceOutputSpan, doubleNum);
std::get<1>(measure_points[i]) = std::chrono::high_resolution_clock::now();
}

// Copy the data back to the host for validation.
alpaka::memcpy(queueAcc, hostMem, deviceOutputMem, extent);

Data resultSum = std::reduce(hostNativePtr, hostNativePtr + extent.prod());

Data expectedResult = (extent.prod() * (extent.prod() + 1));

std::cout << "Testing accelerator: " << alpaka::getAccName<Acc>() << " with size: " << extent.prod() << "\n";
if(expectedResult == resultSum)
{
std::cout << "Transform was successful!\n";
}
else
{
std::cout << "Transform was not successful!\n"
<< "expected result: " << expectedResult << "\n"
<< "actual result: " << resultSum << std::endl;
}


double avg_runtime = 0.0;
for(auto const time_pair : measure_points)
{
avg_runtime += static_cast<double>(
std::chrono::duration_cast<std::chrono::microseconds>(std::get<1>(time_pair) - std::get<0>(time_pair))
.count());
}

avg_runtime /= bench_runs;
avg_runtime /= 1000.0;

std::cout << "Execution time: " << avg_runtime << "ms" << std::endl;

return 0;
}
#endif
111 changes: 111 additions & 0 deletions include/vikunja/access/MdspanLinear.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#pragma once

#include <experimental/mdspan>

#include <assert.h>

namespace stdex = std::experimental;

/**
* @brief Construct submdspan of mdspan. The submdspan has one rank less than the mdspan. The left dimension is fixed
* to a specific index. The rest of the dimension contains the full range.
*
* @tparam TRank Dimension of the new submdspan (needs to be mdspan::rank()-1).
*/
template<int TRank>
struct Construct_Submdspan;

template<>
struct Construct_Submdspan<1>
{
template<typename TSpan, typename... Types>
constexpr auto construct(TSpan span, std::size_t const fixed_index_pos, Types... args)
{
return stdex::submdspan(span, fixed_index_pos, args...);
}
};

template<int TRank>
struct Construct_Submdspan
{
/**
* @brief Returns the submdspan of a mdspan, with one dimension less.
*
* @tparam TSpan Type of the span
* @tparam Types needs to std::experimental::full_extent_t
* @param span mdspan from which the submdspan is created
* @param fixed_index_pos Index postion of the fixed dimension
* @param args needs to std::experimental::full_extent
* @return constexpr auto returns a stdex::submdspan
*/
template<typename TSpan, typename... Types>
constexpr auto construct(TSpan span, std::size_t const fixed_index_pos, Types... args)
{
return Construct_Submdspan<TRank - 1>{}.construct(span, fixed_index_pos, stdex::full_extent, args...);
}
};

/**
* @brief Returns a submdspan of mdspan. The submdspan has one rank less than the mdspan. The left dimension is fixed
* to a specific index. The rest of the dimension contains the full range.
*
* @tparam TMDSpan
* @param span mdspan from which the submdspan is created
* @param fixed_index_pos Index postion of the fixed dimension
* @return constexpr auto returns a stdex::submdspan
*/
template<typename TMDSpan>
constexpr auto submdspan_remove_dim(TMDSpan span, std::size_t const fixed_index_pos)
{
constexpr auto rank = TMDSpan::rank();
return Construct_Submdspan<rank - 1>{}.construct(span, fixed_index_pos, stdex::full_extent);
}

/**
* @brief Iterates over all elements of an n dimension mdspan. The iteration order is from the right to the left
* dimension.
*
* @tparam TDim Rank of the mdspan
*/
template<int TDim>
struct Iterate_mdspan;

template<>
struct Iterate_mdspan<1>
{
template<typename TSpan, typename TFunc>
void operator()(TSpan input, TSpan output, TFunc& functor)
{
assert(input.extent(0) <= output.extent(0));
for(auto i = 0; i < input.extent(0); ++i)
{
output(i) = functor(input(i));
}
}
};

template<int TDim>
struct Iterate_mdspan
{
/**
* @brief Iterate over all elements of an mdspan and apply the functor on it.
*
* @tparam TSpan type of the mdspan's
* @tparam TFunc type of the functor
* @param input The input mdspan
* @param output The output mdspan
* @param functor The functor
*/
template<typename TSpan, typename TFunc>
void operator()(TSpan input, TSpan output, TFunc& functor)
{
assert(input.extent(0) <= output.extent(0));

for(auto i = 0; i < input.extent(0); ++i)
{
auto subinput = submdspan_remove_dim(input, i);
auto suboutput = submdspan_remove_dim(output, i);
Iterate_mdspan<TSpan::rank() - 1>{}(subinput, suboutput, functor);
}
}
};
43 changes: 43 additions & 0 deletions include/vikunja/algorithm/transform.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#pragma once

//#include <alpaka/alpaka.hpp>

#include <vikunja/access/BlockStrategy.hpp>
#include <vikunja/access/MdspanLinear.hpp>
#include <vikunja/operators/operators.hpp>
#include <vikunja/workdiv/BlockBasedWorkDiv.hpp>

#include <experimental/mdspan>

namespace vikunja
{
namespace device
{
// FIXME: I'm only running on a single core CPU :-(
template<
typename TAcc,
typename WorkDivPolicy = vikunja::workdiv::BlockBasedPolicy<TAcc>,
typename MemAccessPolicy = vikunja::MemAccess::MemAccessPolicy<TAcc>,
typename TDevAcc,
typename TQueue,
typename TData,
typename TLayoutPolicy,
typename TAccessorPolicy,
typename TInputExtend,
typename TOutputExtend,
typename TFunc,
typename TOperator = vikunja::operators::UnaryOp<TAcc, TFunc, TData>>
void transform(
TDevAcc& devAcc,
TQueue& queue,
std::experimental::mdspan<TData, TInputExtend, TLayoutPolicy, TAccessorPolicy> input,
std::experimental::mdspan<TData, TOutputExtend, TLayoutPolicy, TAccessorPolicy> output,
TFunc const& func)
{
constexpr auto input_rank = decltype(input)::rank();
static_assert(input_rank > 0);

Iterate_mdspan<input_rank>{}(input, output, func);
}
} // namespace device
} // namespace vikunja