Skip to content

Commit

Permalink
Arg reduction WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
ssheorey committed Dec 31, 2024
1 parent 8db6e35 commit 6e82734
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 35 deletions.
4 changes: 4 additions & 0 deletions cpp/open3d/core/kernel/Reduction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ void Reduction(const Tensor& src,
dims);
}
}
if (src.NumElements() == 0) {
utility::LogError(
"Zero-size Tensor does not support Arg Reductions.");
}
}

SizeVector keepdim_shape =
Expand Down
194 changes: 171 additions & 23 deletions cpp/open3d/core/kernel/ReductionSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,29 @@ namespace core {
namespace kernel {

namespace {
// Based on OneAPI GPU optimization guide code sample (Blocked access to input
// data + SYCL builtin reduction ops for final reduction)

template <typename scalar_t>
struct ArgMinReduction {
using basic_reduction = sycl::minimum<scalar_t>;
std::pair<int64_t, scalar_t> operator()(
std::pair<int64_t, scalar_t> a,
std::pair<int64_t, scalar_t> b) const {
return a.second < b.second ? a : b;
}
};

template <typename scalar_t>
struct ArgMaxReduction {
using basic_reduction = sycl::maximum<scalar_t>;
std::pair<int64_t, scalar_t> operator()(
std::pair<int64_t, scalar_t> a,
std::pair<int64_t, scalar_t> b) const {
return a.second > b.second ? a : b;
}
};

template <class ReductionOp, typename scalar_t>
void SYCLReductionEngine(
Device device,
Indexer indexer,
scalar_t identity =
sycl::known_identity<ReductionOp, scalar_t>::value) {
void SYCLReductionEngine(Device device, Indexer indexer, scalar_t identity) {
auto device_props =
sy::SYCLContext::GetInstance().GetDeviceProperties(device);
auto queue = device_props.queue;
Expand All @@ -37,9 +52,10 @@ void SYCLReductionEngine(
size_t log2workitems_per_group = 8;
auto workitems_per_group = (1 << log2workitems_per_group); // 256
auto elements_per_work_item =
elements_per_group / workitems_per_group; // 32 (= max SIMD sizse)
elements_per_group / workitems_per_group; // 32 (= max SIMD size)
auto mask = ~(~0 << log2workitems_per_group);
ReductionOp red_op;

for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
output_idx++) {
// sub_indexer.NumWorkloads() == ipo.
Expand All @@ -51,12 +67,16 @@ void SYCLReductionEngine(
++num_work_groups;
// ensure each work group has work_group_size work items
auto num_work_items = num_work_groups * work_group_size;
auto output =
reinterpret_cast<scalar_t*>(scalar_out_indexer.GetOutputPtr(0));
auto e = queue.submit([&](auto& cgh) {

auto red_cg = [&](auto& cgh) {
auto output = reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetOutputPtr(0));
// Setting this still doesn't initialize to identity -
// output buffer must be initialized separately.
auto sycl_reducer = sycl::reduction(
output, identity, red_op,
{sycl::property::reduction::initialize_to_identity()});
sycl::stream out_stream(10240, 128, cgh);
cgh.parallel_for(
sycl::nd_range<1>{num_work_items, work_group_size},
sycl_reducer, [=](sycl::nd_item<1> item, auto& red_arg) {
Expand All @@ -69,13 +89,110 @@ void SYCLReductionEngine(
size_t idx =
(i << log2workitems_per_group) + offset;
if (idx >= num_elements) break;
scalar_t* val = reinterpret_cast<scalar_t*>(
auto val = *reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetInputPtr(0, idx));
item_out = red_op(item_out, *val);
item_out = red_op(item_out, val);
}
out_stream << glob_id << ',' << item_out << '\t';
red_arg.combine(item_out);
});
});
};

auto e = queue.submit(red_cg);
}
queue.wait_and_throw();
}

// Based on OneAPI GPU optimization guide code sample (Blocked access to
// input data + SYCL builtin reduction ops for final reduction)
template <class ReductionOp, typename scalar_t>
void SYCLArgReductionEngine(Device device, Indexer indexer, scalar_t identity) {
auto device_props =
sy::SYCLContext::GetInstance().GetDeviceProperties(device);
auto queue = device_props.queue;
auto work_group_size = device_props.max_work_group_size;
size_t log2elements_per_group = 13;
auto elements_per_group = (1 << log2elements_per_group); // 8192
size_t log2workitems_per_group = 8;
auto workitems_per_group = (1 << log2workitems_per_group); // 256
auto elements_per_work_item =
elements_per_group / workitems_per_group; // 32 (= max SIMD size)
auto mask = ~(~0 << log2workitems_per_group);
ReductionOp red_op;

// atomic flag. Must be 4 bytes.
sycl::buffer<int32_t, 1> output_in_use{indexer.NumOutputElements()};
auto e_fill = queue.submit([&](auto& cgh) {
auto acc_output_in_use =
output_in_use.get_access<sycl::access_mode::write>(cgh);
cgh.fill(acc_output_in_use, 0);
});

for (int64_t output_idx = 0; output_idx < indexer.NumOutputElements();
output_idx++) {
// sub_indexer.NumWorkloads() == ipo.
// sub_indexer's workload_idx is indexer's ipo_idx.
Indexer scalar_out_indexer = indexer.GetPerOutputIndexer(output_idx);
auto num_elements = scalar_out_indexer.NumWorkloads();
auto num_work_groups = num_elements / elements_per_group;
if (num_elements > elements_per_group * num_work_groups)
++num_work_groups;
// ensure each work group has work_group_size work items
auto num_work_items = num_work_groups * work_group_size;

sycl::buffer<int32_t, 1> this_output_in_use{output_in_use, output_idx,
1};
auto arg_red_cg = [&](auto& cgh) {
auto acc_in_use =
this_output_in_use
.get_access<sycl::access_mode::read_write>(cgh);
sycl::stream out_stream(10240, 1024, cgh);
cgh.parallel_for(
sycl::nd_range<1>{num_work_items, work_group_size},
[=](sycl::nd_item<1> item) {
auto glob_id = item.get_global_id(0);
auto this_group = item.get_group();
auto offset = ((glob_id >> log2workitems_per_group)
<< log2elements_per_group) +
(glob_id & mask);
std::pair<int64_t, scalar_t> item_out{0, identity};
for (size_t i = 0; i < elements_per_work_item; i++) {
size_t idx =
(i << log2workitems_per_group) + offset;
if (idx >= num_elements) break;
auto val = *reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetInputPtr(0, idx));
item_out = red_op(item_out, {idx, val});
}
auto group_output_val = sycl::reduce_over_group(
this_group, item_out.second, identity,
typename ReductionOp::basic_reduction());
// atomic (serial) reduction over all groups. SYCL does
// not have a barrier over groups. Work item(s) with min
// / max value update the output. (non-deterministic)
if (item_out.second == group_output_val) {
out_stream << "group_output: " << group_output_val
<< item_out.first << sycl::endl;
auto& out_idx = *reinterpret_cast<int64_t*>(
scalar_out_indexer.GetOutputPtr(0));
auto& out_val = *reinterpret_cast<scalar_t*>(
scalar_out_indexer.GetOutputPtr(1));
// TODO: Look for a better option to a spinlock
// mutex.
auto in_use = sycl::atomic_ref<
int32_t, sycl::memory_order::acq_rel,
sycl::memory_scope::device>(acc_in_use[0]);
while (in_use.exchange(1) == 1) {
}
std::tie(out_idx, out_val) =
red_op({out_idx, out_val},
{item_out.first, group_output_val});
in_use.store(0);
}
});
};

auto e = queue.submit(arg_red_cg);
}
queue.wait_and_throw();
}
Expand All @@ -90,33 +207,38 @@ void ReductionSYCL(const Tensor& src,
if (s_regular_reduce_ops.find(op_code) != s_regular_reduce_ops.end()) {
Indexer indexer({src}, dst, DtypePolicy::ALL_SAME, dims);
DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
scalar_t identity;
switch (op_code) {
case ReductionOpCode::Sum:
dst.Fill(0);
SYCLReductionEngine<sycl::plus<scalar_t>, scalar_t>(
device, indexer);
device, indexer, 0);
break;
case ReductionOpCode::Prod:
dst.Fill(1);
SYCLReductionEngine<sycl::multiplies<scalar_t>, scalar_t>(
device, indexer);
device, indexer, 1);
break;
case ReductionOpCode::Min:
if (indexer.NumWorkloads() == 0) {
utility::LogError(
"Zero-size Tensor does not support Min.");
} else {
identity = std::numeric_limits<scalar_t>::max();
dst.Fill(identity);
SYCLReductionEngine<sycl::minimum<scalar_t>, scalar_t>(
device, indexer);
device, indexer, identity);
}
break;
case ReductionOpCode::Max:
if (indexer.NumWorkloads() == 0) {
utility::LogError(
"Zero-size Tensor does not support Max.");
} else {
identity = std::numeric_limits<scalar_t>::lowest();
dst.Fill(identity);
SYCLReductionEngine<sycl::maximum<scalar_t>, scalar_t>(
device, indexer);
device, indexer, identity);
}
break;
default:
Expand All @@ -125,7 +247,33 @@ void ReductionSYCL(const Tensor& src,
}
});
} else if (s_arg_reduce_ops.find(op_code) != s_arg_reduce_ops.end()) {
utility::LogError("SYCL Arg-reduction is not implemented.");
if (dst.GetDtype() != core::Int64) {
utility::LogError("Arg-reduction must have int64 output dtype.");
}
// Accumulation buffer to store temporary min/max values.
Tensor dst_acc(dst.GetShape(), src.GetDtype(), src.GetDevice());
Indexer indexer({src}, {dst, dst_acc}, DtypePolicy::INPUT_SAME, dims);
DISPATCH_DTYPE_TO_TEMPLATE(src.GetDtype(), [&]() {
scalar_t identity;
switch (op_code) {
case ReductionOpCode::ArgMin:
identity = std::numeric_limits<scalar_t>::max();
dst_acc.Fill(identity);
SYCLArgReductionEngine<ArgMinReduction<scalar_t>, scalar_t>(
device, indexer, identity);
break;
case ReductionOpCode::ArgMax:
identity = std::numeric_limits<scalar_t>::lowest();
dst_acc.Fill(identity);
SYCLArgReductionEngine<ArgMaxReduction<scalar_t>, scalar_t>(
device, indexer, identity);
break;
default:
utility::LogError("Unsupported op code.");
break;
}
});
utility::LogInfo("dst_acc: {}", dst_acc.ToString());
} else if (s_boolean_reduce_ops.find(op_code) !=
s_boolean_reduce_ops.end()) {
if (src.GetDtype() != core::Bool) {
Expand All @@ -141,14 +289,14 @@ void ReductionSYCL(const Tensor& src,
case ReductionOpCode::All:
// Identity == true. 0-sized tensor, returns true.
dst.Fill(true);
SYCLReductionEngine<sycl::logical_and<bool>, bool>(device,
indexer);
SYCLReductionEngine<sycl::logical_and<bool>, bool>(
device, indexer, true);
break;
case ReductionOpCode::Any:
// Identity == false. 0-sized tensor, returns false.
dst.Fill(false);
SYCLReductionEngine<sycl::logical_or<bool>, bool>(device,
indexer);
SYCLReductionEngine<sycl::logical_or<bool>, bool>(
device, indexer, false);
break;
default:
utility::LogError("Unsupported op code.");
Expand Down
17 changes: 12 additions & 5 deletions cpp/open3d/utility/CompilerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,19 @@ std::string CompilerInfo::CUDACompilerVersion() const {

void CompilerInfo::Print() const {
#ifdef BUILD_CUDA_MODULE
utility::LogInfo("CompilerInfo: C++ {}, {} {}, {} {}.", CXXStandard(),
CXXCompilerId(), CXXCompilerVersion(), CUDACompilerId(),
CUDACompilerVersion());
utility::LogInfo("CompilerInfo: C++ {}, {} {}, {} {}, SYCL disabled.",
CXXStandard(), CXXCompilerId(), CXXCompilerVersion(),
CUDACompilerId(), CUDACompilerVersion());
#else
utility::LogInfo("CompilerInfo: C++ {}, {} {}, CUDA disabled.",
CXXStandard(), CXXCompilerId(), CXXCompilerVersion());
#ifdef BUILD_SYCL_MODULE
utility::LogInfo(
"CompilerInfo: C++ {}, {} {}, CUDA disabled, SYCL enabled.",
CXXStandard(), CXXCompilerId(), CXXCompilerVersion());
#else
utility::LogInfo(
"CompilerInfo: C++ {}, {} {}, CUDA disabled, SYCL disabled",
CXXStandard(), CXXCompilerId(), CXXCompilerVersion());
#endif
#endif
}

Expand Down
14 changes: 7 additions & 7 deletions cpp/tests/core/Tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ namespace tests {
class TensorPermuteDevices : public PermuteDevices {};
INSTANTIATE_TEST_SUITE_P(Tensor,
TensorPermuteDevices,
testing::ValuesIn(PermuteDevices::TestCases()));
testing::ValuesIn(TensorPermuteDevices::TestCases()));

class TensorPermuteDevicesWithSYCL : public PermuteDevices {};
class TensorPermuteDevicesWithSYCL : public PermuteDevicesWithSYCL {};
INSTANTIATE_TEST_SUITE_P(
Tensor,
TensorPermuteDevicesWithSYCL,
testing::ValuesIn(PermuteDevicesWithSYCL::TestCases()));
testing::ValuesIn(TensorPermuteDevicesWithSYCL::TestCases()));

class TensorPermuteDevicePairs : public PermuteDevicePairs {};
INSTANTIATE_TEST_SUITE_P(
Expand Down Expand Up @@ -1533,7 +1533,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, T) {
EXPECT_THROW(t_3d.T(), std::runtime_error);
}

TEST_P(TensorPermuteDevices, Det) {
TEST_P(TensorPermuteDevicesWithSYCL, Det) {
core::Device device = GetParam();
// Det supports both Float32 and Float64.
core::Dtype dtype = core::Float32;
Expand Down Expand Up @@ -2230,7 +2230,7 @@ TEST_P(TensorPermuteDevicesWithSYCL, ReduceMax) {
EXPECT_EQ(dst.ToFlatVector<float>(), std::vector<float>({23.f}));
}

TEST_P(TensorPermuteDevices, ReduceMaxFloatLimit) {
TEST_P(TensorPermuteDevicesWithSYCL, ReduceMaxFloatLimit) {
// std::numeric_limits<scalar_t> should use lowest() instead of min().
core::Device device = GetParam();
core::Tensor src = core::Tensor::Init<float>({-2.f, -1.f}, device);
Expand All @@ -2242,7 +2242,7 @@ TEST_P(TensorPermuteDevices, ReduceMaxFloatLimit) {
EXPECT_EQ(dst.ToFlatVector<int64_t>(), std::vector<int64_t>({1}));
}

TEST_P(TensorPermuteDevices, ReduceArgMin) {
TEST_P(TensorPermuteDevicesWithSYCL, ReduceArgMin) {
core::Device device = GetParam();
core::Tensor src = core::Tensor::Init<float>(
{{{22, 23, 20, 9}, {6, 14, 18, 13}, {15, 3, 17, 0}},
Expand Down Expand Up @@ -2270,7 +2270,7 @@ TEST_P(TensorPermuteDevices, ReduceArgMin) {
std::vector<int64_t>({3, 0, 3, 3, 1, 0}));
}

TEST_P(TensorPermuteDevices, ReduceArgMax) {
TEST_P(TensorPermuteDevicesWithSYCL, ReduceArgMax) {
core::Device device = GetParam();
core::Tensor src = core::Tensor::Init<float>(
{{{22, 23, 20, 9}, {6, 14, 18, 13}, {15, 3, 17, 0}},
Expand Down

0 comments on commit 6e82734

Please sign in to comment.