diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index f4fd3dcfb77..60d8e96d932 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -55,7 +55,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0 +- numba-cuda>=0.2.0,<0.3.0a0 +- numba>=0.59.1,<0.61.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index ec7ae3f0706..fe1a32ccb87 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -54,7 +54,8 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.2.0,<0.3.0 +- numba-cuda>=0.2.0,<0.3.0a0 +- numba>=0.59.1,<0.61.0a0 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index aa1879fc7bd..5df7f97346a 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -78,7 +78,8 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.2.0,<0.3.0 + - numba-cuda >=0.2.0,<0.3.0a0 + - numba >=0.59.1,<0.61.0a0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp index 3ef174e4206..3158445841e 100644 --- a/cpp/include/cudf/rolling.hpp +++ b/cpp/include/cudf/rolling.hpp @@ -44,6 +44,8 @@ namespace CUDF_EXPORT cudf { * - instead of storing NA/NaN for output rows that do not meet the minimum number of observations * this function updates the valid bitmask of the column to indicate which elements are valid. * + * @note Windows near the endpoints of the input are automatically clamped to be in-bounds. + * * Notes on return column types: * - The returned column for count aggregation always has `INT32` type. * - The returned column for VARIANCE/STD aggregations always has `FLOAT64` type. @@ -594,6 +596,11 @@ std::unique_ptr grouped_range_rolling_window( * column of the same type as the input. Therefore it is suggested to convert integer column types * (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`. * + * @note All entries in `preceding_window` and `following_window` must produce window extents that + * are in-bounds for the `input`. That is, for all `i`, it is required that the set of rows defined + * by the interval `[i - preceding_window[i] + 1, ..., i + following_window[i] + 1)` is a subset of + * `[0, input.size())`. + * * @throws cudf::logic_error if window column type is not INT32 * * @param[in] input The input column diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 469f933f918..aeaa87e2202 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -35,10 +35,10 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan char_count device_span str_col_indexes) { // Index of the column in the `str_col_indexes` array - auto const str_col_idx = blockIdx.y; + auto const str_col_idx = blockIdx.x % str_col_indexes.size(); // Index of the column in the `orc_columns` array auto const col_idx = str_col_indexes[str_col_idx]; - auto const row_group_idx = blockIdx.x * blockDim.x + threadIdx.x; + auto const row_group_idx = (blockIdx.x / str_col_indexes.size()) * blockDim.x + threadIdx.x; if (row_group_idx >= rowgroup_bounds.size().first) { return; } auto const& str_col = orc_columns[col_idx]; @@ -63,18 +63,17 @@ void rowgroup_char_counts(device_2dspan counts, if (rowgroup_bounds.count() == 0) { return; } auto const num_rowgroups = rowgroup_bounds.size().first; - auto const num_str_cols = str_col_indexes.size(); - if (num_str_cols == 0) { return; } + if (str_col_indexes.empty()) { return; } int block_size = 0; // suggested thread count to use int min_grid_size = 0; // minimum block count required CUDF_CUDA_TRY( cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, rowgroup_char_counts_kernel)); - auto const grid_size = - dim3(cudf::util::div_rounding_up_unsafe(num_rowgroups, block_size), - static_cast(num_str_cols)); + auto const num_blocks = + cudf::util::div_rounding_up_unsafe(num_rowgroups, block_size) * + str_col_indexes.size(); - rowgroup_char_counts_kernel<<>>( + rowgroup_char_counts_kernel<<>>( counts, orc_columns, rowgroup_bounds, str_col_indexes); } @@ -104,8 +103,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) populate_dictionary_hash_maps_kernel(device_2dspan dictionaries, device_span columns) { - auto const col_idx = blockIdx.x; - auto const stripe_idx = blockIdx.y; + auto const col_idx = blockIdx.x / dictionaries.size().second; + auto const stripe_idx = blockIdx.x % dictionaries.size().second; auto const t = threadIdx.x; auto& dict = dictionaries[col_idx][stripe_idx]; auto const& col = columns[dict.column_idx]; @@ -166,8 +165,8 @@ template CUDF_KERNEL void __launch_bounds__(block_size) collect_map_entries_kernel(device_2dspan dictionaries) { - auto const col_idx = blockIdx.x; - auto const stripe_idx = blockIdx.y; + auto const col_idx = blockIdx.x / dictionaries.size().second; + auto const stripe_idx = blockIdx.x % dictionaries.size().second; auto const& dict = dictionaries[col_idx][stripe_idx]; if (not dict.is_enabled) { return; } @@ -200,8 +199,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) get_dictionary_indices_kernel(device_2dspan dictionaries, device_span columns) { - auto const col_idx = blockIdx.x; - auto const stripe_idx = blockIdx.y; + auto const col_idx = blockIdx.x / dictionaries.size().second; + auto const stripe_idx = blockIdx.x % dictionaries.size().second; auto const t = threadIdx.x; auto const& dict = dictionaries[col_idx][stripe_idx]; auto const& col = columns[dict.column_idx]; @@ -244,9 +243,8 @@ void populate_dictionary_hash_maps(device_2dspan dictionaries { if (dictionaries.count() == 0) { return; } constexpr int block_size = 256; - dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second); populate_dictionary_hash_maps_kernel - <<>>(dictionaries, columns); + <<>>(dictionaries, columns); } void collect_map_entries(device_2dspan dictionaries, @@ -254,8 +252,8 @@ void collect_map_entries(device_2dspan dictionaries, { if (dictionaries.count() == 0) { return; } constexpr int block_size = 1024; - dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second); - collect_map_entries_kernel<<>>(dictionaries); + collect_map_entries_kernel + <<>>(dictionaries); } void get_dictionary_indices(device_2dspan dictionaries, @@ -264,9 +262,8 @@ void get_dictionary_indices(device_2dspan dictionaries, { if (dictionaries.count() == 0) { return; } constexpr int block_size = 1024; - dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second); get_dictionary_indices_kernel - <<>>(dictionaries, columns); + <<>>(dictionaries, columns); } } // namespace cudf::io::orc::gpu diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 2fce981e8a5..e01b93262d7 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "io/utilities/block_utils.cuh" #include "orc_gpu.hpp" +#include #include #include @@ -40,8 +40,9 @@ CUDF_KERNEL void __launch_bounds__(init_threads_per_block) device_2dspan rowgroup_bounds) { __shared__ __align__(4) statistics_group group_g[init_groups_per_block]; - auto const col_id = blockIdx.y; - auto const chunk_id = (blockIdx.x * init_groups_per_block) + threadIdx.y; + auto const col_id = blockIdx.x % rowgroup_bounds.size().second; + auto const chunk_id = + (blockIdx.x / rowgroup_bounds.size().second * init_groups_per_block) + threadIdx.y; auto const t = threadIdx.x; auto const num_rowgroups = rowgroup_bounds.size().first; statistics_group* group = &group_g[threadIdx.y]; @@ -452,10 +453,12 @@ void orc_init_statistics_groups(statistics_group* groups, device_2dspan rowgroup_bounds, rmm::cuda_stream_view stream) { - dim3 dim_grid((rowgroup_bounds.size().first + init_groups_per_block - 1) / init_groups_per_block, - rowgroup_bounds.size().second); + auto const num_blocks = + cudf::util::div_rounding_up_safe(rowgroup_bounds.size().first, init_groups_per_block) * + rowgroup_bounds.size().second; + dim3 dim_block(init_threads_per_group, init_groups_per_block); - gpu_init_statistics_groups<<>>( + gpu_init_statistics_groups<<>>( groups, cols, rowgroup_bounds); } @@ -490,8 +493,8 @@ void orc_encode_statistics(uint8_t* blob_bfr, uint32_t statistics_count, rmm::cuda_stream_view stream) { - unsigned int num_blocks = - (statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block; + auto const num_blocks = + cudf::util::div_rounding_up_safe(statistics_count, encode_chunks_per_block); dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block); gpu_encode_statistics<<>>( blob_bfr, groups, chunks, statistics_count); diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 1f84d1f81dc..f560b806894 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1288,9 +1288,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) } temp_storage; orcdec_state_s* const s = &state_g; - bool const is_nulldec = (blockIdx.y >= num_stripes); - uint32_t const column = blockIdx.x; - uint32_t const stripe = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y; + // Need the modulo because we have twice as many threads as columns*stripes + uint32_t const column = blockIdx.x / num_stripes; + uint32_t const stripe = blockIdx.x % num_stripes; uint32_t const chunk_id = stripe * num_columns + column; int t = threadIdx.x; @@ -1298,6 +1298,7 @@ CUDF_KERNEL void __launch_bounds__(block_size) __syncthreads(); size_t const max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count; + bool const is_nulldec = (blockIdx.y == 0); if (is_nulldec) { uint32_t null_count = 0; // Decode NULLs @@ -1576,7 +1577,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) auto num_rowgroups = row_groups.size().first; if (num_rowgroups > 0) { - if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; } + if (t == 0) { + auto const rg_idx = blockIdx.x % num_rowgroups; + auto const col_idx = blockIdx.x / num_rowgroups; + s->top.data.index = row_groups[rg_idx][col_idx]; + } __syncthreads(); chunk_id = s->top.data.index.chunk_id; } else { @@ -1601,10 +1606,11 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) { atomicAdd(error_count, 1); } - auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]); - auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]); + auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]); + auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]); + uint32_t const rg_idx = blockIdx.x % num_rowgroups; uint32_t rowgroup_rowofs = - (level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride + (level == 0) ? (rg_idx - cuda::std::min(s->chunk.rowgroup_id, rg_idx)) * rowidx_stride : s->top.data.index.start_row; ; s->chunk.streams[CI_DATA] += ofs0; @@ -2025,7 +2031,9 @@ CUDF_KERNEL void __launch_bounds__(block_size) } if (t == 0 and (s->chunk.type_kind == LIST or s->chunk.type_kind == MAP)) { if (num_rowgroups > 0) { - row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows; + auto const rg_idx = blockIdx.x % num_rowgroups; + auto const col_idx = blockIdx.x / num_rowgroups; + row_groups[rg_idx][col_idx].num_child_rows = s->num_child_rows; } cuda::atomic_ref ref{chunks[chunk_id].num_child_rows}; ref.fetch_add(s->num_child_rows, cuda::std::memory_order_relaxed); @@ -2049,9 +2057,9 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks, int64_t first_row, rmm::cuda_stream_view stream) { - dim3 dim_block(block_size, 1); - dim3 dim_grid(num_columns, num_stripes * 2); // 1024 threads per chunk - gpuDecodeNullsAndStringDictionaries<<>>( + dim3 dim_grid(num_columns * num_stripes, 2); + + gpuDecodeNullsAndStringDictionaries<<>>( chunks, global_dictionary, num_columns, num_stripes, first_row); } @@ -2083,11 +2091,8 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks, size_type* error_count, rmm::cuda_stream_view stream) { - auto const num_chunks = num_columns * num_stripes; - dim3 dim_block(block_size, 1); // 1024 threads per chunk - dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks, - (num_rowgroups > 0) ? num_rowgroups : 1); - gpuDecodeOrcColumnData<<>>( + auto const num_blocks = num_columns * (num_rowgroups > 0 ? num_rowgroups : num_stripes); + gpuDecodeOrcColumnData<<>>( chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count); } diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 4f296bb5bfc..bcdd059bf67 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -741,8 +741,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) } temp_storage; orcenc_state_s* const s = &state_g; - uint32_t col_id = blockIdx.x; - uint32_t group_id = blockIdx.y; + uint32_t col_id = blockIdx.x / chunks.size().second; + uint32_t group_id = blockIdx.x % chunks.size().second; int t = threadIdx.x; if (t == 0) { s->chunk = chunks[col_id][group_id]; @@ -1105,8 +1105,8 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) device_span dsts, device_span sizes) { - auto const stripe_id = cudf::detail::grid_1d::global_thread_id(); - auto const stream_id = blockIdx.y; + auto const stripe_id = (blockIdx.x / strm_desc.size().second) * blockDim.x + threadIdx.x; + auto const stream_id = blockIdx.x % strm_desc.size().second; if (stripe_id >= strm_desc.size().first) { return; } auto const out_id = stream_id * strm_desc.size().first + stripe_id; @@ -1147,7 +1147,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size) // blockDim {256,1,1} CUDF_KERNEL void __launch_bounds__(256) gpuInitCompressionBlocks(device_2dspan strm_desc, - device_2dspan streams, // const? + device_2dspan streams, device_span> inputs, device_span> outputs, device_span results, @@ -1162,8 +1162,8 @@ CUDF_KERNEL void __launch_bounds__(256) auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align); auto const padded_comp_block_size = util::round_up_unsafe(max_comp_blk_size, comp_block_align); - auto const stripe_id = blockIdx.x; - auto const stream_id = blockIdx.y; + auto const stripe_id = blockIdx.x / strm_desc.size().second; + auto const stream_id = blockIdx.x % strm_desc.size().second; uint32_t t = threadIdx.x; uint32_t num_blocks; uint8_t *src, *dst; @@ -1213,8 +1213,8 @@ CUDF_KERNEL void __launch_bounds__(1024) __shared__ uint8_t const* comp_src_g; __shared__ uint32_t comp_len_g; - auto const stripe_id = blockIdx.x; - auto const stream_id = blockIdx.y; + auto const stripe_id = blockIdx.x / strm_desc.size().second; + auto const stream_id = blockIdx.x % strm_desc.size().second; uint32_t t = threadIdx.x; uint32_t num_blocks, b, blk_size; uint8_t const* src; @@ -1290,8 +1290,10 @@ CUDF_KERNEL void decimal_sizes_to_offsets_kernel(device_2dspan chunks, device_2dspan streams, rmm::cuda_stream_view stream) { - dim3 dim_block(encode_block_size, 1); // `encode_block_size` threads per chunk - dim3 dim_grid(chunks.size().first, chunks.size().second); + auto const num_blocks = chunks.size().first * chunks.size().second; gpuEncodeOrcColumnData - <<>>(chunks, streams); + <<>>(chunks, streams); } void EncodeStripeDictionaries(stripe_dictionary const* stripes, @@ -1324,10 +1325,10 @@ void EncodeStripeDictionaries(stripe_dictionary const* stripes, device_2dspan enc_streams, rmm::cuda_stream_view stream) { - dim3 dim_block(512, 1); // 512 threads per dictionary + constexpr int block_size = 512; // 512 threads per dictionary dim3 dim_grid(num_string_columns * num_stripes, 2); - gpuEncodeStringDictionaries<512> - <<>>(stripes, columns, chunks, enc_streams); + gpuEncodeStringDictionaries + <<>>(stripes, columns, chunks, enc_streams); } void CompactOrcDataStreams(device_2dspan strm_desc, @@ -1345,10 +1346,10 @@ void CompactOrcDataStreams(device_2dspan strm_desc, auto lengths = cudf::detail::make_zeroed_device_uvector_async( num_chunks, stream, rmm::mr::get_current_device_resource()); - dim3 dim_block(compact_streams_block_size, 1); - dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size), - strm_desc.size().second); - gpuInitBatchedMemcpy<<>>( + auto const num_blocks = + cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size) * + strm_desc.size().second; + gpuInitBatchedMemcpy<<>>( strm_desc, enc_streams, srcs, dsts, lengths); // Copy streams in a batched manner. @@ -1372,22 +1373,20 @@ std::optional CompressOrcDataStreams( rmm::device_uvector> comp_in(num_compressed_blocks, stream); rmm::device_uvector> comp_out(num_compressed_blocks, stream); - dim3 dim_block_init(256, 1); - dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuInitCompressionBlocks<<>>(strm_desc, - enc_streams, - comp_in, - comp_out, - comp_res, - compressed_data, - comp_blk_size, - max_comp_blk_size, - comp_block_align); + size_t const num_blocks = strm_desc.size().first * strm_desc.size().second; + gpuInitCompressionBlocks<<>>(strm_desc, + enc_streams, + comp_in, + comp_out, + comp_res, + compressed_data, + comp_blk_size, + max_comp_blk_size, + comp_block_align); cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream); - dim3 dim_block_compact(1024, 1); - gpuCompactCompressedBlocks<<>>( + gpuCompactCompressedBlocks<<>>( strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size); if (collect_statistics) { @@ -1415,10 +1414,10 @@ void decimal_sizes_to_offsets(device_2dspan rg_bounds, h_sizes, stream, cudf::get_current_device_resource_ref()); constexpr int block_size = 256; - dim3 const grid_size{static_cast(elem_sizes.size()), // num decimal columns - static_cast(rg_bounds.size().first)}; // num rowgroups + // number of rowgroups * number of decimal columns + auto const num_blocks = elem_sizes.size() * rg_bounds.size().first; decimal_sizes_to_offsets_kernel - <<>>(rg_bounds, d_sizes); + <<>>(rg_bounds, d_sizes); } } // namespace gpu diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 89dbbcb796c..0c739f59b0a 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -457,7 +457,9 @@ CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_g { __shared__ __align__(16) rowindex_state_s state_g; rowindex_state_s* const s = &state_g; - uint32_t chunk_id = blockIdx.y * num_columns + blockIdx.x; + auto const col_idx = blockIdx.x / num_stripes; + auto const stripe_idx = blockIdx.x % num_stripes; + uint32_t chunk_id = stripe_idx * num_columns + col_idx; int t = threadIdx.x; if (t == 0) { @@ -496,19 +498,18 @@ CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_g for (int i = t32; i < num_rowgroups; i += 32) { auto const num_rows = (use_base_stride) ? rowidx_stride - : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows; + : row_groups[(s->rowgroup_start + i) * num_columns + col_idx].num_rows; auto const start_row = - (use_base_stride) - ? i * rowidx_stride - : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row; + (use_base_stride) ? i * rowidx_stride + : row_groups[(s->rowgroup_start + i) * num_columns + col_idx].start_row; for (int j = t4; j < rowgroup_size4; j += 4) { - ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] = + ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + col_idx])[j] = ((uint32_t*)&s->rowgroups[i])[j]; } - row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows; + row_groups[(s->rowgroup_start + i) * num_columns + col_idx].num_rows = num_rows; // Updating in case of struct - row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_child_rows = num_rows; - row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row = start_row; + row_groups[(s->rowgroup_start + i) * num_columns + col_idx].num_child_rows = num_rows; + row_groups[(s->rowgroup_start + i) * num_columns + col_idx].start_row = start_row; } __syncthreads(); if (t == 0) { s->rowgroup_start += num_rowgroups; } @@ -525,8 +526,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - auto const column_id = blockIdx.x; - auto const rowgroup_id = blockIdx.y; + auto const column_id = blockIdx.x / rowgroup_bounds.size().first; + auto const rowgroup_id = blockIdx.x % rowgroup_bounds.size().first; auto const column = orc_columns[column_id]; auto const t = threadIdx.x; @@ -563,9 +564,7 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info, { auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block if (num_blocks > 0) { - dim3 dim_block(128, 1); - dim3 dim_grid(num_blocks, 1); - gpuParseCompressedStripeData<<>>( + gpuParseCompressedStripeData<<>>( strm_info, num_streams, compression_block_size, log2maxcr); } } @@ -576,10 +575,7 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info, { auto const num_blocks = (num_streams + 3) >> 2; // 1 stream per warp, 4 warps per block if (num_blocks > 0) { - dim3 dim_block(128, 1); - dim3 dim_grid(num_blocks, 1); - gpuPostDecompressionReassemble<<>>(strm_info, - num_streams); + gpuPostDecompressionReassemble<<>>(strm_info, num_streams); } } @@ -592,9 +588,8 @@ void __host__ ParseRowGroupIndex(RowGroup* row_groups, bool use_base_stride, rmm::cuda_stream_view stream) { - dim3 dim_block(128, 1); - dim3 dim_grid(num_columns, num_stripes); // 1 column chunk per block - gpuParseRowGroupIndex<<>>( + auto const num_blocks = num_columns * num_stripes; + gpuParseRowGroupIndex<<>>( row_groups, strm_info, chunks, num_columns, num_stripes, rowidx_stride, use_base_stride); } @@ -603,10 +598,10 @@ void __host__ reduce_pushdown_masks(device_span co device_2dspan valid_counts, rmm::cuda_stream_view stream) { - dim3 dim_block(128, 1); - dim3 dim_grid(columns.size(), rowgroups.size().first); // 1 rowgroup per block - gpu_reduce_pushdown_masks<128> - <<>>(columns, rowgroups, valid_counts); + auto const num_blocks = columns.size() * rowgroups.size().first; // 1 block per rowgroup + constexpr int block_size = 128; + gpu_reduce_pushdown_masks + <<>>(columns, rowgroups, valid_counts); } } // namespace gpu diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu index 69e4723eded..af524e1f70a 100644 --- a/cpp/src/io/parquet/bloom_filter_reader.cu +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -66,9 +67,12 @@ struct bloom_filter_caster { ast::literal const* const literal, rmm::cuda_stream_view stream) const { - using key_type = T; - using policy_type = cuco::arrow_filter_policy; - using word_type = typename policy_type::word_type; + using key_type = T; + using policy_type = cuco::arrow_filter_policy; + using bloom_filter_type = cuco:: + bloom_filter_ref, cuco::thread_scope_thread, policy_type>; + using filter_block_type = typename bloom_filter_type::filter_block_type; + using word_type = typename policy_type::word_type; // Check if the literal has the same type as the predicate column CUDF_EXPECTS( @@ -104,16 +108,13 @@ struct bloom_filter_caster { auto const num_filter_blocks = filter_size / bytes_per_block; // Create a bloom filter view. - cuco::bloom_filter_ref, - cuco::thread_scope_thread, - policy_type> - filter{reinterpret_cast(filter_span[filter_idx].data()), - num_filter_blocks, - {}, // Thread scope as the same literal is being searched across different bitsets - // per thread - {}}; // Arrow policy with cudf::hashing::detail::XXHash_64 seeded with 0 for Arrow - // compatibility + bloom_filter_type filter{ + reinterpret_cast(filter_span[filter_idx].data()), + num_filter_blocks, + {}, // Thread scope as the same literal is being searched across different bitsets per + // thread + {}}; // Arrow policy with cudf::hashing::detail::XXHash_64 seeded with 0 for Arrow + // compatibility // If int96_timestamp type, convert literal to string_view and query bloom // filter @@ -381,6 +382,7 @@ class bloom_filter_expression_converter : public equality_literals_collector { * @param bloom_filter_sizes Bloom filter sizes for all chunks * @param chunk_source_map Association between each column chunk and its source * @param stream CUDA stream used for device memory operations and kernel launches + * @param aligned_mr Aligned device memory resource to allocate bloom filter buffers */ void read_bloom_filter_data(host_span const> sources, size_t num_chunks, @@ -388,8 +390,19 @@ void read_bloom_filter_data(host_span const> sources cudf::host_span> bloom_filter_offsets, cudf::host_span> bloom_filter_sizes, std::vector const& chunk_source_map, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref aligned_mr) { + // Using `cuco::arrow_filter_policy` with a temporary `cuda::std::byte` key type to extract bloom + // filter properties + using policy_type = cuco::arrow_filter_policy; + auto constexpr filter_block_alignment = + alignof(cuco::bloom_filter_ref, + cuco::thread_scope_thread, + policy_type>::filter_block_type); + auto constexpr words_per_block = policy_type::words_per_block; + // Read tasks for bloom filter data std::vector> read_tasks; @@ -422,12 +435,6 @@ void read_bloom_filter_data(host_span const> sources CompactProtocolReader cp{buffer->data(), buffer->size()}; cp.read(&header); - // Get the hardcoded words_per_block value from `cuco::arrow_filter_policy` using a temporary - // `std::byte` key type. - auto constexpr words_per_block = - cuco::arrow_filter_policy::words_per_block; - // Check if the bloom filter header is valid. auto const is_header_valid = (header.num_bytes % words_per_block) == 0 and @@ -448,15 +455,25 @@ void read_bloom_filter_data(host_span const> sources // Check if we already read in the filter bitset in the initial read. if (initial_read_size >= bloom_filter_header_size + bitset_size) { - bloom_filter_data[chunk] = - rmm::device_buffer{buffer->data() + bloom_filter_header_size, bitset_size, stream}; + bloom_filter_data[chunk] = rmm::device_buffer{ + buffer->data() + bloom_filter_header_size, bitset_size, stream, aligned_mr}; + // The allocated bloom filter buffer must be aligned + CUDF_EXPECTS(reinterpret_cast(bloom_filter_data[chunk].data()) % + filter_block_alignment == + 0, + "Encountered misaligned bloom filter block"); } // Read the bitset from datasource. else { auto const bitset_offset = bloom_filter_offset + bloom_filter_header_size; // Directly read to device if preferred if (source->is_device_read_preferred(bitset_size)) { - bloom_filter_data[chunk] = rmm::device_buffer{bitset_size, stream}; + bloom_filter_data[chunk] = rmm::device_buffer{bitset_size, stream, aligned_mr}; + // The allocated bloom filter buffer must be aligned + CUDF_EXPECTS(reinterpret_cast(bloom_filter_data[chunk].data()) % + filter_block_alignment == + 0, + "Encountered misaligned bloom filter block"); auto future_read_size = source->device_read_async(bitset_offset, bitset_size, @@ -465,8 +482,14 @@ void read_bloom_filter_data(host_span const> sources read_tasks.emplace_back(std::move(future_read_size)); } else { - buffer = source->host_read(bitset_offset, bitset_size); - bloom_filter_data[chunk] = rmm::device_buffer{buffer->data(), buffer->size(), stream}; + buffer = source->host_read(bitset_offset, bitset_size); + bloom_filter_data[chunk] = + rmm::device_buffer{buffer->data(), buffer->size(), stream, aligned_mr}; + // The allocated bloom filter buffer must be aligned + CUDF_EXPECTS(reinterpret_cast(bloom_filter_data[chunk].data()) % + filter_block_alignment == + 0, + "Encountered misaligned bloom filter block"); } } }); @@ -484,7 +507,8 @@ std::vector aggregate_reader_metadata::read_bloom_filters( host_span const> row_group_indices, host_span column_schemas, size_type total_row_groups, - rmm::cuda_stream_view stream) const + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref aligned_mr) const { // Descriptors for all the chunks that make up the selected columns auto const num_input_columns = column_schemas.size(); @@ -543,7 +567,8 @@ std::vector aggregate_reader_metadata::read_bloom_filters( bloom_filter_offsets, bloom_filter_sizes, chunk_source_map, - stream); + stream, + aligned_mr); // Return bloom filter data return bloom_filter_data; @@ -612,10 +637,22 @@ std::optional>> aggregate_reader_metadata::ap // Return early if no column with equality predicate(s) if (equality_col_schemas.empty()) { return std::nullopt; } + // Required alignment: + // https://github.com/NVIDIA/cuCollections/blob/deab5799f3e4226cb8a49acf2199c03b14941ee4/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh#L55-L67 + using policy_type = cuco::arrow_filter_policy; + auto constexpr alignment = alignof(cuco::bloom_filter_ref, + cuco::thread_scope_thread, + policy_type>::filter_block_type); + + // Aligned resource adaptor to allocate bloom filter buffers with + auto aligned_mr = + rmm::mr::aligned_resource_adaptor(cudf::get_current_device_resource(), alignment); + // Read a vector of bloom filter bitset device buffers for all columns with equality // predicate(s) across all row groups auto bloom_filter_data = read_bloom_filters( - sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream); + sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream, aligned_mr); // No bloom filter buffers, return the original row group indices if (bloom_filter_data.empty()) { return std::nullopt; } diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index a28ce616e2c..ba5e53e3104 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -204,6 +204,7 @@ class aggregate_reader_metadata { * @param[out] bloom_filter_data List of bloom filter data device buffers * @param column_schemas Schema indices of columns whose bloom filters will be read * @param stream CUDA stream used for device memory operations and kernel launches + * @param aligned_mr Aligned device memory resource to allocate bloom filter buffers * * @return A flattened list of bloom filter bitset device buffers for each predicate column across * row group @@ -213,7 +214,8 @@ class aggregate_reader_metadata { host_span const> row_group_indices, host_span column_schemas, size_type num_row_groups, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref aligned_mr) const; /** * @brief Collects Parquet types for the columns with the specified schema indices diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh index 5d5fe9e4aa3..bce441cf323 100644 --- a/cpp/src/rolling/detail/lead_lag_nested.cuh +++ b/cpp/src/rolling/detail/lead_lag_nested.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -147,11 +147,7 @@ std::unique_ptr compute_lead_lag_for_nested(aggregation::Kind op, gather_map.begin(), cuda::proclaim_return_type( [following, input_size, null_index, row_offset] __device__(size_type i) { - // Note: grouped_*rolling_window() trims preceding/following to - // the beginning/end of the group. `rolling_window()` does not. - // Must trim _following[i] so as not to go past the column end. - auto _following = min(following[i], input_size - i - 1); - return (row_offset > _following) ? null_index : (i + row_offset); + return (row_offset > following[i]) ? null_index : (i + row_offset); })); } else { thrust::transform(rmm::exec_policy(stream), @@ -160,11 +156,7 @@ std::unique_ptr compute_lead_lag_for_nested(aggregation::Kind op, gather_map.begin(), cuda::proclaim_return_type( [preceding, input_size, null_index, row_offset] __device__(size_type i) { - // Note: grouped_*rolling_window() trims preceding/following to - // the beginning/end of the group. `rolling_window()` does not. - // Must trim _preceding[i] so as not to go past the column start. - auto _preceding = min(preceding[i], i + 1); - return (row_offset > (_preceding - 1)) ? null_index : (i - row_offset); + return (row_offset > (preceding[i] - 1)) ? null_index : (i - row_offset); })); } diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh index bc0ee2eb519..07495eb02e2 100644 --- a/cpp/src/rolling/detail/rolling.cuh +++ b/cpp/src/rolling/detail/rolling.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1010,7 +1010,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation * @param[out] output_valid_count Output count of valid values * @param[in] device_operator The operator used to perform a single window operation * @param[in] preceding_window_begin Rolling window size iterator, accumulates from - * in_col[i-preceding_window] to in_col[i] inclusive + * in_col[i-preceding_window + 1] to in_col[i] inclusive * @param[in] following_window_begin Rolling window size iterator in the forward * direction, accumulates from in_col[i] to in_col[i+following_window] inclusive */ @@ -1034,28 +1034,27 @@ __launch_bounds__(block_size) CUDF_KERNEL size_type warp_valid_count{0}; - auto active_threads = __ballot_sync(0xffff'ffffu, i < input.size()); - while (i < input.size()) { - // to prevent overflow issues when computing bounds use int64_t - int64_t const preceding_window = preceding_window_begin[i]; - int64_t const following_window = following_window_begin[i]; + auto const num_rows = input.size(); + auto active_threads = __ballot_sync(0xffff'ffffu, i < num_rows); + while (i < num_rows) { + // The caller is required to provide window bounds that will + // result in indexing that is in-bounds for the column. Therefore all + // of these calculations cannot overflow and we can do everything + // in size_type arithmetic. Moreover, we require that start <= + // end, i.e., the window is never "reversed" though it may be empty. + auto const preceding_window = preceding_window_begin[i]; + auto const following_window = following_window_begin[i]; - // compute bounds - auto const start = static_cast( - min(static_cast(input.size()), max(int64_t{0}, i - preceding_window + 1))); - auto const end = static_cast( - min(static_cast(input.size()), max(int64_t{0}, i + following_window + 1))); - auto const start_index = min(start, end); - auto const end_index = max(start, end); + size_type const start = i - preceding_window + 1; + size_type const end = i + following_window + 1; // aggregate // TODO: We should explore using shared memory to avoid redundant loads. // This might require separating the kernel into a special version // for dynamic and static sizes. - volatile bool output_is_valid = false; - output_is_valid = device_operator.template operator()( - input, default_outputs, output, start_index, end_index, i); + bool const output_is_valid = device_operator.template operator()( + input, default_outputs, output, start, end, i); // set the mask cudf::bitmask_type const result_mask{__ballot_sync(active_threads, output_is_valid)}; @@ -1068,7 +1067,7 @@ __launch_bounds__(block_size) CUDF_KERNEL // process next element i += stride; - active_threads = __ballot_sync(active_threads, i < input.size()); + active_threads = __ballot_sync(active_threads, i < num_rows); } // sum the valid counts across the whole block diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh index f3eff6b0689..7bb6ace6b02 100644 --- a/cpp/src/rolling/detail/rolling_collect_list.cuh +++ b/cpp/src/rolling/detail/rolling_collect_list.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -154,8 +154,8 @@ std::pair, std::unique_ptr> purge_null_entries( template std::unique_ptr rolling_collect_list(column_view const& input, column_view const& default_outputs, - PrecedingIter preceding_begin_raw, - FollowingIter following_begin_raw, + PrecedingIter preceding_begin, + FollowingIter following_begin, size_type min_periods, null_policy null_handling, rmm::cuda_stream_view stream, @@ -166,23 +166,6 @@ std::unique_ptr rolling_collect_list(column_view const& input, if (input.is_empty()) return empty_like(input); - // Fix up preceding/following iterators to respect column boundaries, - // similar to gpu_rolling(). - // `rolling_window()` does not fix up preceding/following so as not to read past - // column boundaries. - // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do. - auto preceding_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type([preceding_begin_raw] __device__(auto i) { - return thrust::min(preceding_begin_raw[i], i + 1); - })); - auto following_begin = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - cuda::proclaim_return_type( - [following_begin_raw, size = input.size()] __device__(auto i) { - return thrust::min(following_begin_raw[i], size - i - 1); - })); - // Materialize collect list's offsets. auto offsets = create_collect_offsets(input.size(), preceding_begin, following_begin, min_periods, stream, mr); diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu index 0603f27852a..7526c858899 100644 --- a/cpp/src/rolling/detail/rolling_fixed_window.cu +++ b/cpp/src/rolling/detail/rolling_fixed_window.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,16 +15,15 @@ */ #include "rolling.cuh" - -#include +#include "rolling_utils.cuh" #include +#include #include #include #include #include -#include namespace cudf::detail { @@ -62,28 +61,14 @@ std::unique_ptr rolling_window(column_view const& input, stream, mr); } else { - // Clamp preceding/following to column boundaries. - // E.g. If preceding_window == 2, then for a column of 5 elements, preceding_window will be: - // [1, 2, 2, 2, 1] - - auto const preceding_calc = cuda::proclaim_return_type( - [preceding_window] __device__(size_type i) { return thrust::min(i + 1, preceding_window); }); - - auto const following_calc = cuda::proclaim_return_type( - [col_size = input.size(), following_window] __device__(size_type i) { - return thrust::min(col_size - i - 1, following_window); - }); - - auto const preceding_column = expand_to_column(preceding_calc, input.size(), stream); - auto const following_column = expand_to_column(following_calc, input.size(), stream); - return cudf::detail::rolling_window(input, - default_outputs, - preceding_column->view().begin(), - following_column->view().begin(), - min_periods, - agg, - stream, - mr); + namespace utils = cudf::detail::rolling; + auto groups = utils::ungrouped{input.size()}; + auto preceding = + utils::make_clamped_window_iterator(preceding_window, groups); + auto following = + utils::make_clamped_window_iterator(following_window, groups); + return cudf::detail::rolling_window( + input, default_outputs, preceding, following, min_periods, agg, stream, mr); } } } // namespace cudf::detail diff --git a/cpp/src/rolling/detail/rolling_utils.cuh b/cpp/src/rolling/detail/rolling_utils.cuh new file mode 100644 index 00000000000..ec6d83fd68a --- /dev/null +++ b/cpp/src/rolling/detail/rolling_utils.cuh @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +namespace CUDF_EXPORT cudf { + +namespace detail::rolling { + +/** + * @brief A group descriptor for an ungrouped rolling window. + * + * @param num_rows The number of rows to be rolled over. + * + * @note This is used for uniformity of interface between grouped and ungrouped iterator + * construction. + */ +struct ungrouped { + cudf::size_type num_rows; + + [[nodiscard]] __device__ constexpr cudf::size_type label(cudf::size_type) const noexcept + { + return 0; + } + [[nodiscard]] __device__ constexpr cudf::size_type start(cudf::size_type) const noexcept + { + return 0; + } + [[nodiscard]] __device__ constexpr cudf::size_type end(cudf::size_type) const noexcept + { + return num_rows; + } +}; + +/** + * @brief A group descriptor for a grouped rolling window. + * + * @param labels The group labels, mapping from input rows to group. + * @param offsets The group offsets providing the endpoints of each group. + * + * @note This is used for uniformity of interface between grouped and ungrouped iterator + * construction. + */ +struct grouped { + // Taking raw pointers here to avoid stealing two registers for the sizes which are never needed. + cudf::size_type const* labels; + cudf::size_type const* offsets; + + [[nodiscard]] __device__ constexpr cudf::size_type label(cudf::size_type i) const noexcept + { + return labels[i]; + } + [[nodiscard]] __device__ constexpr cudf::size_type start(cudf::size_type label) const noexcept + { + return offsets[label]; + } + [[nodiscard]] __device__ constexpr cudf::size_type end(cudf::size_type label) const noexcept + { + return offsets[label + 1]; + } +}; + +enum class direction : bool { + PRECEDING, + FOLLOWING, +}; + +template +struct fixed_window_clamper { + Grouping groups; + cudf::size_type delta; + [[nodiscard]] __device__ constexpr cudf::size_type operator()(cudf::size_type i) const + { + auto const label = groups.label(i); + // i is contained in [start, end) + auto const start = groups.start(label); + auto const end = groups.end(label); + if constexpr (Direction == direction::PRECEDING) { + return cuda::std::min(i + 1 - start, cuda::std::max(delta, i + 1 - end)); + } else { + return cuda::std::max(start - i - 1, cuda::std::min(delta, end - i - 1)); + } + } +}; + +/** + * @brief Construct a clamped counting iterator for a row-based window offset + * + * @tparam Direction the direction of the window `PRECEDING` or `FOLLOWING`. + * @tparam Grouping the group specification. + * @param delta the window offset. + * @param grouper the grouping object. + * + * @return An iterator suitable for passing to `cudf::detail::rolling_window` + */ +template +[[nodiscard]] auto inline make_clamped_window_iterator(cudf::size_type delta, Grouping grouper) +{ + return cudf::detail::make_counting_transform_iterator( + cudf::size_type{0}, fixed_window_clamper{grouper, delta}); +} +} // namespace detail::rolling +} // namespace CUDF_EXPORT cudf diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu index d4851df740b..a345c0d0c28 100644 --- a/cpp/src/rolling/detail/rolling_variable_window.cu +++ b/cpp/src/rolling/detail/rolling_variable_window.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,24 +63,10 @@ std::unique_ptr rolling_window(column_view const& input, } else { auto defaults_col = cudf::is_dictionary(input.type()) ? dictionary_column_view(input).indices() : input; - // Clamp preceding/following to column boundaries. - // E.g. If preceding_window == [2, 2, 2, 2, 2] for a column of 5 elements, the new - // preceding_window will be: [1, 2, 2, 2, 1] - auto const preceding_window_begin = cudf::detail::make_counting_transform_iterator( - 0, - cuda::proclaim_return_type( - [preceding = preceding_window.begin()] __device__(size_type i) { - return thrust::min(i + 1, preceding[i]); - })); - auto const following_window_begin = cudf::detail::make_counting_transform_iterator( - 0, - cuda::proclaim_return_type( - [col_size = input.size(), following = following_window.begin()] __device__( - size_type i) { return thrust::min(col_size - i - 1, following[i]); })); return cudf::detail::rolling_window(input, empty_like(defaults_col)->view(), - preceding_window_begin, - following_window_begin, + preceding_window.begin(), + following_window.begin(), min_periods, agg, stream, diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu index 66d46a51577..18c793029b6 100644 --- a/cpp/src/rolling/grouped_rolling.cu +++ b/cpp/src/rolling/grouped_rolling.cu @@ -19,6 +19,7 @@ #include "detail/range_window_bounds.hpp" #include "detail/rolling.cuh" #include "detail/rolling_jit.hpp" +#include "detail/rolling_utils.cuh" #include #include @@ -43,109 +44,6 @@ namespace cudf { namespace detail { -/// Preceding window calculation functor. -template -struct row_based_preceding_calc { - cudf::size_type const* _group_offsets_begin; - cudf::size_type const* _group_labels_begin; - cudf::size_type const _preceding_window; - - row_based_preceding_calc(rmm::device_uvector const& group_offsets, - rmm::device_uvector const& group_labels, - cudf::size_type const& preceding_window) - : _group_offsets_begin(group_offsets.data()), - _group_labels_begin(group_labels.data()), - _preceding_window(preceding_window) - { - } - - __device__ cudf::size_type operator()(cudf::size_type const& idx) const - { - auto group_label = _group_labels_begin[idx]; - if constexpr (preceding_less_than_1) { // where 1 indicates only the current row. - auto group_end = _group_offsets_begin[group_label + 1]; - return thrust::maximum{}(_preceding_window, -(group_end - 1 - idx)); - } else { - auto group_start = _group_offsets_begin[group_label]; - return thrust::minimum{}(_preceding_window, - idx - group_start + 1); // Preceding includes current row. - } - } -}; - -/// Helper to materialize preceding-window column, corrected to respect group boundaries. -/// E.g. If preceding window == 5, then, -/// 1. For the first row in the group, the preceding is set to 1, -/// 2. For the next row in the group, preceding is set to 2, etc. -std::unique_ptr make_preceding_column( - rmm::device_uvector const& group_offsets, - rmm::device_uvector const& group_labels, - cudf::size_type const& preceding_window, - cudf::size_type const& num_rows, - rmm::cuda_stream_view stream) -{ - if (preceding_window < 1) { - auto const calc = row_based_preceding_calc(group_offsets, group_labels, preceding_window); - return cudf::detail::expand_to_column(calc, num_rows, stream); - } else { - auto const calc = - row_based_preceding_calc(group_offsets, group_labels, preceding_window); - return cudf::detail::expand_to_column(calc, num_rows, stream); - } -} - -/// Following window calculation functor. -template -struct row_based_following_calc { - cudf::size_type const* _group_offsets_begin; - cudf::size_type const* _group_labels_begin; - cudf::size_type const _following_window; - - row_based_following_calc(rmm::device_uvector const& group_offsets, - rmm::device_uvector const& group_labels, - cudf::size_type const& following_window) - : _group_offsets_begin(group_offsets.data()), - _group_labels_begin(group_labels.data()), - _following_window(following_window) - { - } - - __device__ cudf::size_type operator()(cudf::size_type const& idx) const - { - auto group_label = _group_labels_begin[idx]; - if constexpr (following_less_than_0) { - auto group_start = _group_offsets_begin[group_label]; - return thrust::maximum{}(_following_window, -(idx - group_start) - 1); - } else { - auto group_end = - _group_offsets_begin[group_label + 1]; // Cannot fall off the end, since offsets - // is capped with `input.size()`. - return thrust::minimum{}(_following_window, (group_end - 1) - idx); - } - } -}; - -/// Helper to materialize following-window column, corrected to respect group boundaries. -/// i.e. If following window == 5, then: -/// 1. For the last row in the group, the following is set to 0. -/// 2. For the second last row in the group, following is set to 1, etc. -std::unique_ptr make_following_column( - rmm::device_uvector const& group_offsets, - rmm::device_uvector const& group_labels, - cudf::size_type const& following_window, - cudf::size_type const& num_rows, - rmm::cuda_stream_view stream) -{ - if (following_window < 0) { - auto const calc = row_based_following_calc(group_offsets, group_labels, following_window); - return cudf::detail::expand_to_column(calc, num_rows, stream); - } else { - auto const calc = - row_based_following_calc(group_offsets, group_labels, following_window); - return cudf::detail::expand_to_column(calc, num_rows, stream); - } -} - std::unique_ptr grouped_rolling_window(table_view const& group_keys, column_view const& input, column_view const& default_outputs, @@ -208,10 +106,6 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, // groups.) // 3. [0, 500, 1000] indicates two equal-sized groups: [0,500), and [500,1000). - assert(group_offsets.size() >= 2 && group_offsets.element(0, stream) == 0 && - group_offsets.element(group_offsets.size() - 1, stream) == input.size() && - "Must have at least one group."); - if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) { cudf::detail::preceding_window_wrapper grouped_preceding_window{ group_offsets.data(), group_labels.data(), preceding_window}; @@ -229,18 +123,14 @@ std::unique_ptr grouped_rolling_window(table_view const& group_keys, stream, mr); } else { - auto const preceding_column = - make_preceding_column(group_offsets, group_labels, preceding_window, input.size(), stream); - auto const following_column = - make_following_column(group_offsets, group_labels, following_window, input.size(), stream); - return cudf::detail::rolling_window(input, - default_outputs, - preceding_column->view().begin(), - following_column->view().begin(), - min_periods, - aggr, - stream, - mr); + namespace utils = cudf::detail::rolling; + auto groups = utils::grouped{group_labels.data(), group_offsets.data()}; + auto preceding = + utils::make_clamped_window_iterator(preceding_window, groups); + auto following = + utils::make_clamped_window_iterator(following_window, groups); + return cudf::detail::rolling_window( + input, default_outputs, preceding, following, min_periods, aggr, stream, mr); } } diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 6c9c2aada9a..1ade2143494 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -2196,6 +2196,76 @@ TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite) EXPECT_EQ(out_buffer.size(), 0); } +// Tests whether Y dimension of grid sizes depends on the number of row groups +// Disabled because of the high execution time (especially compared to the likelihood of regression) +TEST_F(OrcReaderTest, DISABLE_Over65kRowGroups) +{ + auto constexpr row_group_size = 512; + constexpr auto num_rows = (1 << 16) * row_group_size + 1; + + auto vals_col = random_values(num_rows); + dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}}; + table_view chunk_table({col}); + + std::vector out_buffer; + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, chunk_table) + .row_index_stride(row_group_size); + + cudf::io::write_orc(out_opts); + + cudf::io::orc_reader_options read_opts = cudf::io::orc_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + auto result = cudf::io::read_orc(read_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(chunk_table, result.tbl->view()); +} + +// Tests whether Y dimension of grid sizes depends on the number of stripes +// Disabled because of the high execution time (especially compared to the likelihood of regression) +TEST_F(OrcReaderTest, DISABLE_Over65kStripes) +{ + auto constexpr stripe_size = 512; + constexpr auto num_rows = (1 << 16) * stripe_size + 1; + + auto vals_col = random_values(num_rows); + dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}}; + table_view chunk_table({col}); + + std::vector out_buffer; + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, chunk_table) + .stripe_size_rows(stripe_size) + .compression(cudf::io::compression_type::NONE); + + cudf::io::write_orc(out_opts); + + cudf::io::orc_reader_options read_opts = cudf::io::orc_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + auto result = cudf::io::read_orc(read_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(chunk_table, result.tbl->view()); +} + +// Tests whether Y dimension of grid sizes depends on the number of columns +// Disabled because of the high execution time (especially compared to the likelihood of regression) +TEST_F(OrcWriterTest, DISABLE_Over65kColumns) +{ + auto vals_col = random_values(8); + dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}}; + cudf::column_view col_view = col; + table_view expected(std::vector{(1 << 16) + 1, col_view}); + + std::vector out_buffer; + cudf::io::orc_writer_options out_opts = + cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected) + .compression(cudf::io::compression_type::NONE); + cudf::io::write_orc(out_opts); + + cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder( + cudf::io::source_info{out_buffer.data(), out_buffer.size()}); + auto result = cudf::io::read_orc(in_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view()); +} + TEST_F(OrcWriterTest, MultipleBlocksInStripeFooter) { std::vector vals_col(8, "a"); diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp index 72a511fd5f1..5b57ec46d19 100644 --- a/cpp/tests/rolling/rolling_test.cpp +++ b/cpp/tests/rolling/rolling_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,6 +38,7 @@ #include +#include #include #include #include @@ -660,7 +661,7 @@ TEST_F(RollingErrorTest, WindowArraySizeMismatch) cudf::test::fixed_width_column_wrapper input( col_data.begin(), col_data.end(), col_valid.begin()); - std::vector five({2, 1, 2, 1, 4}); + std::vector five({1, 1, 2, 1, 0}); std::vector four({1, 2, 3, 4}); cudf::test::fixed_width_column_wrapper five_elements(five.begin(), five.end()); cudf::test::fixed_width_column_wrapper four_elements(four.begin(), four.end()); @@ -985,22 +986,7 @@ TYPED_TEST(RollingTest, SimpleDynamic) cudf::test::fixed_width_column_wrapper input( col_data.begin(), col_data.end(), col_mask.begin()); std::vector preceding_window({1, 2, 3, 4, 2}); - std::vector following_window({2, 1, 2, 1, 2}); - - // dynamic sizes - this->run_test_col_agg(input, preceding_window, following_window, 1); -} - -// this is a special test to check the volatile count variable issue (see rolling.cu for detail) -TYPED_TEST(RollingTest, VolatileCount) -{ - auto const col_data = cudf::test::make_type_param_vector({8, 70, 45, 20, 59, 80}); - const std::vector col_mask = {1, 1, 0, 0, 1, 0}; - - cudf::test::fixed_width_column_wrapper input( - col_data.begin(), col_data.end(), col_mask.begin()); - std::vector preceding_window({5, 9, 4, 8, 3, 3}); - std::vector following_window({1, 1, 9, 2, 8, 9}); + std::vector following_window({2, 1, 2, 1, 0}); // dynamic sizes this->run_test_col_agg(input, preceding_window, following_window, 1); @@ -1128,14 +1114,19 @@ TYPED_TEST(RollingTest, RandomDynamicAllValid) // random parameters cudf::test::UniformRandomGenerator window_rng(0, max_window_size); - auto generator = [&]() { return window_rng.generate(); }; std::vector preceding_window(num_rows); std::vector following_window(num_rows); - std::generate(preceding_window.begin(), preceding_window.end(), generator); - std::generate(following_window.begin(), following_window.end(), generator); - + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, preceding_window.begin(), [&window_rng, num_rows](auto i) { + auto p = window_rng.generate(); + return std::min(i + 1, std::max(p, i + 1 - num_rows)); + }); + std::transform(it, it + num_rows, following_window.begin(), [&window_rng, num_rows](auto i) { + auto f = window_rng.generate(); + return std::max(-i - 1, std::min(f, num_rows - i - 1)); + }); this->run_test_col_agg(input, preceding_window, following_window, max_window_size); } @@ -1157,14 +1148,19 @@ TYPED_TEST(RollingTest, RandomDynamicWithInvalid) // random parameters cudf::test::UniformRandomGenerator window_rng(0, max_window_size); - auto generator = [&]() { return window_rng.generate(); }; std::vector preceding_window(num_rows); std::vector following_window(num_rows); - std::generate(preceding_window.begin(), preceding_window.end(), generator); - std::generate(following_window.begin(), following_window.end(), generator); - + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, preceding_window.begin(), [&window_rng, num_rows](auto i) { + auto p = window_rng.generate(); + return std::min(i + 1, std::max(p, i + 1 - num_rows)); + }); + std::transform(it, it + num_rows, following_window.begin(), [&window_rng, num_rows](auto i) { + auto f = window_rng.generate(); + return std::max(-i - 1, std::min(f, num_rows - i - 1)); + }); this->run_test_col_agg(input, preceding_window, following_window, max_window_size); } diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp index 7d2467fd18d..c7d7b4aa5c4 100644 --- a/cpp/tests/streams/rolling_test.cpp +++ b/cpp/tests/streams/rolling_test.cpp @@ -54,8 +54,8 @@ TEST_F(RollingTest, FixedSizeDefault) TEST_F(RollingTest, VariableSize) { cudf::test::fixed_width_column_wrapper input({1, 2, 3, 4, 5, 6, 7, 8, 9}); - cudf::test::fixed_width_column_wrapper preceding({2, 2, 2, 2, 3, 3, 3, 3, 3}); - cudf::test::fixed_width_column_wrapper following({3, 3, 3, 3, 3, 2, 2, 2, 2}); + cudf::test::fixed_width_column_wrapper preceding({1, 2, 2, 2, 3, 3, 3, 3, 3}); + cudf::test::fixed_width_column_wrapper following({3, 3, 3, 3, 3, 2, 2, 1, 0}); cudf::rolling_window(input, preceding, diff --git a/dependencies.yaml b/dependencies.yaml index 25866e85a0b..edd83e6e07d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -438,12 +438,21 @@ dependencies: matrices: - matrix: cuda: "12.*" + use_cuda_wheels: "true" packages: - nvidia-nvcomp-cu12==4.1.0.6 - matrix: cuda: "11.*" + use_cuda_wheels: "true" packages: - nvidia-nvcomp-cu11==4.1.0.6 + # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels + # (e.g. for DLFW and pip devcontainers) + - matrix: + use_cuda_wheels: "false" + packages: + # if no matching matrix selectors passed, list the unsuffixed packages + # (just as a source of documentation, as this populates pyproject.toml in source control) - matrix: packages: - nvidia-nvcomp==4.1.0.6 @@ -708,7 +717,8 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0 + - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0 + - &numba-dep numba>=0.59.1,<0.61.0a0 - nvtx>=0.2.1 - packaging - rich @@ -835,6 +845,7 @@ dependencies: - matrix: {dependencies: "latest"} packages: - *numba-cuda-dep + - *numba-dep - pandas==2.2.3 - matrix: packages: @@ -917,6 +928,7 @@ dependencies: packages: - dask-cuda==25.2.*,>=0.0.0a0 - *numba-cuda-dep + - *numba-dep specific: - output_types: [conda, requirements] matrices: diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index c07a58ed8a5..42fa871abeb 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -396,7 +396,16 @@ private HostUDFAggregation(HostUDFWrapper wrapper) { @Override long createNativeInstance() { - return Aggregation.createHostUDFAgg(wrapper.udfNativeHandle); + long udf = 0; + try { + udf = wrapper.createUDFInstance(); + return Aggregation.createHostUDFAgg(udf); + } finally { + // a new UDF is cloned in `createHostUDFAgg`, here should close the UDF instance. + if (udf != 0) { + HostUDFWrapper.closeUDFInstance(udf); + } + } } @Override diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java index 7251ce643d4..9686cf17b60 100644 --- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -367,6 +367,23 @@ public final int getInt(long offset) { return UnsafeMemoryAccessor.getInt(requestedAddress); } + /** + * Copy a set of ints to an array from the buffer starting at offset. + * @param dst destination int array + * @param dstIndex starting index within the destination array + * @param srcOffset starting offset within this buffer + * @param count number of ints to copy + */ + public final void getInts(int[] dst, long dstIndex, long srcOffset, int count) { + assert count >= 0; + assert count <= dst.length - dstIndex; + assert srcOffset >= 0; + long requestedAddress = this.address + srcOffset; + addressOutOfBoundsCheck(requestedAddress, count * 4L, "getInts"); + UnsafeMemoryAccessor.getInts(dst, dstIndex, requestedAddress, count); + } + + /** * Sets the Integer value at that offset * @param offset - offset from the address diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java index 124f2c99188..0bd3ccad0d3 100644 --- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java +++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,18 +19,56 @@ /** * A wrapper around native host UDF aggregations. *

- * This class is used to store the native handle of a host UDF aggregation and is used as + * This class is used to create the native handle of a host UDF aggregation and is used as * a proxy object to compute hash code and compare two host UDF aggregations for equality. *

* A new host UDF aggregation implementation must extend this class and override the - * {@code hashCode} and {@code equals} methods for such purposes. - * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must - * also be overridden to automatically delete the native UDF instance upon class destruction. + * {@code computeHashCode} and {@code isEqual} methods for such purposes. + * */ -public abstract class HostUDFWrapper implements AutoCloseable { - public final long udfNativeHandle; +public abstract class HostUDFWrapper { + + /** + * Create a derived host UDF native instance. + * The instance created by this function MUST be closed by `closeUDFInstance` + *

Typical usage, refer to Aggregation.java:

+ *
+   * long udf = 0;
+   * try {
+   *     udf = wrapper.createUDFInstance();
+   *     return Aggregation.createHostUDFAgg(udf);
+   * } finally {
+   *     // a new UDF is cloned in `createHostUDFAgg`, here should close the UDF instance.
+   *     if (udf != 0) {
+   *         HostUDFWrapper.closeUDFInstance(udf);
+   *     }
+   * }
+   * 
+ * + */ + public abstract long createUDFInstance(); + + /** + * Close the derived UDF instance created by `createUDFInstance`. + * @param hostUDFInstance the UDF instance + */ + public static void closeUDFInstance(long hostUDFInstance) { + close(hostUDFInstance); + } + + public abstract int computeHashCode(); - public HostUDFWrapper(long udfNativeHandle) { - this.udfNativeHandle = udfNativeHandle; + @Override + public int hashCode() { + return computeHashCode(); } + + public abstract boolean isEqual(Object obj); + + @Override + public boolean equals(Object obj) { + return isEqual(obj); + } + + static native void close(long hostUDFInstance); } diff --git a/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java b/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java index ebc3f43ccf4..fcb3f5e40fc 100644 --- a/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java +++ b/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -154,6 +154,19 @@ public static int getInt(long address) { return UNSAFE.getInt(address); } + /** + * Copy out an array of ints. + * @param dst where to write the data + * @param dstIndex index into values to start writing at. + * @param address src memory address + * @param count the number of ints to copy + * @throws IndexOutOfBoundsException + */ + public static void getInts(int[] dst, long dstIndex, long address, int count) { + copyMemory(null, address, + dst, UnsafeMemoryAccessor.INT_ARRAY_OFFSET + (dstIndex * 4), count * 4); + } + /** * Sets the Integer value at that address * @param address - memory address diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index bd1714aa476..3923d8b45e3 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -148,6 +148,7 @@ add_library( src/DataSourceHelperJni.cpp src/HashJoinJni.cpp src/HostMemoryBufferNativeUtilsJni.cpp + src/HostUDFWrapperJni.cpp src/NvcompJni.cpp src/NvtxRangeJni.cpp src/NvtxUniqueRangeJni.cpp diff --git a/java/src/main/native/src/HostUDFWrapperJni.cpp b/java/src/main/native/src/HostUDFWrapperJni.cpp new file mode 100644 index 00000000000..1dd1b483fac --- /dev/null +++ b/java/src/main/native/src/HostUDFWrapperJni.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf_jni_apis.hpp" + +#include + +extern "C" { + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostUDFWrapper_close(JNIEnv* env, + jclass class_object, + jlong ptr) +{ + try { + auto to_del = reinterpret_cast(ptr); + delete to_del; + } + CATCH_STD(env, ); +} + +} // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index d1a1ff2c95c..326b4251335 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -2983,7 +2983,7 @@ void testWindowStaticCounts() { @Test void testWindowDynamicNegative() { - try (ColumnVector precedingCol = ColumnVector.fromInts(3, 3, 3, 4, 4); + try (ColumnVector precedingCol = ColumnVector.fromInts(1, 2, 3, 4, 4); ColumnVector followingCol = ColumnVector.fromInts(-1, -1, -1, -1, 0)) { try (WindowOptions window = WindowOptions.builder() .minPeriods(2).window(precedingCol, followingCol).build()) { @@ -3014,7 +3014,7 @@ void testWindowLag() { @Test void testWindowDynamic() { try (ColumnVector precedingCol = ColumnVector.fromInts(1, 2, 3, 1, 2); - ColumnVector followingCol = ColumnVector.fromInts(2, 2, 2, 2, 2)) { + ColumnVector followingCol = ColumnVector.fromInts(2, 2, 2, 1, 0)) { try (WindowOptions window = WindowOptions.builder().minPeriods(2) .window(precedingCol, followingCol).build()) { try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8); diff --git a/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java index b7fde511c38..e0cfeff97e1 100644 --- a/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java +++ b/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,6 +87,20 @@ public void testGetInt() { } } + @Test + public void testGetInts() { + try (HostMemoryBuffer hostMemoryBuffer = HostMemoryBuffer.allocate(16)) { + hostMemoryBuffer.setInt(0, 1); + hostMemoryBuffer.setInt(4, 2); + hostMemoryBuffer.setInt(8, 3); + hostMemoryBuffer.setInt(12, 4); + int[] expectedInts = new int[] {1, 2, 3, 4}; + int[] result = new int[expectedInts.length]; + hostMemoryBuffer.getInts(result, 0, 0, 4); + assertArrayEquals(expectedInts, result); + } + } + @Test public void testGetByte() { try (HostMemoryBuffer hostMemoryBuffer = HostMemoryBuffer.allocate(16)) { diff --git a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java index a3684cb42b9..645d0789ef8 100644 --- a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java +++ b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,6 +66,24 @@ public void setIntAndGetInt() { } } + @Test + public void setAndGetInts() { + int numInts = 289; + long address = UnsafeMemoryAccessor.allocate(numInts * 4); + try { + for (int i = 0; i < numInts; i++) { + UnsafeMemoryAccessor.setInt(address + i * 4, i); + } + int[] ints = new int[numInts]; + UnsafeMemoryAccessor.getInts(ints, 0, address, numInts); + for (int i = 0; i < numInts; i++) { + assertEquals(i, ints[i]); + } + } finally { + UnsafeMemoryAccessor.free(address); + } + } + @Test public void setMemoryValue() { long address = UnsafeMemoryAccessor.allocate(4); diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4c6f8a9c152..50e6b42129e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3659,6 +3659,9 @@ def sort_values( by_columns = by_in_index else: raise KeyError(by) + + if cudf.get_option("mode.pandas_compatible"): + by_columns = by_columns.nans_to_nulls() # argsort the `by` column out = self._gather( GatherMap.from_column_unchecked( diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py index 7e5ce713c7e..556a3826eb7 100644 --- a/python/cudf/cudf/tests/test_sorting.py +++ b/python/cudf/cudf/tests/test_sorting.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import string from itertools import product @@ -7,7 +7,7 @@ import pandas as pd import pytest -from cudf import DataFrame, Series +from cudf import DataFrame, Series, option_context from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION from cudf.core.column import NumericalColumn from cudf.testing import assert_eq @@ -46,6 +46,14 @@ def test_dataframe_sort_values(nelem, dtype): assert_eq(sorted_df["b"].values, bb[sorted_index]) +def test_sort_values_nans_pandas_compat(): + data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]} + with option_context("mode.pandas_compatible", True): + result = DataFrame(data).sort_values("b", na_position="first") + expected = pd.DataFrame(data).sort_values("b", na_position="first") + assert_eq(result, expected) + + @pytest.mark.parametrize("ignore_index", [True, False]) @pytest.mark.parametrize("index", ["a", "b", ["a", "b"]]) def test_dataframe_sort_values_ignore_index(index, ignore_index): diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index c6a5887f85d..2b03f515657 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -24,7 +24,8 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.2.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0", + "numba-cuda>=0.2.0,<0.3.0a0", + "numba>=0.59.1,<0.61.0a0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 15079f7f06a..3725722a8ae 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -47,7 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==25.2.*,>=0.0.0a0", - "numba-cuda>=0.2.0,<0.3.0", + "numba-cuda>=0.2.0,<0.3.0a0", + "numba>=0.59.1,<0.61.0a0", "pytest-cov", "pytest-xdist", "pytest<8", diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index ac85298bc40..9fe930d754c 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -75,7 +75,7 @@ regex = "(?P.*)" [tool.rapids-build-backend] build-backend = "scikit_build_core.build" dependencies-file = "../../dependencies.yaml" -matrix-entry = "cuda_suffixed=true" +matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true" requires = [ "cmake>=3.26.4,!=3.30.0", "libkvikio==25.2.*,>=0.0.0a0",