Skip to content

Commit

Permalink
Merge branch 'fix_long_strings' of https://github.com/pmattione-nvidi…
Browse files Browse the repository at this point in the history
…a/cudf into fix_long_strings
  • Loading branch information
pmattione-nvidia committed Jan 24, 2025
2 parents fd3aed2 + 29558e5 commit dcc11c2
Show file tree
Hide file tree
Showing 36 changed files with 641 additions and 402 deletions.
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba-cuda>=0.2.0,<0.3.0
- numba-cuda>=0.2.0,<0.3.0a0
- numba>=0.59.1,<0.61.0a0
- numpy>=1.23,<3.0a0
- numpydoc
- nvcc_linux-64=11.8
Expand Down
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-125_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ dependencies:
- nbsphinx
- ninja
- notebook
- numba-cuda>=0.2.0,<0.3.0
- numba-cuda>=0.2.0,<0.3.0a0
- numba>=0.59.1,<0.61.0a0
- numpy>=1.23,<3.0a0
- numpydoc
- nvcomp==4.1.0.6
Expand Down
3 changes: 2 additions & 1 deletion conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ requirements:
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.4dev0
- cupy >=12.0.0
- numba-cuda >=0.2.0,<0.3.0
- numba-cuda >=0.2.0,<0.3.0a0
- numba >=0.59.1,<0.61.0a0
- numpy >=1.23,<3.0a0
- pyarrow>=14.0.0,<18.0.0a0
- libcudf ={{ version }}
Expand Down
7 changes: 7 additions & 0 deletions cpp/include/cudf/rolling.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ namespace CUDF_EXPORT cudf {
* - instead of storing NA/NaN for output rows that do not meet the minimum number of observations
* this function updates the valid bitmask of the column to indicate which elements are valid.
*
* @note Windows near the endpoints of the input are automatically clamped to be in-bounds.
*
* Notes on return column types:
* - The returned column for count aggregation always has `INT32` type.
* - The returned column for VARIANCE/STD aggregations always has `FLOAT64` type.
Expand Down Expand Up @@ -594,6 +596,11 @@ std::unique_ptr<column> grouped_range_rolling_window(
* column of the same type as the input. Therefore it is suggested to convert integer column types
* (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`.
*
* @note All entries in `preceding_window` and `following_window` must produce window extents that
* are in-bounds for the `input`. That is, for all `i`, it is required that the set of rows defined
* by the interval `[i - preceding_window[i] + 1, ..., i + following_window[i] + 1)` is a subset of
* `[0, input.size())`.
*
* @throws cudf::logic_error if window column type is not INT32
*
* @param[in] input The input column
Expand Down
37 changes: 17 additions & 20 deletions cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,10 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
device_span<uint32_t const> str_col_indexes)
{
// Index of the column in the `str_col_indexes` array
auto const str_col_idx = blockIdx.y;
auto const str_col_idx = blockIdx.x % str_col_indexes.size();
// Index of the column in the `orc_columns` array
auto const col_idx = str_col_indexes[str_col_idx];
auto const row_group_idx = blockIdx.x * blockDim.x + threadIdx.x;
auto const row_group_idx = (blockIdx.x / str_col_indexes.size()) * blockDim.x + threadIdx.x;
if (row_group_idx >= rowgroup_bounds.size().first) { return; }

auto const& str_col = orc_columns[col_idx];
Expand All @@ -63,18 +63,17 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
if (rowgroup_bounds.count() == 0) { return; }

auto const num_rowgroups = rowgroup_bounds.size().first;
auto const num_str_cols = str_col_indexes.size();
if (num_str_cols == 0) { return; }
if (str_col_indexes.empty()) { return; }

int block_size = 0; // suggested thread count to use
int min_grid_size = 0; // minimum block count required
CUDF_CUDA_TRY(
cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, rowgroup_char_counts_kernel));
auto const grid_size =
dim3(cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size),
static_cast<unsigned int>(num_str_cols));
auto const num_blocks =
cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size) *
str_col_indexes.size();

rowgroup_char_counts_kernel<<<grid_size, block_size, 0, stream.value()>>>(
rowgroup_char_counts_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
counts, orc_columns, rowgroup_bounds, str_col_indexes);
}

Expand Down Expand Up @@ -104,8 +103,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
device_span<orc_column_device_view const> columns)
{
auto const col_idx = blockIdx.x;
auto const stripe_idx = blockIdx.y;
auto const col_idx = blockIdx.x / dictionaries.size().second;
auto const stripe_idx = blockIdx.x % dictionaries.size().second;
auto const t = threadIdx.x;
auto& dict = dictionaries[col_idx][stripe_idx];
auto const& col = columns[dict.column_idx];
Expand Down Expand Up @@ -166,8 +165,8 @@ template <int block_size>
CUDF_KERNEL void __launch_bounds__(block_size)
collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
{
auto const col_idx = blockIdx.x;
auto const stripe_idx = blockIdx.y;
auto const col_idx = blockIdx.x / dictionaries.size().second;
auto const stripe_idx = blockIdx.x % dictionaries.size().second;
auto const& dict = dictionaries[col_idx][stripe_idx];

if (not dict.is_enabled) { return; }
Expand Down Expand Up @@ -200,8 +199,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
device_span<orc_column_device_view const> columns)
{
auto const col_idx = blockIdx.x;
auto const stripe_idx = blockIdx.y;
auto const col_idx = blockIdx.x / dictionaries.size().second;
auto const stripe_idx = blockIdx.x % dictionaries.size().second;
auto const t = threadIdx.x;
auto const& dict = dictionaries[col_idx][stripe_idx];
auto const& col = columns[dict.column_idx];
Expand Down Expand Up @@ -244,18 +243,17 @@ void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries
{
if (dictionaries.count() == 0) { return; }
constexpr int block_size = 256;
dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
populate_dictionary_hash_maps_kernel<block_size>
<<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
<<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
}

void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
rmm::cuda_stream_view stream)
{
if (dictionaries.count() == 0) { return; }
constexpr int block_size = 1024;
dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
collect_map_entries_kernel<block_size><<<dim_grid, block_size, 0, stream.value()>>>(dictionaries);
collect_map_entries_kernel<block_size>
<<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries);
}

void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
Expand All @@ -264,9 +262,8 @@ void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
{
if (dictionaries.count() == 0) { return; }
constexpr int block_size = 1024;
dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
get_dictionary_indices_kernel<block_size>
<<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
<<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
}

} // namespace cudf::io::orc::gpu
21 changes: 12 additions & 9 deletions cpp/src/io/orc/stats_enc.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,9 +14,9 @@
* limitations under the License.
*/

#include "io/utilities/block_utils.cuh"
#include "orc_gpu.hpp"

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/orc_types.hpp>
#include <cudf/strings/detail/convert/fixed_point_to_string.cuh>

Expand All @@ -40,8 +40,9 @@ CUDF_KERNEL void __launch_bounds__(init_threads_per_block)
device_2dspan<rowgroup_rows const> rowgroup_bounds)
{
__shared__ __align__(4) statistics_group group_g[init_groups_per_block];
auto const col_id = blockIdx.y;
auto const chunk_id = (blockIdx.x * init_groups_per_block) + threadIdx.y;
auto const col_id = blockIdx.x % rowgroup_bounds.size().second;
auto const chunk_id =
(blockIdx.x / rowgroup_bounds.size().second * init_groups_per_block) + threadIdx.y;
auto const t = threadIdx.x;
auto const num_rowgroups = rowgroup_bounds.size().first;
statistics_group* group = &group_g[threadIdx.y];
Expand Down Expand Up @@ -452,10 +453,12 @@ void orc_init_statistics_groups(statistics_group* groups,
device_2dspan<rowgroup_rows const> rowgroup_bounds,
rmm::cuda_stream_view stream)
{
dim3 dim_grid((rowgroup_bounds.size().first + init_groups_per_block - 1) / init_groups_per_block,
rowgroup_bounds.size().second);
auto const num_blocks =
cudf::util::div_rounding_up_safe<size_t>(rowgroup_bounds.size().first, init_groups_per_block) *
rowgroup_bounds.size().second;

dim3 dim_block(init_threads_per_group, init_groups_per_block);
gpu_init_statistics_groups<<<dim_grid, dim_block, 0, stream.value()>>>(
gpu_init_statistics_groups<<<num_blocks, dim_block, 0, stream.value()>>>(
groups, cols, rowgroup_bounds);
}

Expand Down Expand Up @@ -490,8 +493,8 @@ void orc_encode_statistics(uint8_t* blob_bfr,
uint32_t statistics_count,
rmm::cuda_stream_view stream)
{
unsigned int num_blocks =
(statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block;
auto const num_blocks =
cudf::util::div_rounding_up_safe(statistics_count, encode_chunks_per_block);
dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block);
gpu_encode_statistics<<<num_blocks, dim_block, 0, stream.value()>>>(
blob_bfr, groups, chunks, statistics_count);
Expand Down
37 changes: 21 additions & 16 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1288,16 +1288,17 @@ CUDF_KERNEL void __launch_bounds__(block_size)
} temp_storage;

orcdec_state_s* const s = &state_g;
bool const is_nulldec = (blockIdx.y >= num_stripes);
uint32_t const column = blockIdx.x;
uint32_t const stripe = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
// Need the modulo because we have twice as many threads as columns*stripes
uint32_t const column = blockIdx.x / num_stripes;
uint32_t const stripe = blockIdx.x % num_stripes;
uint32_t const chunk_id = stripe * num_columns + column;
int t = threadIdx.x;

if (t == 0) s->chunk = chunks[chunk_id];
__syncthreads();
size_t const max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count;

bool const is_nulldec = (blockIdx.y == 0);
if (is_nulldec) {
uint32_t null_count = 0;
// Decode NULLs
Expand Down Expand Up @@ -1576,7 +1577,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
auto num_rowgroups = row_groups.size().first;

if (num_rowgroups > 0) {
if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; }
if (t == 0) {
auto const rg_idx = blockIdx.x % num_rowgroups;
auto const col_idx = blockIdx.x / num_rowgroups;
s->top.data.index = row_groups[rg_idx][col_idx];
}
__syncthreads();
chunk_id = s->top.data.index.chunk_id;
} else {
Expand All @@ -1601,10 +1606,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
atomicAdd(error_count, 1);
}
auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
uint32_t const rg_idx = blockIdx.x % num_rowgroups;
uint32_t rowgroup_rowofs =
(level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride
(level == 0) ? (rg_idx - cuda::std::min(s->chunk.rowgroup_id, rg_idx)) * rowidx_stride
: s->top.data.index.start_row;
;
s->chunk.streams[CI_DATA] += ofs0;
Expand Down Expand Up @@ -2025,7 +2031,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
}
if (t == 0 and (s->chunk.type_kind == LIST or s->chunk.type_kind == MAP)) {
if (num_rowgroups > 0) {
row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
auto const rg_idx = blockIdx.x % num_rowgroups;
auto const col_idx = blockIdx.x / num_rowgroups;
row_groups[rg_idx][col_idx].num_child_rows = s->num_child_rows;
}
cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{chunks[chunk_id].num_child_rows};
ref.fetch_add(s->num_child_rows, cuda::std::memory_order_relaxed);
Expand All @@ -2049,9 +2057,9 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
int64_t first_row,
rmm::cuda_stream_view stream)
{
dim3 dim_block(block_size, 1);
dim3 dim_grid(num_columns, num_stripes * 2); // 1024 threads per chunk
gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
dim3 dim_grid(num_columns * num_stripes, 2);

gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, block_size, 0, stream.value()>>>(
chunks, global_dictionary, num_columns, num_stripes, first_row);
}

Expand Down Expand Up @@ -2083,11 +2091,8 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
size_type* error_count,
rmm::cuda_stream_view stream)
{
auto const num_chunks = num_columns * num_stripes;
dim3 dim_block(block_size, 1); // 1024 threads per chunk
dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
(num_rowgroups > 0) ? num_rowgroups : 1);
gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
auto const num_blocks = num_columns * (num_rowgroups > 0 ? num_rowgroups : num_stripes);
gpuDecodeOrcColumnData<block_size><<<num_blocks, block_size, 0, stream.value()>>>(
chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
}

Expand Down
Loading

0 comments on commit dcc11c2

Please sign in to comment.