Merge branch 'fix_long_strings' of https://github.com/pmattione-nvidi…

…a/cudf into fix_long_strings
rapidsai · Jan 24, 2025 · dcc11c2 · dcc11c2
2 parents fd3aed2 + 29558e5
commit dcc11c2
Show file tree

Hide file tree

Showing 36 changed files with 641 additions and 402 deletions.
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,8 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0
+- numba-cuda>=0.2.0,<0.3.0a0
+- numba>=0.59.1,<0.61.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -54,7 +54,8 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0
+- numba-cuda>=0.2.0,<0.3.0a0
+- numba>=0.59.1,<0.61.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.1.0.6

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,8 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.2.0,<0.3.0
+    - numba-cuda >=0.2.0,<0.3.0a0
+    - numba >=0.59.1,<0.61.0a0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}

diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
@@ -44,6 +44,8 @@ namespace CUDF_EXPORT cudf {
  * - instead of storing NA/NaN for output rows that do not meet the minimum number of observations
  *   this function updates the valid bitmask of the column to indicate which elements are valid.
  *
+ * @note Windows near the endpoints of the input are automatically clamped to be in-bounds.
+ *
  * Notes on return column types:
  * - The returned column for count aggregation always has `INT32` type.
  * - The returned column for VARIANCE/STD aggregations always has `FLOAT64` type.
@@ -594,6 +596,11 @@ std::unique_ptr<column> grouped_range_rolling_window(
  * column of the same type as the input. Therefore it is suggested to convert integer column types
  * (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`.
  *
+ * @note All entries in `preceding_window` and `following_window` must produce window extents that
+ * are in-bounds for the `input`. That is, for all `i`, it is required that the set of rows defined
+ * by the interval `[i - preceding_window[i] + 1, ..., i + following_window[i] + 1)` is a subset of
+ * `[0, input.size())`.
+ *
  * @throws cudf::logic_error if window column type is not INT32
  *
  * @param[in] input The input column

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
@@ -35,10 +35,10 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
                                              device_span<uint32_t const> str_col_indexes)
 {
   // Index of the column in the `str_col_indexes` array
-  auto const str_col_idx = blockIdx.y;
+  auto const str_col_idx = blockIdx.x % str_col_indexes.size();
   // Index of the column in the `orc_columns` array
   auto const col_idx       = str_col_indexes[str_col_idx];
-  auto const row_group_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto const row_group_idx = (blockIdx.x / str_col_indexes.size()) * blockDim.x + threadIdx.x;
   if (row_group_idx >= rowgroup_bounds.size().first) { return; }
 
   auto const& str_col  = orc_columns[col_idx];
@@ -63,18 +63,17 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
   if (rowgroup_bounds.count() == 0) { return; }
 
   auto const num_rowgroups = rowgroup_bounds.size().first;
-  auto const num_str_cols  = str_col_indexes.size();
-  if (num_str_cols == 0) { return; }
+  if (str_col_indexes.empty()) { return; }
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, rowgroup_char_counts_kernel));
-  auto const grid_size =
-    dim3(cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size),
-         static_cast<unsigned int>(num_str_cols));
+  auto const num_blocks =
+    cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size) *
+    str_col_indexes.size();
 
-  rowgroup_char_counts_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+  rowgroup_char_counts_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
     counts, orc_columns, rowgroup_bounds, str_col_indexes);
 }
 
@@ -104,8 +103,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                        device_span<orc_column_device_view const> columns)
 {
-  auto const col_idx    = blockIdx.x;
-  auto const stripe_idx = blockIdx.y;
+  auto const col_idx    = blockIdx.x / dictionaries.size().second;
+  auto const stripe_idx = blockIdx.x % dictionaries.size().second;
   auto const t          = threadIdx.x;
   auto& dict            = dictionaries[col_idx][stripe_idx];
   auto const& col       = columns[dict.column_idx];
@@ -166,8 +165,8 @@ template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
 {
-  auto const col_idx    = blockIdx.x;
-  auto const stripe_idx = blockIdx.y;
+  auto const col_idx    = blockIdx.x / dictionaries.size().second;
+  auto const stripe_idx = blockIdx.x % dictionaries.size().second;
   auto const& dict      = dictionaries[col_idx][stripe_idx];
 
   if (not dict.is_enabled) { return; }
@@ -200,8 +199,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                 device_span<orc_column_device_view const> columns)
 {
-  auto const col_idx    = blockIdx.x;
-  auto const stripe_idx = blockIdx.y;
+  auto const col_idx    = blockIdx.x / dictionaries.size().second;
+  auto const stripe_idx = blockIdx.x % dictionaries.size().second;
   auto const t          = threadIdx.x;
   auto const& dict      = dictionaries[col_idx][stripe_idx];
   auto const& col       = columns[dict.column_idx];
@@ -244,18 +243,17 @@ void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries
 {
   if (dictionaries.count() == 0) { return; }
   constexpr int block_size = 256;
-  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
   populate_dictionary_hash_maps_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
 void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
                          rmm::cuda_stream_view stream)
 {
   if (dictionaries.count() == 0) { return; }
   constexpr int block_size = 1024;
-  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
-  collect_map_entries_kernel<block_size><<<dim_grid, block_size, 0, stream.value()>>>(dictionaries);
+  collect_map_entries_kernel<block_size>
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries);
 }
 
 void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
@@ -264,9 +262,8 @@ void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
 {
   if (dictionaries.count() == 0) { return; }
   constexpr int block_size = 1024;
-  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
   get_dictionary_indices_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
 }  // namespace cudf::io::orc::gpu
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "io/utilities/block_utils.cuh"
 #include "orc_gpu.hpp"
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 
@@ -40,8 +40,9 @@ CUDF_KERNEL void __launch_bounds__(init_threads_per_block)
                              device_2dspan<rowgroup_rows const> rowgroup_bounds)
 {
   __shared__ __align__(4) statistics_group group_g[init_groups_per_block];
-  auto const col_id        = blockIdx.y;
-  auto const chunk_id      = (blockIdx.x * init_groups_per_block) + threadIdx.y;
+  auto const col_id = blockIdx.x % rowgroup_bounds.size().second;
+  auto const chunk_id =
+    (blockIdx.x / rowgroup_bounds.size().second * init_groups_per_block) + threadIdx.y;
   auto const t             = threadIdx.x;
   auto const num_rowgroups = rowgroup_bounds.size().first;
   statistics_group* group  = &group_g[threadIdx.y];
@@ -452,10 +453,12 @@ void orc_init_statistics_groups(statistics_group* groups,
                                 device_2dspan<rowgroup_rows const> rowgroup_bounds,
                                 rmm::cuda_stream_view stream)
 {
-  dim3 dim_grid((rowgroup_bounds.size().first + init_groups_per_block - 1) / init_groups_per_block,
-                rowgroup_bounds.size().second);
+  auto const num_blocks =
+    cudf::util::div_rounding_up_safe<size_t>(rowgroup_bounds.size().first, init_groups_per_block) *
+    rowgroup_bounds.size().second;
+
   dim3 dim_block(init_threads_per_group, init_groups_per_block);
-  gpu_init_statistics_groups<<<dim_grid, dim_block, 0, stream.value()>>>(
+  gpu_init_statistics_groups<<<num_blocks, dim_block, 0, stream.value()>>>(
     groups, cols, rowgroup_bounds);
 }
 
@@ -490,8 +493,8 @@ void orc_encode_statistics(uint8_t* blob_bfr,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream)
 {
-  unsigned int num_blocks =
-    (statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block;
+  auto const num_blocks =
+    cudf::util::div_rounding_up_safe(statistics_count, encode_chunks_per_block);
   dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block);
   gpu_encode_statistics<<<num_blocks, dim_block, 0, stream.value()>>>(
     blob_bfr, groups, chunks, statistics_count);

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
@@ -1288,16 +1288,17 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   } temp_storage;
 
   orcdec_state_s* const s = &state_g;
-  bool const is_nulldec   = (blockIdx.y >= num_stripes);
-  uint32_t const column   = blockIdx.x;
-  uint32_t const stripe   = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
+  // Need the modulo because we have twice as many threads as columns*stripes
+  uint32_t const column   = blockIdx.x / num_stripes;
+  uint32_t const stripe   = blockIdx.x % num_stripes;
   uint32_t const chunk_id = stripe * num_columns + column;
   int t                   = threadIdx.x;
 
   if (t == 0) s->chunk = chunks[chunk_id];
   __syncthreads();
   size_t const max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count;
 
+  bool const is_nulldec = (blockIdx.y == 0);
   if (is_nulldec) {
     uint32_t null_count = 0;
     // Decode NULLs
@@ -1576,7 +1577,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto num_rowgroups = row_groups.size().first;
 
   if (num_rowgroups > 0) {
-    if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; }
+    if (t == 0) {
+      auto const rg_idx  = blockIdx.x % num_rowgroups;
+      auto const col_idx = blockIdx.x / num_rowgroups;
+      s->top.data.index  = row_groups[rg_idx][col_idx];
+    }
     __syncthreads();
     chunk_id = s->top.data.index.chunk_id;
   } else {
@@ -1601,10 +1606,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
       if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
         atomicAdd(error_count, 1);
       }
-      auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
-      auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
+      auto const ofs0       = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
+      auto const ofs1       = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
+      uint32_t const rg_idx = blockIdx.x % num_rowgroups;
       uint32_t rowgroup_rowofs =
-        (level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride
+        (level == 0) ? (rg_idx - cuda::std::min(s->chunk.rowgroup_id, rg_idx)) * rowidx_stride
                      : s->top.data.index.start_row;
       ;
       s->chunk.streams[CI_DATA] += ofs0;
@@ -2025,7 +2031,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   }
   if (t == 0 and (s->chunk.type_kind == LIST or s->chunk.type_kind == MAP)) {
     if (num_rowgroups > 0) {
-      row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
+      auto const rg_idx                          = blockIdx.x % num_rowgroups;
+      auto const col_idx                         = blockIdx.x / num_rowgroups;
+      row_groups[rg_idx][col_idx].num_child_rows = s->num_child_rows;
     }
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{chunks[chunk_id].num_child_rows};
     ref.fetch_add(s->num_child_rows, cuda::std::memory_order_relaxed);
@@ -2049,9 +2057,9 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
                                                int64_t first_row,
                                                rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(block_size, 1);
-  dim3 dim_grid(num_columns, num_stripes * 2);  // 1024 threads per chunk
-  gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+  dim3 dim_grid(num_columns * num_stripes, 2);
+
+  gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, block_size, 0, stream.value()>>>(
     chunks, global_dictionary, num_columns, num_stripes, first_row);
 }
 
@@ -2083,11 +2091,8 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
                                   size_type* error_count,
                                   rmm::cuda_stream_view stream)
 {
-  auto const num_chunks = num_columns * num_stripes;
-  dim3 dim_block(block_size, 1);  // 1024 threads per chunk
-  dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
-                (num_rowgroups > 0) ? num_rowgroups : 1);
-  gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+  auto const num_blocks = num_columns * (num_rowgroups > 0 ? num_rowgroups : num_stripes);
+  gpuDecodeOrcColumnData<block_size><<<num_blocks, block_size, 0, stream.value()>>>(
     chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
 }