diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index f4fd3dcfb77..60d8e96d932 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,8 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0
+- numba-cuda>=0.2.0,<0.3.0a0
+- numba>=0.59.1,<0.61.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index ec7ae3f0706..fe1a32ccb87 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -54,7 +54,8 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0
+- numba-cuda>=0.2.0,<0.3.0a0
+- numba>=0.59.1,<0.61.0a0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.1.0.6
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index aa1879fc7bd..5df7f97346a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,8 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.2.0,<0.3.0
+    - numba-cuda >=0.2.0,<0.3.0a0
+    - numba >=0.59.1,<0.61.0a0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 3ef174e4206..3158445841e 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -44,6 +44,8 @@ namespace CUDF_EXPORT cudf {
  * - instead of storing NA/NaN for output rows that do not meet the minimum number of observations
  *   this function updates the valid bitmask of the column to indicate which elements are valid.
  *
+ * @note Windows near the endpoints of the input are automatically clamped to be in-bounds.
+ *
  * Notes on return column types:
  * - The returned column for count aggregation always has `INT32` type.
  * - The returned column for VARIANCE/STD aggregations always has `FLOAT64` type.
@@ -594,6 +596,11 @@ std::unique_ptr<column> grouped_range_rolling_window(
  * column of the same type as the input. Therefore it is suggested to convert integer column types
  * (especially low-precision integers) to `FLOAT32` or `FLOAT64` before doing a rolling `MEAN`.
  *
+ * @note All entries in `preceding_window` and `following_window` must produce window extents that
+ * are in-bounds for the `input`. That is, for all `i`, it is required that the set of rows defined
+ * by the interval `[i - preceding_window[i] + 1, ..., i + following_window[i] + 1)` is a subset of
+ * `[0, input.size())`.
+ *
  * @throws cudf::logic_error if window column type is not INT32
  *
  * @param[in] input The input column
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 469f933f918..aeaa87e2202 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -35,10 +35,10 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
                                              device_span<uint32_t const> str_col_indexes)
 {
   // Index of the column in the `str_col_indexes` array
-  auto const str_col_idx = blockIdx.y;
+  auto const str_col_idx = blockIdx.x % str_col_indexes.size();
   // Index of the column in the `orc_columns` array
   auto const col_idx       = str_col_indexes[str_col_idx];
-  auto const row_group_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto const row_group_idx = (blockIdx.x / str_col_indexes.size()) * blockDim.x + threadIdx.x;
   if (row_group_idx >= rowgroup_bounds.size().first) { return; }
 
   auto const& str_col  = orc_columns[col_idx];
@@ -63,18 +63,17 @@ void rowgroup_char_counts(device_2dspan<size_type> counts,
   if (rowgroup_bounds.count() == 0) { return; }
 
   auto const num_rowgroups = rowgroup_bounds.size().first;
-  auto const num_str_cols  = str_col_indexes.size();
-  if (num_str_cols == 0) { return; }
+  if (str_col_indexes.empty()) { return; }
 
   int block_size    = 0;  // suggested thread count to use
   int min_grid_size = 0;  // minimum block count required
   CUDF_CUDA_TRY(
     cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, rowgroup_char_counts_kernel));
-  auto const grid_size =
-    dim3(cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size),
-         static_cast<unsigned int>(num_str_cols));
+  auto const num_blocks =
+    cudf::util::div_rounding_up_unsafe<unsigned int>(num_rowgroups, block_size) *
+    str_col_indexes.size();
 
-  rowgroup_char_counts_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+  rowgroup_char_counts_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
     counts, orc_columns, rowgroup_bounds, str_col_indexes);
 }
 
@@ -104,8 +103,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   populate_dictionary_hash_maps_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                        device_span<orc_column_device_view const> columns)
 {
-  auto const col_idx    = blockIdx.x;
-  auto const stripe_idx = blockIdx.y;
+  auto const col_idx    = blockIdx.x / dictionaries.size().second;
+  auto const stripe_idx = blockIdx.x % dictionaries.size().second;
   auto const t          = threadIdx.x;
   auto& dict            = dictionaries[col_idx][stripe_idx];
   auto const& col       = columns[dict.column_idx];
@@ -166,8 +165,8 @@ template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
   collect_map_entries_kernel(device_2dspan<stripe_dictionary> dictionaries)
 {
-  auto const col_idx    = blockIdx.x;
-  auto const stripe_idx = blockIdx.y;
+  auto const col_idx    = blockIdx.x / dictionaries.size().second;
+  auto const stripe_idx = blockIdx.x % dictionaries.size().second;
   auto const& dict      = dictionaries[col_idx][stripe_idx];
 
   if (not dict.is_enabled) { return; }
@@ -200,8 +199,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   get_dictionary_indices_kernel(device_2dspan<stripe_dictionary> dictionaries,
                                 device_span<orc_column_device_view const> columns)
 {
-  auto const col_idx    = blockIdx.x;
-  auto const stripe_idx = blockIdx.y;
+  auto const col_idx    = blockIdx.x / dictionaries.size().second;
+  auto const stripe_idx = blockIdx.x % dictionaries.size().second;
   auto const t          = threadIdx.x;
   auto const& dict      = dictionaries[col_idx][stripe_idx];
   auto const& col       = columns[dict.column_idx];
@@ -244,9 +243,8 @@ void populate_dictionary_hash_maps(device_2dspan<stripe_dictionary> dictionaries
 {
   if (dictionaries.count() == 0) { return; }
   constexpr int block_size = 256;
-  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
   populate_dictionary_hash_maps_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
 void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
@@ -254,8 +252,8 @@ void collect_map_entries(device_2dspan<stripe_dictionary> dictionaries,
 {
   if (dictionaries.count() == 0) { return; }
   constexpr int block_size = 1024;
-  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
-  collect_map_entries_kernel<block_size><<<dim_grid, block_size, 0, stream.value()>>>(dictionaries);
+  collect_map_entries_kernel<block_size>
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries);
 }
 
 void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
@@ -264,9 +262,8 @@ void get_dictionary_indices(device_2dspan<stripe_dictionary> dictionaries,
 {
   if (dictionaries.count() == 0) { return; }
   constexpr int block_size = 1024;
-  dim3 const dim_grid(dictionaries.size().first, dictionaries.size().second);
   get_dictionary_indices_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(dictionaries, columns);
+    <<<dictionaries.count(), block_size, 0, stream.value()>>>(dictionaries, columns);
 }
 
 }  // namespace cudf::io::orc::gpu
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 2fce981e8a5..e01b93262d7 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "io/utilities/block_utils.cuh"
 #include "orc_gpu.hpp"
 
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 
@@ -40,8 +40,9 @@ CUDF_KERNEL void __launch_bounds__(init_threads_per_block)
                              device_2dspan<rowgroup_rows const> rowgroup_bounds)
 {
   __shared__ __align__(4) statistics_group group_g[init_groups_per_block];
-  auto const col_id        = blockIdx.y;
-  auto const chunk_id      = (blockIdx.x * init_groups_per_block) + threadIdx.y;
+  auto const col_id = blockIdx.x % rowgroup_bounds.size().second;
+  auto const chunk_id =
+    (blockIdx.x / rowgroup_bounds.size().second * init_groups_per_block) + threadIdx.y;
   auto const t             = threadIdx.x;
   auto const num_rowgroups = rowgroup_bounds.size().first;
   statistics_group* group  = &group_g[threadIdx.y];
@@ -452,10 +453,12 @@ void orc_init_statistics_groups(statistics_group* groups,
                                 device_2dspan<rowgroup_rows const> rowgroup_bounds,
                                 rmm::cuda_stream_view stream)
 {
-  dim3 dim_grid((rowgroup_bounds.size().first + init_groups_per_block - 1) / init_groups_per_block,
-                rowgroup_bounds.size().second);
+  auto const num_blocks =
+    cudf::util::div_rounding_up_safe<size_t>(rowgroup_bounds.size().first, init_groups_per_block) *
+    rowgroup_bounds.size().second;
+
   dim3 dim_block(init_threads_per_group, init_groups_per_block);
-  gpu_init_statistics_groups<<<dim_grid, dim_block, 0, stream.value()>>>(
+  gpu_init_statistics_groups<<<num_blocks, dim_block, 0, stream.value()>>>(
     groups, cols, rowgroup_bounds);
 }
 
@@ -490,8 +493,8 @@ void orc_encode_statistics(uint8_t* blob_bfr,
                            uint32_t statistics_count,
                            rmm::cuda_stream_view stream)
 {
-  unsigned int num_blocks =
-    (statistics_count + encode_chunks_per_block - 1) / encode_chunks_per_block;
+  auto const num_blocks =
+    cudf::util::div_rounding_up_safe(statistics_count, encode_chunks_per_block);
   dim3 dim_block(encode_threads_per_chunk, encode_chunks_per_block);
   gpu_encode_statistics<<<num_blocks, dim_block, 0, stream.value()>>>(
     blob_bfr, groups, chunks, statistics_count);
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 1f84d1f81dc..f560b806894 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1288,9 +1288,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   } temp_storage;
 
   orcdec_state_s* const s = &state_g;
-  bool const is_nulldec   = (blockIdx.y >= num_stripes);
-  uint32_t const column   = blockIdx.x;
-  uint32_t const stripe   = (is_nulldec) ? blockIdx.y - num_stripes : blockIdx.y;
+  // Need the modulo because we have twice as many threads as columns*stripes
+  uint32_t const column   = blockIdx.x / num_stripes;
+  uint32_t const stripe   = blockIdx.x % num_stripes;
   uint32_t const chunk_id = stripe * num_columns + column;
   int t                   = threadIdx.x;
 
@@ -1298,6 +1298,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   __syncthreads();
   size_t const max_num_rows = s->chunk.column_num_rows - s->chunk.parent_validity_info.null_count;
 
+  bool const is_nulldec = (blockIdx.y == 0);
   if (is_nulldec) {
     uint32_t null_count = 0;
     // Decode NULLs
@@ -1576,7 +1577,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto num_rowgroups = row_groups.size().first;
 
   if (num_rowgroups > 0) {
-    if (t == 0) { s->top.data.index = row_groups[blockIdx.y][blockIdx.x]; }
+    if (t == 0) {
+      auto const rg_idx  = blockIdx.x % num_rowgroups;
+      auto const col_idx = blockIdx.x / num_rowgroups;
+      s->top.data.index  = row_groups[rg_idx][col_idx];
+    }
     __syncthreads();
     chunk_id = s->top.data.index.chunk_id;
   } else {
@@ -1601,10 +1606,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
       if (s->top.data.index.strm_offset[1] > s->chunk.strm_len[CI_DATA2]) {
         atomicAdd(error_count, 1);
       }
-      auto const ofs0 = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
-      auto const ofs1 = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
+      auto const ofs0       = min(s->top.data.index.strm_offset[0], s->chunk.strm_len[CI_DATA]);
+      auto const ofs1       = min(s->top.data.index.strm_offset[1], s->chunk.strm_len[CI_DATA2]);
+      uint32_t const rg_idx = blockIdx.x % num_rowgroups;
       uint32_t rowgroup_rowofs =
-        (level == 0) ? (blockIdx.y - min(s->chunk.rowgroup_id, blockIdx.y)) * rowidx_stride
+        (level == 0) ? (rg_idx - cuda::std::min(s->chunk.rowgroup_id, rg_idx)) * rowidx_stride
                      : s->top.data.index.start_row;
       ;
       s->chunk.streams[CI_DATA] += ofs0;
@@ -2025,7 +2031,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   }
   if (t == 0 and (s->chunk.type_kind == LIST or s->chunk.type_kind == MAP)) {
     if (num_rowgroups > 0) {
-      row_groups[blockIdx.y][blockIdx.x].num_child_rows = s->num_child_rows;
+      auto const rg_idx                          = blockIdx.x % num_rowgroups;
+      auto const col_idx                         = blockIdx.x / num_rowgroups;
+      row_groups[rg_idx][col_idx].num_child_rows = s->num_child_rows;
     }
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{chunks[chunk_id].num_child_rows};
     ref.fetch_add(s->num_child_rows, cuda::std::memory_order_relaxed);
@@ -2049,9 +2057,9 @@ void __host__ DecodeNullsAndStringDictionaries(ColumnDesc* chunks,
                                                int64_t first_row,
                                                rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(block_size, 1);
-  dim3 dim_grid(num_columns, num_stripes * 2);  // 1024 threads per chunk
-  gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+  dim3 dim_grid(num_columns * num_stripes, 2);
+
+  gpuDecodeNullsAndStringDictionaries<block_size><<<dim_grid, block_size, 0, stream.value()>>>(
     chunks, global_dictionary, num_columns, num_stripes, first_row);
 }
 
@@ -2083,11 +2091,8 @@ void __host__ DecodeOrcColumnData(ColumnDesc* chunks,
                                   size_type* error_count,
                                   rmm::cuda_stream_view stream)
 {
-  auto const num_chunks = num_columns * num_stripes;
-  dim3 dim_block(block_size, 1);  // 1024 threads per chunk
-  dim3 dim_grid((num_rowgroups > 0) ? num_columns : num_chunks,
-                (num_rowgroups > 0) ? num_rowgroups : 1);
-  gpuDecodeOrcColumnData<block_size><<<dim_grid, dim_block, 0, stream.value()>>>(
+  auto const num_blocks = num_columns * (num_rowgroups > 0 ? num_rowgroups : num_stripes);
+  gpuDecodeOrcColumnData<block_size><<<num_blocks, block_size, 0, stream.value()>>>(
     chunks, global_dictionary, tz_table, row_groups, first_row, rowidx_stride, level, error_count);
 }
 
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 4f296bb5bfc..bcdd059bf67 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -741,8 +741,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   } temp_storage;
 
   orcenc_state_s* const s = &state_g;
-  uint32_t col_id         = blockIdx.x;
-  uint32_t group_id       = blockIdx.y;
+  uint32_t col_id         = blockIdx.x / chunks.size().second;
+  uint32_t group_id       = blockIdx.x % chunks.size().second;
   int t                   = threadIdx.x;
   if (t == 0) {
     s->chunk                = chunks[col_id][group_id];
@@ -1105,8 +1105,8 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
                        device_span<uint8_t*> dsts,
                        device_span<size_t> sizes)
 {
-  auto const stripe_id = cudf::detail::grid_1d::global_thread_id();
-  auto const stream_id = blockIdx.y;
+  auto const stripe_id = (blockIdx.x / strm_desc.size().second) * blockDim.x + threadIdx.x;
+  auto const stream_id = blockIdx.x % strm_desc.size().second;
   if (stripe_id >= strm_desc.size().first) { return; }
 
   auto const out_id = stream_id * strm_desc.size().first + stripe_id;
@@ -1147,7 +1147,7 @@ CUDF_KERNEL void __launch_bounds__(compact_streams_block_size)
 // blockDim {256,1,1}
 CUDF_KERNEL void __launch_bounds__(256)
   gpuInitCompressionBlocks(device_2dspan<StripeStream const> strm_desc,
-                           device_2dspan<encoder_chunk_streams> streams,  // const?
+                           device_2dspan<encoder_chunk_streams const> streams,
                            device_span<device_span<uint8_t const>> inputs,
                            device_span<device_span<uint8_t>> outputs,
                            device_span<compression_result> results,
@@ -1162,8 +1162,8 @@ CUDF_KERNEL void __launch_bounds__(256)
   auto const padded_block_header_size = util::round_up_unsafe(block_header_size, comp_block_align);
   auto const padded_comp_block_size   = util::round_up_unsafe(max_comp_blk_size, comp_block_align);
 
-  auto const stripe_id = blockIdx.x;
-  auto const stream_id = blockIdx.y;
+  auto const stripe_id = blockIdx.x / strm_desc.size().second;
+  auto const stream_id = blockIdx.x % strm_desc.size().second;
   uint32_t t           = threadIdx.x;
   uint32_t num_blocks;
   uint8_t *src, *dst;
@@ -1213,8 +1213,8 @@ CUDF_KERNEL void __launch_bounds__(1024)
   __shared__ uint8_t const* comp_src_g;
   __shared__ uint32_t comp_len_g;
 
-  auto const stripe_id = blockIdx.x;
-  auto const stream_id = blockIdx.y;
+  auto const stripe_id = blockIdx.x / strm_desc.size().second;
+  auto const stream_id = blockIdx.x % strm_desc.size().second;
   uint32_t t           = threadIdx.x;
   uint32_t num_blocks, b, blk_size;
   uint8_t const* src;
@@ -1290,8 +1290,10 @@ CUDF_KERNEL void decimal_sizes_to_offsets_kernel(device_2dspan<rowgroup_rows con
   __shared__ typename block_scan::TempStorage scan_storage;
   int const t = threadIdx.x;
 
-  auto const& col_elem_sizes = sizes[blockIdx.x];
-  auto const& row_group      = rg_bounds[blockIdx.y][col_elem_sizes.col_idx];
+  auto const col_id          = blockIdx.x / rg_bounds.size().first;
+  auto const rg_id           = blockIdx.x % rg_bounds.size().first;
+  auto const& col_elem_sizes = sizes[col_id];
+  auto const& row_group      = rg_bounds[rg_id][col_elem_sizes.col_idx];
   auto const elem_sizes      = col_elem_sizes.sizes.data() + row_group.begin;
 
   uint32_t initial_value = 0;
@@ -1310,10 +1312,9 @@ void EncodeOrcColumnData(device_2dspan<EncChunk const> chunks,
                          device_2dspan<encoder_chunk_streams> streams,
                          rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(encode_block_size, 1);  // `encode_block_size` threads per chunk
-  dim3 dim_grid(chunks.size().first, chunks.size().second);
+  auto const num_blocks = chunks.size().first * chunks.size().second;
   gpuEncodeOrcColumnData<encode_block_size>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(chunks, streams);
+    <<<num_blocks, encode_block_size, 0, stream.value()>>>(chunks, streams);
 }
 
 void EncodeStripeDictionaries(stripe_dictionary const* stripes,
@@ -1324,10 +1325,10 @@ void EncodeStripeDictionaries(stripe_dictionary const* stripes,
                               device_2dspan<encoder_chunk_streams> enc_streams,
                               rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(512, 1);  // 512 threads per dictionary
+  constexpr int block_size = 512;  // 512 threads per dictionary
   dim3 dim_grid(num_string_columns * num_stripes, 2);
-  gpuEncodeStringDictionaries<512>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(stripes, columns, chunks, enc_streams);
+  gpuEncodeStringDictionaries<block_size>
+    <<<dim_grid, block_size, 0, stream.value()>>>(stripes, columns, chunks, enc_streams);
 }
 
 void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
@@ -1345,10 +1346,10 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
   auto lengths = cudf::detail::make_zeroed_device_uvector_async<size_t>(
     num_chunks, stream, rmm::mr::get_current_device_resource());
 
-  dim3 dim_block(compact_streams_block_size, 1);
-  dim3 dim_grid(cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size),
-                strm_desc.size().second);
-  gpuInitBatchedMemcpy<<<dim_grid, dim_block, 0, stream.value()>>>(
+  auto const num_blocks =
+    cudf::util::div_rounding_up_unsafe(num_stripes, compact_streams_block_size) *
+    strm_desc.size().second;
+  gpuInitBatchedMemcpy<<<num_blocks, compact_streams_block_size, 0, stream.value()>>>(
     strm_desc, enc_streams, srcs, dsts, lengths);
 
   // Copy streams in a batched manner.
@@ -1372,22 +1373,20 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
   rmm::device_uvector<device_span<uint8_t const>> comp_in(num_compressed_blocks, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(num_compressed_blocks, stream);
 
-  dim3 dim_block_init(256, 1);
-  dim3 dim_grid(strm_desc.size().first, strm_desc.size().second);
-  gpuInitCompressionBlocks<<<dim_grid, dim_block_init, 0, stream.value()>>>(strm_desc,
-                                                                            enc_streams,
-                                                                            comp_in,
-                                                                            comp_out,
-                                                                            comp_res,
-                                                                            compressed_data,
-                                                                            comp_blk_size,
-                                                                            max_comp_blk_size,
-                                                                            comp_block_align);
+  size_t const num_blocks = strm_desc.size().first * strm_desc.size().second;
+  gpuInitCompressionBlocks<<<num_blocks, 256, 0, stream.value()>>>(strm_desc,
+                                                                   enc_streams,
+                                                                   comp_in,
+                                                                   comp_out,
+                                                                   comp_res,
+                                                                   compressed_data,
+                                                                   comp_blk_size,
+                                                                   max_comp_blk_size,
+                                                                   comp_block_align);
 
   cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream);
 
-  dim3 dim_block_compact(1024, 1);
-  gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
+  gpuCompactCompressedBlocks<<<num_blocks, 1024, 0, stream.value()>>>(
     strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size);
 
   if (collect_statistics) {
@@ -1415,10 +1414,10 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
     h_sizes, stream, cudf::get_current_device_resource_ref());
 
   constexpr int block_size = 256;
-  dim3 const grid_size{static_cast<unsigned int>(elem_sizes.size()),        // num decimal columns
-                       static_cast<unsigned int>(rg_bounds.size().first)};  // num rowgroups
+  // number of rowgroups * number of decimal columns
+  auto const num_blocks = elem_sizes.size() * rg_bounds.size().first;
   decimal_sizes_to_offsets_kernel<block_size>
-    <<<grid_size, block_size, 0, stream.value()>>>(rg_bounds, d_sizes);
+    <<<num_blocks, block_size, 0, stream.value()>>>(rg_bounds, d_sizes);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 89dbbcb796c..0c739f59b0a 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -457,7 +457,9 @@ CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_g
 {
   __shared__ __align__(16) rowindex_state_s state_g;
   rowindex_state_s* const s = &state_g;
-  uint32_t chunk_id         = blockIdx.y * num_columns + blockIdx.x;
+  auto const col_idx        = blockIdx.x / num_stripes;
+  auto const stripe_idx     = blockIdx.x % num_stripes;
+  uint32_t chunk_id         = stripe_idx * num_columns + col_idx;
   int t                     = threadIdx.x;
 
   if (t == 0) {
@@ -496,19 +498,18 @@ CUDF_KERNEL void __launch_bounds__(128, 8) gpuParseRowGroupIndex(RowGroup* row_g
     for (int i = t32; i < num_rowgroups; i += 32) {
       auto const num_rows =
         (use_base_stride) ? rowidx_stride
-                          : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows;
+                          : row_groups[(s->rowgroup_start + i) * num_columns + col_idx].num_rows;
       auto const start_row =
-        (use_base_stride)
-          ? i * rowidx_stride
-          : row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row;
+        (use_base_stride) ? i * rowidx_stride
+                          : row_groups[(s->rowgroup_start + i) * num_columns + col_idx].start_row;
       for (int j = t4; j < rowgroup_size4; j += 4) {
-        ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x])[j] =
+        ((uint32_t*)&row_groups[(s->rowgroup_start + i) * num_columns + col_idx])[j] =
           ((uint32_t*)&s->rowgroups[i])[j];
       }
-      row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_rows = num_rows;
+      row_groups[(s->rowgroup_start + i) * num_columns + col_idx].num_rows = num_rows;
       // Updating in case of struct
-      row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].num_child_rows = num_rows;
-      row_groups[(s->rowgroup_start + i) * num_columns + blockIdx.x].start_row      = start_row;
+      row_groups[(s->rowgroup_start + i) * num_columns + col_idx].num_child_rows = num_rows;
+      row_groups[(s->rowgroup_start + i) * num_columns + col_idx].start_row      = start_row;
     }
     __syncthreads();
     if (t == 0) { s->rowgroup_start += num_rowgroups; }
@@ -525,8 +526,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   using BlockReduce = cub::BlockReduce<size_type, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
 
-  auto const column_id   = blockIdx.x;
-  auto const rowgroup_id = blockIdx.y;
+  auto const column_id   = blockIdx.x / rowgroup_bounds.size().first;
+  auto const rowgroup_id = blockIdx.x % rowgroup_bounds.size().first;
   auto const column      = orc_columns[column_id];
   auto const t           = threadIdx.x;
 
@@ -563,9 +564,7 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
 {
   auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
   if (num_blocks > 0) {
-    dim3 dim_block(128, 1);
-    dim3 dim_grid(num_blocks, 1);
-    gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
+    gpuParseCompressedStripeData<<<num_blocks, 128, 0, stream.value()>>>(
       strm_info, num_streams, compression_block_size, log2maxcr);
   }
 }
@@ -576,10 +575,7 @@ void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
 {
   auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
   if (num_blocks > 0) {
-    dim3 dim_block(128, 1);
-    dim3 dim_grid(num_blocks, 1);
-    gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
-                                                                               num_streams);
+    gpuPostDecompressionReassemble<<<num_blocks, 128, 0, stream.value()>>>(strm_info, num_streams);
   }
 }
 
@@ -592,9 +588,8 @@ void __host__ ParseRowGroupIndex(RowGroup* row_groups,
                                  bool use_base_stride,
                                  rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid(num_columns, num_stripes);  // 1 column chunk per block
-  gpuParseRowGroupIndex<<<dim_grid, dim_block, 0, stream.value()>>>(
+  auto const num_blocks = num_columns * num_stripes;
+  gpuParseRowGroupIndex<<<num_blocks, 128, 0, stream.value()>>>(
     row_groups, strm_info, chunks, num_columns, num_stripes, rowidx_stride, use_base_stride);
 }
 
@@ -603,10 +598,10 @@ void __host__ reduce_pushdown_masks(device_span<orc_column_device_view const> co
                                     device_2dspan<cudf::size_type> valid_counts,
                                     rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid(columns.size(), rowgroups.size().first);  // 1 rowgroup per block
-  gpu_reduce_pushdown_masks<128>
-    <<<dim_grid, dim_block, 0, stream.value()>>>(columns, rowgroups, valid_counts);
+  auto const num_blocks    = columns.size() * rowgroups.size().first;  // 1 block per rowgroup
+  constexpr int block_size = 128;
+  gpu_reduce_pushdown_masks<block_size>
+    <<<num_blocks, block_size, 0, stream.value()>>>(columns, rowgroups, valid_counts);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu
index 69e4723eded..af524e1f70a 100644
--- a/cpp/src/io/parquet/bloom_filter_reader.cu
+++ b/cpp/src/io/parquet/bloom_filter_reader.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/aligned_resource_adaptor.hpp>
 
 #include <cuco/bloom_filter_policies.cuh>
 #include <cuco/bloom_filter_ref.cuh>
@@ -66,9 +67,12 @@ struct bloom_filter_caster {
                      ast::literal const* const literal,
                      rmm::cuda_stream_view stream) const
   {
-    using key_type    = T;
-    using policy_type = cuco::arrow_filter_policy<key_type, cudf::hashing::detail::XXHash_64>;
-    using word_type   = typename policy_type::word_type;
+    using key_type          = T;
+    using policy_type       = cuco::arrow_filter_policy<key_type, cudf::hashing::detail::XXHash_64>;
+    using bloom_filter_type = cuco::
+      bloom_filter_ref<key_type, cuco::extent<std::size_t>, cuco::thread_scope_thread, policy_type>;
+    using filter_block_type = typename bloom_filter_type::filter_block_type;
+    using word_type         = typename policy_type::word_type;
 
     // Check if the literal has the same type as the predicate column
     CUDF_EXPECTS(
@@ -104,16 +108,13 @@ struct bloom_filter_caster {
         auto const num_filter_blocks = filter_size / bytes_per_block;
 
         // Create a bloom filter view.
-        cuco::bloom_filter_ref<key_type,
-                               cuco::extent<std::size_t>,
-                               cuco::thread_scope_thread,
-                               policy_type>
-          filter{reinterpret_cast<word_type*>(filter_span[filter_idx].data()),
-                 num_filter_blocks,
-                 {},  // Thread scope as the same literal is being searched across different bitsets
-                      // per thread
-                 {}};  // Arrow policy with cudf::hashing::detail::XXHash_64 seeded with 0 for Arrow
-                       // compatibility
+        bloom_filter_type filter{
+          reinterpret_cast<filter_block_type*>(filter_span[filter_idx].data()),
+          num_filter_blocks,
+          {},   // Thread scope as the same literal is being searched across different bitsets per
+                // thread
+          {}};  // Arrow policy with cudf::hashing::detail::XXHash_64 seeded with 0 for Arrow
+                // compatibility
 
         // If int96_timestamp type, convert literal to string_view and query bloom
         // filter
@@ -381,6 +382,7 @@ class bloom_filter_expression_converter : public equality_literals_collector {
  * @param bloom_filter_sizes Bloom filter sizes for all chunks
  * @param chunk_source_map Association between each column chunk and its source
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param aligned_mr Aligned device memory resource to allocate bloom filter buffers
  */
 void read_bloom_filter_data(host_span<std::unique_ptr<datasource> const> sources,
                             size_t num_chunks,
@@ -388,8 +390,19 @@ void read_bloom_filter_data(host_span<std::unique_ptr<datasource> const> sources
                             cudf::host_span<std::optional<int64_t>> bloom_filter_offsets,
                             cudf::host_span<std::optional<int32_t>> bloom_filter_sizes,
                             std::vector<size_type> const& chunk_source_map,
-                            rmm::cuda_stream_view stream)
+                            rmm::cuda_stream_view stream,
+                            rmm::device_async_resource_ref aligned_mr)
 {
+  // Using `cuco::arrow_filter_policy` with a temporary `cuda::std::byte` key type to extract bloom
+  // filter properties
+  using policy_type = cuco::arrow_filter_policy<cuda::std::byte, cudf::hashing::detail::XXHash_64>;
+  auto constexpr filter_block_alignment =
+    alignof(cuco::bloom_filter_ref<cuda::std::byte,
+                                   cuco::extent<std::size_t>,
+                                   cuco::thread_scope_thread,
+                                   policy_type>::filter_block_type);
+  auto constexpr words_per_block = policy_type::words_per_block;
+
   // Read tasks for bloom filter data
   std::vector<std::future<size_t>> read_tasks;
 
@@ -422,12 +435,6 @@ void read_bloom_filter_data(host_span<std::unique_ptr<datasource> const> sources
       CompactProtocolReader cp{buffer->data(), buffer->size()};
       cp.read(&header);
 
-      // Get the hardcoded words_per_block value from `cuco::arrow_filter_policy` using a temporary
-      // `std::byte` key type.
-      auto constexpr words_per_block =
-        cuco::arrow_filter_policy<cuda::std::byte,
-                                  cudf::hashing::detail::XXHash_64>::words_per_block;
-
       // Check if the bloom filter header is valid.
       auto const is_header_valid =
         (header.num_bytes % words_per_block) == 0 and
@@ -448,15 +455,25 @@ void read_bloom_filter_data(host_span<std::unique_ptr<datasource> const> sources
 
       // Check if we already read in the filter bitset in the initial read.
       if (initial_read_size >= bloom_filter_header_size + bitset_size) {
-        bloom_filter_data[chunk] =
-          rmm::device_buffer{buffer->data() + bloom_filter_header_size, bitset_size, stream};
+        bloom_filter_data[chunk] = rmm::device_buffer{
+          buffer->data() + bloom_filter_header_size, bitset_size, stream, aligned_mr};
+        // The allocated bloom filter buffer must be aligned
+        CUDF_EXPECTS(reinterpret_cast<std::uintptr_t>(bloom_filter_data[chunk].data()) %
+                         filter_block_alignment ==
+                       0,
+                     "Encountered misaligned bloom filter block");
       }
       // Read the bitset from datasource.
       else {
         auto const bitset_offset = bloom_filter_offset + bloom_filter_header_size;
         // Directly read to device if preferred
         if (source->is_device_read_preferred(bitset_size)) {
-          bloom_filter_data[chunk] = rmm::device_buffer{bitset_size, stream};
+          bloom_filter_data[chunk] = rmm::device_buffer{bitset_size, stream, aligned_mr};
+          // The allocated bloom filter buffer must be aligned
+          CUDF_EXPECTS(reinterpret_cast<std::uintptr_t>(bloom_filter_data[chunk].data()) %
+                           filter_block_alignment ==
+                         0,
+                       "Encountered misaligned bloom filter block");
           auto future_read_size =
             source->device_read_async(bitset_offset,
                                       bitset_size,
@@ -465,8 +482,14 @@ void read_bloom_filter_data(host_span<std::unique_ptr<datasource> const> sources
 
           read_tasks.emplace_back(std::move(future_read_size));
         } else {
-          buffer                   = source->host_read(bitset_offset, bitset_size);
-          bloom_filter_data[chunk] = rmm::device_buffer{buffer->data(), buffer->size(), stream};
+          buffer = source->host_read(bitset_offset, bitset_size);
+          bloom_filter_data[chunk] =
+            rmm::device_buffer{buffer->data(), buffer->size(), stream, aligned_mr};
+          // The allocated bloom filter buffer must be aligned
+          CUDF_EXPECTS(reinterpret_cast<std::uintptr_t>(bloom_filter_data[chunk].data()) %
+                           filter_block_alignment ==
+                         0,
+                       "Encountered misaligned bloom filter block");
         }
       }
     });
@@ -484,7 +507,8 @@ std::vector<rmm::device_buffer> aggregate_reader_metadata::read_bloom_filters(
   host_span<std::vector<size_type> const> row_group_indices,
   host_span<int const> column_schemas,
   size_type total_row_groups,
-  rmm::cuda_stream_view stream) const
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref aligned_mr) const
 {
   // Descriptors for all the chunks that make up the selected columns
   auto const num_input_columns = column_schemas.size();
@@ -543,7 +567,8 @@ std::vector<rmm::device_buffer> aggregate_reader_metadata::read_bloom_filters(
                          bloom_filter_offsets,
                          bloom_filter_sizes,
                          chunk_source_map,
-                         stream);
+                         stream,
+                         aligned_mr);
 
   // Return bloom filter data
   return bloom_filter_data;
@@ -612,10 +637,22 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
   // Return early if no column with equality predicate(s)
   if (equality_col_schemas.empty()) { return std::nullopt; }
 
+  // Required alignment:
+  // https://github.com/NVIDIA/cuCollections/blob/deab5799f3e4226cb8a49acf2199c03b14941ee4/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh#L55-L67
+  using policy_type = cuco::arrow_filter_policy<cuda::std::byte, cudf::hashing::detail::XXHash_64>;
+  auto constexpr alignment = alignof(cuco::bloom_filter_ref<cuda::std::byte,
+                                                            cuco::extent<std::size_t>,
+                                                            cuco::thread_scope_thread,
+                                                            policy_type>::filter_block_type);
+
+  // Aligned resource adaptor to allocate bloom filter buffers with
+  auto aligned_mr =
+    rmm::mr::aligned_resource_adaptor(cudf::get_current_device_resource(), alignment);
+
   // Read a vector of bloom filter bitset device buffers for all columns with equality
   // predicate(s) across all row groups
   auto bloom_filter_data = read_bloom_filters(
-    sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream);
+    sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream, aligned_mr);
 
   // No bloom filter buffers, return the original row group indices
   if (bloom_filter_data.empty()) { return std::nullopt; }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index a28ce616e2c..ba5e53e3104 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -204,6 +204,7 @@ class aggregate_reader_metadata {
    * @param[out] bloom_filter_data List of bloom filter data device buffers
    * @param column_schemas Schema indices of columns whose bloom filters will be read
    * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param aligned_mr Aligned device memory resource to allocate bloom filter buffers
    *
    * @return A flattened list of bloom filter bitset device buffers for each predicate column across
    * row group
@@ -213,7 +214,8 @@ class aggregate_reader_metadata {
     host_span<std::vector<size_type> const> row_group_indices,
     host_span<int const> column_schemas,
     size_type num_row_groups,
-    rmm::cuda_stream_view stream) const;
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref aligned_mr) const;
 
   /**
    * @brief Collects Parquet types for the columns with the specified schema indices
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 5d5fe9e4aa3..bce441cf323 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -147,11 +147,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                       gather_map.begin<size_type>(),
                       cuda::proclaim_return_type<size_type>(
                         [following, input_size, null_index, row_offset] __device__(size_type i) {
-                          // Note: grouped_*rolling_window() trims preceding/following to
-                          // the beginning/end of the group. `rolling_window()` does not.
-                          // Must trim _following[i] so as not to go past the column end.
-                          auto _following = min(following[i], input_size - i - 1);
-                          return (row_offset > _following) ? null_index : (i + row_offset);
+                          return (row_offset > following[i]) ? null_index : (i + row_offset);
                         }));
   } else {
     thrust::transform(rmm::exec_policy(stream),
@@ -160,11 +156,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                       gather_map.begin<size_type>(),
                       cuda::proclaim_return_type<size_type>(
                         [preceding, input_size, null_index, row_offset] __device__(size_type i) {
-                          // Note: grouped_*rolling_window() trims preceding/following to
-                          // the beginning/end of the group. `rolling_window()` does not.
-                          // Must trim _preceding[i] so as not to go past the column start.
-                          auto _preceding = min(preceding[i], i + 1);
-                          return (row_offset > (_preceding - 1)) ? null_index : (i - row_offset);
+                          return (row_offset > (preceding[i] - 1)) ? null_index : (i - row_offset);
                         }));
   }
 
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index bc0ee2eb519..07495eb02e2 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1010,7 +1010,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
  * @param[out] output_valid_count Output count of valid values
  * @param[in] device_operator The operator used to perform a single window operation
  * @param[in] preceding_window_begin Rolling window size iterator, accumulates from
- *            in_col[i-preceding_window] to in_col[i] inclusive
+ *            in_col[i-preceding_window + 1] to in_col[i] inclusive
  * @param[in] following_window_begin Rolling window size iterator in the forward
  *            direction, accumulates from in_col[i] to in_col[i+following_window] inclusive
  */
@@ -1034,28 +1034,27 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
   size_type warp_valid_count{0};
 
-  auto active_threads = __ballot_sync(0xffff'ffffu, i < input.size());
-  while (i < input.size()) {
-    // to prevent overflow issues when computing bounds use int64_t
-    int64_t const preceding_window = preceding_window_begin[i];
-    int64_t const following_window = following_window_begin[i];
+  auto const num_rows = input.size();
+  auto active_threads = __ballot_sync(0xffff'ffffu, i < num_rows);
+  while (i < num_rows) {
+    // The caller is required to provide window bounds that will
+    // result in indexing that is in-bounds for the column. Therefore all
+    // of these calculations cannot overflow and we can do everything
+    // in size_type arithmetic. Moreover, we require that start <=
+    // end, i.e., the window is never "reversed" though it may be empty.
+    auto const preceding_window = preceding_window_begin[i];
+    auto const following_window = following_window_begin[i];
 
-    // compute bounds
-    auto const start = static_cast<size_type>(
-      min(static_cast<int64_t>(input.size()), max(int64_t{0}, i - preceding_window + 1)));
-    auto const end = static_cast<size_type>(
-      min(static_cast<int64_t>(input.size()), max(int64_t{0}, i + following_window + 1)));
-    auto const start_index = min(start, end);
-    auto const end_index   = max(start, end);
+    size_type const start = i - preceding_window + 1;
+    size_type const end   = i + following_window + 1;
 
     // aggregate
     // TODO: We should explore using shared memory to avoid redundant loads.
     //       This might require separating the kernel into a special version
     //       for dynamic and static sizes.
 
-    volatile bool output_is_valid = false;
-    output_is_valid               = device_operator.template operator()<OutputType, has_nulls>(
-      input, default_outputs, output, start_index, end_index, i);
+    bool const output_is_valid = device_operator.template operator()<OutputType, has_nulls>(
+      input, default_outputs, output, start, end, i);
 
     // set the mask
     cudf::bitmask_type const result_mask{__ballot_sync(active_threads, output_is_valid)};
@@ -1068,7 +1067,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
     // process next element
     i += stride;
-    active_threads = __ballot_sync(active_threads, i < input.size());
+    active_threads = __ballot_sync(active_threads, i < num_rows);
   }
 
   // sum the valid counts across the whole block
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index f3eff6b0689..7bb6ace6b02 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -154,8 +154,8 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
 template <typename PrecedingIter, typename FollowingIter>
 std::unique_ptr<column> rolling_collect_list(column_view const& input,
                                              column_view const& default_outputs,
-                                             PrecedingIter preceding_begin_raw,
-                                             FollowingIter following_begin_raw,
+                                             PrecedingIter preceding_begin,
+                                             FollowingIter following_begin,
                                              size_type min_periods,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
@@ -166,23 +166,6 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
 
   if (input.is_empty()) return empty_like(input);
 
-  // Fix up preceding/following iterators to respect column boundaries,
-  // similar to gpu_rolling().
-  // `rolling_window()` does not fix up preceding/following so as not to read past
-  // column boundaries.
-  // `grouped_rolling_window()` and `time_range_based_grouped_rolling_window() do.
-  auto preceding_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<size_type>([preceding_begin_raw] __device__(auto i) {
-      return thrust::min(preceding_begin_raw[i], i + 1);
-    }));
-  auto following_begin = thrust::make_transform_iterator(
-    thrust::make_counting_iterator<size_type>(0),
-    cuda::proclaim_return_type<size_type>(
-      [following_begin_raw, size = input.size()] __device__(auto i) {
-        return thrust::min(following_begin_raw[i], size - i - 1);
-      }));
-
   // Materialize collect list's offsets.
   auto offsets =
     create_collect_offsets(input.size(), preceding_begin, following_begin, min_periods, stream, mr);
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index 0603f27852a..7526c858899 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,16 +15,15 @@
  */
 
 #include "rolling.cuh"
-
-#include <cudf_test/column_utilities.hpp>
+#include "rolling_utils.cuh"
 
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
-#include <thrust/extrema.h>
 
 namespace cudf::detail {
 
@@ -62,28 +61,14 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                             stream,
                                             mr);
   } else {
-    // Clamp preceding/following to column boundaries.
-    // E.g. If preceding_window == 2, then for a column of 5 elements, preceding_window will be:
-    //      [1, 2, 2, 2, 1]
-
-    auto const preceding_calc = cuda::proclaim_return_type<cudf::size_type>(
-      [preceding_window] __device__(size_type i) { return thrust::min(i + 1, preceding_window); });
-
-    auto const following_calc = cuda::proclaim_return_type<cudf::size_type>(
-      [col_size = input.size(), following_window] __device__(size_type i) {
-        return thrust::min(col_size - i - 1, following_window);
-      });
-
-    auto const preceding_column = expand_to_column(preceding_calc, input.size(), stream);
-    auto const following_column = expand_to_column(following_calc, input.size(), stream);
-    return cudf::detail::rolling_window(input,
-                                        default_outputs,
-                                        preceding_column->view().begin<cudf::size_type>(),
-                                        following_column->view().begin<cudf::size_type>(),
-                                        min_periods,
-                                        agg,
-                                        stream,
-                                        mr);
+    namespace utils = cudf::detail::rolling;
+    auto groups     = utils::ungrouped{input.size()};
+    auto preceding =
+      utils::make_clamped_window_iterator<utils::direction::PRECEDING>(preceding_window, groups);
+    auto following =
+      utils::make_clamped_window_iterator<utils::direction::FOLLOWING>(following_window, groups);
+    return cudf::detail::rolling_window(
+      input, default_outputs, preceding, following, min_periods, agg, stream, mr);
   }
 }
 }  // namespace cudf::detail
diff --git a/cpp/src/rolling/detail/rolling_utils.cuh b/cpp/src/rolling/detail/rolling_utils.cuh
new file mode 100644
index 00000000000..ec6d83fd68a
--- /dev/null
+++ b/cpp/src/rolling/detail/rolling_utils.cuh
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/types.hpp>
+
+#include <cuda/functional>
+
+namespace CUDF_EXPORT cudf {
+
+namespace detail::rolling {
+
+/**
+ * @brief A group descriptor for an ungrouped rolling window.
+ *
+ * @param num_rows The number of rows to be rolled over.
+ *
+ * @note This is used for uniformity of interface between grouped and ungrouped iterator
+ * construction.
+ */
+struct ungrouped {
+  cudf::size_type num_rows;
+
+  [[nodiscard]] __device__ constexpr cudf::size_type label(cudf::size_type) const noexcept
+  {
+    return 0;
+  }
+  [[nodiscard]] __device__ constexpr cudf::size_type start(cudf::size_type) const noexcept
+  {
+    return 0;
+  }
+  [[nodiscard]] __device__ constexpr cudf::size_type end(cudf::size_type) const noexcept
+  {
+    return num_rows;
+  }
+};
+
+/**
+ * @brief A group descriptor for a grouped rolling window.
+ *
+ * @param labels The group labels, mapping from input rows to group.
+ * @param offsets The group offsets providing the endpoints of each group.
+ *
+ * @note This is used for uniformity of interface between grouped and ungrouped iterator
+ * construction.
+ */
+struct grouped {
+  // Taking raw pointers here to avoid stealing two registers for the sizes which are never needed.
+  cudf::size_type const* labels;
+  cudf::size_type const* offsets;
+
+  [[nodiscard]] __device__ constexpr cudf::size_type label(cudf::size_type i) const noexcept
+  {
+    return labels[i];
+  }
+  [[nodiscard]] __device__ constexpr cudf::size_type start(cudf::size_type label) const noexcept
+  {
+    return offsets[label];
+  }
+  [[nodiscard]] __device__ constexpr cudf::size_type end(cudf::size_type label) const noexcept
+  {
+    return offsets[label + 1];
+  }
+};
+
+enum class direction : bool {
+  PRECEDING,
+  FOLLOWING,
+};
+
+template <direction Direction, typename Grouping>
+struct fixed_window_clamper {
+  Grouping groups;
+  cudf::size_type delta;
+  [[nodiscard]] __device__ constexpr cudf::size_type operator()(cudf::size_type i) const
+  {
+    auto const label = groups.label(i);
+    // i is contained in [start, end)
+    auto const start = groups.start(label);
+    auto const end   = groups.end(label);
+    if constexpr (Direction == direction::PRECEDING) {
+      return cuda::std::min(i + 1 - start, cuda::std::max(delta, i + 1 - end));
+    } else {
+      return cuda::std::max(start - i - 1, cuda::std::min(delta, end - i - 1));
+    }
+  }
+};
+
+/**
+ * @brief Construct a clamped counting iterator for a row-based window offset
+ *
+ * @tparam Direction the direction of the window `PRECEDING` or `FOLLOWING`.
+ * @tparam Grouping the group specification.
+ * @param delta the window offset.
+ * @param grouper the grouping object.
+ *
+ * @return An iterator suitable for passing to `cudf::detail::rolling_window`
+ */
+template <direction Direction, typename Grouping>
+[[nodiscard]] auto inline make_clamped_window_iterator(cudf::size_type delta, Grouping grouper)
+{
+  return cudf::detail::make_counting_transform_iterator(
+    cudf::size_type{0}, fixed_window_clamper<Direction, Grouping>{grouper, delta});
+}
+}  // namespace detail::rolling
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index d4851df740b..a345c0d0c28 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,24 +63,10 @@ std::unique_ptr<column> rolling_window(column_view const& input,
   } else {
     auto defaults_col =
       cudf::is_dictionary(input.type()) ? dictionary_column_view(input).indices() : input;
-    // Clamp preceding/following to column boundaries.
-    // E.g. If preceding_window == [2, 2, 2, 2, 2] for a column of 5 elements, the new
-    // preceding_window will be: [1, 2, 2, 2, 1]
-    auto const preceding_window_begin = cudf::detail::make_counting_transform_iterator(
-      0,
-      cuda::proclaim_return_type<size_type>(
-        [preceding = preceding_window.begin<size_type>()] __device__(size_type i) {
-          return thrust::min(i + 1, preceding[i]);
-        }));
-    auto const following_window_begin = cudf::detail::make_counting_transform_iterator(
-      0,
-      cuda::proclaim_return_type<size_type>(
-        [col_size = input.size(), following = following_window.begin<size_type>()] __device__(
-          size_type i) { return thrust::min(col_size - i - 1, following[i]); }));
     return cudf::detail::rolling_window(input,
                                         empty_like(defaults_col)->view(),
-                                        preceding_window_begin,
-                                        following_window_begin,
+                                        preceding_window.begin<size_type>(),
+                                        following_window.begin<size_type>(),
                                         min_periods,
                                         agg,
                                         stream,
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 66d46a51577..18c793029b6 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -19,6 +19,7 @@
 #include "detail/range_window_bounds.hpp"
 #include "detail/rolling.cuh"
 #include "detail/rolling_jit.hpp"
+#include "detail/rolling_utils.cuh"
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -43,109 +44,6 @@ namespace cudf {
 
 namespace detail {
 
-/// Preceding window calculation functor.
-template <bool preceding_less_than_1>
-struct row_based_preceding_calc {
-  cudf::size_type const* _group_offsets_begin;
-  cudf::size_type const* _group_labels_begin;
-  cudf::size_type const _preceding_window;
-
-  row_based_preceding_calc(rmm::device_uvector<cudf::size_type> const& group_offsets,
-                           rmm::device_uvector<cudf::size_type> const& group_labels,
-                           cudf::size_type const& preceding_window)
-    : _group_offsets_begin(group_offsets.data()),
-      _group_labels_begin(group_labels.data()),
-      _preceding_window(preceding_window)
-  {
-  }
-
-  __device__ cudf::size_type operator()(cudf::size_type const& idx) const
-  {
-    auto group_label = _group_labels_begin[idx];
-    if constexpr (preceding_less_than_1) {  // where 1 indicates only the current row.
-      auto group_end = _group_offsets_begin[group_label + 1];
-      return thrust::maximum{}(_preceding_window, -(group_end - 1 - idx));
-    } else {
-      auto group_start = _group_offsets_begin[group_label];
-      return thrust::minimum{}(_preceding_window,
-                               idx - group_start + 1);  // Preceding includes current row.
-    }
-  }
-};
-
-/// Helper to materialize preceding-window column, corrected to respect group boundaries.
-/// E.g. If preceding window == 5, then,
-///   1. For the first row in the group, the preceding is set to 1,
-///   2. For the next row in the group, preceding is set to 2, etc.
-std::unique_ptr<cudf::column> make_preceding_column(
-  rmm::device_uvector<cudf::size_type> const& group_offsets,
-  rmm::device_uvector<cudf::size_type> const& group_labels,
-  cudf::size_type const& preceding_window,
-  cudf::size_type const& num_rows,
-  rmm::cuda_stream_view stream)
-{
-  if (preceding_window < 1) {
-    auto const calc = row_based_preceding_calc<true>(group_offsets, group_labels, preceding_window);
-    return cudf::detail::expand_to_column(calc, num_rows, stream);
-  } else {
-    auto const calc =
-      row_based_preceding_calc<false>(group_offsets, group_labels, preceding_window);
-    return cudf::detail::expand_to_column(calc, num_rows, stream);
-  }
-}
-
-/// Following window calculation functor.
-template <bool following_less_than_0>
-struct row_based_following_calc {
-  cudf::size_type const* _group_offsets_begin;
-  cudf::size_type const* _group_labels_begin;
-  cudf::size_type const _following_window;
-
-  row_based_following_calc(rmm::device_uvector<cudf::size_type> const& group_offsets,
-                           rmm::device_uvector<cudf::size_type> const& group_labels,
-                           cudf::size_type const& following_window)
-    : _group_offsets_begin(group_offsets.data()),
-      _group_labels_begin(group_labels.data()),
-      _following_window(following_window)
-  {
-  }
-
-  __device__ cudf::size_type operator()(cudf::size_type const& idx) const
-  {
-    auto group_label = _group_labels_begin[idx];
-    if constexpr (following_less_than_0) {
-      auto group_start = _group_offsets_begin[group_label];
-      return thrust::maximum{}(_following_window, -(idx - group_start) - 1);
-    } else {
-      auto group_end =
-        _group_offsets_begin[group_label + 1];  // Cannot fall off the end, since offsets
-                                                // is capped with `input.size()`.
-      return thrust::minimum{}(_following_window, (group_end - 1) - idx);
-    }
-  }
-};
-
-/// Helper to materialize following-window column, corrected to respect group boundaries.
-/// i.e. If following window == 5, then:
-///   1. For the last row in the group, the following is set to 0.
-///   2. For the second last row in the group, following is set to 1, etc.
-std::unique_ptr<cudf::column> make_following_column(
-  rmm::device_uvector<cudf::size_type> const& group_offsets,
-  rmm::device_uvector<cudf::size_type> const& group_labels,
-  cudf::size_type const& following_window,
-  cudf::size_type const& num_rows,
-  rmm::cuda_stream_view stream)
-{
-  if (following_window < 0) {
-    auto const calc = row_based_following_calc<true>(group_offsets, group_labels, following_window);
-    return cudf::detail::expand_to_column(calc, num_rows, stream);
-  } else {
-    auto const calc =
-      row_based_following_calc<false>(group_offsets, group_labels, following_window);
-    return cudf::detail::expand_to_column(calc, num_rows, stream);
-  }
-}
-
 std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                column_view const& input,
                                                column_view const& default_outputs,
@@ -208,10 +106,6 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
   //   groups.)
   //   3. [0, 500, 1000] indicates two equal-sized groups: [0,500), and [500,1000).
 
-  assert(group_offsets.size() >= 2 && group_offsets.element(0, stream) == 0 &&
-         group_offsets.element(group_offsets.size() - 1, stream) == input.size() &&
-         "Must have at least one group.");
-
   if (aggr.kind == aggregation::CUDA || aggr.kind == aggregation::PTX) {
     cudf::detail::preceding_window_wrapper grouped_preceding_window{
       group_offsets.data(), group_labels.data(), preceding_window};
@@ -229,18 +123,14 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                             stream,
                                             mr);
   } else {
-    auto const preceding_column =
-      make_preceding_column(group_offsets, group_labels, preceding_window, input.size(), stream);
-    auto const following_column =
-      make_following_column(group_offsets, group_labels, following_window, input.size(), stream);
-    return cudf::detail::rolling_window(input,
-                                        default_outputs,
-                                        preceding_column->view().begin<cudf::size_type>(),
-                                        following_column->view().begin<cudf::size_type>(),
-                                        min_periods,
-                                        aggr,
-                                        stream,
-                                        mr);
+    namespace utils = cudf::detail::rolling;
+    auto groups     = utils::grouped{group_labels.data(), group_offsets.data()};
+    auto preceding =
+      utils::make_clamped_window_iterator<utils::direction::PRECEDING>(preceding_window, groups);
+    auto following =
+      utils::make_clamped_window_iterator<utils::direction::FOLLOWING>(following_window, groups);
+    return cudf::detail::rolling_window(
+      input, default_outputs, preceding, following, min_periods, aggr, stream, mr);
   }
 }
 
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 6c9c2aada9a..1ade2143494 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -2196,6 +2196,76 @@ TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite)
   EXPECT_EQ(out_buffer.size(), 0);
 }
 
+// Tests whether Y dimension of grid sizes depends on the number of row groups
+// Disabled because of the high execution time (especially compared to the likelihood of regression)
+TEST_F(OrcReaderTest, DISABLE_Over65kRowGroups)
+{
+  auto constexpr row_group_size = 512;
+  constexpr auto num_rows       = (1 << 16) * row_group_size + 1;
+
+  auto vals_col = random_values<int32_t>(num_rows);
+  dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}};
+  table_view chunk_table({col});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, chunk_table)
+      .row_index_stride(row_group_size);
+
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options read_opts = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+  auto result = cudf::io::read_orc(read_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(chunk_table, result.tbl->view());
+}
+
+// Tests whether Y dimension of grid sizes depends on the number of stripes
+// Disabled because of the high execution time (especially compared to the likelihood of regression)
+TEST_F(OrcReaderTest, DISABLE_Over65kStripes)
+{
+  auto constexpr stripe_size = 512;
+  constexpr auto num_rows    = (1 << 16) * stripe_size + 1;
+
+  auto vals_col = random_values<int32_t>(num_rows);
+  dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}};
+  table_view chunk_table({col});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, chunk_table)
+      .stripe_size_rows(stripe_size)
+      .compression(cudf::io::compression_type::NONE);
+
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options read_opts = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+  auto result = cudf::io::read_orc(read_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(chunk_table, result.tbl->view());
+}
+
+// Tests whether Y dimension of grid sizes depends on the number of columns
+// Disabled because of the high execution time (especially compared to the likelihood of regression)
+TEST_F(OrcWriterTest, DISABLE_Over65kColumns)
+{
+  auto vals_col = random_values<int32_t>(8);
+  dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}};
+  cudf::column_view col_view = col;
+  table_view expected(std::vector<cudf::column_view>{(1 << 16) + 1, col_view});
+
+  std::vector<char> out_buffer;
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected)
+      .compression(cudf::io::compression_type::NONE);
+  cudf::io::write_orc(out_opts);
+
+  cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(
+    cudf::io::source_info{out_buffer.data(), out_buffer.size()});
+  auto result = cudf::io::read_orc(in_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 TEST_F(OrcWriterTest, MultipleBlocksInStripeFooter)
 {
   std::vector<std::string> vals_col(8, "a");
diff --git a/cpp/tests/rolling/rolling_test.cpp b/cpp/tests/rolling/rolling_test.cpp
index 72a511fd5f1..5b57ec46d19 100644
--- a/cpp/tests/rolling/rolling_test.cpp
+++ b/cpp/tests/rolling/rolling_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@
 
 #include <src/rolling/detail/rolling.hpp>
 
+#include <algorithm>
 #include <limits>
 #include <type_traits>
 #include <vector>
@@ -660,7 +661,7 @@ TEST_F(RollingErrorTest, WindowArraySizeMismatch)
   cudf::test::fixed_width_column_wrapper<cudf::size_type> input(
     col_data.begin(), col_data.end(), col_valid.begin());
 
-  std::vector<cudf::size_type> five({2, 1, 2, 1, 4});
+  std::vector<cudf::size_type> five({1, 1, 2, 1, 0});
   std::vector<cudf::size_type> four({1, 2, 3, 4});
   cudf::test::fixed_width_column_wrapper<cudf::size_type> five_elements(five.begin(), five.end());
   cudf::test::fixed_width_column_wrapper<cudf::size_type> four_elements(four.begin(), four.end());
@@ -985,22 +986,7 @@ TYPED_TEST(RollingTest, SimpleDynamic)
   cudf::test::fixed_width_column_wrapper<TypeParam> input(
     col_data.begin(), col_data.end(), col_mask.begin());
   std::vector<cudf::size_type> preceding_window({1, 2, 3, 4, 2});
-  std::vector<cudf::size_type> following_window({2, 1, 2, 1, 2});
-
-  // dynamic sizes
-  this->run_test_col_agg(input, preceding_window, following_window, 1);
-}
-
-// this is a special test to check the volatile count variable issue (see rolling.cu for detail)
-TYPED_TEST(RollingTest, VolatileCount)
-{
-  auto const col_data = cudf::test::make_type_param_vector<TypeParam>({8, 70, 45, 20, 59, 80});
-  const std::vector<bool> col_mask = {1, 1, 0, 0, 1, 0};
-
-  cudf::test::fixed_width_column_wrapper<TypeParam> input(
-    col_data.begin(), col_data.end(), col_mask.begin());
-  std::vector<cudf::size_type> preceding_window({5, 9, 4, 8, 3, 3});
-  std::vector<cudf::size_type> following_window({1, 1, 9, 2, 8, 9});
+  std::vector<cudf::size_type> following_window({2, 1, 2, 1, 0});
 
   // dynamic sizes
   this->run_test_col_agg(input, preceding_window, following_window, 1);
@@ -1128,14 +1114,19 @@ TYPED_TEST(RollingTest, RandomDynamicAllValid)
 
   // random parameters
   cudf::test::UniformRandomGenerator<cudf::size_type> window_rng(0, max_window_size);
-  auto generator = [&]() { return window_rng.generate(); };
 
   std::vector<cudf::size_type> preceding_window(num_rows);
   std::vector<cudf::size_type> following_window(num_rows);
 
-  std::generate(preceding_window.begin(), preceding_window.end(), generator);
-  std::generate(following_window.begin(), following_window.end(), generator);
-
+  auto it = thrust::make_counting_iterator<cudf::size_type>(0);
+  std::transform(it, it + num_rows, preceding_window.begin(), [&window_rng, num_rows](auto i) {
+    auto p = window_rng.generate();
+    return std::min(i + 1, std::max(p, i + 1 - num_rows));
+  });
+  std::transform(it, it + num_rows, following_window.begin(), [&window_rng, num_rows](auto i) {
+    auto f = window_rng.generate();
+    return std::max(-i - 1, std::min(f, num_rows - i - 1));
+  });
   this->run_test_col_agg(input, preceding_window, following_window, max_window_size);
 }
 
@@ -1157,14 +1148,19 @@ TYPED_TEST(RollingTest, RandomDynamicWithInvalid)
 
   // random parameters
   cudf::test::UniformRandomGenerator<cudf::size_type> window_rng(0, max_window_size);
-  auto generator = [&]() { return window_rng.generate(); };
 
   std::vector<cudf::size_type> preceding_window(num_rows);
   std::vector<cudf::size_type> following_window(num_rows);
 
-  std::generate(preceding_window.begin(), preceding_window.end(), generator);
-  std::generate(following_window.begin(), following_window.end(), generator);
-
+  auto it = thrust::make_counting_iterator<cudf::size_type>(0);
+  std::transform(it, it + num_rows, preceding_window.begin(), [&window_rng, num_rows](auto i) {
+    auto p = window_rng.generate();
+    return std::min(i + 1, std::max(p, i + 1 - num_rows));
+  });
+  std::transform(it, it + num_rows, following_window.begin(), [&window_rng, num_rows](auto i) {
+    auto f = window_rng.generate();
+    return std::max(-i - 1, std::min(f, num_rows - i - 1));
+  });
   this->run_test_col_agg(input, preceding_window, following_window, max_window_size);
 }
 
diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp
index 7d2467fd18d..c7d7b4aa5c4 100644
--- a/cpp/tests/streams/rolling_test.cpp
+++ b/cpp/tests/streams/rolling_test.cpp
@@ -54,8 +54,8 @@ TEST_F(RollingTest, FixedSizeDefault)
 TEST_F(RollingTest, VariableSize)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> preceding({2, 2, 2, 2, 3, 3, 3, 3, 3});
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> following({3, 3, 3, 3, 3, 2, 2, 2, 2});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> preceding({1, 2, 2, 2, 3, 3, 3, 3, 3});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> following({3, 3, 3, 3, 3, 2, 2, 1, 0});
 
   cudf::rolling_window(input,
                        preceding,
diff --git a/dependencies.yaml b/dependencies.yaml
index 25866e85a0b..edd83e6e07d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -438,12 +438,21 @@ dependencies:
         matrices:
           - matrix:
               cuda: "12.*"
+              use_cuda_wheels: "true"
             packages:
               - nvidia-nvcomp-cu12==4.1.0.6
           - matrix:
               cuda: "11.*"
+              use_cuda_wheels: "true"
             packages:
               - nvidia-nvcomp-cu11==4.1.0.6
+          # if use_cuda_wheels=false is provided, do not add dependencies on any CUDA wheels
+          # (e.g. for DLFW and pip devcontainers)
+          - matrix:
+              use_cuda_wheels: "false"
+            packages:
+          # if no matching matrix selectors passed, list the unsuffixed packages
+          # (just as a source of documentation, as this populates pyproject.toml in source control)
           - matrix:
             packages:
               - nvidia-nvcomp==4.1.0.6
@@ -708,7 +717,8 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0
+          - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0
+          - &numba-dep numba>=0.59.1,<0.61.0a0
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -835,6 +845,7 @@ dependencies:
           - matrix: {dependencies: "latest"}
             packages:
               - *numba-cuda-dep
+              - *numba-dep
               - pandas==2.2.3
           - matrix:
             packages:
@@ -917,6 +928,7 @@ dependencies:
         packages:
           - dask-cuda==25.2.*,>=0.0.0a0
           - *numba-cuda-dep
+          - *numba-dep
     specific:
       - output_types: [conda, requirements]
         matrices:
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index c07a58ed8a5..42fa871abeb 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -396,7 +396,16 @@ private HostUDFAggregation(HostUDFWrapper wrapper) {
 
         @Override
         long createNativeInstance() {
-            return Aggregation.createHostUDFAgg(wrapper.udfNativeHandle);
+            long udf = 0;
+            try {
+                udf = wrapper.createUDFInstance();
+                return Aggregation.createHostUDFAgg(udf);
+            } finally {
+                // a new UDF is cloned in `createHostUDFAgg`, here should close the UDF instance.
+                if (udf != 0) {
+                    HostUDFWrapper.closeUDFInstance(udf);
+                }
+            }
         }
 
         @Override
diff --git a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
index 7251ce643d4..9686cf17b60 100644
--- a/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/HostMemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -367,6 +367,23 @@ public final int getInt(long offset) {
     return UnsafeMemoryAccessor.getInt(requestedAddress);
   }
 
+  /**
+   * Copy a set of ints to an array from the buffer starting at offset.
+   * @param dst       destination int array
+   * @param dstIndex  starting index within the destination array
+   * @param srcOffset starting offset within this buffer
+   * @param count     number of ints to copy
+   */
+  public final void getInts(int[] dst, long dstIndex, long srcOffset, int count) {
+    assert count >= 0;
+    assert count <= dst.length - dstIndex;
+    assert srcOffset >= 0;
+    long requestedAddress = this.address + srcOffset;
+    addressOutOfBoundsCheck(requestedAddress, count * 4L, "getInts");
+    UnsafeMemoryAccessor.getInts(dst, dstIndex, requestedAddress, count);
+  }
+
+
   /**
    * Sets the Integer value at that offset
    * @param offset - offset from the address
diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
index 124f2c99188..0bd3ccad0d3 100644
--- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
+++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,18 +19,56 @@
 /**
  * A wrapper around native host UDF aggregations.
  * <p>
- * This class is used to store the native handle of a host UDF aggregation and is used as
+ * This class is used to create the native handle of a host UDF aggregation and is used as
  * a proxy object to compute hash code and compare two host UDF aggregations for equality.
  * <p>
  * A new host UDF aggregation implementation must extend this class and override the
- * {@code hashCode} and {@code equals} methods for such purposes.
- * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must
- * also be overridden to automatically delete the native UDF instance upon class destruction.
+ * {@code computeHashCode} and {@code isEqual} methods for such purposes.
+ *
  */
-public abstract class HostUDFWrapper implements AutoCloseable {
-  public final long udfNativeHandle;
+public abstract class HostUDFWrapper {
+
+  /**
+   * Create a derived host UDF native instance.
+   * The instance created by this function MUST be closed by `closeUDFInstance`
+   * <p>Typical usage, refer to Aggregation.java:</p>
+   * <pre>
+   * long udf = 0;
+   * try {
+   *     udf = wrapper.createUDFInstance();
+   *     return Aggregation.createHostUDFAgg(udf);
+   * } finally {
+   *     // a new UDF is cloned in `createHostUDFAgg`, here should close the UDF instance.
+   *     if (udf != 0) {
+   *         HostUDFWrapper.closeUDFInstance(udf);
+   *     }
+   * }
+   * </pre>
+   *
+   */
+  public abstract long createUDFInstance();
+
+  /**
+  * Close the derived UDF instance created by `createUDFInstance`.
+  * @param hostUDFInstance the UDF instance
+  */
+  public static void closeUDFInstance(long hostUDFInstance) {
+    close(hostUDFInstance);
+  }
+
+  public abstract int computeHashCode();
 
-  public HostUDFWrapper(long udfNativeHandle) {
-    this.udfNativeHandle = udfNativeHandle;
+  @Override
+  public int hashCode() {
+      return computeHashCode();
   }
+
+  public abstract boolean isEqual(Object obj);
+
+  @Override
+  public boolean equals(Object obj) {
+      return isEqual(obj);
+  }
+
+  static native void close(long hostUDFInstance);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java b/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java
index ebc3f43ccf4..fcb3f5e40fc 100644
--- a/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java
+++ b/java/src/main/java/ai/rapids/cudf/UnsafeMemoryAccessor.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -154,6 +154,19 @@ public static int getInt(long address) {
     return UNSAFE.getInt(address);
   }
 
+  /**
+   * Copy out an array of ints.
+   * @param dst       where to write the data
+   * @param dstIndex  index into values to start writing at.
+   * @param address   src memory address
+   * @param count     the number of ints to copy
+   * @throws IndexOutOfBoundsException
+   */
+  public static void getInts(int[] dst, long dstIndex, long address, int count) {
+    copyMemory(null, address,
+            dst, UnsafeMemoryAccessor.INT_ARRAY_OFFSET + (dstIndex * 4), count * 4);
+  }
+
   /**
    * Sets the Integer value at that address
    * @param address - memory address
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index bd1714aa476..3923d8b45e3 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -148,6 +148,7 @@ add_library(
   src/DataSourceHelperJni.cpp
   src/HashJoinJni.cpp
   src/HostMemoryBufferNativeUtilsJni.cpp
+  src/HostUDFWrapperJni.cpp
   src/NvcompJni.cpp
   src/NvtxRangeJni.cpp
   src/NvtxUniqueRangeJni.cpp
diff --git a/java/src/main/native/src/HostUDFWrapperJni.cpp b/java/src/main/native/src/HostUDFWrapperJni.cpp
new file mode 100644
index 00000000000..1dd1b483fac
--- /dev/null
+++ b/java/src/main/native/src/HostUDFWrapperJni.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+
+#include <cudf/aggregation/host_udf.hpp>
+
+extern "C" {
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostUDFWrapper_close(JNIEnv* env,
+                                                                jclass class_object,
+                                                                jlong ptr)
+{
+  try {
+    auto to_del = reinterpret_cast<cudf::host_udf_base*>(ptr);
+    delete to_del;
+  }
+  CATCH_STD(env, );
+}
+
+}  // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d1a1ff2c95c..326b4251335 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -2983,7 +2983,7 @@ void testWindowStaticCounts() {
 
   @Test
   void testWindowDynamicNegative() {
-    try (ColumnVector precedingCol = ColumnVector.fromInts(3, 3, 3, 4, 4);
+    try (ColumnVector precedingCol = ColumnVector.fromInts(1, 2, 3, 4, 4);
          ColumnVector followingCol = ColumnVector.fromInts(-1, -1, -1, -1, 0)) {
       try (WindowOptions window = WindowOptions.builder()
           .minPeriods(2).window(precedingCol, followingCol).build()) {
@@ -3014,7 +3014,7 @@ void testWindowLag() {
   @Test
   void testWindowDynamic() {
     try (ColumnVector precedingCol = ColumnVector.fromInts(1, 2, 3, 1, 2);
-         ColumnVector followingCol = ColumnVector.fromInts(2, 2, 2, 2, 2)) {
+         ColumnVector followingCol = ColumnVector.fromInts(2, 2, 2, 1, 0)) {
       try (WindowOptions window = WindowOptions.builder().minPeriods(2)
           .window(precedingCol, followingCol).build()) {
         try (ColumnVector v1 = ColumnVector.fromInts(5, 4, 7, 6, 8);
diff --git a/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java
index b7fde511c38..e0cfeff97e1 100644
--- a/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java
+++ b/java/src/test/java/ai/rapids/cudf/HostMemoryBufferTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -87,6 +87,20 @@ public void testGetInt() {
     }
   }
 
+  @Test
+  public void testGetInts() {
+    try (HostMemoryBuffer hostMemoryBuffer = HostMemoryBuffer.allocate(16)) {
+      hostMemoryBuffer.setInt(0, 1);
+      hostMemoryBuffer.setInt(4, 2);
+      hostMemoryBuffer.setInt(8, 3);
+      hostMemoryBuffer.setInt(12, 4);
+      int[] expectedInts = new int[] {1, 2, 3, 4};
+      int[] result = new int[expectedInts.length];
+      hostMemoryBuffer.getInts(result, 0, 0, 4);
+      assertArrayEquals(expectedInts, result);
+    }
+  }
+
   @Test
   public void testGetByte() {
     try (HostMemoryBuffer hostMemoryBuffer = HostMemoryBuffer.allocate(16)) {
diff --git a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
index a3684cb42b9..645d0789ef8 100644
--- a/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/UnsafeMemoryAccessorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -66,6 +66,24 @@ public void setIntAndGetInt() {
     }
   }
 
+  @Test
+  public void setAndGetInts() {
+    int numInts = 289;
+    long address = UnsafeMemoryAccessor.allocate(numInts * 4);
+    try {
+      for (int i = 0; i < numInts; i++) {
+        UnsafeMemoryAccessor.setInt(address + i * 4, i);
+      }
+      int[] ints = new int[numInts];
+      UnsafeMemoryAccessor.getInts(ints, 0, address, numInts);
+      for (int i = 0; i < numInts; i++) {
+        assertEquals(i, ints[i]);
+      }
+    } finally {
+      UnsafeMemoryAccessor.free(address);
+    }
+  }
+
   @Test
   public void setMemoryValue() {
     long address = UnsafeMemoryAccessor.allocate(4);
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 4c6f8a9c152..50e6b42129e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3659,6 +3659,9 @@ def sort_values(
             by_columns = by_in_index
         else:
             raise KeyError(by)
+
+        if cudf.get_option("mode.pandas_compatible"):
+            by_columns = by_columns.nans_to_nulls()
         # argsort the `by` column
         out = self._gather(
             GatherMap.from_column_unchecked(
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 7e5ce713c7e..556a3826eb7 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import string
 from itertools import product
@@ -7,7 +7,7 @@
 import pandas as pd
 import pytest
 
-from cudf import DataFrame, Series
+from cudf import DataFrame, Series, option_context
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.column import NumericalColumn
 from cudf.testing import assert_eq
@@ -46,6 +46,14 @@ def test_dataframe_sort_values(nelem, dtype):
     assert_eq(sorted_df["b"].values, bb[sorted_index])
 
 
+def test_sort_values_nans_pandas_compat():
+    data = {"a": [0, 0, 2, -1], "b": [1, 3, 2, None]}
+    with option_context("mode.pandas_compatible", True):
+        result = DataFrame(data).sort_values("b", na_position="first")
+    expected = pd.DataFrame(data).sort_values("b", na_position="first")
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("ignore_index", [True, False])
 @pytest.mark.parametrize("index", ["a", "b", ["a", "b"]])
 def test_dataframe_sort_values_ignore_index(index, ignore_index):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index c6a5887f85d..2b03f515657 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -24,7 +24,8 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.2.*,>=0.0.0a0",
-    "numba-cuda>=0.2.0,<0.3.0",
+    "numba-cuda>=0.2.0,<0.3.0a0",
+    "numba>=0.59.1,<0.61.0a0",
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 15079f7f06a..3725722a8ae 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -47,7 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 [project.optional-dependencies]
 test = [
     "dask-cuda==25.2.*,>=0.0.0a0",
-    "numba-cuda>=0.2.0,<0.3.0",
+    "numba-cuda>=0.2.0,<0.3.0a0",
+    "numba>=0.59.1,<0.61.0a0",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index ac85298bc40..9fe930d754c 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@ regex = "(?P<value>.*)"
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
-matrix-entry = "cuda_suffixed=true"
+matrix-entry = "cuda_suffixed=true;use_cuda_wheels=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "libkvikio==25.2.*,>=0.0.0a0",