rapidsai · mhaseeb123 · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -255,12 +255,14 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
  * @param begin Input iterator for scan
  * @param end End of the input iterator
  * @param result Output iterator for scan result
+ * @param initial_offset Initial offset to add to scan
  * @return The last element of the scan
  */
 template <typename SizesIterator, typename OffsetsIterator>
 auto sizes_to_offsets(SizesIterator begin,
                       SizesIterator end,
                       OffsetsIterator result,
+                      int64_t initial_offset,
                       rmm::cuda_stream_view stream)
 {
   using SizeType = typename thrust::iterator_traits<SizesIterator>::value_type;
@@ -273,7 +275,8 @@ auto sizes_to_offsets(SizesIterator begin,
     make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
   // This function uses the type of the initialization parameter as the accumulator type
   // when computing the individual scan output elements.
-  thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0});
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), begin, end, output_itr, static_cast<LastType>(initial_offset));
   return last_element.value(stream);
 }
 
@@ -319,7 +322,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
+  auto const total_elements =
+    sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, 0, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
     "Size of output exceeds the column size limit",

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -140,7 +140,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
   auto const total_bytes =
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, 0, stream);
 
   auto const threshold = cudf::strings::get_offset64_threshold();
   CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
@@ -151,7 +151,8 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
     auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+    cudf::detail::sizes_to_offsets(
+      input_itr, input_itr + strings_count + 1, d_offsets64, 0, stream);
   }
 
   return std::pair(std::move(offsets_column), total_bytes);

@@ -942,6 +942,7 @@ constexpr bool is_split_decode()
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @param initial_str_offsets A vector to store the initial str_offset for large strings
  * @param error_code Error code to set if an error is encountered
  */
 template <typename level_t, int decode_block_size_t, decode_kernel_mask kernel_mask_t>
@@ -950,6 +951,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
                            size_t num_rows,
+                           size_t* initial_str_offsets,
                            kernel_error::pointer error_code)
 {
   constexpr bool has_dict_t     = has_dict<kernel_mask_t>();
@@ -1161,12 +1163,11 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     valid_count = next_valid_count;
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
+  // Convert string sizes to offsets if this is not a large string column. Otherwise, update the
+  // initial string buffer offset to be used during large string column construction
   if constexpr (has_strings_t) {
-    if (!s->col.is_large_string_col) {
-      convert_small_string_lengths_to_offsets<decode_block_size_t, has_lists_t>(s);
-    }
+    convert_string_lengths_to_offsets<decode_block_size_t>(
+      s, initial_str_offsets, pages[page_idx].chunk_idx, has_lists_t);
   }
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -1185,6 +1186,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              decode_kernel_mask kernel_mask,
+                             size_t* initial_str_offsets,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream)
 {
@@ -1199,11 +1201,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
     if (level_type_size == 1) {
       gpuDecodePageDataGeneric<uint8_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+          pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
     } else {
       gpuDecodePageDataGeneric<uint16_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+          pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
     }
   };
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -435,6 +435,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
                           size_t num_rows,
+                          size_t* initial_str_offsets,
                           kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
@@ -579,18 +580,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
-  if (not s->col.is_large_string_col) {
-    int value_count = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
-  }
+  // Convert string sizes to offsets if this is not a large string column. Otherwise, update the
+  // initial string buffer offset to be used during large string column construction
+  convert_string_lengths_to_offsets<decode_block_size>(
+    s, initial_str_offsets, pages[page_idx].chunk_idx, has_repetition);
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -603,6 +596,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                                 device_span<ColumnChunkDesc const> chunks,
                                 size_t min_row,
                                 size_t num_rows,
+                                size_t* initial_str_offsets,
                                 kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
@@ -741,18 +735,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
-  if (not s->col.is_large_string_col) {
-    int value_count = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
-  }
+  // Convert string sizes to offsets if this is not a large string column. Otherwise, update the
+  // initial string buffer offset to be used during large string column construction
+  convert_string_lengths_to_offsets<decode_block_size>(
+    s, initial_str_offsets, pages[page_idx].chunk_idx, has_repetition);
 
   // finally, copy the string data into place
   auto const dst = nesting_info_base[leaf_level_index].string_out;
@@ -797,6 +783,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          size_t* initial_str_offsets,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream)
 {
@@ -807,10 +794,10 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
 
   if (level_type_size == 1) {
     gpuDecodeDeltaByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   } else {
     gpuDecodeDeltaByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   }
 }
 
@@ -822,6 +809,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                size_t* initial_str_offsets,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream)
 {
@@ -832,10 +820,10 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
 
   if (level_type_size == 1) {
     gpuDecodeDeltaLengthByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   } else {
     gpuDecodeDeltaLengthByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   }
 }
 

@@ -20,6 +20,8 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
+#include <cuda/atomic>
+
 namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
@@ -98,21 +100,40 @@ __device__ inline void block_excl_sum(size_type* arr, size_type length, size_typ
   }
 }
 
-template <int block_size, bool has_lists>
-__device__ inline void convert_small_string_lengths_to_offsets(page_state_s* s)
+/**
+ * @brief Converts string sizes to offsets if this is not a large string column. Otherwise,
+ * atomically update the initial string offset to be used during large string column construction
+ */
+template <int block_size>
+__device__ void convert_string_lengths_to_offsets(page_state_s const* const state,
+                                                  size_t* initial_str_offsets,
+                                                  int32_t chunk_idx,
+                                                  bool has_lists)
 {
   // If this is a large string column. In the
   // latter case, offsets will be computed during string column creation.
-  auto& ni        = s->nesting_info[s->col.max_nesting_depth - 1];
+  auto& ni        = state->nesting_info[state->col.max_nesting_depth - 1];
   int value_count = ni.value_count;
 
   // if no repetition we haven't calculated start/end bounds and instead just skipped
   // values until we reach first_row. account for that here.
-  if constexpr (!has_lists) { value_count -= s->first_row; }
-
-  auto const offptr        = reinterpret_cast<size_type*>(ni.data_out);
-  auto const initial_value = s->page.str_offset;
-  block_excl_sum<block_size>(offptr, value_count, initial_value);
+  if (not has_lists) { value_count -= state->first_row; }
+
+  auto const initial_value = state->page.str_offset;
+
+  if (value_count > 0) {
+    // Convert the array of lengths into offsets if this not a large string column.
+    if (not state->col.is_large_string_col) {
+      auto const offptr = reinterpret_cast<size_type*>(ni.data_out);
+      block_excl_sum<block_size>(offptr, value_count, initial_value);
+    }  // Atomically update the initial string offset if this is a large string column. This initial
+    // offset will be used to compute (64-bit) offsets during large string column construction.
+    else if (!threadIdx.x) {
+      cuda::atomic_ref<size_t, cuda::std::thread_scope_device> initial_str_offsets_ref{
+        initial_str_offsets[chunk_idx]};
+      initial_str_offsets_ref.fetch_min(initial_value, cuda::std::memory_order_relaxed);
+    }
+  }
 }
 
 template <int block_size>

@@ -870,6 +870,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          size_t* initial_str_offsets,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream);
 
@@ -884,6 +885,8 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] initial_str_offsets A vector to store the initial str_offset for large nested
+ * string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -892,6 +895,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                size_t* initial_str_offsets,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream);
 
@@ -907,6 +911,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch.
+ * @param[out] initial_str_offsets A vector to store the initial str_offset for large strings
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -916,6 +921,7 @@ void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                     size_t min_row,
                     int level_type_size,
                     decode_kernel_mask kernel_mask,
+                    size_t* initial_str_offsets,
                     kernel_error::pointer error_code,
                     rmm::cuda_stream_view stream);