diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 32bd945d57c..83e6c35216a 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -121,6 +121,10 @@ void BM_parquet_read_long_strings(nvbench::state& state) cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile); // THIS auto const view = tbl->view(); + // set smaller threshold to reduce file size and execution time + auto const threshold = 1; + setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1); + cudf::io::parquet_writer_options write_opts = cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression); @@ -129,6 +133,7 @@ void BM_parquet_read_long_strings(nvbench::state& state) }(); parquet_read_common(num_rows_written, num_cols, source_sink, state); + unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); } template diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 88ec0c07dc5..358170f76db 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -255,12 +255,14 @@ static sizes_to_offsets_iterator make_sizes_to_offsets_i * @param begin Input iterator for scan * @param end End of the input iterator * @param result Output iterator for scan result + * @param initial_offset Initial offset to add to scan * @return The last element of the scan */ template auto sizes_to_offsets(SizesIterator begin, SizesIterator end, OffsetsIterator result, + int64_t initial_offset, rmm::cuda_stream_view stream) { using SizeType = typename thrust::iterator_traits::value_type; @@ -273,7 +275,8 @@ auto sizes_to_offsets(SizesIterator begin, make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data()); // This function uses the type of the initialization parameter as the accumulator type // when computing the individual scan output elements. - thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0}); + thrust::exclusive_scan( + rmm::exec_policy_nosync(stream), begin, end, output_itr, static_cast(initial_offset)); return last_element.value(stream); } @@ -319,7 +322,8 @@ std::pair, size_type> make_offsets_child_column( }); auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements - auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); + auto const total_elements = + sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, 0, stream); CUDF_EXPECTS( total_elements <= static_cast(std::numeric_limits::max()), "Size of output exceeds the column size limit", diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index de2f1770e28..f8f7f12fc3f 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -140,7 +140,7 @@ std::pair, int64_t> make_offsets_child_column( auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements auto const total_bytes = - cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream); + cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, 0, stream); auto const threshold = cudf::strings::get_offset64_threshold(); CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold), @@ -151,7 +151,8 @@ std::pair, int64_t> make_offsets_child_column( offsets_column = make_numeric_column( data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); auto d_offsets64 = offsets_column->mutable_view().template data(); - cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); + cudf::detail::sizes_to_offsets( + input_itr, input_itr + strings_count + 1, d_offsets64, 0, stream); } return std::pair(std::move(offsets_column), total_bytes); diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index f63a4fb79b9..84f751dea6b 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -942,6 +942,7 @@ constexpr bool is_split_decode() * @param chunks List of column chunks * @param min_row Row index to start reading at * @param num_rows Maximum number of rows to read + * @param initial_str_offsets Vector to store the initial offsets for large nested string cols * @param error_code Error code to set if an error is encountered */ template @@ -950,6 +951,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) device_span chunks, size_t min_row, size_t num_rows, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code) { constexpr bool has_dict_t = has_dict(); @@ -1161,11 +1163,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) valid_count = next_valid_count; } - // Now turn the array of lengths into offsets, but skip if this is a large string column. In the - // latter case, offsets will be computed during string column creation. if constexpr (has_strings_t) { - if (!s->col.is_large_string_col) { - convert_small_string_lengths_to_offsets(s); + // For large strings, update the initial string buffer offset to be used during large string + // column construction. Otherwise, convert string sizes to final offsets. + if (s->col.is_large_string_col) { + compute_initial_large_strings_offset( + s, initial_str_offsets[pages[page_idx].chunk_idx], has_lists_t); + } else { + convert_small_string_lengths_to_offsets(s, has_lists_t); } } if (t == 0 and s->error != 0) { set_error(s->error, error_code); } @@ -1185,6 +1190,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, decode_kernel_mask kernel_mask, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -1199,11 +1205,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span pages, if (level_type_size == 1) { gpuDecodePageDataGeneric <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } else { gpuDecodePageDataGeneric <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } }; diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 0c9d4e77f0c..4c98a08006c 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -435,6 +435,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) device_span chunks, size_t min_row, size_t num_rows, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code) { using cudf::detail::warp_size; @@ -579,17 +580,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) __syncthreads(); } - // Now turn the array of lengths into offsets, but skip if this is a large string column. In the - // latter case, offsets will be computed during string column creation. - if (not s->col.is_large_string_col) { - int value_count = nesting_info_base[leaf_level_index].value_count; - - // if no repetition we haven't calculated start/end bounds and instead just skipped - // values until we reach first_row. account for that here. - if (!has_repetition) { value_count -= s->first_row; } - - auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); - block_excl_sum(offptr, value_count, s->page.str_offset); + // For large strings, update the initial string buffer offset to be used during large string + // column construction. Otherwise, convert string sizes to final offsets. + if (s->col.is_large_string_col) { + compute_initial_large_strings_offset( + s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition); + } else { + convert_small_string_lengths_to_offsets(s, has_repetition); } if (t == 0 and s->error != 0) { set_error(s->error, error_code); } @@ -603,6 +600,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) device_span chunks, size_t min_row, size_t num_rows, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code) { using cudf::detail::warp_size; @@ -741,17 +739,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) __syncthreads(); } - // Now turn the array of lengths into offsets, but skip if this is a large string column. In the - // latter case, offsets will be computed during string column creation. - if (not s->col.is_large_string_col) { - int value_count = nesting_info_base[leaf_level_index].value_count; - - // if no repetition we haven't calculated start/end bounds and instead just skipped - // values until we reach first_row. account for that here. - if (!has_repetition) { value_count -= s->first_row; } - - auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); - block_excl_sum(offptr, value_count, s->page.str_offset); + // For large strings, update the initial string buffer offset to be used during large string + // column construction. Otherwise, convert string sizes to final offsets. + if (s->col.is_large_string_col) { + compute_initial_large_strings_offset( + s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition); + } else { + convert_small_string_lengths_to_offsets(s, has_repetition); } // finally, copy the string data into place @@ -797,6 +791,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -807,10 +802,10 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, if (level_type_size == 1) { gpuDecodeDeltaByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } else { gpuDecodeDeltaByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } } @@ -822,6 +817,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -832,10 +828,10 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, if (level_type_size == 1) { gpuDecodeDeltaLengthByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } else { gpuDecodeDeltaLengthByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } } diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index dc4140d0a44..ba627e73625 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -20,6 +20,8 @@ #include +#include + namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. @@ -98,21 +100,54 @@ __device__ inline void block_excl_sum(size_type* arr, size_type length, size_typ } } -template -__device__ inline void convert_small_string_lengths_to_offsets(page_state_s* s) +/** + * @brief Converts string sizes to offsets if this is not a large string column. Otherwise, + * atomically update the initial string offset to be used during large string column construction + */ +template +__device__ void convert_small_string_lengths_to_offsets(page_state_s const* const state, + bool has_lists) { // If this is a large string column. In the // latter case, offsets will be computed during string column creation. - auto& ni = s->nesting_info[s->col.max_nesting_depth - 1]; + auto& ni = state->nesting_info[state->col.max_nesting_depth - 1]; int value_count = ni.value_count; // if no repetition we haven't calculated start/end bounds and instead just skipped // values until we reach first_row. account for that here. - if constexpr (!has_lists) { value_count -= s->first_row; } + if (not has_lists) { value_count -= state->first_row; } + + // Convert the array of lengths into offsets + if (value_count > 0) { + auto const offptr = reinterpret_cast(ni.data_out); + auto const initial_value = state->page.str_offset; + block_excl_sum(offptr, value_count, initial_value); + } +} - auto const offptr = reinterpret_cast(ni.data_out); - auto const initial_value = s->page.str_offset; - block_excl_sum(offptr, value_count, initial_value); +/** + * @brief Atomically update the initial string offset to be used during large string column + * construction + */ +inline __device__ void compute_initial_large_strings_offset(page_state_s const* const state, + size_t& initial_str_offset, + bool has_lists) +{ + // Values decoded by this page. + int value_count = state->nesting_info[state->col.max_nesting_depth - 1].value_count; + + // if no repetition we haven't calculated start/end bounds and instead just skipped + // values until we reach first_row. account for that here. + if (not has_lists) { value_count -= state->first_row; } + + // Atomically update the initial string offset if this is a large string column. This initial + // offset will be used to compute (64-bit) offsets during large string column construction. + if (value_count > 0 and threadIdx.x == 0) { + auto const initial_value = state->page.str_offset; + cuda::atomic_ref initial_str_offsets_ref{ + initial_str_offset}; + initial_str_offsets_ref.fetch_min(initial_value, cuda::std::memory_order_relaxed); + } } template diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 3c8d32572f8..2d947f9d4af 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -862,6 +862,7 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -870,6 +871,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -884,6 +886,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -892,6 +895,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -907,6 +911,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch. + * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -916,6 +921,7 @@ void DecodePageData(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, decode_kernel_mask kernel_mask, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index dff1f3f0c0e..9dd4e19de52 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -27,6 +27,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { @@ -210,10 +211,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num } } + // Create an empty device vector to store the initial str offset for large string columns from for + // string decoders. + auto initial_str_offsets = rmm::device_uvector{0, _stream, _mr}; + pass.chunks.host_to_device_async(_stream); chunk_nested_valids.host_to_device_async(_stream); chunk_nested_data.host_to_device_async(_stream); - if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); } + if (has_strings) { + // Host vector to initialize the initial string offsets + auto host_offsets_vector = + cudf::detail::make_host_vector(_input_columns.size(), _stream); + std::fill( + host_offsets_vector.begin(), host_offsets_vector.end(), std::numeric_limits::max()); + // Initialize the initial string offsets vector from the host vector + initial_str_offsets = + cudf::detail::make_device_uvector_async(host_offsets_vector, _stream, _mr); + chunk_nested_str_data.host_to_device_async(_stream); + } // create this before we fork streams kernel_error error_code(_stream); @@ -231,6 +246,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, decoder_mask, + initial_str_offsets, error_code.data(), streams[s_idx++]); }; @@ -287,6 +303,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num num_rows, skip_rows, level_type_size, + initial_str_offsets, error_code.data(), streams[s_idx++]); } @@ -298,6 +315,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num num_rows, skip_rows, level_type_size, + initial_str_offsets, error_code.data(), streams[s_idx++]); } @@ -402,6 +420,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num page_nesting.device_to_host_async(_stream); page_nesting_decode.device_to_host_async(_stream); + // Copy over initial string offsets from device + auto h_initial_str_offsets = cudf::detail::make_host_vector_async(initial_str_offsets, _stream); + if (auto const error = error_code.value_sync(_stream); error != 0) { CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } @@ -440,6 +461,12 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); final_offsets.emplace_back(static_cast(col_string_sizes[idx])); } + // Nested large strings column + else if (input_col.nesting_depth() > 0) { + CUDF_EXPECTS(h_initial_str_offsets[idx] != std::numeric_limits::max(), + "Encountered invalid initial offset for large string column"); + out_buf.set_initial_string_offset(h_initial_str_offsets[idx]); + } } } } diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index da19539f509..5a8e3081681 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -249,6 +249,8 @@ class inline_column_buffer : public column_buffer_base { void create_string_data(size_t num_bytes, bool is_large_strings_col, rmm::cuda_stream_view stream); + void set_initial_string_offset(size_t offset) { initial_string_offset = offset; } + void* string_data() { return _string_data.data(); } [[nodiscard]] void const* string_data() const { return _string_data.data(); } [[nodiscard]] size_t string_size() const { return _string_data.size(); } @@ -257,6 +259,7 @@ class inline_column_buffer : public column_buffer_base { private: rmm::device_buffer _string_data{}; bool _is_large_strings_col{}; + size_t initial_string_offset{0}; }; using column_buffer = gather_column_buffer; diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu index 66d0a644c12..6befc078bb2 100644 --- a/cpp/src/io/utilities/column_buffer_strings.cu +++ b/cpp/src/io/utilities/column_buffer_strings.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,7 +37,8 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr); auto d_offsets64 = offsets_col->mutable_view().template data(); // it's safe to call with size + 1 because _data is also sized that large - cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream); + cudf::detail::sizes_to_offsets( + offsets_ptr, offsets_ptr + size + 1, d_offsets64, initial_string_offset, stream); return make_strings_column( size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask)); } else { diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index 4b50bf626f2..a98f3021da5 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -167,7 +167,7 @@ std::unique_ptr sequences(column_view const& starts, thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin); auto const n_elements = cudf::detail::sizes_to_offsets( - offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, stream); + offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, 0, stream); CUDF_EXPECTS(n_elements <= std::numeric_limits::max(), "Size of output exceeds the column size limit", std::overflow_error); diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 247440212d0..58c94b60718 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -348,7 +348,7 @@ std::pair, rmm::device_uvector> hash_subs count_substrings_kernel<<>>( *d_strings, width, offsets.data()); auto const total_hashes = - cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream); + cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), 0, stream); // hash substrings rmm::device_uvector hashes(total_hashes, stream); diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index 39cd783de00..5d2db84ae2e 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #include "large_strings_fixture.hpp" +#include +#include #include #include @@ -143,3 +145,96 @@ TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings) // Verify that we read exactly two table chunks EXPECT_EQ(tables.size(), 2); } + +TEST_F(ParquetStringsTest, ChunkedReadNestedLargeStrings) +{ + using int32s_col = cudf::test::fixed_width_column_wrapper; + using strings_col = cudf::test::strings_column_wrapper; + using structs_col = cudf::test::structs_column_wrapper; + + auto constexpr num_rows = 100'000; + + std::vector> input_columns; + auto const int_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release()); + + auto const str_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); }); + input_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release()); + + auto offsets = std::vector{}; + offsets.reserve(num_rows * 2); + cudf::size_type num_structs = 0; + for (int i = 0; i < num_rows; ++i) { + offsets.push_back(num_structs); + auto const new_list_size = i % 4; + num_structs += new_list_size; + } + offsets.push_back(num_structs); + + auto const make_structs_col = [=] { + auto child1 = int32s_col(int_iter, int_iter + num_structs); + auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2); + auto child3 = strings_col{str_iter, str_iter + num_structs}; + + return structs_col{{child1, child2, child3}}.release(); + }; + + input_columns.emplace_back( + cudf::make_lists_column(static_cast(offsets.size() - 1), + int32s_col(offsets.begin(), offsets.end()).release(), + make_structs_col(), + 0, + rmm::device_buffer{})); + + // Input table + auto const table = cudf::table{std::move(input_columns)}; + auto const expected = table.view(); + + auto const child3_view = expected.column(2).child(1).child(2); // list> + auto const column_size = + cudf::strings_column_view(child3_view).chars_size(cudf::get_default_stream()); + // set smaller threshold to reduce file size and execution time + auto const threshold = + column_size / 16; // Empirically set to get a mix of 32 and 64 bit string col chunks. + setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1); + + // Host buffer to write Parquet + auto buffer = std::vector{}; + // Writer options + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected) + .max_page_size_bytes(512 * 1024) + .max_page_size_rows(20000) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS) + .write_v2_headers(false); + + // Write to Parquet + cudf::io::write_parquet(out_opts); + + // Reader options + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + + auto constexpr chunk_read_limit = size_t{1} * 1024 * 1024; + auto constexpr pass_read_limit = 0; + // Chunked parquet reader + auto reader = cudf::io::chunked_parquet_reader(chunk_read_limit, pass_read_limit, in_opts); + + // Read chunked + auto tables = std::vector>{}; + while (reader.has_next()) { + tables.emplace_back(reader.read_chunk().tbl); + } + auto table_views = std::vector{}; + std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) { + return tbl->view(); + }); + auto result = cudf::concatenate(table_views); + + // Verify tables to be equal + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + + // go back to normal threshold + unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); +}