diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 39869b67547..d99b74506e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,13 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
+
+ci:
+  autofix_commit_msg: "[pre-commit.ci] auto code formatting"
+  autofix_prs: false
+  autoupdate_branch: ""
+  autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
+  autoupdate_schedule: quarterly
+  skip: ["verify-alpha-spec"]
+  submodules: false
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
diff --git a/ci/run_cudf_memcheck_ctests.sh b/ci/run_cudf_memcheck_ctests.sh
index 653829db419..391579b6c59 100755
--- a/ci/run_cudf_memcheck_ctests.sh
+++ b/ci/run_cudf_memcheck_ctests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 set -uo pipefail
 
@@ -10,6 +10,7 @@ trap "EXITCODE=1" ERR
 cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/gtests/libcudf/";
 
 export GTEST_CUDF_RMM_MODE=cuda
+export GTEST_BRIEF=1
 # compute-sanitizer bug 4553815
 export LIBCUDF_MEMCHECK_ENABLED=1
 for gt in ./*_TEST ; do
@@ -18,6 +19,7 @@ for gt in ./*_TEST ; do
   echo "Running compute-sanitizer on $test_name"
   compute-sanitizer --tool memcheck ${gt} "$@"
 done
+unset GTEST_BRIEF
 unset GTEST_CUDF_RMM_MODE
 unset LIBCUDF_MEMCHECK_ENABLED
 
diff --git a/cpp/benchmarks/ast/transform.cpp b/cpp/benchmarks/ast/transform.cpp
index 2533ea9611c..06ec7543213 100644
--- a/cpp/benchmarks/ast/transform.cpp
+++ b/cpp/benchmarks/ast/transform.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,54 +56,46 @@ static void BM_ast_transform(nvbench::state& state)
   auto const tree_levels = static_cast<cudf::size_type>(state.get_int64("tree_levels"));
 
   // Create table data
-  auto const n_cols = reuse_columns ? 1 : tree_levels + 1;
+  auto const num_columns = reuse_columns ? 1 : tree_levels + 1;
   auto const source_table =
-    create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, n_cols),
+    create_sequence_table(cycle_dtypes({cudf::type_to_id<key_type>()}, num_columns),
                           row_count{num_rows},
                           Nullable ? std::optional<double>{0.5} : std::nullopt);
   auto table = source_table->view();
 
+  cudf::ast::tree tree;
+
   // Create column references
-  auto column_refs = std::vector<cudf::ast::column_reference>();
-  std::transform(thrust::make_counting_iterator(0),
-                 thrust::make_counting_iterator(n_cols),
-                 std::back_inserter(column_refs),
-                 [](auto const& column_id) {
-                   return cudf::ast::column_reference(reuse_columns ? 0 : column_id);
-                 });
+  std::for_each(
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(num_columns),
+    [&](int column_id) { tree.push(cudf::ast::column_reference(reuse_columns ? 0 : column_id)); });
 
   // Create expression trees
 
-  // Note that a std::list is required here because of its guarantees against reference invalidation
-  // when items are added or removed. References to items in a std::vector are not safe if the
-  // vector must re-allocate.
-  auto expressions = std::list<cudf::ast::operation>();
-
   // Construct tree that chains additions like (((a + b) + c) + d)
   auto const op = cudf::ast::ast_operator::ADD;
   if (reuse_columns) {
-    expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(0)));
+    tree.push(cudf::ast::operation(op, tree.at(0), tree.at(0)));
     for (cudf::size_type i = 0; i < tree_levels - 1; i++) {
-      expressions.push_back(cudf::ast::operation(op, expressions.back(), column_refs.at(0)));
+      tree.push(cudf::ast::operation(op, tree.back(), tree.at(0)));
     }
   } else {
-    expressions.push_back(cudf::ast::operation(op, column_refs.at(0), column_refs.at(1)));
-    std::transform(std::next(column_refs.cbegin(), 2),
-                   column_refs.cend(),
-                   std::back_inserter(expressions),
-                   [&](auto const& column_ref) {
-                     return cudf::ast::operation(op, expressions.back(), column_ref);
-                   });
+    tree.push(cudf::ast::operation(op, tree.at(0), tree.at(1)));
+    std::for_each(
+      thrust::make_counting_iterator(2),
+      thrust::make_counting_iterator(num_columns),
+      [&](int col_id) { tree.push(cudf::ast::operation(op, tree.back(), tree.at(col_id))); });
   }
 
-  auto const& expression_tree_root = expressions.back();
+  auto const& root_expression = tree.back();
 
   // Use the number of bytes read from global memory
   state.add_global_memory_reads<key_type>(static_cast<size_t>(num_rows) * (tree_levels + 1));
   state.add_global_memory_writes<key_type>(num_rows);
 
   state.exec(nvbench::exec_tag::sync,
-             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
+             [&](nvbench::launch&) { cudf::compute_column(table, root_expression); });
 }
 
 template <cudf::ast::ast_operator cmp_op, cudf::ast::ast_operator reduce_op>
@@ -117,10 +109,10 @@ static void BM_string_compare_ast_transform(nvbench::state& state)
   CUDF_EXPECTS(tree_levels > 0, "benchmarks require 1 or more comparisons");
 
   // Create table data
-  auto const num_cols = tree_levels * 2;
+  auto const num_columns = tree_levels * 2;
   std::vector<std::unique_ptr<cudf::column>> columns;
   std::for_each(
-    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_cols), [&](size_t) {
+    thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_columns), [&](size_t) {
       columns.emplace_back(create_string_column(num_rows, string_width, hit_rate));
     });
 
@@ -135,38 +127,36 @@ static void BM_string_compare_ast_transform(nvbench::state& state)
       return size + cudf::strings_column_view{column}.chars_size(cudf::get_default_stream());
     });
 
-  // Create column references
-  auto column_refs = std::vector<cudf::ast::column_reference>();
-  std::transform(thrust::make_counting_iterator(0),
-                 thrust::make_counting_iterator(num_cols),
-                 std::back_inserter(column_refs),
-                 [](auto const& column_id) { return cudf::ast::column_reference(column_id); });
+  // Create expression tree
+  cudf::ast::tree tree;
 
-  // Create expression trees
-  std::list<cudf::ast::operation> expressions;
+  // Create column references
+  std::for_each(thrust::make_counting_iterator(0),
+                thrust::make_counting_iterator(num_columns),
+                [&](int column_id) { tree.push(cudf::ast::column_reference{column_id}); });
 
   // Construct AST tree (a == b && c == d && e == f && ...)
 
-  expressions.emplace_back(cudf::ast::operation(cmp_op, column_refs[0], column_refs[1]));
+  tree.push(cudf::ast::operation(cmp_op, tree[0], tree[1]));
 
   std::for_each(thrust::make_counting_iterator(1),
                 thrust::make_counting_iterator(tree_levels),
                 [&](size_t idx) {
-                  auto const& lhs = expressions.back();
-                  auto const& rhs = expressions.emplace_back(
-                    cudf::ast::operation(cmp_op, column_refs[idx * 2], column_refs[idx * 2 + 1]));
-                  expressions.emplace_back(cudf::ast::operation(reduce_op, lhs, rhs));
+                  auto const& lhs = tree.back();
+                  auto const& rhs =
+                    tree.push(cudf::ast::operation(cmp_op, tree[idx * 2], tree[idx * 2 + 1]));
+                  tree.push(cudf::ast::operation(reduce_op, lhs, rhs));
                 });
 
-  auto const& expression_tree_root = expressions.back();
-
   // Use the number of bytes read from global memory
   state.add_element_count(chars_size, "chars_size");
   state.add_global_memory_reads<nvbench::uint8_t>(chars_size);
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);
 
+  auto const& expression = tree.back();
+
   state.exec(nvbench::exec_tag::sync,
-             [&](nvbench::launch&) { cudf::compute_column(table, expression_tree_root); });
+             [&](nvbench::launch&) { cudf::compute_column(table, expression); });
 }
 
 #define AST_TRANSFORM_BENCHMARK_DEFINE(name, key_type, tree_type, reuse_columns, nullable) \
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 7516d4e8d5d..8e89b461e30 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
@@ -6,7 +6,6 @@ include(../set_cuda_architecture.cmake)
 
 # initialize cuda architecture
 rapids_cuda_init_architectures(basic_example)
-rapids_cuda_set_architectures(RAPIDS)
 
 project(
   basic_example
diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt
index 87149ac821c..603c8d0b457 100644
--- a/cpp/examples/billion_rows/CMakeLists.txt
+++ b/cpp/examples/billion_rows/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
@@ -6,7 +6,6 @@ include(../set_cuda_architecture.cmake)
 
 # initialize cuda architecture
 rapids_cuda_init_architectures(billion_rows)
-rapids_cuda_set_architectures(RAPIDS)
 
 project(
   billion_rows
diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
index 6fbc1b50212..6f1249beaaa 100644
--- a/cpp/examples/interop/CMakeLists.txt
+++ b/cpp/examples/interop/CMakeLists.txt
@@ -1,11 +1,10 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
 include(../set_cuda_architecture.cmake)
 
 rapids_cuda_init_architectures(interop_example)
-rapids_cuda_set_architectures(RAPIDS)
 
 project(
   interop_example
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
index 88e891f892a..e7972d1531b 100644
--- a/cpp/examples/nested_types/CMakeLists.txt
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
@@ -6,7 +6,6 @@ include(../set_cuda_architecture.cmake)
 
 # initialize cuda architecture
 rapids_cuda_init_architectures(nested_types)
-rapids_cuda_set_architectures(RAPIDS)
 
 project(
   nested_types
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
index b7d8fc14b6c..17f86fdf5e0 100644
--- a/cpp/examples/parquet_io/CMakeLists.txt
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
@@ -6,7 +6,6 @@ include(../set_cuda_architecture.cmake)
 
 # initialize cuda architecture
 rapids_cuda_init_architectures(parquet_io)
-rapids_cuda_set_architectures(RAPIDS)
 
 project(
   parquet_io
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index 47e63f319ad..9010d495715 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
@@ -6,7 +6,6 @@ include(../set_cuda_architecture.cmake)
 
 # initialize cuda architecture
 rapids_cuda_init_architectures(strings_examples)
-rapids_cuda_set_architectures(RAPIDS)
 
 project(
   strings_examples
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 85289a52831..40609a0b64e 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -556,7 +556,7 @@ class column_name_reference : public expression {
 
 /**
  * @brief An AST expression tree. It owns and contains multiple dependent expressions. All the
- * expressions are destroyed when the tree is destructed.
+ * expressions are destroyed when the tree is destroyed.
  */
 class tree {
  public:
@@ -588,12 +588,11 @@ class tree {
    * @returns a reference to the added expression
    */
   template <typename Expr, typename... Args>
-  Expr const& emplace(Args&&... args)
+  std::enable_if_t<std::is_base_of_v<expression, Expr>, Expr const&> emplace(Args&&... args)
   {
-    static_assert(std::is_base_of_v<expression, Expr>);
-    auto expr            = std::make_shared<Expr>(std::forward<Args>(args)...);
+    auto expr            = std::make_unique<Expr>(std::forward<Args>(args)...);
     Expr const& expr_ref = *expr;
-    expressions.emplace_back(std::static_pointer_cast<expression>(std::move(expr)));
+    expressions.emplace_back(std::move(expr));
     return expr_ref;
   }
 
@@ -603,7 +602,7 @@ class tree {
    * @returns a reference to the added expression
    */
   template <typename Expr>
-  Expr const& push(Expr expr)
+  decltype(auto) push(Expr expr)
   {
     return emplace<Expr>(std::move(expr));
   }
@@ -641,9 +640,9 @@ class tree {
   expression const& operator[](size_t index) const { return *expressions[index]; }
 
  private:
-  // TODO: use better ownership semantics, the shared_ptr here is redundant. Consider using a bump
+  // TODO: use better ownership semantics, the unique_ptr here is redundant. Consider using a bump
   // allocator with type-erased deleters.
-  std::vector<std::shared_ptr<expression>> expressions;
+  std::vector<std::unique_ptr<expression>> expressions;
 };
 
 /** @} */  // end of group
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 5685b50c322..d8f8e13a164 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -19,6 +19,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda/std/array>
+#include <cuda/std/functional>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -897,7 +898,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     __syncthreads();
 
     using OffsetPrefixScanCallbackOpT_ =
-      cub::TilePrefixCallbackOp<OffsetT, cub::Sum, OutOffsetScanTileState>;
+      cub::TilePrefixCallbackOp<OffsetT, cuda::std::plus<>, OutOffsetScanTileState>;
 
     using OutOffsetBlockScan =
       cub::BlockScan<OffsetT, BLOCK_THREADS, cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>;
@@ -915,7 +916,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
         .ExclusiveScan(count_chars_callback_op.out_count,
                        thread_out_offset,
                        static_cast<OffsetT>(0),
-                       cub::Sum{},
+                       cuda::std::plus{},
                        block_aggregate);
       tile_out_count = block_aggregate;
       if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
@@ -927,10 +928,11 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
       }
     } else {
       auto prefix_op = OffsetPrefixScanCallbackOpT_(
-        offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx);
+        offset_tile_state, prefix_callback_temp_storage, cuda::std::plus{}, tile_idx);
 
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op);
+        .ExclusiveScan(
+          count_chars_callback_op.out_count, thread_out_offset, cuda::std::plus{}, prefix_op);
       tile_out_offset = prefix_op.GetExclusivePrefix();
       tile_out_count  = prefix_op.GetBlockAggregate();
       if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh
index 7f728db5afa..aaab7a142f7 100644
--- a/cpp/src/io/fst/in_reg_array.cuh
+++ b/cpp/src/io/fst/in_reg_array.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,12 +66,8 @@ class MultiFragmentInRegArray {
                                               uint32_t bit_start,
                                               uint32_t num_bits) const
   {
-#if CUB_PTX_ARCH > 0
-    return cub::BFE(data, bit_start, num_bits);
-#else
     uint32_t const MASK = (1 << num_bits) - 1;
     return (data >> bit_start) & MASK;
-#endif
   }
 
   /**
@@ -83,15 +79,11 @@ class MultiFragmentInRegArray {
                             uint32_t bit_start,
                             uint32_t num_bits) const
   {
-#if CUB_PTX_ARCH > 0
-    cub::BFI(data, data, bits, bit_start, num_bits);
-#else
     uint32_t x      = bits << bit_start;
     uint32_t y      = data;
     uint32_t MASK_X = ((1 << num_bits) - 1) << bit_start;
     uint32_t MASK_Y = ~MASK_X;
     data            = (y & MASK_Y) | (x & MASK_X);
-#endif
   }
 
   BackingFragmentT data[FRAGMENTS_PER_ITEM];
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index a1edd21f8a2..7dc1255af6f 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/gather.h>
@@ -341,7 +342,7 @@ void __device__ generate_def_level_histogram(uint32_t* hist,
   }
 }
 
-// operator to use with warp_reduce. stolen from cub::Sum
+// operator to use with warp_reduce. stolen from cuda::std::plus
 struct BitwiseOr {
   /// Binary OR operator, returns <tt>a | b</tt>
   template <typename T>
@@ -2456,7 +2457,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       // calculate offsets into output
       size_type s_off, total;
       block_scan(temp_storage.scan_storage)
-        .ExclusiveScan(suff_len, s_off, str_data_len, cub::Sum(), total);
+        .ExclusiveScan(suff_len, s_off, str_data_len, cuda::std::plus(), total);
 
       if (t_idx < s->page.num_valid) {
         auto const dst = strings_ptr + s_off;
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 6a7c8ea45e9..bd7c1cd8bd1 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -292,7 +292,7 @@ CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
     u_char const chr = static_cast<u_char>(d_input_chars[i]);
     mb_count += ((chr & 0x80) > 0);
   }
-  auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
+  auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cuda::std::plus());
 
   if ((lane_idx == 0) && (mb_total > 0)) {
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 88f343926c9..9ccdcfda82b 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -280,7 +280,7 @@ CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes
   for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
     count += fn.has_target(i, chars_bytes);
   }
-  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+  auto const total = block_reduce(temp_storage).Reduce(count, cuda::std::plus());
 
   if ((lane_idx == 0) && (total > 0)) {
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 52ddef76c1a..60c387acab1 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -255,7 +255,7 @@ CUDF_KERNEL void count_targets_kernel(replace_parallel_chars_fn fn,
   for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
     count += fn.has_target(i);
   }
-  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+  auto const total = block_reduce(temp_storage).Reduce(count, cuda::std::plus());
 
   if ((lane_idx == 0) && (total > 0)) {
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 4b777be9d5b..620a2753826 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -325,7 +326,7 @@ CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer,
   for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
     count += tokenizer.is_delimiter(i, d_offsets, chars_bytes);
   }
-  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+  auto const total = block_reduce(temp_storage).Reduce(count, cuda::std::plus());
 
   if ((lane_idx == 0) && (total > 0)) {
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 89ca8a089d6..77a01d555db 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/for_each.h>
@@ -119,7 +120,7 @@ CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, i
   for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
     count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]);
   }
-  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+  auto const total = block_reduce(temp_storage).Reduce(count, cuda::std::plus());
 
   if ((lane_idx == 0) && (total > 0)) {
     cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index caf2b1d8b30..34cc8362c82 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,6 +40,7 @@
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
+#include <cuda/std/functional>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -283,7 +284,7 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
   __syncwarp();
 
   // add up the counts from the other threads to compute the total token count for this string
-  auto const total_count = warp_reduce(warp_storage).Reduce(count, cub::Sum());
+  auto const total_count = warp_reduce(warp_storage).Reduce(count, cuda::std::plus());
   if (lane_idx == 0) { d_counts[str_idx] = total_count; }
 }
 
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index 995f7748167..43dc211c472 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -359,18 +359,15 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression)
   constexpr int64_t left_depth_level  = 100;
   constexpr int64_t right_depth_level = 75;
 
-  auto generate_ast_expr = [](int64_t depth_level,
-                              cudf::ast::column_reference const& col_ref,
-                              cudf::ast::ast_operator root_operator,
-                              cudf::ast::ast_operator arithmetic_operator,
-                              bool nested_left_tree) {
-    // Note that a std::list is required here because of its guarantees against reference
-    // invalidation when items are added or removed. References to items in a std::vector are not
-    // safe if the vector must re-allocate.
-    auto expressions = std::list<cudf::ast::operation>();
+  cudf::ast::tree tree;
 
+  auto generate_ast_expr = [&](int64_t depth_level,
+                               cudf::ast::column_reference const& col_ref,
+                               cudf::ast::ast_operator root_operator,
+                               cudf::ast::ast_operator arithmetic_operator,
+                               bool nested_left_tree) -> cudf::ast::expression const& {
     auto op = arithmetic_operator;
-    expressions.emplace_back(op, col_ref, col_ref);
+    tree.push(cudf::ast::operation{op, col_ref, col_ref});
 
     for (int64_t i = 0; i < depth_level - 1; i++) {
       if (i == depth_level - 2) {
@@ -379,34 +376,35 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression)
         op = arithmetic_operator;
       }
       if (nested_left_tree) {
-        expressions.emplace_back(op, expressions.back(), col_ref);
+        tree.push(cudf::ast::operation{op, tree.back(), col_ref});
       } else {
-        expressions.emplace_back(op, col_ref, expressions.back());
+        tree.push(cudf::ast::operation{op, col_ref, tree.back()});
       }
     }
-    return expressions;
+
+    return tree.back();
   };
 
   auto c_0   = column_wrapper<int64_t>{0, 0, 0};
   auto c_1   = column_wrapper<int32_t>{0, 0, 0};
   auto table = cudf::table_view{{c_0, c_1}};
 
-  auto col_ref_0 = cudf::ast::column_reference(0);
-  auto col_ref_1 = cudf::ast::column_reference(1);
+  auto const& col_ref_0 = tree.push(cudf::ast::column_reference(0));
+  auto const& col_ref_1 = tree.push(cudf::ast::column_reference(1));
 
-  auto left_expression  = generate_ast_expr(left_depth_level,
-                                           col_ref_0,
-                                           cudf::ast::ast_operator::LESS,
-                                           cudf::ast::ast_operator::ADD,
-                                           false);
-  auto right_expression = generate_ast_expr(right_depth_level,
-                                            col_ref_1,
-                                            cudf::ast::ast_operator::EQUAL,
-                                            cudf::ast::ast_operator::SUB,
-                                            true);
+  auto const& left_expression  = generate_ast_expr(left_depth_level,
+                                                  col_ref_0,
+                                                  cudf::ast::ast_operator::LESS,
+                                                  cudf::ast::ast_operator::ADD,
+                                                  false);
+  auto const& right_expression = generate_ast_expr(right_depth_level,
+                                                   col_ref_1,
+                                                   cudf::ast::ast_operator::EQUAL,
+                                                   cudf::ast::ast_operator::SUB,
+                                                   true);
 
-  auto expression_tree = cudf::ast::operation(
-    cudf::ast::ast_operator::LOGICAL_OR, left_expression.back(), right_expression.back());
+  auto const& expression = tree.push(
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_OR, left_expression, right_expression));
 
   // Expression:
   // OR(<(+(+(+(+($0, $0), $0), $0), $0), $0), ==($1, -($1, -($1, -($1, -($1, $1))))))
@@ -421,7 +419,7 @@ TEST_F(TransformTest, DeeplyNestedArithmeticLogicalExpression)
   // If all $1 values and $R values are zeros, the result is true because of the equality check
   // combined with the OR operator in OR(<($L, $0), ==($1, $R)).
 
-  auto result   = cudf::compute_column(table, expression_tree);
+  auto result   = cudf::compute_column(table, expression);
   auto expected = column_wrapper<bool>{true, true, true};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 1ade2143494..362a6df913d 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -2198,7 +2198,7 @@ TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite)
 
 // Tests whether Y dimension of grid sizes depends on the number of row groups
 // Disabled because of the high execution time (especially compared to the likelihood of regression)
-TEST_F(OrcReaderTest, DISABLE_Over65kRowGroups)
+TEST_F(OrcReaderTest, DISABLED_Over65kRowGroups)
 {
   auto constexpr row_group_size = 512;
   constexpr auto num_rows       = (1 << 16) * row_group_size + 1;
@@ -2222,7 +2222,7 @@ TEST_F(OrcReaderTest, DISABLE_Over65kRowGroups)
 
 // Tests whether Y dimension of grid sizes depends on the number of stripes
 // Disabled because of the high execution time (especially compared to the likelihood of regression)
-TEST_F(OrcReaderTest, DISABLE_Over65kStripes)
+TEST_F(OrcReaderTest, DISABLED_Over65kStripes)
 {
   auto constexpr stripe_size = 512;
   constexpr auto num_rows    = (1 << 16) * stripe_size + 1;
@@ -2247,7 +2247,7 @@ TEST_F(OrcReaderTest, DISABLE_Over65kStripes)
 
 // Tests whether Y dimension of grid sizes depends on the number of columns
 // Disabled because of the high execution time (especially compared to the likelihood of regression)
-TEST_F(OrcWriterTest, DISABLE_Over65kColumns)
+TEST_F(OrcWriterTest, DISABLED_Over65kColumns)
 {
   auto vals_col = random_values<int32_t>(8);
   dec64_col col{vals_col.begin(), vals_col.end(), numeric::scale_type{2}};
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index ec44a6aa8c5..0ec9350e6ee 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx scalar.pyx strings_udf.pyx)
+set(cython_sources column.pyx strings_udf.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 114991dbe3e..00ecd53e70d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -33,7 +33,11 @@ from libcpp.vector cimport vector
 
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
-from pylibcudf cimport DataType as plc_DataType, Column as plc_Column
+from pylibcudf cimport (
+    DataType as plc_DataType,
+    Column as plc_Column,
+    Scalar as plc_Scalar,
+)
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -45,8 +49,6 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
-from cudf._lib.scalar cimport DeviceScalar
-
 
 cdef get_element(column_view col_view, size_type index):
 
@@ -55,10 +57,8 @@ cdef get_element(column_view col_view, size_type index):
         c_output = move(
             cpp_copying.get_element(col_view, index)
         )
-
-    return DeviceScalar.from_unique_ptr(
-        move(c_output), dtype=dtype_from_column_view(col_view)
-    )
+    plc_scalar = plc_Scalar.from_libcudf(move(c_output))
+    return pylibcudf.interop.to_arrow(plc_scalar).as_py()
 
 
 def dtype_from_pylibcudf_column(plc_Column col not None):
@@ -767,7 +767,7 @@ cdef class Column:
                     base_nbytes = 0
                 else:
                     chars_size = get_element(
-                        offset_child_column, offset_child_column.size()-1).value
+                        offset_child_column, offset_child_column.size()-1)
                     base_nbytes = chars_size
 
         if data_ptr:
@@ -908,6 +908,6 @@ cdef class Column:
     def from_scalar(py_val, size_type size):
         return Column.from_pylibcudf(
             pylibcudf.Column.from_scalar(
-                py_val.device_value.c_value, size
+                py_val.device_value, size
             )
         )
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
deleted file mode 100644
index a3a8a14e70f..00000000000
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-
-from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-
-
-cdef class DeviceScalar:
-    cdef public object c_value
-
-    cdef object _dtype
-
-    cdef const scalar* get_raw_ptr(self) except *
-
-    @staticmethod
-    cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*)
-
-    cdef void _set_dtype(self, dtype=*)
-
-    cpdef bool is_valid(DeviceScalar s)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
deleted file mode 100644
index 65607c91302..00000000000
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-
-import copy
-
-import numpy as np
-import pandas as pd
-import pyarrow as pa
-
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
-import pylibcudf as plc
-
-import cudf
-from cudf.core.dtypes import ListDtype, StructDtype
-from cudf.core.missing import NA, NaT
-from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-
-# We currently need this cimport because some of the implementations here
-# access the c_obj of the scalar, and because we need to be able to call
-# pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
-# DeviceScalar is phased out entirely from cuDF Cython (at which point
-# cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar
-from pylibcudf.libcudf.scalar.scalar cimport scalar
-
-
-def _replace_nested(obj, check, replacement):
-    if isinstance(obj, list):
-        for i, item in enumerate(obj):
-            if check(item):
-                obj[i] = replacement
-            elif isinstance(item, (dict, list)):
-                _replace_nested(item, check, replacement)
-    elif isinstance(obj, dict):
-        for k, v in obj.items():
-            if check(v):
-                obj[k] = replacement
-            elif isinstance(v, (dict, list)):
-                _replace_nested(v, check, replacement)
-
-
-def gather_metadata(dtypes):
-    """Convert a dict of dtypes to a list of ColumnMetadata objects.
-
-    The metadata is constructed recursively so that nested types are
-    represented as nested ColumnMetadata objects.
-
-    Parameters
-    ----------
-    dtypes : dict
-        A dict mapping column names to dtypes.
-
-    Returns
-    -------
-    List[ColumnMetadata]
-        A list of ColumnMetadata objects.
-    """
-    out = []
-    for name, dtype in dtypes.items():
-        v = plc.interop.ColumnMetadata(name)
-        if isinstance(dtype, cudf.StructDtype):
-            v.children_meta = gather_metadata(dtype.fields)
-        elif isinstance(dtype, cudf.ListDtype):
-            # Offsets column is unnamed and has no children
-            v.children_meta.append(plc.interop.ColumnMetadata(""))
-            v.children_meta.extend(
-                gather_metadata({"": dtype.element_type})
-            )
-        out.append(v)
-    return out
-
-
-cdef class DeviceScalar:
-
-    # TODO: I think this should be removable, except that currently the way
-    # that from_unique_ptr is implemented is probably dereferencing this in an
-    # invalid state. See what the best way to fix that is.
-    def __cinit__(self, *args, **kwargs):
-        self.c_value = plc.Scalar.__new__(plc.Scalar)
-
-    def __init__(self, value, dtype):
-        """
-        Type representing an *immutable* scalar value on the device
-
-        Parameters
-        ----------
-        value : scalar
-            An object of scalar type, i.e., one for which
-            `np.isscalar()` returns `True`. Can also be `None`,
-            to represent a "null" scalar. In this case,
-            dtype *must* be provided.
-        dtype : dtype
-            A NumPy dtype.
-        """
-        dtype = dtype if dtype.kind != 'U' else cudf.dtype('object')
-
-        if cudf.utils.utils.is_na_like(value):
-            value = None
-        else:
-            # TODO: For now we always deepcopy the input value to avoid
-            # overwriting the input values when replacing nulls. Since it's
-            # just host values it's not that expensive, but we could consider
-            # alternatives.
-            value = copy.deepcopy(value)
-            _replace_nested(value, cudf.utils.utils.is_na_like, None)
-
-        if isinstance(dtype, cudf.core.dtypes._BaseDtype):
-            pa_type = dtype.to_arrow()
-        elif pd.api.types.is_string_dtype(dtype):
-            # Have to manually convert object types, which we use internally
-            # for strings but pyarrow only supports as unicode 'U'
-            pa_type = pa.string()
-        else:
-            pa_type = pa.from_numpy_dtype(dtype)
-
-        if isinstance(pa_type, pa.ListType) and value is None:
-            # pyarrow doesn't correctly handle None values for list types, so
-            # we have to create this one manually.
-            # https://github.com/apache/arrow/issues/40319
-            pa_array = pa.array([None], type=pa_type)
-        else:
-            pa_array = pa.array([pa.scalar(value, type=pa_type)])
-
-        pa_table = pa.Table.from_arrays([pa_array], names=[""])
-        table = plc.interop.from_arrow(pa_table)
-
-        column = table.columns()[0]
-        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                column = plc.unary.cast(
-                    column, plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale)
-                )
-            elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                column = plc.unary.cast(
-                    column, plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale)
-                )
-
-        self.c_value = plc.copying.get_element(column, 0)
-        self._dtype = dtype
-
-    def _to_host_scalar(self):
-        is_datetime = self.dtype.kind == "M"
-        is_timedelta = self.dtype.kind == "m"
-
-        null_type = NaT if is_datetime or is_timedelta else NA
-
-        metadata = gather_metadata({"": self.dtype})[0]
-        ps = plc.interop.to_arrow(self.c_value, metadata)
-        if not ps.is_valid:
-            return null_type
-
-        # TODO: The special handling of specific types below does not currently
-        # extend to nested types containing those types (e.g. List[timedelta]
-        # where the timedelta would overflow). We should eventually account for
-        # those cases, but that will require more careful consideration of how
-        # to traverse the contents of the nested data.
-        if is_datetime or is_timedelta:
-            time_unit, _ = np.datetime_data(self.dtype)
-            # Cast to int64 to avoid overflow
-            ps_cast = ps.cast('int64').as_py()
-            out_type = np.datetime64 if is_datetime else np.timedelta64
-            ret = out_type(ps_cast, time_unit)
-        elif cudf.api.types.is_numeric_dtype(self.dtype):
-            ret = ps.type.to_pandas_dtype()(ps.as_py())
-        else:
-            ret = ps.as_py()
-
-        _replace_nested(ret, lambda item: item is None, NA)
-        return ret
-
-    @property
-    def dtype(self):
-        """
-        The NumPy dtype corresponding to the data type of the underlying
-        device scalar.
-        """
-        return self._dtype
-
-    @property
-    def value(self):
-        """
-        Returns a host copy of the underlying device scalar.
-        """
-        return self._to_host_scalar()
-
-    cdef const scalar* get_raw_ptr(self) except *:
-        return (<plc_Scalar> self.c_value).c_obj.get()
-
-    cpdef bool is_valid(self):
-        """
-        Returns if the Scalar is valid or not(i.e., <NA>).
-        """
-        return self.c_value.is_valid()
-
-    def __repr__(self):
-        if cudf.utils.utils.is_na_like(self.value):
-            return (
-                f"{self.__class__.__name__}"
-                f"({self.value}, {repr(self.dtype)})"
-            )
-        else:
-            return f"{self.__class__.__name__}({repr(self.value)})"
-
-    @staticmethod
-    cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=None):
-        """
-        Construct a Scalar object from a unique_ptr<cudf::scalar>.
-        """
-        cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
-        # Note: This line requires pylibcudf to be cimported
-        s.c_value = plc_Scalar.from_libcudf(move(ptr))
-        s._set_dtype(dtype)
-        return s
-
-    @staticmethod
-    def from_pylibcudf(pscalar, dtype=None):
-        cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
-        s.c_value = pscalar
-        s._set_dtype(dtype)
-        return s
-
-    cdef void _set_dtype(self, dtype=None):
-        cdtype_id = self.c_value.type().id()
-        if dtype is not None:
-            self._dtype = dtype
-        elif cdtype_id in {
-            plc.TypeID.DECIMAL32,
-            plc.TypeID.DECIMAL64,
-            plc.TypeID.DECIMAL128,
-        }:
-            raise TypeError(
-                "Must pass a dtype when constructing from a fixed-point scalar"
-            )
-        elif cdtype_id == plc.TypeID.STRUCT:
-            self._dtype = StructDtype.from_arrow(
-                plc.interop.to_arrow(self.c_value).type
-            )
-        elif cdtype_id == plc.TypeID.LIST:
-            self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type)
-        else:
-            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id]
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index cad4b1aa72c..35eb25e2a32 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -16,6 +16,8 @@
 import pyarrow as pa
 from pandas.api import types as pd_types
 
+import pylibcudf as plc
+
 import cudf
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.dtypes import (  # noqa: F401
@@ -143,8 +145,8 @@ def is_scalar(val):
         val,
         (
             cudf.Scalar,
-            cudf._lib.scalar.DeviceScalar,
             cudf.core.tools.datetimes.DateOffset,
+            plc.Scalar,
             pa.Scalar,
         ),
     ) or (
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 5f90439f86f..657985dab5e 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -12,12 +12,7 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_scalar
-from cudf.core._internals import copying
-from cudf.core._internals.stream_compaction import (
-    apply_boolean_mask,
-    drop_duplicates,
-    drop_nulls,
-)
+from cudf.core._internals import copying, stream_compaction
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.core.copy_types import GatherMap
@@ -1945,7 +1940,7 @@ def drop_duplicates(
         # This utilizes the fact that all `Index` is also a `Frame`.
         # Except RangeIndex.
         return self._from_columns_like_self(
-            drop_duplicates(
+            stream_compaction.drop_duplicates(
                 list(self._columns),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
@@ -2032,7 +2027,7 @@ def dropna(self, how="any"):
         data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
-            drop_nulls(
+            stream_compaction.drop_nulls(
                 data_columns,
                 how=how,
             ),
@@ -2103,7 +2098,9 @@ def _apply_boolean_mask(self, boolean_mask):
             raise ValueError("boolean_mask is not boolean type.")
 
         return self._from_columns_like_self(
-            apply_boolean_mask(list(self._columns), boolean_mask),
+            stream_compaction.apply_boolean_mask(
+                list(self._columns), boolean_mask
+            ),
             column_names=self._column_names,
         )
 
diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py
index a9023f8fd59..4ad873b9825 100644
--- a/python/cudf/cudf/core/_internals/binaryop.py
+++ b/python/cudf/cudf/core/_internals/binaryop.py
@@ -50,10 +50,10 @@ def binaryop(
         plc.binaryop.binary_operation(
             lhs.to_pylibcudf(mode="read")
             if isinstance(lhs, Column)
-            else lhs.device_value.c_value,
+            else lhs.device_value,
             rhs.to_pylibcudf(mode="read")
             if isinstance(rhs, Column)
-            else rhs.device_value.c_value,
+            else rhs.device_value,
             plc.binaryop.BinaryOperator[op],
             dtype_to_pylibcudf_type(dtype),
         )
diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py
index 34c1850cb72..76122f89445 100644
--- a/python/cudf/cudf/core/_internals/copying.py
+++ b/python/cudf/cudf/core/_internals/copying.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -67,7 +67,7 @@ def scatter(
     plc_tbl = plc.copying.scatter(
         plc.Table([col.to_pylibcudf(mode="read") for col in sources])  # type: ignore[union-attr]
         if isinstance(sources[0], cudf._lib.column.Column)
-        else [slr.device_value.c_value for slr in sources],  # type: ignore[union-attr]
+        else [slr.device_value for slr in sources],  # type: ignore[union-attr]
         scatter_map.to_pylibcudf(mode="read"),
         plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
     )
diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py
deleted file mode 100644
index c45c4a1b5cf..00000000000
--- a/python/cudf/cudf/core/_internals/unary.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-import pylibcudf as plc
-
-from cudf.api.types import is_decimal_dtype
-from cudf.core.buffer import acquire_spill_lock
-from cudf.utils.dtypes import dtype_to_pylibcudf_type
-
-if TYPE_CHECKING:
-    from cudf._typing import Dtype
-    from cudf.core.column import ColumnBase
-
-
-@acquire_spill_lock()
-def unary_operation(
-    col: ColumnBase, op: plc.unary.UnaryOperator
-) -> ColumnBase:
-    return type(col).from_pylibcudf(
-        plc.unary.unary_operation(col.to_pylibcudf(mode="read"), op)
-    )
-
-
-@acquire_spill_lock()
-def is_null(col: ColumnBase) -> ColumnBase:
-    return type(col).from_pylibcudf(
-        plc.unary.is_null(col.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def is_valid(col: ColumnBase) -> ColumnBase:
-    return type(col).from_pylibcudf(
-        plc.unary.is_valid(col.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def cast(col: ColumnBase, dtype: Dtype) -> ColumnBase:
-    result = type(col).from_pylibcudf(
-        plc.unary.cast(
-            col.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
-        )
-    )
-
-    if is_decimal_dtype(result.dtype):
-        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
-    return result
-
-
-@acquire_spill_lock()
-def is_nan(col: ColumnBase) -> ColumnBase:
-    return type(col).from_pylibcudf(
-        plc.unary.is_nan(col.to_pylibcudf(mode="read"))
-    )
-
-
-@acquire_spill_lock()
-def is_non_nan(col: ColumnBase) -> ColumnBase:
-    return type(col).from_pylibcudf(
-        plc.unary.is_not_nan(col.to_pylibcudf(mode="read"))
-    )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index b9d6c0e7f08..6f241a50a4e 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -12,7 +12,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -126,7 +125,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex):
         super().__init__(parent=parent)
 
     @property
-    def categories(self) -> "cudf.core.index.Index":
+    def categories(self) -> cudf.Index:
         """
         The categories of this categorical.
         """
@@ -608,7 +607,7 @@ def children(self) -> tuple[NumericalColumn]:
 
     @property
     def categories(self) -> ColumnBase:
-        return self.dtype.categories._values
+        return self.dtype.categories._column
 
     @property
     def codes(self) -> NumericalColumn:
@@ -1010,12 +1009,12 @@ def isnull(self) -> ColumnBase:
         """
         Identify missing values in a CategoricalColumn.
         """
-        result = unary.is_null(self)
+        result = super().isnull()
 
         if self.categories.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of an underlying float column
-            categories = unary.is_nan(self.categories)
+            categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
                 result = result | (self.codes == cudf.Scalar(code))
@@ -1026,12 +1025,12 @@ def notnull(self) -> ColumnBase:
         """
         Identify non-missing values in a CategoricalColumn.
         """
-        result = unary.is_valid(self)
+        result = super().is_valid()
 
         if self.categories.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of an underlying float column
-            categories = unary.is_nan(self.categories)
+            categories = self.categories.isnan()
             if categories.any():
                 code = self._encode(np.nan)
                 result = result & (self.codes != cudf.Scalar(code))
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7438d526919..7c9ed0a911e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -23,22 +23,23 @@
 import rmm
 
 import cudf
-from cudf import _lib as libcudf
 from cudf._lib.column import Column
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
+    is_decimal_dtype,
     is_dtype_equal,
     is_scalar,
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import aggregation, copying, sorting, unary
-from cudf.core._internals.stream_compaction import (
-    apply_boolean_mask,
-    drop_duplicates,
-    drop_nulls,
+from cudf.core._internals import (
+    aggregation,
+    copying,
+    search,
+    sorting,
+    stream_compaction,
 )
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
@@ -294,7 +295,9 @@ def any(self, skipna: bool = True) -> bool:
 
     def dropna(self) -> Self:
         if self.has_nulls():
-            return drop_nulls([self])[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
+            return stream_compaction.drop_nulls([self])[0]._with_type_metadata(
+                self.dtype
+            )  # type: ignore[return-value]
         else:
             return self.copy()
 
@@ -437,7 +440,7 @@ def _fill(
                         self.to_pylibcudf(mode="read"),
                         begin,
                         end,
-                        slr.device_value.c_value,
+                        slr.device_value,
                     )
                 )
             if is_string_dtype(self.dtype):
@@ -457,7 +460,7 @@ def _fill(
                 self.to_pylibcudf(mode="write"),
                 begin,
                 end,
-                slr.device_value.c_value,
+                slr.device_value,
             )
         return self
 
@@ -468,7 +471,7 @@ def shift(self, offset: int, fill_value: ScalarLike) -> Self:
         plc_col = plc.copying.shift(
             self.to_pylibcudf(mode="read"),
             offset,
-            fill_value.device_value.c_value,
+            fill_value.device_value,
         )
         return type(self).from_pylibcudf(plc_col)  # type: ignore[return-value]
 
@@ -584,14 +587,11 @@ def element_indexing(self, index: int):
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
         with acquire_spill_lock():
-            dscalar = libcudf.scalar.DeviceScalar.from_pylibcudf(
-                plc.copying.get_element(
-                    self.to_pylibcudf(mode="read"),
-                    idx,
-                ),
-                dtype=self.dtype,
+            plc_scalar = plc.copying.get_element(
+                self.to_pylibcudf(mode="read"),
+                idx,
             )
-        return dscalar.value
+        return cudf.Scalar.from_pylibcudf(plc_scalar).value
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         stride = 1 if stride is None else stride
@@ -706,6 +706,7 @@ def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
         value: cudf.core.scalar.Scalar | ColumnBase,
+        bounds_check: bool = True,
     ) -> Self:
         if key.dtype.kind == "b":
             # `key` is boolean mask
@@ -737,7 +738,7 @@ def _scatter_by_column(
                 plc_table = plc.copying.boolean_mask_scatter(
                     plc.Table([value.to_pylibcudf(mode="read")])
                     if isinstance(value, Column)
-                    else [value.device_value.c_value],
+                    else [value.device_value],
                     plc.Table([self.to_pylibcudf(mode="read")]),
                     key.to_pylibcudf(mode="read"),
                 )
@@ -747,9 +748,9 @@ def _scatter_by_column(
                     ._with_type_metadata(self.dtype)
                 )
         else:
-            return copying.scatter([value], key, [self])[
-                0
-            ]._with_type_metadata(self.dtype)
+            return copying.scatter(
+                [value], key, [self], bounds_check=bounds_check
+            )[0]._with_type_metadata(self.dtype)
 
     def _check_scatter_key_length(
         self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
@@ -817,7 +818,7 @@ def fillna(
                     else plc.replace.ReplacePolicy.FOLLOWING
                 )
             elif is_scalar(fill_value):
-                plc_replace = cudf.Scalar(fill_value).device_value.c_value
+                plc_replace = cudf.Scalar(fill_value).device_value
             else:
                 plc_replace = fill_value.to_pylibcudf(mode="read")
             plc_column = plc.replace.replace_nulls(
@@ -827,17 +828,45 @@ def fillna(
             result = type(self).from_pylibcudf(plc_column)
         return result._with_type_metadata(self.dtype)  # type: ignore[return-value]
 
+    @acquire_spill_lock()
+    def is_valid(self) -> ColumnBase:
+        """Identify non-null values"""
+        return type(self).from_pylibcudf(
+            plc.unary.is_valid(self.to_pylibcudf(mode="read"))
+        )
+
+    def isnan(self) -> ColumnBase:
+        """Identify NaN values in a Column."""
+        if self.dtype.kind != "f":
+            return as_column(False, length=len(self))
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.is_nan(self.to_pylibcudf(mode="read"))
+            )
+
+    def notnan(self) -> ColumnBase:
+        """Identify non-NaN values in a Column."""
+        if self.dtype.kind != "f":
+            return as_column(True, length=len(self))
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.is_not_nan(self.to_pylibcudf(mode="read"))
+            )
+
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
         if not self.has_nulls(include_nan=self.dtype.kind == "f"):
             return as_column(False, length=len(self))
 
-        result = unary.is_null(self)
+        with acquire_spill_lock():
+            result = type(self).from_pylibcudf(
+                plc.unary.is_null(self.to_pylibcudf(mode="read"))
+            )
 
         if self.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of a float column
-            result = result | unary.is_nan(self)
+            result = result | self.isnan()
 
         return result
 
@@ -846,15 +875,22 @@ def notnull(self) -> ColumnBase:
         if not self.has_nulls(include_nan=self.dtype.kind == "f"):
             return as_column(True, length=len(self))
 
-        result = unary.is_valid(self)
+        with acquire_spill_lock():
+            result = type(self).from_pylibcudf(
+                plc.unary.is_valid(self.to_pylibcudf(mode="read"))
+            )
 
         if self.dtype.kind == "f":
             # Need to consider `np.nan` values in case
             # of a float column
-            result = result & unary.is_non_nan(self)
+            result = result & self.notnan()
 
         return result
 
+    @cached_property
+    def nan_count(self) -> int:
+        return 0
+
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
@@ -875,9 +911,9 @@ def indices_of(
         else:
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
-        return apply_boolean_mask(  # type: ignore[return-value]
-            [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
-        )[0]
+        return as_column(
+            range(len(self)), dtype=SIZE_TYPE_DTYPE
+        ).apply_boolean_mask(mask)  # type: ignore[return-value]
 
     def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
         indices = self.indices_of(value)
@@ -1124,6 +1160,17 @@ def distinct_count(self, dropna: bool = True) -> int:
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
+    @acquire_spill_lock()
+    def cast(self, dtype: Dtype) -> ColumnBase:
+        result = type(self).from_pylibcudf(
+            plc.unary.cast(
+                self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
+            )
+        )
+        if is_decimal_dtype(result.dtype):
+            result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result
+
     def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
         if len(self) == 0:
             dtype = cudf.dtype(dtype)
@@ -1257,9 +1304,9 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
         if mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
-        return apply_boolean_mask([self], mask)[0]._with_type_metadata(
-            self.dtype
-        )
+        return stream_compaction.apply_boolean_mask([self], mask)[
+            0
+        ]._with_type_metadata(self.dtype)
 
     def argsort(
         self,
@@ -1332,7 +1379,7 @@ def searchsorted(
             raise ValueError(
                 "Column searchsorted expects values to be column of same dtype"
             )
-        return cudf.core._internals.search.search_sorted(  # type: ignore[return-value]
+        return search.search_sorted(  # type: ignore[return-value]
             [self],
             [value],
             side=side,
@@ -1347,7 +1394,7 @@ def unique(self) -> Self:
         if self.is_unique:
             return self.copy()
         else:
-            return drop_duplicates([self], keep="first")[  # type: ignore[return-value]
+            return stream_compaction.drop_duplicates([self], keep="first")[  # type: ignore[return-value]
                 0
             ]._with_type_metadata(self.dtype)
 
@@ -1597,7 +1644,7 @@ def copy_if_else(
         return type(self).from_pylibcudf(  # type: ignore[return-value]
             plc.copying.copy_if_else(
                 self.to_pylibcudf(mode="read"),
-                other.device_value.c_value
+                other.device_value
                 if isinstance(other, cudf.Scalar)
                 else other.to_pylibcudf(mode="read"),
                 boolean_mask.to_pylibcudf(mode="read"),
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 1bde7d27700..8b28c372d2f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -21,8 +21,7 @@
 import cudf.core.column.column as column
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core._internals import binaryop, unary
-from cudf.core._internals.search import search_sorted
+from cudf.core._internals import binaryop
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
     get_compatible_timezone,
@@ -574,7 +573,7 @@ def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
                 "Cannot use .astype to convert from timezone-naive dtype to timezone-aware dtype. "
                 "Use tz_localize instead."
             )
-        return unary.cast(self, dtype=dtype)  # type: ignore[return-value]
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
@@ -958,7 +957,7 @@ def tz_localize(
             localized.dtype
         )
         indices = (
-            search_sorted([transition_times_local], [localized], "right") - 1
+            transition_times_local.searchsorted(localized, side="right") - 1
         )
         offsets_to_utc = offsets.take(indices, nullify=True)
         gmt_data = localized - offsets_to_utc
@@ -1043,8 +1042,14 @@ def _utc_time(self):
     def _local_time(self):
         """Return the local time as naive timestamps."""
         transition_times, offsets = get_tz_data(str(self.dtype.tz))
-        transition_times = transition_times.astype(_get_base_dtype(self.dtype))
-        indices = search_sorted([transition_times], [self], "right") - 1
+        base_dtype = _get_base_dtype(self.dtype)
+        transition_times = transition_times.astype(base_dtype)
+        indices = (
+            transition_times.searchsorted(
+                self.astype(base_dtype), side="right"
+            )
+            - 1
+        )
         offsets_from_utc = offsets.take(indices, nullify=True)
         return self + offsets_from_utc
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 09941665ba2..0f233b5bdc4 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -14,7 +14,7 @@
 
 import cudf
 from cudf.api.types import is_scalar
-from cudf.core._internals import binaryop, unary
+from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.numerical_base import NumericalBaseColumn
@@ -84,7 +84,7 @@ def as_decimal_column(
 
         if dtype == self.dtype:
             return self
-        return unary.cast(self, dtype)  # type: ignore[return-value]
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
@@ -223,8 +223,8 @@ def normalize_binop_value(self, other) -> Self | cudf.Scalar:
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        return unary.cast(self, dtype)  # type: ignore[return-value]
+    ) -> cudf.core.column.NumericalColumn:
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
 
 class Decimal32Column(DecimalBaseColumn):
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index e7e69961db4..2b834a20726 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -111,6 +111,13 @@ def memory_usage(self):
             )
         return n
 
+    def element_indexing(self, index: int) -> list:
+        result = super().element_indexing(index)
+        if isinstance(result, list):
+            return self.dtype._recursively_replace_fields(result)
+        else:
+            return result
+
     def __setitem__(self, key, value):
         if isinstance(value, list):
             value = cudf.Scalar(value)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 3713d64f4aa..f901b5d735e 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -16,7 +16,7 @@
 import cudf
 import cudf.core.column.column as column
 from cudf.api.types import is_integer, is_scalar
-from cudf.core._internals import binaryop, unary
+from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.numerical_base import NumericalBaseColumn
@@ -43,6 +43,7 @@
         ScalarLike,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column import DecimalBaseColumn
 
 _unaryop_map = {
     "ASIN": "ARCSIN",
@@ -130,8 +131,7 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
             and self.dtype.kind in {"c", "f"}
             and np.isnan(value)
         ):
-            nan_col = unary.is_nan(self)
-            return nan_col.indices_of(True)
+            return self.isnan().indices_of(True)
         else:
             return super().indices_of(value)
 
@@ -203,7 +203,12 @@ def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         unaryop = unaryop.upper()
         unaryop = _unaryop_map.get(unaryop, unaryop)
         unaryop = plc.unary.UnaryOperator[unaryop]
-        return unary.unary_operation(self, unaryop)
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(
+                plc.unary.unary_operation(
+                    self.to_pylibcudf(mode="read"), unaryop
+                )
+            )
 
     def __invert__(self):
         if self.dtype.kind in "ui":
@@ -421,16 +426,14 @@ def as_timedelta_column(
             size=self.size,
         )
 
-    def as_decimal_column(
-        self, dtype: Dtype
-    ) -> "cudf.core.column.DecimalBaseColumn":
-        return unary.cast(self, dtype)  # type: ignore[return-value]
+    def as_decimal_column(self, dtype: Dtype) -> DecimalBaseColumn:
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         dtype = cudf.dtype(dtype)
         if dtype == self.dtype:
             return self
-        return unary.cast(self, dtype)  # type: ignore[return-value]
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def all(self, skipna: bool = True) -> bool:
         # If all entries are null the result is True, including when the column
@@ -446,9 +449,8 @@ def any(self, skipna: bool = True) -> bool:
     @functools.cached_property
     def nan_count(self) -> int:
         if self.dtype.kind != "f":
-            return 0
-        nan_col = unary.is_nan(self)
-        return nan_col.sum()
+            return super().nan_count
+        return self.isnan().sum()
 
     def _process_values_for_isin(
         self, values: Sequence
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 689d5132d45..d8c316a4c8f 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
@@ -10,7 +10,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf.core._internals import sorting
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
@@ -145,9 +144,11 @@ def quantile(
         else:
             no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
-            indices = sorting.order_by(
-                [no_nans], [True], "first", stable=True
-            ).slice(no_nans.null_count, len(no_nans))
+            indices = (
+                no_nans.argsort(ascending=True, na_position="first")
+                .slice(no_nans.null_count, len(no_nans))
+                .astype(np.dtype(np.int32))
+            )
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
                     no_nans.to_pylibcudf(mode="read"),
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d3cbf3bd695..54b42b1f6de 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1068,6 +1068,10 @@ def replace(
         if regex and isinstance(pat, re.Pattern):
             pat = pat.pattern
 
+        pa_repl = pa.scalar(repl)
+        if not pa.types.is_string(pa_repl.type):
+            raise TypeError(f"repl must be a str, not {type(repl).__name__}.")
+
         # Pandas forces non-regex replace when pat is a single-character
         with acquire_spill_lock():
             if regex is True and len(pat) > 1:
@@ -1076,14 +1080,14 @@ def replace(
                     plc.strings.regex_program.RegexProgram.create(
                         pat, plc.strings.regex_flags.RegexFlags.DEFAULT
                     ),
-                    pa_scalar_to_plc_scalar(pa.scalar(repl)),
+                    pa_scalar_to_plc_scalar(pa_repl),
                     n,
                 )
             else:
                 plc_result = plc.strings.replace.replace(
                     self._column.to_pylibcudf(mode="read"),
                     pa_scalar_to_plc_scalar(pa.scalar(pat)),
-                    pa_scalar_to_plc_scalar(pa.scalar(repl)),
+                    pa_scalar_to_plc_scalar(pa_repl),
                     n,
                 )
             result = Column.from_pylibcudf(plc_result)
@@ -2416,13 +2420,19 @@ def get(self, i: int = 0) -> SeriesOrIndex:
         2    f
         dtype: object
         """
+        str_lens = self.len()
         if i < 0:
             next_index = i - 1
             step = -1
+            to_mask = str_lens < abs(i)  # type: ignore[operator]
         else:
             next_index = i + 1
             step = 1
-        return self.slice(i, next_index, step)
+            to_mask = str_lens <= i  # type: ignore[operator]
+        result = self.slice(i, next_index, step)
+        if to_mask.any():  # type: ignore[union-attr]
+            result[to_mask] = cudf.NA  # type: ignore[index]
+        return result
 
     def get_json_object(
         self,
@@ -3933,19 +3943,18 @@ def isspace(self) -> SeriesOrIndex:
     def _starts_ends_with(
         self,
         method: Callable[[plc.Column, plc.Column | plc.Scalar], plc.Column],
-        pat: str | Sequence,
+        pat: str | tuple[str, ...],
     ) -> SeriesOrIndex:
-        if pat is None:
-            raise TypeError(
-                f"expected a string or a sequence-like object, not "
-                f"{type(pat).__name__}"
-            )
-        elif is_scalar(pat):
+        if isinstance(pat, str):
             plc_pat = pa_scalar_to_plc_scalar(pa.scalar(pat, type=pa.string()))
-        else:
+        elif isinstance(pat, tuple) and all(isinstance(p, str) for p in pat):
             plc_pat = column.as_column(pat, dtype="str").to_pylibcudf(
                 mode="read"
             )
+        else:
+            raise TypeError(
+                f"expected a string or tuple, not {type(pat).__name__}"
+            )
         with acquire_spill_lock():
             plc_result = method(
                 self._column.to_pylibcudf(mode="read"), plc_pat
@@ -3953,7 +3962,7 @@ def _starts_ends_with(
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
 
-    def endswith(self, pat: str | Sequence) -> SeriesOrIndex:
+    def endswith(self, pat: str | tuple[str, ...]) -> SeriesOrIndex:
         """
         Test if the end of each string element matches a pattern.
 
@@ -3997,7 +4006,7 @@ def endswith(self, pat: str | Sequence) -> SeriesOrIndex:
         """
         return self._starts_ends_with(plc.strings.find.ends_with, pat)
 
-    def startswith(self, pat: str | Sequence) -> SeriesOrIndex:
+    def startswith(self, pat: str | tuple[str, ...]) -> SeriesOrIndex:
         """
         Test if the start of each string element matches a pattern.
 
@@ -4299,6 +4308,8 @@ def index(
 
         if (result == -1).any():
             raise ValueError("substring not found")
+        elif cudf.get_option("mode.pandas_compatible"):
+            return result.astype(np.dtype(np.int64))
         else:
             return result
 
@@ -4359,6 +4370,8 @@ def rindex(
 
         if (result == -1).any():
             raise ValueError("substring not found")
+        elif cudf.get_option("mode.pandas_compatible"):
+            return result.astype(np.dtype(np.int64))
         else:
             return result
 
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 052a68cec98..2e10166295b 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -120,7 +120,7 @@ def memory_usage(self) -> int:
 
     def element_indexing(self, index: int) -> dict:
         result = super().element_indexing(index)
-        return dict(zip(self.dtype.fields, result.values()))
+        return self.dtype._recursively_replace_fields(result)
 
     def __setitem__(self, key, value):
         if isinstance(value, dict):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 302178ea277..3f5aea19307 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -16,7 +16,7 @@
 import cudf
 import cudf.core.column.column as column
 from cudf.api.types import is_scalar
-from cudf.core._internals import binaryop, unary
+from cudf.core._internals import binaryop
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -322,7 +322,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
     def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
         if dtype == self.dtype:
             return self
-        return unary.cast(self, dtype=dtype)  # type: ignore[return-value]
+        return self.cast(dtype=dtype)  # type: ignore[return-value]
 
     def find_and_replace(
         self,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d4d633f9bcc..08d3d73034f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6409,7 +6409,20 @@ def count(self, axis=0, numeric_only=False):
             raise NotImplementedError("Only axis=0 is currently supported.")
         length = len(self)
         return Series._from_column(
-            as_column([length - col.null_count for col in self._columns]),
+            as_column(
+                [
+                    length
+                    - (
+                        col.null_count
+                        + (
+                            col.nan_count
+                            if cudf.get_option("mode.pandas_compatible")
+                            else 0
+                        )
+                    )
+                    for col in self._columns
+                ]
+            ),
             index=cudf.Index(self._column_names),
         )
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index ce7fb968069..32e695b32e3 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -518,6 +518,28 @@ def deserialize(cls, header: dict, frames: list):
     def itemsize(self):
         return self.element_type.itemsize
 
+    def _recursively_replace_fields(self, result: list) -> list:
+        """
+        Return a new list result but with the keys of dict element by the keys in StructDtype.fields.keys().
+
+        Intended when result comes from pylibcudf without preserved nested field names.
+        """
+        if isinstance(self.element_type, StructDtype):
+            return [
+                self.element_type._recursively_replace_fields(res)
+                if isinstance(res, dict)
+                else res
+                for res in result
+            ]
+        elif isinstance(self.element_type, ListDtype):
+            return [
+                self.element_type._recursively_replace_fields(res)
+                if isinstance(res, list)
+                else res
+                for res in result
+            ]
+        return result
+
 
 class StructDtype(_BaseDtype):
     """
@@ -677,6 +699,26 @@ def itemsize(self):
             for field in self._typ
         )
 
+    def _recursively_replace_fields(self, result: dict) -> dict:
+        """
+        Return a new dict result but with the keys replaced by the keys in self.fields.keys().
+
+        Intended when result comes from pylibcudf without preserved nested field names.
+        """
+        new_result = {}
+        for (new_field, field_dtype), result_value in zip(
+            self.fields.items(), result.values()
+        ):
+            if isinstance(field_dtype, StructDtype) and isinstance(
+                result_value, dict
+            ):
+                new_result[new_field] = (
+                    field_dtype._recursively_replace_fields(result_value)
+                )
+            else:
+                new_result[new_field] = result_value
+        return new_result
+
 
 decimal_dtype_template = textwrap.dedent(
     """
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index abf9f7b3686..d8373541e2a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -22,8 +22,7 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core._internals import copying, sorting
-from cudf.core._internals.search import search_sorted
+from cudf.core._internals import copying, search, sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -1350,7 +1349,7 @@ def searchsorted(
             for val, common_dtype in zip(values, common_dtype_list)
         ]
 
-        outcol = search_sorted(
+        outcol = search.search_sorted(
             sources,
             values,
             side,
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index e69997519e4..081cbce2098 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -19,7 +19,6 @@
 import pylibcudf as plc
 
 import cudf
-import cudf.core._internals
 from cudf import _lib as libcudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -28,7 +27,7 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core._internals import aggregation, sorting
+from cudf.core._internals import aggregation, sorting, stream_compaction
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column, column_empty
@@ -593,9 +592,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
             ]
         )
 
-        group_keys = cudf.core._internals.stream_compaction.drop_duplicates(
-            group_keys
-        )
+        group_keys = stream_compaction.drop_duplicates(group_keys)
         if len(group_keys) > 1:
             index = cudf.MultiIndex.from_arrays(group_keys)
         else:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0d1bf552982..c13d62b39df 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -30,8 +30,7 @@
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core._internals import copying
-from cudf.core._internals.search import search_sorted
+from cudf.core._internals import search
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     CategoricalColumn,
@@ -123,13 +122,13 @@ def _lexsorted_equal_range(
     else:
         sort_inds = None
         sort_vals = idx
-    lower_bound = search_sorted(
+    lower_bound = search.search_sorted(
         list(sort_vals._columns),
         keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
-    upper_bound = search_sorted(
+    upper_bound = search.search_sorted(
         list(sort_vals._columns),
         keys,
         side="right",
@@ -1366,7 +1365,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = copying.scatter([indices], scatter_map, [result])[0]
+        result = result._scatter_by_column(scatter_map, indices)
         result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
@@ -3386,8 +3385,8 @@ def interval_range(
         bin_edges = libcudf.column.Column.from_pylibcudf(
             plc.filling.sequence(
                 size=periods + 1,
-                init=start.device_value.c_value,
-                step=freq.device_value.c_value,
+                init=start.device_value,
+                step=freq.device_value,
             )
         )
     return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 50e6b42129e..8c32da43c75 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -27,7 +27,6 @@
 import cudf
 import cudf._lib as libcudf
 import cudf.core
-import cudf.core._internals
 import cudf.core.algorithms
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -38,7 +37,7 @@
 )
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core._internals import copying
+from cudf.core._internals import copying, stream_compaction
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, NumericalColumn, as_column
 from cudf.core.column_accessor import ColumnAccessor
@@ -3122,7 +3121,7 @@ def drop_duplicates(
             subset, offset_by_index_columns=not ignore_index
         )
         return self._from_columns_like_self(
-            cudf.core._internals.stream_compaction.drop_duplicates(
+            stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
                 else list(self.index._columns + self._columns),
@@ -3255,12 +3254,13 @@ def duplicated(
                 plc.types.NanEquality.ALL_EQUAL,
             )
             distinct = libcudf.column.Column.from_pylibcudf(plc_column)
-        result = copying.scatter(
-            [cudf.Scalar(False)],
+        result = as_column(
+            True, length=len(self), dtype=bool
+        )._scatter_by_column(
             distinct,
-            [as_column(True, length=len(self), dtype=bool)],
+            cudf.Scalar(False),
             bounds_check=False,
-        )[0]
+        )
         return cudf.Series._from_column(result, index=self.index, name=name)
 
     @_performance_tracking
@@ -4381,7 +4381,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
         data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
-            cudf.core._internals.stream_compaction.drop_nulls(
+            stream_compaction.drop_nulls(
                 [*self.index._columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(subset),
@@ -4404,7 +4404,7 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
                 f"{len(boolean_mask.column)} not {len(self)}"
             )
         return self._from_columns_like_self(
-            cudf.core._internals.stream_compaction.apply_boolean_mask(
+            stream_compaction.apply_boolean_mask(
                 list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 6630433c9a3..19b13a8e97d 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -1,19 +1,27 @@
 # Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import copy
 import decimal
 import functools
 import operator
 from collections import OrderedDict
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
+import pandas as pd
 import pyarrow as pa
 
 import pylibcudf as plc
 
 import cudf
 from cudf.api.types import is_scalar
-from cudf.core.dtypes import ListDtype, StructDtype
+from cudf.core.dtypes import (
+    Decimal32Dtype,
+    Decimal64Dtype,
+    ListDtype,
+    StructDtype,
+)
 from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.dtypes import (
@@ -21,6 +29,180 @@
     to_cudf_compatible_scalar,
 )
 
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+    from cudf._typing import Dtype, ScalarLike
+
+
+def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]:
+    """
+    Preprocess a value and dtype for host-side cudf.Scalar
+
+    Parameters
+    ----------
+    value: Scalarlike
+    dtype: dtypelike or None
+
+    Returns
+    -------
+    tuple[ScalarLike, Dtype]
+    """
+    valid = not cudf.utils.utils._is_null_host_scalar(value)
+
+    if isinstance(value, list):
+        if dtype is not None:
+            raise TypeError("Lists may not be cast to a different dtype")
+        else:
+            dtype = ListDtype.from_arrow(
+                pa.infer_type([value], from_pandas=True)
+            )
+            return value, dtype
+    elif isinstance(dtype, ListDtype):
+        if value not in {None, NA}:
+            raise ValueError(f"Can not coerce {value} to ListDtype")
+        else:
+            return NA, dtype
+
+    if isinstance(value, dict):
+        if dtype is None:
+            dtype = StructDtype.from_arrow(
+                pa.infer_type([value], from_pandas=True)
+            )
+        return value, dtype
+    elif isinstance(dtype, StructDtype):
+        if value not in {None, NA}:
+            raise ValueError(f"Can not coerce {value} to StructDType")
+        else:
+            return NA, dtype
+
+    if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+        value = pa.scalar(
+            value, type=pa.decimal128(dtype.precision, dtype.scale)
+        ).as_py()
+    if isinstance(value, decimal.Decimal) and dtype is None:
+        dtype = cudf.Decimal128Dtype._from_decimal(value)
+
+    value = to_cudf_compatible_scalar(value, dtype=dtype)
+
+    if dtype is None:
+        if not valid:
+            if value is NaT:
+                value = value.to_numpy()
+
+            if isinstance(value, (np.datetime64, np.timedelta64)):
+                unit, _ = np.datetime_data(value)
+                if unit == "generic":
+                    raise TypeError("Cant convert generic NaT to null scalar")
+                else:
+                    dtype = value.dtype
+            else:
+                raise TypeError(
+                    "dtype required when constructing a null scalar"
+                )
+        else:
+            dtype = value.dtype
+
+    if not isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+        dtype = cudf.dtype(dtype)
+
+    if not valid:
+        value = NaT if dtype.kind in "mM" else NA
+
+    return value, dtype
+
+
+def _replace_nested(obj, check, replacement):
+    if isinstance(obj, list):
+        for i, item in enumerate(obj):
+            if check(item):
+                obj[i] = replacement
+            elif isinstance(item, (dict, list)):
+                _replace_nested(item, check, replacement)
+    elif isinstance(obj, dict):
+        for k, v in obj.items():
+            if check(v):
+                obj[k] = replacement
+            elif isinstance(v, (dict, list)):
+                _replace_nested(v, check, replacement)
+
+
+def _maybe_nested_pa_scalar_to_py(pa_scalar: pa.Scalar) -> Any:
+    """
+    Convert a "nested" pyarrow scalar to a Python object.
+
+    These scalars come from pylibcudf.Scalar where field names can be
+    duplicate empty strings.
+
+    Parameters
+    ----------
+    pa_scalar: pa.Scalar
+
+    Returns
+    -------
+    Any
+        Python scalar
+    """
+    if not pa_scalar.is_valid:
+        return pa_scalar.as_py()
+    elif pa.types.is_struct(pa_scalar.type):
+        return {
+            str(i): _maybe_nested_pa_scalar_to_py(val)
+            for i, (_, val) in enumerate(pa_scalar.items())
+        }
+    elif pa.types.is_list(pa_scalar.type):
+        return [_maybe_nested_pa_scalar_to_py(val) for val in pa_scalar]
+    else:
+        return pa_scalar.as_py()
+
+
+def _to_plc_scalar(value: ScalarLike, dtype: Dtype) -> plc.Scalar:
+    """
+    Convert a value and dtype to a pylibcudf Scalar for device-side cudf.Scalar
+
+    Parameters
+    ----------
+    value: Scalarlike
+    dtype: dtypelike
+
+    Returns
+    -------
+    plc.Scalar
+    """
+    if cudf.utils.utils.is_na_like(value):
+        value = None
+    else:
+        # TODO: For now we deepcopy the input value for nested values to avoid
+        # overwriting the input values when replacing nulls. Since it's
+        # just host values it's not that expensive, but we could consider
+        # alternatives.
+        if isinstance(value, (list, dict)):
+            value = copy.deepcopy(value)
+        _replace_nested(value, cudf.utils.utils.is_na_like, None)
+
+    if isinstance(dtype, cudf.core.dtypes._BaseDtype):
+        pa_type = dtype.to_arrow()
+    elif pd.api.types.is_string_dtype(dtype):
+        # Have to manually convert object types, which we use internally
+        # for strings but pyarrow only supports as unicode 'U'
+        pa_type = pa.string()
+    else:
+        pa_type = pa.from_numpy_dtype(dtype)
+
+    pa_scalar = pa.scalar(value, type=pa_type)
+    plc_scalar = plc.interop.from_arrow(pa_scalar)
+    if isinstance(dtype, (Decimal32Dtype, Decimal64Dtype)):
+        # pyarrow only supports decimal128
+        if isinstance(dtype, Decimal32Dtype):
+            plc_type = plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale)
+        elif isinstance(dtype, Decimal64Dtype):
+            plc_type = plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale)
+        plc_column = plc.unary.cast(
+            plc.Column.from_scalar(plc_scalar, 1), plc_type
+        )
+        plc_scalar = plc.copying.get_element(plc_column, 0)
+    return plc_scalar
+
 
 @functools.lru_cache(maxsize=128)
 def pa_scalar_to_plc_scalar(pa_scalar: pa.Scalar) -> plc.Scalar:
@@ -138,7 +320,7 @@ class Scalar(BinaryOperand, metaclass=CachedScalarInstanceMeta):
     def __init__(self, value, dtype=None):
         self._host_value = None
         self._host_dtype = None
-        self._device_value = None
+        self._device_value: plc.Scalar | None = None
 
         if isinstance(value, Scalar):
             if value._is_host_value_current:
@@ -147,37 +329,34 @@ def __init__(self, value, dtype=None):
             else:
                 self._device_value = value._device_value
         else:
-            self._host_value, self._host_dtype = self._preprocess_host_value(
+            self._host_value, self._host_dtype = _preprocess_host_value(
                 value, dtype
             )
 
     @classmethod
-    def from_device_scalar(cls, device_scalar):
-        if not isinstance(device_scalar, cudf._lib.scalar.DeviceScalar):
+    def from_pylibcudf(cls, scalar: plc.Scalar) -> Self:
+        if not isinstance(scalar, plc.Scalar):
             raise TypeError(
-                "Expected an instance of DeviceScalar, "
-                f"got {type(device_scalar).__name__}"
+                "Expected an instance of pylibcudf.Scalar, "
+                f"got {type(scalar).__name__}"
             )
         obj = object.__new__(cls)
         obj._host_value = None
         obj._host_dtype = None
-        obj._device_value = device_scalar
+        obj._device_value = scalar
         return obj
 
     @property
-    def _is_host_value_current(self):
+    def _is_host_value_current(self) -> bool:
         return self._host_value is not None
 
     @property
-    def _is_device_value_current(self):
+    def _is_device_value_current(self) -> bool:
         return self._device_value is not None
 
     @property
-    def device_value(self):
-        if self._device_value is None:
-            self._device_value = cudf._lib.scalar.DeviceScalar(
-                self._host_value, self._host_dtype
-            )
+    def device_value(self) -> plc.Scalar:
+        self._sync()
         return self._device_value
 
     @property
@@ -186,92 +365,55 @@ def value(self):
             self._device_value_to_host()
         return self._host_value
 
-    # todo: change to cached property
+    # TODO: change to @functools.cached_property
     @property
     def dtype(self):
-        if self._is_host_value_current:
-            if isinstance(self._host_value, str):
-                return cudf.dtype("object")
-            else:
-                return self._host_dtype
-        else:
-            return self.device_value.dtype
+        if self._host_dtype is not None:
+            return self._host_dtype
+        if not self._is_host_value_current:
+            self._device_value_to_host()
+        _, host_dtype = _preprocess_host_value(self._host_value, None)
+        self._host_dtype = host_dtype
+        return self._host_dtype
 
-    def is_valid(self):
+    def is_valid(self) -> bool:
         if not self._is_host_value_current:
             self._device_value_to_host()
         return not cudf.utils.utils._is_null_host_scalar(self._host_value)
 
-    def _device_value_to_host(self):
-        self._host_value = self._device_value._to_host_scalar()
-
-    def _preprocess_host_value(self, value, dtype):
-        valid = not cudf.utils.utils._is_null_host_scalar(value)
-
-        if isinstance(value, list):
-            if dtype is not None:
-                raise TypeError("Lists may not be cast to a different dtype")
-            else:
-                dtype = ListDtype.from_arrow(
-                    pa.infer_type([value], from_pandas=True)
-                )
-                return value, dtype
-        elif isinstance(dtype, ListDtype):
-            if value not in {None, NA}:
-                raise ValueError(f"Can not coerce {value} to ListDtype")
+    def _device_value_to_host(self) -> None:
+        ps = plc.interop.to_arrow(self._device_value)
+        is_datetime = pa.types.is_timestamp(ps.type)
+        is_timedelta = pa.types.is_duration(ps.type)
+        if not ps.is_valid:
+            if is_datetime or is_timedelta:
+                self._host_value = NaT
             else:
-                return NA, dtype
-
-        if isinstance(value, dict):
-            if dtype is None:
-                dtype = StructDtype.from_arrow(
-                    pa.infer_type([value], from_pandas=True)
-                )
-            return value, dtype
-        elif isinstance(dtype, StructDtype):
-            if value not in {None, NA}:
-                raise ValueError(f"Can not coerce {value} to StructDType")
-            else:
-                return NA, dtype
-
-        if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            value = pa.scalar(
-                value, type=pa.decimal128(dtype.precision, dtype.scale)
-            ).as_py()
-        if isinstance(value, decimal.Decimal) and dtype is None:
-            dtype = cudf.Decimal128Dtype._from_decimal(value)
-
-        value = to_cudf_compatible_scalar(value, dtype=dtype)
-
-        if dtype is None:
-            if not valid:
-                if value is NaT:
-                    value = value.to_numpy()
-
-                if isinstance(value, (np.datetime64, np.timedelta64)):
-                    unit, _ = np.datetime_data(value)
-                    if unit == "generic":
-                        raise TypeError(
-                            "Cant convert generic NaT to null scalar"
-                        )
-                    else:
-                        dtype = value.dtype
-                else:
-                    raise TypeError(
-                        "dtype required when constructing a null scalar"
-                    )
+                self._host_value = NA
+        else:
+            # TODO: The special handling of specific types below does not currently
+            # extend to nested types containing those types (e.g. List[timedelta]
+            # where the timedelta would overflow). We should eventually account for
+            # those cases, but that will require more careful consideration of how
+            # to traverse the contents of the nested data.
+            if is_datetime or is_timedelta:
+                time_unit = ps.type.unit
+                # Cast to int64 to avoid overflow
+                ps_cast = ps.cast(pa.int64()).as_py()
+                out_type = np.datetime64 if is_datetime else np.timedelta64
+                self._host_value = out_type(ps_cast, time_unit)
+            elif (
+                pa.types.is_integer(ps.type)
+                or pa.types.is_floating(ps.type)
+                or pa.types.is_boolean(ps.type)
+            ):
+                self._host_value = ps.type.to_pandas_dtype()(ps.as_py())
             else:
-                dtype = value.dtype
-
-        if not isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            dtype = cudf.dtype(dtype)
+                host_value = _maybe_nested_pa_scalar_to_py(ps)
+                _replace_nested(host_value, lambda item: item is None, NA)
+                self._host_value = host_value
 
-        if not valid:
-            value = NaT if dtype.kind in "mM" else NA
-
-        return value, dtype
-
-    def _sync(self):
+    def _sync(self) -> None:
         """
         If the cache is not synched, copy either the device or host value
         to the host or device respectively. If cache is valid, do nothing
@@ -279,27 +421,27 @@ def _sync(self):
         if self._is_host_value_current and self._is_device_value_current:
             return
         elif self._is_host_value_current and not self._is_device_value_current:
-            self._device_value = cudf._lib.scalar.DeviceScalar(
+            self._device_value = _to_plc_scalar(
                 self._host_value, self._host_dtype
             )
         elif self._is_device_value_current and not self._is_host_value_current:
-            self._host_value = self._device_value.value
+            self._device_value_to_host()
             self._host_dtype = self._host_value.dtype
         else:
             raise ValueError("Invalid cudf.Scalar")
 
-    def __index__(self):
+    def __index__(self) -> int:
         if self.dtype.kind not in {"u", "i"}:
             raise TypeError("Only Integer typed scalars may be used in slices")
         return int(self)
 
-    def __int__(self):
+    def __int__(self) -> int:
         return int(self.value)
 
-    def __float__(self):
+    def __float__(self) -> float:
         return float(self.value)
 
-    def __bool__(self):
+    def __bool__(self) -> bool:
         return bool(self.value)
 
     def __round__(self, n):
@@ -321,7 +463,7 @@ def __invert__(self):
     def __neg__(self):
         return self._scalar_unaop("__neg__")
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         # str() fixes a numpy bug with NaT
         # https://github.com/numpy/numpy/issues/17552
         return (
@@ -403,13 +545,13 @@ def _unaop_result_type_or_error(self, op):
                 return cudf.dtype("float64")
         return self.dtype
 
-    def _scalar_unaop(self, op):
+    def _scalar_unaop(self, op) -> None | Self:
         out_dtype = self._unaop_result_type_or_error(op)
         if not self.is_valid():
-            result = None
+            return None
         else:
             result = self._dispatch_scalar_unaop(op)
-            return Scalar(result, dtype=out_dtype)
+            return Scalar(result, dtype=out_dtype)  # type: ignore[return-value]
 
     def _dispatch_scalar_unaop(self, op):
         if op == "__floor__":
@@ -418,7 +560,7 @@ def _dispatch_scalar_unaop(self, op):
             return np.ceil(self.value)
         return getattr(self.value, op)()
 
-    def astype(self, dtype):
+    def astype(self, dtype) -> Self:
         if self.dtype == dtype:
             return self
-        return Scalar(self.value, dtype)
+        return Scalar(self.value, dtype)  # type: ignore[return-value]
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c89f0835c54..60e0cd38483 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2712,7 +2712,10 @@ def count(self):
 
             Parameters currently not supported is `level`.
         """
-        return self.valid_count
+        valid_count = self.valid_count
+        if cudf.get_option("mode.pandas_compatible"):
+            return valid_count - self._column.nan_count
+        return valid_count
 
     @_performance_tracking
     def mode(self, dropna=True):
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 8be336021b1..4ca92be2498 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import math
@@ -998,7 +998,7 @@ def date_range(
             res = libcudf.column.Column.from_pylibcudf(
                 plc.filling.calendrical_month_sequence(
                     periods,
-                    start.device_value.c_value,
+                    start.device_value,
                     months,
                 )
             )
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 6d3dc2dc7d9..93bc6725210 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import warnings
@@ -9,7 +9,6 @@
 
 import cudf
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
-from cudf.core._internals import unary
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.index import ensure_index
@@ -178,7 +177,7 @@ def to_numeric(
             downcast_dtype = cudf.dtype(t)
             if downcast_dtype.itemsize <= col.dtype.itemsize:
                 if col.can_cast_safely(downcast_dtype):
-                    col = unary.cast(col, downcast_dtype)
+                    col = col.cast(downcast_dtype)
                     break
 
     if isinstance(arg, (cudf.Series, pd.Series)):
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index e2c332f34f5..dc43f297416 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -14,7 +14,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_number
-from cudf.core._internals.aggregation import make_aggregation
+from cudf.core._internals import aggregation
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import as_column
 from cudf.core.mixins import Reducible
@@ -307,7 +307,7 @@ def _apply_agg_column(self, source_column, agg_name):
                     pre,
                     fwd,
                     min_periods,
-                    make_aggregation(
+                    aggregation.make_aggregation(
                         agg_name,
                         {"dtype": source_column.dtype}
                         if callable(agg_name)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index a1df2c7d857..9c20a42d215 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -11,7 +11,6 @@
 
 import cudf
 from cudf.api.types import is_numeric_dtype, is_string_dtype
-from cudf.core._internals.unary import is_nan
 from cudf.core.missing import NA, NaT
 
 
@@ -250,7 +249,7 @@ def assert_column_equal(
                     left.dtype.kind == right.dtype.kind == "f"
                 ):
                     columns_equal = cp.all(
-                        is_nan(left).values == is_nan(right).values
+                        left.isnan().values == right.isnan().values
                     )
             else:
                 columns_equal = left.equals(right)
diff --git a/python/cudf/cudf/tests/input_output/test_parquet.py b/python/cudf/cudf/tests/input_output/test_parquet.py
index 709b67f48bf..69355e26f70 100644
--- a/python/cudf/cudf/tests/input_output/test_parquet.py
+++ b/python/cudf/cudf/tests/input_output/test_parquet.py
@@ -8,7 +8,7 @@
 from cudf._fuzz_testing.utils import compare_dataframe
 
 
-def test_parquet_long_list():
+def test_parquet_long_list(tmpdir):
     # This test generates int and string list columns, where each has a row that is very large.
     # When generated by the cudf writer these long rows are contained on a single page,
     # but when generated by pyarrow they span several pages.
@@ -41,7 +41,7 @@ def test_parquet_long_list():
     )
 
     # Write the table to a parquet file using pyarrow
-    file_name = "long_row_list_test.pq"
+    file_name = tmpdir.join("long_row_list_test.pq")
     # https://arrow.apache.org/docs/python/generated/pyarrow.parquet.write_table.html
     pq.write_table(
         generated_table,
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 0712a0de635..ef94b3cd176 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import decimal
 import operator
@@ -3200,7 +3200,7 @@ def set_null_cases(column_l, column_r, case):
     "lcol,rcol,ans,case", generate_test_null_equals_columnops_data()
 )
 def test_null_equals_columnops(lcol, rcol, ans, case):
-    assert lcol.equals(rcol).all() == ans
+    assert lcol.equals(rcol) == ans
 
 
 def test_add_series_to_dataframe():
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index b1f81edfc54..359660e76a7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -689,7 +689,6 @@ def test_list_getitem(data):
 def test_list_scalar_host_construction(data):
     slr = cudf.Scalar(data)
     assert slr.value == data
-    assert slr.device_value.value == data
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index c14fab4040b..ba2bd040c38 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import datetime
 import re
@@ -145,14 +145,11 @@ def test_scalar_host_initialization(value):
 def test_scalar_device_initialization(value):
     column = cudf.Series([value], nan_as_null=False)._column
     with acquire_spill_lock():
-        dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf(
-            plc.copying.get_element(
-                column.to_pylibcudf(mode="read"),
-                0,
-            ),
-            dtype=column.dtype,
+        dev_slr = plc.copying.get_element(
+            column.to_pylibcudf(mode="read"),
+            0,
         )
-    s = cudf.Scalar.from_device_scalar(dev_slr)
+    s = cudf.Scalar.from_pylibcudf(dev_slr)
 
     assert s._is_device_value_current
     assert not s._is_host_value_current
@@ -172,14 +169,11 @@ def test_scalar_device_initialization_decimal(value, decimal_type):
     dtype = decimal_type._from_decimal(value)
     column = cudf.Series([str(value)]).astype(dtype)._column
     with acquire_spill_lock():
-        dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf(
-            plc.copying.get_element(
-                column.to_pylibcudf(mode="read"),
-                0,
-            ),
-            dtype=column.dtype,
+        dev_slr = plc.copying.get_element(
+            column.to_pylibcudf(mode="read"),
+            0,
         )
-    s = cudf.Scalar.from_device_scalar(dev_slr)
+    s = cudf.Scalar.from_pylibcudf(dev_slr)
 
     assert s._is_device_value_current
     assert not s._is_host_value_current
@@ -387,34 +381,6 @@ def test_scalar_invalid_implicit_conversion(cls, dtype):
             cls(slr)
 
 
-@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES)
-@pytest.mark.parametrize(
-    "decimal_type",
-    [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype],
-)
-def test_device_scalar_direct_construction(value, decimal_type):
-    value = cudf.utils.dtypes.to_cudf_compatible_scalar(value)
-
-    dtype = (
-        value.dtype
-        if not isinstance(value, Decimal)
-        else decimal_type._from_decimal(value)
-    )
-
-    s = cudf._lib.scalar.DeviceScalar(value, dtype)
-
-    assert s.value == value or np.isnan(s.value) and np.isnan(value)
-    if isinstance(
-        dtype, (cudf.Decimal64Dtype, cudf.Decimal128Dtype, cudf.Decimal32Dtype)
-    ):
-        assert s.dtype.precision == dtype.precision
-        assert s.dtype.scale == dtype.scale
-    elif dtype.char == "U":
-        assert s.dtype == "object"
-    else:
-        assert s.dtype == dtype
-
-
 @pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES)
 def test_construct_from_scalar(value):
     value = cudf.utils.dtypes.to_cudf_compatible_scalar(value)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index d0db35c034a..894f601064a 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -3019,3 +3019,18 @@ def test_roundtrip_series_plc_column(ps):
     expect = cudf.Series(ps)
     actual = cudf.Series.from_pylibcudf(*expect.to_pylibcudf())
     assert_eq(expect, actual)
+
+
+def test_series_dataframe_count_float():
+    gs = cudf.Series([1, 2, 3, None, np.nan, 10], nan_as_null=False)
+    ps = cudf.Series([1, 2, 3, None, np.nan, 10])
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        assert_eq(ps.count(), gs.count())
+        assert_eq(ps.to_frame().count(), gs.to_frame().count())
+    with cudf.option_context("mode.pandas_compatible", False):
+        assert_eq(gs.count(), gs.to_pandas(nullable=True).count())
+        assert_eq(
+            gs.to_frame().count(),
+            gs.to_frame().to_pandas(nullable=True).count(),
+        )
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index bdc9e695844..809fedfde7b 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import json
 import re
@@ -2048,26 +2048,26 @@ def test_string_starts_ends(data, pat):
     [
         (
             ["abc", "xyz", "a", "ab", "123", "097"],
-            ["abc", "x", "a", "b", "3", "7"],
+            ("abc", "x", "a", "b", "3", "7"),
         ),
-        (["A B", "1.5", "3,000"], ["A ", ".", ","]),
-        (["23", "³", "⅕", ""], ["23", "³", "⅕", ""]),
-        ([" ", "\t\r\n ", ""], ["d", "\n ", ""]),
+        (["A B", "1.5", "3,000"], ("A ", ".", ",")),
+        (["23", "³", "⅕", ""], ("23", "³", "⅕", "")),
+        ([" ", "\t\r\n ", ""], ("d", "\n ", "")),
         (
             ["$", "B", "Aab$", "$$ca", "C$B$", "cat"],
-            ["$", "$", "a", "<", "(", "#"],
+            ("$", "$", "a", "<", "(", "#"),
         ),
         (
             ["line to be wrapped", "another line to be wrapped"],
-            ["another", "wrapped"],
+            ("another", "wrapped"),
         ),
         (
             ["hello", "there", "world", "+1234", "-1234", None, "accént", ""],
-            ["hsdjfk", None, "ll", "+", "-", "w", "-", "én"],
+            ("hsdjfk", "", "ll", "+", "-", "w", "-", "én"),
         ),
         (
             ["1. Ant.  ", "2. Bee!\n", "3. Cat?\t", None],
-            ["1. Ant.  ", "2. Bee!\n", "3. Cat?\t", None],
+            ("1. Ant.  ", "2. Bee!\n", "3. Cat?\t", ""),
         ),
     ],
 )
@@ -3539,3 +3539,39 @@ def test_string_reduction_error():
         lfunc_args_and_kwargs=([], {"skipna": False}),
         rfunc_args_and_kwargs=([], {"skipna": False}),
     )
+
+
+def test_getitem_out_of_bounds():
+    data = ["123", "12", "1"]
+    pd_ser = pd.Series(data)
+    cudf_ser = cudf.Series(data)
+    expected = pd_ser.str[2]
+    result = cudf_ser.str[2]
+    assert_eq(result, expected)
+
+    expected = pd_ser.str[-2]
+    result = cudf_ser.str[-2]
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("method", ["startswith", "endswith"])
+@pytest.mark.parametrize("pat", [None, (1, 2), pd.Series([1])])
+def test_startsendwith_invalid_pat(method, pat):
+    ser = cudf.Series(["1"])
+    with pytest.raises(TypeError):
+        getattr(ser.str, method)(pat)
+
+
+@pytest.mark.parametrize("method", ["rindex", "index"])
+def test_index_int64_pandas_compat(method):
+    data = ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = getattr(cudf.Series(data).str, method)("E", 4, 8)
+    expected = getattr(pd.Series(data).str, method)("E", 4, 8)
+    assert_eq(result, expected)
+
+
+def test_replace_invalid_scalar_repl():
+    ser = cudf.Series(["1"])
+    with pytest.raises(TypeError):
+        ser.str.replace("1", 2)
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index b85943626a6..e7fca63d980 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -6,7 +6,6 @@
 import pytest
 
 import cudf
-from cudf.core.dtypes import StructDtype
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES
 
@@ -161,7 +160,6 @@ def test_struct_setitem(data, item):
 def test_struct_scalar_host_construction(data):
     slr = cudf.Scalar(data)
     assert slr.value == data
-    assert list(slr.device_value.value.values()) == list(data.values())
 
 
 @pytest.mark.parametrize(
@@ -194,12 +192,11 @@ def test_struct_scalar_host_construction_no_dtype_inference(data, dtype):
     # is empty.
     slr = cudf.Scalar(data, dtype=dtype)
     assert slr.value == data
-    assert list(slr.device_value.value.values()) == list(data.values())
 
 
 def test_struct_scalar_null():
-    slr = cudf.Scalar(cudf.NA, dtype=StructDtype)
-    assert slr.device_value.value is cudf.NA
+    slr = cudf.Scalar(cudf.NA, dtype=cudf.StructDtype)
+    assert cudf.Scalar.from_pylibcudf(slr.device_value).value is cudf.NA
 
 
 def test_struct_explode():