From 51fd01871e813fde3ce3a4bb68f622812aa5fefa Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Jan 2025 16:33:42 -0800 Subject: [PATCH 1/4] Add pylibcudf.null_mask.null_count --- python/cudf/cudf/_lib/column.pxd | 3 --- python/cudf/cudf/_lib/column.pyx | 22 +++++++----------- python/pylibcudf/pylibcudf/null_mask.pxd | 4 +++- python/pylibcudf/pylibcudf/null_mask.pyi | 1 + python/pylibcudf/pylibcudf/null_mask.pyx | 29 +++++++++++++++++++++++- 5 files changed, 40 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 026c12895e8..58745d91fc0 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -13,7 +13,6 @@ from pylibcudf.libcudf.column.column_view cimport ( from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer -cdef dtype_from_lists_column_view(column_view cv) cdef dtype_from_column_view(column_view cv) cdef class Column: @@ -42,5 +41,3 @@ cdef class Column: @staticmethod cdef Column from_column_view(column_view, object) - - cdef size_type compute_null_count(self) except? 0 diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index c59bbc0f40c..cf7122fd15b 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -43,7 +43,6 @@ from pylibcudf.libcudf.column.column_factories cimport ( ) from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count from pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.scalar cimport DeviceScalar @@ -346,7 +345,14 @@ cdef class Column: @property def null_count(self): if self._null_count is None: - self._null_count = self.compute_null_count() + if not self.nullable or self.size == 0: + return 0 + with acquire_spill_lock(): + self._null_count = pylibcudf.null_mask.null_count( + self.base_mask.get_ptr(mode="read"), + self.offset, + self.offset + self.size + ) return self._null_count @property @@ -410,18 +416,6 @@ cdef class Column: else: return other_col - cdef libcudf_types.size_type compute_null_count(self) except? 0: - with acquire_spill_lock(): - if not self.nullable: - return 0 - return cpp_null_count( - ( - self.base_mask.get_ptr(mode="read") - ), - self.offset, - self.offset + self.size - ) - cdef mutable_column_view mutable_view(self) except *: if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd index 9bdfaee2842..779a5aed306 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pxd +++ b/python/pylibcudf/pylibcudf/null_mask.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.libcudf.types cimport mask_state, size_type @@ -16,3 +16,5 @@ cpdef DeviceBuffer create_null_mask(size_type size, mask_state state = *) cpdef tuple bitmask_and(list columns) cpdef tuple bitmask_or(list columns) + +cpdef size_type null_count(Py_ssize_t bitmask, size_type start, size_type stop) diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi index 1a6d96a0822..ace18582bd1 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyi +++ b/python/pylibcudf/pylibcudf/null_mask.pyi @@ -12,3 +12,4 @@ def create_null_mask( ) -> DeviceBuffer: ... def bitmask_and(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... def bitmask_or(columns: list[Column]) -> tuple[DeviceBuffer, int]: ... +def null_count(bitmask: int, start: int, stop: int) -> int: ... diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index adc264e9af6..9e7c41087f4 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -1,10 +1,11 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp.memory cimport make_unique from libcpp.pair cimport pair from libcpp.utility cimport move from pylibcudf.libcudf cimport null_mask as cpp_null_mask from pylibcudf.libcudf.types cimport mask_state, size_type +from pylibcudf.utils cimport int_to_bitmask_ptr from rmm.librmm.device_buffer cimport device_buffer from rmm.pylibrmm.device_buffer cimport DeviceBuffer @@ -20,6 +21,7 @@ __all__ = [ "bitmask_or", "copy_bitmask", "create_null_mask", + "null_count", ] cdef DeviceBuffer buffer_to_python(device_buffer buf): @@ -148,3 +150,28 @@ cpdef tuple bitmask_or(list columns): c_result = cpp_null_mask.bitmask_or(c_table.view()) return buffer_to_python(move(c_result.first)), c_result.second + + +cpdef size_type null_count(Py_ssize_t bitmask, size_type start, size_type stop): + """Given a validity bitmask, counts the number of null elements. + + For details, see :cpp:func:`null_count`. + + Parameters + ---------- + bitmask : int + Integer pointer to the bitmask. + + start : int + Index of the first bit to count (inclusive). + + stop : int + Index of the last bit to count (exclusive). + + Returns + ------- + int + The number of null elements in the specified range. + """ + with nogil: + return cpp_null_mask.null_count(int_to_bitmask_ptr(bitmask), start, stop) From ec19d52f62782bf8a3af12ff30c7140a2bbe98e6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Jan 2025 09:51:18 -0800 Subject: [PATCH 2/4] cache null count for short circuit case --- python/cudf/cudf/_lib/column.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index cf7122fd15b..7e313d64789 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -346,7 +346,7 @@ cdef class Column: def null_count(self): if self._null_count is None: if not self.nullable or self.size == 0: - return 0 + self._null_count = 0 with acquire_spill_lock(): self._null_count = pylibcudf.null_mask.null_count( self.base_mask.get_ptr(mode="read"), From 171ebe3c1818707f0371c6326ad5d0ae9e7ccd80 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 10 Jan 2025 10:50:31 -0800 Subject: [PATCH 3/4] Add else: --- python/cudf/cudf/_lib/column.pyx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 7e313d64789..114991dbe3e 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -347,12 +347,13 @@ cdef class Column: if self._null_count is None: if not self.nullable or self.size == 0: self._null_count = 0 - with acquire_spill_lock(): - self._null_count = pylibcudf.null_mask.null_count( - self.base_mask.get_ptr(mode="read"), - self.offset, - self.offset + self.size - ) + else: + with acquire_spill_lock(): + self._null_count = pylibcudf.null_mask.null_count( + self.base_mask.get_ptr(mode="read"), + self.offset, + self.offset + self.size + ) return self._null_count @property From 065fe8cbbcc0447e424f88b6f240f2403ade4b62 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 13 Jan 2025 16:53:38 -0800 Subject: [PATCH 4/4] Update python/pylibcudf/pylibcudf/null_mask.pyx Co-authored-by: Matthew Murray <41342305+Matt711@users.noreply.github.com> --- python/pylibcudf/pylibcudf/null_mask.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx index 9e7c41087f4..0260088c0e2 100644 --- a/python/pylibcudf/pylibcudf/null_mask.pyx +++ b/python/pylibcudf/pylibcudf/null_mask.pyx @@ -161,10 +161,8 @@ cpdef size_type null_count(Py_ssize_t bitmask, size_type start, size_type stop): ---------- bitmask : int Integer pointer to the bitmask. - start : int Index of the first bit to count (inclusive). - stop : int Index of the last bit to count (exclusive).