From b511f4a629a3dda31725f755ff516f4e769de8db Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Jan 2025 18:45:25 -0800 Subject: [PATCH 01/10] Remove cudf._lib.column --- python/cudf/cudf/_lib/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/column.pxd | 43 - python/cudf/cudf/_lib/column.pyi | 82 -- python/cudf/cudf/_lib/column.pyx | 913 ------------------ python/cudf/cudf/core/_internals/binaryop.py | 9 +- python/cudf/cudf/core/_internals/copying.py | 25 +- python/cudf/cudf/core/_internals/search.py | 11 +- python/cudf/cudf/core/_internals/sorting.py | 10 +- .../cudf/core/_internals/stream_compaction.py | 15 +- python/cudf/cudf/core/_internals/timezones.py | 6 +- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 438 ++++++++- python/cudf/cudf/core/column/string.py | 104 +- python/cudf/cudf/core/cut.py | 7 +- python/cudf/cudf/core/resample.py | 6 +- python/cudf/cudf/core/reshape.py | 3 +- python/cudf/cudf/core/udf/utils.py | 5 +- python/cudf/cudf/io/avro.py | 6 +- python/cudf/cudf/io/csv.py | 4 +- python/cudf/cudf/io/json.py | 11 +- python/cudf/cudf/io/orc.py | 12 +- python/cudf/cudf/io/parquet.py | 11 +- python/cudf/cudf/io/text.py | 4 +- python/cudf/cudf/tests/test_string_udfs.py | 10 +- python/cudf/cudf/utils/dtypes.py | 29 + 25 files changed, 581 insertions(+), 1187 deletions(-) delete mode 100644 python/cudf/cudf/_lib/column.pxd delete mode 100644 python/cudf/cudf/_lib/column.pyi delete mode 100644 python/cudf/cudf/_lib/column.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index ec44a6aa8c5..94d0b3f6703 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx scalar.pyx strings_udf.pyx) +set(cython_sources scalar.pyx strings_udf.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd deleted file mode 100644 index 58745d91fc0..00000000000 --- a/python/cudf/cudf/_lib/column.pxd +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -from typing import Literal - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport ( - column_view, - mutable_column_view, -) -from pylibcudf.libcudf.types cimport size_type -from rmm.librmm.device_buffer cimport device_buffer - -cdef dtype_from_column_view(column_view cv) - -cdef class Column: - cdef public: - cdef int _offset - cdef int _size - cdef object _dtype - cdef object _base_children - cdef object _base_data - cdef object _base_mask - cdef object _children - cdef object _data - cdef object _mask - cdef object _null_count - cdef object _distinct_count - - cdef column_view _view(self, size_type null_count) except * - cdef column_view view(self) except * - cdef mutable_column_view mutable_view(self) except * - cpdef to_pylibcudf(self, mode: Literal["read", "write"]) - - @staticmethod - cdef Column from_unique_ptr( - unique_ptr[column] c_col, bint data_ptr_exposed=* - ) - - @staticmethod - cdef Column from_column_view(column_view, object) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi deleted file mode 100644 index bdd90be45b8..00000000000 --- a/python/cudf/cudf/_lib/column.pyi +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -from __future__ import annotations - -from typing import Literal - -from typing_extensions import Self - -import pylibcudf as plc - -from cudf._typing import Dtype, DtypeObj, ScalarLike -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase - -class Column: - _data: Buffer | None - _mask: Buffer | None - _base_data: Buffer | None - _base_mask: Buffer | None - _dtype: DtypeObj - _size: int - _offset: int - _null_count: int - _children: tuple[ColumnBase, ...] - _base_children: tuple[ColumnBase, ...] - _distinct_count: dict[bool, int] - - def __init__( - self, - data: Buffer | None, - size: int, - dtype: Dtype, - mask: Buffer | None = None, - offset: int | None = None, - null_count: int | None = None, - children: tuple[ColumnBase, ...] = (), - ) -> None: ... - @property - def base_size(self) -> int: ... - @property - def dtype(self) -> DtypeObj: ... - @property - def size(self) -> int: ... - @property - def base_data(self) -> Buffer | None: ... - @property - def data(self) -> Buffer | None: ... - @property - def data_ptr(self) -> int: ... - def set_base_data(self, value: Buffer) -> None: ... - @property - def nullable(self) -> bool: ... - def has_nulls(self, include_nan: bool = False) -> bool: ... - @property - def base_mask(self) -> Buffer | None: ... - @property - def mask(self) -> Buffer | None: ... - @property - def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Buffer | None) -> None: ... - def set_mask(self, value: ColumnBase | Buffer | None) -> Self: ... - @property - def null_count(self) -> int: ... - @property - def offset(self) -> int: ... - @property - def base_children(self) -> tuple[ColumnBase, ...]: ... - @property - def children(self) -> tuple[ColumnBase, ...]: ... - def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ... - def _mimic_inplace( - self, other_col: ColumnBase, inplace=False - ) -> Self | None: ... - - # TODO: The val parameter should be Scalar, not ScalarLike - @staticmethod - def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... - @staticmethod - def from_pylibcudf( - col: plc.Column, data_ptr_exposed: bool = False - ) -> ColumnBase: ... - def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ... diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx deleted file mode 100644 index 114991dbe3e..00000000000 --- a/python/cudf/cudf/_lib/column.pyx +++ /dev/null @@ -1,913 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - - -from typing import Literal - -import cupy as cp -import numpy as np -import pandas as pd - -import pylibcudf -import rmm - -import cudf -from cudf.core.buffer import ( - Buffer, - ExposureTrackedBuffer, - SpillableBuffer, - acquire_spill_lock, - as_buffer, - cuda_array_interface_wrapper, -) -from cudf.utils.dtypes import ( - _get_base_dtype, - dtype_to_pylibcudf_type, - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES, -) - -from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t, int32_t -from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from rmm.pylibrmm.device_buffer cimport DeviceBuffer - -from pylibcudf cimport DataType as plc_DataType, Column as plc_Column -cimport pylibcudf.libcudf.copying as cpp_copying -cimport pylibcudf.libcudf.types as libcudf_types -cimport pylibcudf.libcudf.unary as libcudf_unary -from pylibcudf.libcudf.column.column cimport column, column_contents -from pylibcudf.libcudf.column.column_factories cimport ( - make_numeric_column -) -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -from pylibcudf.libcudf.scalar.scalar cimport scalar - -from cudf._lib.scalar cimport DeviceScalar - - -cdef get_element(column_view col_view, size_type index): - - cdef unique_ptr[scalar] c_output - with nogil: - c_output = move( - cpp_copying.get_element(col_view, index) - ) - - return DeviceScalar.from_unique_ptr( - move(c_output), dtype=dtype_from_column_view(col_view) - ) - - -def dtype_from_pylibcudf_column(plc_Column col not None): - type_ = col.type() - tid = type_.id() - - if tid == pylibcudf.TypeId.LIST: - child = col.list_view().child() - return cudf.ListDtype(dtype_from_pylibcudf_column(child)) - elif tid == pylibcudf.TypeId.STRUCT: - fields = { - str(i): dtype_from_pylibcudf_column(col.child(i)) - for i in range(col.num_children()) - } - return cudf.StructDtype(fields) - elif tid == pylibcudf.TypeId.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-type_.scale() - ) - elif tid == pylibcudf.TypeId.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-type_.scale() - ) - elif tid == pylibcudf.TypeId.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-type_.scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - else: - return cudf.ListDtype(dtype_from_column_view(child)) - - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[(tid)] - - -cdef class Column: - """ - A Column stores columnar data in device memory. - A Column may be composed of: - - * A *data* Buffer - * One or more (optional) *children* Columns - * An (optional) *mask* Buffer representing the nullmask - - The *dtype* indicates the Column's element type. - """ - def __init__( - self, - object data, - int size, - object dtype, - object mask=None, - int offset=0, - object null_count=None, - tuple children=() - ): - if size < 0: - raise ValueError("size must be >=0") - self._size = size - self._distinct_count = {} - self._dtype = dtype - self._offset = offset - self._null_count = null_count - self.set_base_children(children) - self.set_base_data(data) - self.set_base_mask(mask) - - @property - def base_size(self): - return int(self.base_data.size / self.dtype.itemsize) - - @property - def dtype(self): - return self._dtype - - @property - def size(self): - return self._size - - @property - def base_data(self): - return self._base_data - - @property - def data(self): - if self.base_data is None: - return None - if self._data is None: - start = self.offset * self.dtype.itemsize - end = start + self.size * self.dtype.itemsize - self._data = self.base_data[start:end] - return self._data - - @property - def data_ptr(self): - if self.data is None: - return 0 - else: - return self.data.get_ptr(mode="write") - - def set_base_data(self, value): - if value is not None and not isinstance(value, Buffer): - raise TypeError( - "Expected a Buffer or None for data, " - f"got {type(value).__name__}" - ) - - self._data = None - self._base_data = value - - @property - def nullable(self): - return self.base_mask is not None - - def has_nulls(self, include_nan=False): - return int(self.null_count) != 0 - - @property - def base_mask(self): - return self._base_mask - - @property - def mask(self): - if self._mask is None: - if self.base_mask is None or self.offset == 0: - self._mask = self.base_mask - else: - with acquire_spill_lock(): - self._mask = as_buffer( - pylibcudf.null_mask.copy_bitmask(self.to_pylibcudf(mode="read")) - ) - return self._mask - - @property - def mask_ptr(self): - if self.mask is None: - return 0 - else: - return self.mask.get_ptr(mode="write") - - def set_base_mask(self, value): - """ - Replaces the base mask buffer of the column inplace. This does not - modify size or offset in any way, so the passed mask is expected to be - compatible with the current offset. - """ - if value is not None and not isinstance(value, Buffer): - raise TypeError( - "Expected a Buffer or None for mask, " - f"got {type(value).__name__}" - ) - - if value is not None: - # bitmask size must be relative to offset = 0 data. - required_size = pylibcudf.null_mask.bitmask_allocation_size_bytes( - self.base_size - ) - if value.size < required_size: - error_msg = ( - "The Buffer for mask is smaller than expected, " - f"got {value.size} bytes, expected {required_size} bytes." - ) - if self.offset > 0 or self.size < self.base_size: - error_msg += ( - "\n\nNote: The mask is expected to be sized according " - "to the base allocation as opposed to the offsetted or" - " sized allocation." - ) - raise ValueError(error_msg) - - self._mask = None - self._children = None - self._base_mask = value - self._clear_cache() - - def _clear_cache(self): - self._distinct_count = {} - attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing") - for attr in attrs: - try: - delattr(self, attr) - except AttributeError: - # attr was not called yet, so ignore. - pass - self._null_count = None - - def set_mask(self, value): - """ - Replaces the mask buffer of the column and returns a new column. This - will zero the column offset, compute a new mask buffer if necessary, - and compute new data Buffers zero-copy that use pointer arithmetic to - properly adjust the pointer. - """ - mask_size = pylibcudf.null_mask.bitmask_allocation_size_bytes(self.size) - required_num_bytes = -(-self.size // 8) # ceiling divide - error_msg = ( - "The value for mask is smaller than expected, got {} bytes, " - "expected " + str(required_num_bytes) + " bytes." - ) - if value is None: - mask = None - elif hasattr(value, "__cuda_array_interface__"): - if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): - if isinstance(value, Column): - value = value.data_array_view(mode="write") - value = cp.asarray(value).view('|u1') - mask = as_buffer(value) - if mask.size < required_num_bytes: - raise ValueError(error_msg.format(str(value.size))) - if mask.size < mask_size: - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_device(value) - mask = as_buffer(dbuf) - elif hasattr(value, "__array_interface__"): - value = np.asarray(value).view("u1")[:mask_size] - if value.size < required_num_bytes: - raise ValueError(error_msg.format(str(value.size))) - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_host(value) - mask = as_buffer(dbuf) - elif PyObject_CheckBuffer(value): - value = np.asarray(value).view("u1")[:mask_size] - if value.size < required_num_bytes: - raise ValueError(error_msg.format(str(value.size))) - dbuf = rmm.DeviceBuffer(size=mask_size) - dbuf.copy_from_host(value) - mask = as_buffer(dbuf) - else: - raise TypeError( - "Expected a Buffer object or None for mask, " - f"got {type(value).__name__}" - ) - - return cudf.core.column.build_column( - data=self.data, - dtype=self.dtype, - mask=mask, - size=self.size, - offset=0, - children=self.children - ) - - @property - def null_count(self): - if self._null_count is None: - if not self.nullable or self.size == 0: - self._null_count = 0 - else: - with acquire_spill_lock(): - self._null_count = pylibcudf.null_mask.null_count( - self.base_mask.get_ptr(mode="read"), - self.offset, - self.offset + self.size - ) - return self._null_count - - @property - def offset(self): - return self._offset - - @property - def base_children(self): - return self._base_children - - @property - def children(self): - if (self.offset == 0) and (self.size == self.base_size): - self._children = self.base_children - if self._children is None: - if self.base_children == (): - self._children = () - else: - children = Column.from_unique_ptr( - move(make_unique[column](self.view())) - ).base_children - dtypes = [ - base_child.dtype for base_child in self.base_children - ] - self._children = tuple( - child._with_type_metadata(dtype) for child, dtype in zip( - children, dtypes - ) - ) - return self._children - - def set_base_children(self, value): - if not isinstance(value, tuple): - raise TypeError("Expected a tuple of Columns for children, got " + - type(value).__name__) - - for child in value: - if not isinstance(child, Column): - raise TypeError( - "Expected each of children to be a Column, got " + - type(child).__name__ - ) - - self._children = None - self._base_children = value - - def _mimic_inplace(self, other_col, inplace=False): - """ - Given another column, update the attributes of this column to mimic an - inplace operation. This does not modify the memory of Buffers, but - instead replaces the Buffers and other attributes underneath the column - object with the Buffers and attributes from the other column. - """ - if inplace: - self._offset = other_col.offset - self._size = other_col.size - self._dtype = other_col._dtype - self.set_base_data(other_col.base_data) - self.set_base_children(other_col.base_children) - self.set_base_mask(other_col.base_mask) - else: - return other_col - - cdef mutable_column_view mutable_view(self) except *: - if isinstance(self.dtype, cudf.CategoricalDtype): - col = self.base_children[0] - data_dtype = col.dtype - elif isinstance(self.dtype, pd.DatetimeTZDtype): - col = self - data_dtype = _get_base_dtype(col.dtype) - else: - col = self - data_dtype = col.dtype - - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) - cdef libcudf_types.size_type offset = self.offset - cdef vector[mutable_column_view] children - cdef void* data - - if col.base_data is None: - data = NULL - else: - data = ( - col.base_data.get_ptr(mode="write") - ) - - cdef Column child_column - if col.base_children: - for child_column in col.base_children: - children.push_back(child_column.mutable_view()) - - cdef libcudf_types.bitmask_type* mask - if self.nullable: - mask = ( - self.base_mask.get_ptr(mode="write") - ) - else: - mask = NULL - - null_count = self._null_count - - if null_count is None: - null_count = 0 - cdef libcudf_types.size_type c_null_count = null_count - - self._mask = None - self._null_count = None - self._children = None - self._data = None - - return mutable_column_view( - dtype.c_obj, - self.size, - data, - mask, - c_null_count, - offset, - children) - - cdef column_view view(self) except *: - null_count = self.null_count - if null_count is None: - null_count = 0 - cdef libcudf_types.size_type c_null_count = null_count - return self._view(c_null_count) - - cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if isinstance(self.dtype, cudf.CategoricalDtype): - col = self.base_children[0] - data_dtype = col.dtype - elif isinstance(self.dtype, pd.DatetimeTZDtype): - col = self - data_dtype = _get_base_dtype(col.dtype) - else: - col = self - data_dtype = col.dtype - - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) - cdef libcudf_types.size_type offset = self.offset - cdef vector[column_view] children - cdef void* data - - if col.base_data is None: - data = NULL - else: - data = (col.base_data.get_ptr(mode="read")) - - cdef Column child_column - if col.base_children: - for child_column in col.base_children: - children.push_back(child_column.view()) - - cdef libcudf_types.bitmask_type* mask - if self.nullable: - mask = ( - self.base_mask.get_ptr(mode="read") - ) - else: - mask = NULL - - cdef libcudf_types.size_type c_null_count = null_count - - return column_view( - dtype.c_obj, - self.size, - data, - mask, - c_null_count, - offset, - children) - - # TODO: Consider whether this function should support some sort of `copy` - # parameter. Not urgent until this functionality is moved up to the Frame - # layer and made public. This function will also need to mark the - # underlying buffers as exposed before this function can itself be exposed - # publicly. User requests to convert to pylibcudf must assume that the - # data may be modified afterwards. - cpdef to_pylibcudf(self, mode: Literal["read", "write"]): - """Convert this Column to a pylibcudf.Column. - - This function will generate a pylibcudf Column pointing to the same - data, mask, and children as this one. - - Parameters - ---------- - mode : str - Supported values are {"read", "write"} If "write", the data pointed - to may be modified by the caller. If "read", the data pointed to - must not be modified by the caller. Failure to fulfill this - contract will cause incorrect behavior. - - Returns - ------- - pylibcudf.Column - A new pylibcudf.Column referencing the same data. - """ - - # TODO: Categoricals will need to be treated differently eventually. - # There is no 1-1 correspondence between cudf and libcudf for - # categoricals because cudf supports ordered and unordered categoricals - # while libcudf supports only unordered categoricals (see - # https://github.com/rapidsai/cudf/pull/8567). - if isinstance(self.dtype, cudf.CategoricalDtype): - col = self.base_children[0] - else: - col = self - - dtype = dtype_to_pylibcudf_type(col.dtype) - - data = None - if col.base_data is not None: - cai = cuda_array_interface_wrapper( - ptr=col.base_data.get_ptr(mode=mode), - size=col.base_data.size, - owner=col.base_data, - ) - data = pylibcudf.gpumemoryview(cai) - - mask = None - if self.nullable: - # TODO: Are we intentionally use self's mask instead of col's? - # Where is the mask stored for categoricals? - cai = cuda_array_interface_wrapper( - ptr=self.base_mask.get_ptr(mode=mode), - size=self.base_mask.size, - owner=self.base_mask, - ) - mask = pylibcudf.gpumemoryview(cai) - - cdef Column child_column - children = [] - if col.base_children: - for child_column in col.base_children: - children.append(child_column.to_pylibcudf(mode=mode)) - - return pylibcudf.Column( - dtype, - self.size, - data, - mask, - self.null_count, - self.offset, - children, - ) - - @staticmethod - cdef Column from_unique_ptr( - unique_ptr[column] c_col, bint data_ptr_exposed=False - ): - """Create a Column from a column - - Typically, this is called on the result of a libcudf operation. - If the data of the libcudf result has been exposed, set - `data_ptr_exposed=True` to expose the memory of the returned Column - as well. - """ - cdef column_view view = c_col.get()[0].view() - cdef libcudf_types.type_id tid = view.type().id() - cdef libcudf_types.data_type c_dtype - cdef size_type length = view.size() - cdef libcudf_types.mask_state mask_state - if tid == libcudf_types.type_id.TIMESTAMP_DAYS: - c_dtype = libcudf_types.data_type( - libcudf_types.type_id.TIMESTAMP_SECONDS - ) - with nogil: - c_col = move(libcudf_unary.cast(view, c_dtype)) - elif tid == libcudf_types.type_id.EMPTY: - c_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8) - mask_state = libcudf_types.mask_state.ALL_NULL - with nogil: - c_col = move(make_numeric_column(c_dtype, length, mask_state)) - - size = c_col.get()[0].size() - dtype = dtype_from_column_view(c_col.get()[0].view()) - null_count = c_col.get()[0].null_count() - - # After call to release(), c_col is unusable - cdef column_contents contents = move(c_col.get()[0].release()) - - data = as_buffer( - DeviceBuffer.c_from_unique_ptr(move(contents.data)), - exposed=data_ptr_exposed - ) - - if null_count > 0: - mask = as_buffer( - DeviceBuffer.c_from_unique_ptr(move(contents.null_mask)), - exposed=data_ptr_exposed - ) - else: - mask = None - - cdef vector[unique_ptr[column]] c_children = move(contents.children) - children = [] - if c_children.size() != 0: - # Because of a bug in Cython, we cannot set the optional - # `data_ptr_exposed` argument within a comprehension. - for i in range(c_children.size()): - child = Column.from_unique_ptr( - move(c_children[i]), - data_ptr_exposed=data_ptr_exposed - ) - children.append(child) - - return cudf.core.column.build_column( - data, - dtype=dtype, - mask=mask, - size=size, - null_count=null_count, - children=tuple(children) - ) - - @staticmethod - def from_pylibcudf( - col, bint data_ptr_exposed=False - ): - """Create a Column from a pylibcudf.Column. - - This function will generate a Column pointing to the provided pylibcudf - Column. It will directly access the data and mask buffers of the - pylibcudf Column, so the newly created object is not tied to the - lifetime of the original pylibcudf.Column. - - Parameters - ---------- - col : pylibcudf.Column - The object to copy. - data_ptr_exposed : bool - Whether the data buffer is exposed. - - Returns - ------- - pylibcudf.Column - A new pylibcudf.Column referencing the same data. - """ - if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS: - col = pylibcudf.unary.cast( - col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS) - ) - elif col.type().id() == pylibcudf.TypeId.EMPTY: - new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8) - - col = pylibcudf.column_factories.make_numeric_column( - new_dtype, - col.size(), - pylibcudf.column_factories.MaskState.ALL_NULL - ) - - dtype = dtype_from_pylibcudf_column(col) - - return cudf.core.column.build_column( - data=as_buffer( - col.data().obj, exposed=data_ptr_exposed - ) if col.data() is not None else None, - dtype=dtype, - size=col.size(), - mask=as_buffer( - col.null_mask().obj, exposed=data_ptr_exposed - ) if col.null_mask() is not None else None, - offset=col.offset(), - null_count=col.null_count(), - children=tuple([ - Column.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) - for child in col.children() - ]) - ) - - @staticmethod - cdef Column from_column_view(column_view cv, object owner): - """ - Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it, - along with referencing an ``owner`` Python object that owns the memory - lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and - make the owner of each newly created ``Buffer`` the respective - ``Buffer`` from the ``owner`` ``cudf.Column``. - If ``owner`` is ``None``, we allocate new memory for the resulting - ``cudf.Column``. - """ - column_owner = isinstance(owner, Column) - mask_owner = owner - if column_owner and isinstance(owner.dtype, cudf.CategoricalDtype): - owner = owner.base_children[0] - - size = cv.size() - offset = cv.offset() - dtype = dtype_from_column_view(cv) - dtype_itemsize = getattr(dtype, "itemsize", 1) - - data_ptr = (cv.head[void]()) - data = None - base_size = size + offset - data_owner = owner - - if column_owner: - data_owner = owner.base_data - mask_owner = mask_owner.base_mask - base_size = owner.base_size - base_nbytes = base_size * dtype_itemsize - # special case for string column - is_string_column = (cv.type().id() == libcudf_types.type_id.STRING) - if is_string_column: - if cv.num_children() == 0: - base_nbytes = 0 - else: - # get the size from offset child column (device to host copy) - offsets_column_index = 0 - offset_child_column = cv.child(offsets_column_index) - if offset_child_column.size() == 0: - base_nbytes = 0 - else: - chars_size = get_element( - offset_child_column, offset_child_column.size()-1).value - base_nbytes = chars_size - - if data_ptr: - if data_owner is None: - buffer_size = ( - base_nbytes - if is_string_column - else ((size + offset) * dtype_itemsize) - ) - data = as_buffer( - rmm.DeviceBuffer(ptr=data_ptr, - size=buffer_size) - ) - elif ( - column_owner and - isinstance(data_owner, ExposureTrackedBuffer) - ): - data = as_buffer( - data=data_ptr, - size=base_nbytes, - owner=data_owner, - exposed=False, - ) - elif ( - # This is an optimization of the most common case where - # from_column_view creates a "view" that is identical to - # the owner. - column_owner and - isinstance(data_owner, SpillableBuffer) and - # We check that `data_owner` is spill locked (not spillable) - # and that it points to the same memory as `data_ptr`. - not data_owner.spillable and - data_owner.memory_info() == (data_ptr, base_nbytes, "gpu") - ): - data = data_owner - else: - # At this point we don't know the relationship between data_ptr - # and data_owner thus we mark both of them exposed. - # TODO: try to discover their relationship and create a - # SpillableBufferSlice instead. - data = as_buffer( - data=data_ptr, - size=base_nbytes, - owner=data_owner, - exposed=True, - ) - if isinstance(data_owner, ExposureTrackedBuffer): - # accessing the pointer marks it exposed permanently. - data_owner.mark_exposed() - elif isinstance(data_owner, SpillableBuffer): - if data_owner.is_spilled: - raise ValueError( - f"{data_owner} is spilled, which invalidates " - f"the exposed data_ptr ({hex(data_ptr)})" - ) - # accessing the pointer marks it exposed permanently. - data_owner.mark_exposed() - else: - data = as_buffer( - rmm.DeviceBuffer(ptr=data_ptr, size=0) - ) - - mask = None - mask_ptr = (cv.null_mask()) - if mask_ptr: - if mask_owner is None: - if column_owner: - # if we reached here, it means `owner` is a `Column` - # that does not have a null mask, but `cv` thinks it - # should have a null mask. This can happen in the - # following sequence of events: - # - # 1) `cv` is constructed as a view into a - # `cudf::column` that is nullable (i.e., it has - # a null mask), but contains no nulls. - # 2) `owner`, a `Column`, is constructed from the - # same `cudf::column`. Because `cudf::column` - # is memory owning, `owner` takes ownership of - # the memory owned by the - # `cudf::column`. Because the column has a null - # count of 0, it may choose to discard the null - # mask. - # 3) Now, `cv` points to a discarded null mask. - # - # TL;DR: we should not include a null mask in the - # result: - mask = None - else: - mask = as_buffer( - rmm.DeviceBuffer( - ptr=mask_ptr, - size=pylibcudf.null_mask.bitmask_allocation_size_bytes( - base_size - ) - ) - ) - else: - mask = as_buffer( - data=mask_ptr, - size=pylibcudf.null_mask.bitmask_allocation_size_bytes( - base_size - ), - owner=mask_owner, - exposed=True - ) - - if cv.has_nulls(): - null_count = cv.null_count() - else: - null_count = 0 - - children = [] - for child_index in range(cv.num_children()): - child_owner = owner - if column_owner: - child_owner = owner.base_children[child_index] - children.append( - Column.from_column_view( - cv.child(child_index), - child_owner - ) - ) - children = tuple(children) - - result = cudf.core.column.build_column( - data=data, - dtype=dtype, - mask=mask, - size=size, - offset=offset, - null_count=null_count, - children=tuple(children) - ) - - return result - - @staticmethod - def from_scalar(py_val, size_type size): - return Column.from_pylibcudf( - pylibcudf.Column.from_scalar( - py_val.device_value.c_value, size - ) - ) diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py index a9023f8fd59..56095e84538 100644 --- a/python/cudf/cudf/core/_internals/binaryop.py +++ b/python/cudf/cudf/core/_internals/binaryop.py @@ -5,13 +5,12 @@ import pylibcudf as plc -from cudf._lib.column import Column from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase from cudf.utils.dtypes import dtype_to_pylibcudf_type if TYPE_CHECKING: from cudf._typing import Dtype - from cudf.core.column import ColumnBase from cudf.core.scalar import Scalar @@ -46,13 +45,13 @@ def binaryop( op = op.upper() op = _op_map.get(op, op) - return Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( plc.binaryop.binary_operation( lhs.to_pylibcudf(mode="read") - if isinstance(lhs, Column) + if isinstance(lhs, ColumnBase) else lhs.device_value.c_value, rhs.to_pylibcudf(mode="read") - if isinstance(rhs, Column) + if isinstance(rhs, ColumnBase) else rhs.device_value.c_value, plc.binaryop.BinaryOperator[op], dtype_to_pylibcudf_type(dtype), diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py index 34c1850cb72..35d75c37c37 100644 --- a/python/cudf/cudf/core/_internals/copying.py +++ b/python/cudf/cudf/core/_internals/copying.py @@ -1,17 +1,17 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING import pylibcudf as plc -import cudf from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase if TYPE_CHECKING: from collections.abc import Iterable - from cudf.core.column import ColumnBase + from cudf import Scalar from cudf.core.column.numerical import NumericalColumn @@ -28,15 +28,12 @@ def gather( if nullify else plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - return [ - cudf._lib.column.Column.from_pylibcudf(col) - for col in plc_tbl.columns() - ] + return [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] @acquire_spill_lock() def scatter( - sources: list[ColumnBase | cudf.Scalar], + sources: list[ColumnBase | Scalar], scatter_map: NumericalColumn, target_columns: list[ColumnBase], bounds_check: bool = True, @@ -66,16 +63,13 @@ def scatter( plc_tbl = plc.copying.scatter( plc.Table([col.to_pylibcudf(mode="read") for col in sources]) # type: ignore[union-attr] - if isinstance(sources[0], cudf._lib.column.Column) + if isinstance(sources[0], ColumnBase) else [slr.device_value.c_value for slr in sources], # type: ignore[union-attr] scatter_map.to_pylibcudf(mode="read"), plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]), ) - return [ - cudf._lib.column.Column.from_pylibcudf(col) - for col in plc_tbl.columns() - ] + return [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] @acquire_spill_lock() @@ -83,10 +77,7 @@ def columns_split( input_columns: Iterable[ColumnBase], splits: list[int] ) -> list[list[ColumnBase]]: return [ - [ - cudf._lib.column.Column.from_pylibcudf(col) - for col in plc_tbl.columns() - ] + [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] for plc_tbl in plc.copying.split( plc.Table( [col.to_pylibcudf(mode="read") for col in input_columns] diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py index a0ffe078de9..aa410c36575 100644 --- a/python/cudf/cudf/core/_internals/search.py +++ b/python/cudf/cudf/core/_internals/search.py @@ -1,15 +1,12 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations -from typing import TYPE_CHECKING, Literal +from typing import Literal import pylibcudf as plc -from cudf._lib.column import Column from cudf.core.buffer import acquire_spill_lock - -if TYPE_CHECKING: - from cudf.core.column import ColumnBase +from cudf.core.column import ColumnBase @acquire_spill_lock() @@ -46,7 +43,7 @@ def search_sorted( plc.search, "lower_bound" if side == "left" else "upper_bound", ) - return Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( func( plc.Table([col.to_pylibcudf(mode="read") for col in source]), plc.Table([col.to_pylibcudf(mode="read") for col in values]), diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py index 69f9e7664b1..8dbb169bb83 100644 --- a/python/cudf/cudf/core/_internals/sorting.py +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -6,14 +6,12 @@ import pylibcudf as plc -from cudf._lib.column import Column from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase if TYPE_CHECKING: from collections.abc import Iterable - from cudf.core.column import ColumnBase - @acquire_spill_lock() def is_sorted( @@ -146,7 +144,7 @@ def order_by( func = ( plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order ) - return Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( func( plc.Table( [col.to_pylibcudf(mode="read") for col in columns_from_table], @@ -195,7 +193,7 @@ def sort_by_key( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key ) return [ - Column.from_pylibcudf(col) + ColumnBase.from_pylibcudf(col) for col in func( plc.Table([col.to_pylibcudf(mode="read") for col in values]), plc.Table([col.to_pylibcudf(mode="read") for col in keys]), diff --git a/python/cudf/cudf/core/_internals/stream_compaction.py b/python/cudf/cudf/core/_internals/stream_compaction.py index 4ccc26c2a1c..d82671bd3f0 100644 --- a/python/cudf/cudf/core/_internals/stream_compaction.py +++ b/python/cudf/cudf/core/_internals/stream_compaction.py @@ -1,15 +1,12 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations -from typing import TYPE_CHECKING, Literal +from typing import Literal import pylibcudf as plc -from cudf._lib.column import Column from cudf.core.buffer import acquire_spill_lock - -if TYPE_CHECKING: - from cudf.core.column import ColumnBase +from cudf.core.column import ColumnBase @acquire_spill_lock() @@ -53,7 +50,7 @@ def drop_nulls( keys, keep_threshold, ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()] + return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()] @acquire_spill_lock() @@ -76,7 +73,7 @@ def apply_boolean_mask( plc.Table([col.to_pylibcudf(mode="read") for col in columns]), boolean_mask.to_pylibcudf(mode="read"), ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()] + return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()] @acquire_spill_lock() @@ -118,4 +115,4 @@ def drop_duplicates( else plc.types.NullEquality.UNEQUAL, plc.types.NanEquality.ALL_EQUAL, ) - return [Column.from_pylibcudf(col) for col in plc_table.columns()] + return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()] diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 4d001577581..0ccd517cae0 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime @@ -13,7 +13,7 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column +from cudf.core.column import ColumnBase if TYPE_CHECKING: from cudf.core.column.datetime import DatetimeColumn @@ -117,7 +117,7 @@ def _read_tzfile_as_columns( tzdir, zone_name ) transition_times_and_offsets = [ - Column.from_pylibcudf(col) for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] if not transition_times_and_offsets: diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index b9d6c0e7f08..7e4985f8819 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1162,7 +1162,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False ) -> Self | None: - out = super()._mimic_inplace(other_col, inplace=inplace) + out = super()._mimic_inplace(other_col, inplace=inplace) # type: ignore[arg-type] if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col.codes return out diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 19f2802553d..0734c85e1a5 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -24,7 +24,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.column import Column from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -63,6 +62,7 @@ _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, + dtype_from_pylibcudf_column, dtype_to_pylibcudf_type, find_common_type, get_time_unit, @@ -86,7 +86,19 @@ NumpyExtensionArray = pd.arrays.PandasArray -class ColumnBase(Column, Serializable, BinaryOperand, Reducible): +class ColumnBase(Serializable, BinaryOperand, Reducible): + """ + A ColumnBase stores columnar data in device memory. + + A ColumnBase may be composed of: + + * A *data* Buffer + * One or more (optional) *children* Columns + * An (optional) *mask* Buffer representing the nullmask + + The *dtype* indicates the ColumnBase's element type. + """ + _VALID_REDUCTIONS = { "any", "all", @@ -96,6 +108,420 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): _PANDAS_NA_REPR = str(pd.NA) + def __init__( + self, + data: None | Buffer, + size: int, + dtype, + mask: None | Buffer = None, + offset: int = 0, + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), + ) -> None: + if size < 0: + raise ValueError("size must be >=0") + self._size = size + self._distinct_count: dict[bool, int] = {} + self._dtype = dtype + self._offset = offset + self._null_count = null_count + self.set_base_children(children) + self.set_base_data(data) + self.set_base_mask(mask) + + @property + def base_size(self) -> int: + return int(self.base_data.size / self.dtype.itemsize) # type: ignore[union-attr] + + @property + def dtype(self): + return self._dtype + + @property + def size(self) -> int: + return self._size + + @property + def base_data(self) -> None | Buffer: + return self._base_data # type: ignore[has-type] + + @property + def data(self) -> None | Buffer: + if self.base_data is None: + return None + if self._data is None: # type: ignore[has-type] + start = self.offset * self.dtype.itemsize + end = start + self.size * self.dtype.itemsize + self._data = self.base_data[start:end] + return self._data + + @property + def data_ptr(self) -> int: + if self.data is None: + return 0 + else: + return self.data.get_ptr(mode="write") + + def set_base_data(self, value: None | Buffer) -> None: + if value is not None and not isinstance(value, Buffer): + raise TypeError( + "Expected a Buffer or None for data, " + f"got {type(value).__name__}" + ) + + self._data = None # type: ignore[assignment] + self._base_data = value + + @property + def nullable(self) -> bool: + return self.base_mask is not None + + def has_nulls(self, include_nan: bool = False) -> bool: + return int(self.null_count) != 0 + + @property + def base_mask(self) -> None | Buffer: + return self._base_mask # type: ignore[has-type] + + @property + def mask(self) -> None | Buffer: + if self._mask is None: # type: ignore[has-type] + if self.base_mask is None or self.offset == 0: + self._mask = self.base_mask + else: + with acquire_spill_lock(): + self._mask = as_buffer( + plc.null_mask.copy_bitmask( + self.to_pylibcudf(mode="read") + ) + ) + return self._mask + + @property + def mask_ptr(self) -> int: + if self.mask is None: + return 0 + else: + return self.mask.get_ptr(mode="write") + + def set_base_mask(self, value: None | Buffer) -> None: + """ + Replaces the base mask buffer of the column inplace. This does not + modify size or offset in any way, so the passed mask is expected to be + compatible with the current offset. + """ + if value is not None and not isinstance(value, Buffer): + raise TypeError( + "Expected a Buffer or None for mask, " + f"got {type(value).__name__}" + ) + + if value is not None: + # bitmask size must be relative to offset = 0 data. + required_size = plc.null_mask.bitmask_allocation_size_bytes( + self.base_size + ) + if value.size < required_size: + error_msg = ( + "The Buffer for mask is smaller than expected, " + f"got {value.size} bytes, expected {required_size} bytes." + ) + if self.offset > 0 or self.size < self.base_size: + error_msg += ( + "\n\nNote: The mask is expected to be sized according " + "to the base allocation as opposed to the offsetted or" + " sized allocation." + ) + raise ValueError(error_msg) + + self._mask = None + self._children = None + self._base_mask = value + self._clear_cache() + + def _clear_cache(self) -> None: + self._distinct_count.clear() + attrs = ( + "memory_usage", + "is_monotonic_increasing", + "is_monotonic_decreasing", + ) + for attr in attrs: + try: + delattr(self, attr) + except AttributeError: + # attr was not called yet, so ignore. + pass + self._null_count = None + + def set_mask(self, value) -> Self: + """ + Replaces the mask buffer of the column and returns a new column. This + will zero the column offset, compute a new mask buffer if necessary, + and compute new data Buffers zero-copy that use pointer arithmetic to + properly adjust the pointer. + """ + mask_size = plc.null_mask.bitmask_allocation_size_bytes(self.size) + required_num_bytes = -(-self.size // 8) # ceiling divide + error_msg = ( + "The value for mask is smaller than expected, got {} bytes, " + f"expected {required_num_bytes} bytes." + ) + if value is None: + mask = None + elif hasattr(value, "__cuda_array_interface__"): + if value.__cuda_array_interface__["typestr"] not in ("|i1", "|u1"): + if isinstance(value, ColumnBase): + value = value.data_array_view(mode="write") + value = cupy.asarray(value).view("|u1") + mask = as_buffer(value) + if mask.size < required_num_bytes: + raise ValueError(error_msg.format(str(value.size))) + if mask.size < mask_size: + dbuf = rmm.DeviceBuffer(size=mask_size) + dbuf.copy_from_device(value) + mask = as_buffer(dbuf) + elif hasattr(value, "__array_interface__"): + value = np.asarray(value).view("u1")[:mask_size] + if value.size < required_num_bytes: + raise ValueError(error_msg.format(str(value.size))) + dbuf = rmm.DeviceBuffer(size=mask_size) + dbuf.copy_from_host(value) + mask = as_buffer(dbuf) + else: + try: + value = memoryview(value) + except TypeError as err: + raise TypeError( + f"Expected a Buffer object or None for mask, got {type(value).__name__}" + ) from err + else: + value = np.asarray(value).view("u1")[:mask_size] + if value.size < required_num_bytes: + raise ValueError(error_msg.format(str(value.size))) + dbuf = rmm.DeviceBuffer(size=mask_size) + dbuf.copy_from_host(value) + mask = as_buffer(dbuf) + + return cudf.core.column.build_column( # type: ignore[return-value] + data=self.data, + dtype=self.dtype, + mask=mask, + size=self.size, + offset=0, + children=self.children, + ) + + @property + def null_count(self) -> int: + if self._null_count is None: + if not self.nullable or self.size == 0: + self._null_count = 0 + else: + with acquire_spill_lock(): + self._null_count = plc.null_mask.null_count( + self.base_mask.get_ptr(mode="read"), # type: ignore[union-attr] + self.offset, + self.offset + self.size, + ) + return self._null_count + + @property + def offset(self) -> int: + return self._offset + + @property + def base_children(self) -> tuple[ColumnBase, ...]: + return self._base_children # type: ignore[has-type] + + @property + def children(self) -> tuple[ColumnBase, ...]: + if self.offset == 0 and self.size == self.base_size: + self._children = self.base_children # type: ignore[assignment] + if self._children is None: + if not self.base_children: + self._children = () # type: ignore[assignment] + else: + self._children = self.base_children # type: ignore[assignment] + # children = Column.from_unique_ptr( + # move(make_unique[column](self.view())) + # ).base_children + # dtypes = [ + # base_child.dtype for base_child in self.base_children + # ] + # self._children = tuple( + # child._with_type_metadata(dtype) for child, dtype in zip( + # children, dtypes + # ) + # ) + return self._children # type: ignore[return-value] + + def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: + if not isinstance(value, tuple): + raise TypeError( + f"Expected a tuple of Columns for children, got {type(value).__name__}" + ) + if any(not isinstance(child, ColumnBase) for child in value): + raise TypeError("All children must be Columns.") + + self._children = None + self._base_children = value + + def _mimic_inplace( + self, other_col: Self, inplace: bool = False + ) -> None | Self: + """ + Given another column, update the attributes of this column to mimic an + inplace operation. This does not modify the memory of Buffers, but + instead replaces the Buffers and other attributes underneath the column + object with the Buffers and attributes from the other column. + """ + if inplace: + self._offset = other_col.offset + self._size = other_col.size + self._dtype = other_col._dtype + self.set_base_data(other_col.base_data) + self.set_base_children(other_col.base_children) + self.set_base_mask(other_col.base_mask) + # TODO: self._clear_cache here? + return None + else: + return other_col + + # TODO: Consider whether this function should support some sort of `copy` + # parameter. Not urgent until this functionality is moved up to the Frame + # layer and made public. This function will also need to mark the + # underlying buffers as exposed before this function can itself be exposed + # publicly. User requests to convert to pylibcudf must assume that the + # data may be modified afterwards. + def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: + """Convert this Column to a pylibcudf.Column. + + This function will generate a pylibcudf Column pointing to the same + data, mask, and children as this one. + + Parameters + ---------- + mode : str + Supported values are {"read", "write"} If "write", the data pointed + to may be modified by the caller. If "read", the data pointed to + must not be modified by the caller. Failure to fulfill this + contract will cause incorrect behavior. + + Returns + ------- + pylibcudf.Column + A new pylibcudf.Column referencing the same data. + """ + + # TODO: Categoricals will need to be treated differently eventually. + # There is no 1-1 correspondence between cudf and libcudf for + # categoricals because cudf supports ordered and unordered categoricals + # while libcudf supports only unordered categoricals (see + # https://github.com/rapidsai/cudf/pull/8567). + if isinstance(self.dtype, cudf.CategoricalDtype): + col = self.base_children[0] + else: + col = self + + dtype = dtype_to_pylibcudf_type(col.dtype) + + data = None + if col.base_data is not None: + cai = cuda_array_interface_wrapper( + ptr=col.base_data.get_ptr(mode=mode), + size=col.base_data.size, + owner=col.base_data, + ) + data = plc.gpumemoryview(cai) + + mask = None + if self.nullable: + # TODO: Are we intentionally use self's mask instead of col's? + # Where is the mask stored for categoricals? + cai = cuda_array_interface_wrapper( + ptr=self.base_mask.get_ptr(mode=mode), # type: ignore[union-attr] + size=self.base_mask.size, # type: ignore[union-attr] + owner=self.base_mask, + ) + mask = plc.gpumemoryview(cai) + + children = [] + if col.base_children: + children = [ + child_column.to_pylibcudf(mode=mode) + for child_column in col.base_children + ] + + return plc.Column( + dtype, + self.size, + data, + mask, + self.null_count, + self.offset, + children, + ) + + @classmethod + def from_pylibcudf( + cls, col: plc.Column, data_ptr_exposed: bool = False + ) -> Self: + """Create a Column from a pylibcudf.Column. + + This function will generate a Column pointing to the provided pylibcudf + Column. It will directly access the data and mask buffers of the + pylibcudf Column, so the newly created object is not tied to the + lifetime of the original pylibcudf.Column. + + Parameters + ---------- + col : pylibcudf.Column + The object to copy. + data_ptr_exposed : bool + Whether the data buffer is exposed. + + Returns + ------- + pylibcudf.Column + A new pylibcudf.Column referencing the same data. + """ + if col.type().id() == plc.TypeId.TIMESTAMP_DAYS: + col = plc.unary.cast( + col, plc.DataType(plc.TypeId.TIMESTAMP_SECONDS) + ) + elif col.type().id() == plc.TypeId.EMPTY: + new_dtype = plc.DataType(plc.TypeId.INT8) + + col = plc.column_factories.make_numeric_column( + new_dtype, col.size(), plc.column_factories.MaskState.ALL_NULL + ) + + dtype = dtype_from_pylibcudf_column(col) + + return cudf.core.column.build_column( # type: ignore[return-value] + data=as_buffer(col.data().obj, exposed=data_ptr_exposed) + if col.data() is not None + else None, + dtype=dtype, + size=col.size(), + mask=as_buffer(col.null_mask().obj, exposed=data_ptr_exposed) + if col.null_mask() is not None + else None, + offset=col.offset(), + null_count=col.null_count(), + children=tuple( + cls.from_pylibcudf(child, data_ptr_exposed=data_ptr_exposed) + for child in col.children() + ), + ) + + @classmethod + def from_scalar(cls, slr: cudf.Scalar, size: int) -> Self: + return cls.from_pylibcudf( + plc.Column.from_scalar(slr.device_value.c_value, size) + ) + def data_array_view( self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": @@ -735,7 +1161,7 @@ def _scatter_by_column( with acquire_spill_lock(): plc_table = plc.copying.boolean_mask_scatter( plc.Table([value.to_pylibcudf(mode="read")]) - if isinstance(value, Column) + if isinstance(value, ColumnBase) else [value.device_value.c_value], plc.Table([self.to_pylibcudf(mode="read")]), key.to_pylibcudf(mode="read"), @@ -1080,7 +1506,7 @@ def contains(self, other: ColumnBase) -> ColumnBase: A column of values to search for """ with acquire_spill_lock(): - return Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( plc.search.contains( self.to_pylibcudf(mode="read"), other.to_pylibcudf(mode="read"), @@ -1990,7 +2416,7 @@ def as_column( """ if isinstance(arbitrary, (range, pd.RangeIndex, cudf.RangeIndex)): with acquire_spill_lock(): - column = Column.from_pylibcudf( + column = ColumnBase.from_pylibcudf( plc.filling.sequence( len(arbitrary), plc.interop.from_arrow( @@ -2560,7 +2986,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: # Filter out inputs that have 0 length, then concatenate. objs_with_len = [o for o in objs if len(o)] with acquire_spill_lock(): - return Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( plc.concatenate.concatenate( [col.to_pylibcudf(mode="read") for col in objs_with_len] ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2bee85cb387..cd38bfdf934 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,7 +19,6 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf._lib.column import Column from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock @@ -166,7 +165,7 @@ def len(self) -> SeriesOrIndex: plc_column = plc.strings.attributes.count_characters( self._column.to_pylibcudf(mode="read") ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return self._return_or_inplace(result) def byte_count(self) -> SeriesOrIndex: @@ -200,7 +199,7 @@ def byte_count(self) -> SeriesOrIndex: plc_column = plc.strings.attributes.count_bytes( self._column.to_pylibcudf(mode="read") ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return self._return_or_inplace(result) @overload @@ -309,7 +308,7 @@ def cat(self, others=None, sep=None, na_rep=None): pa.scalar(na_rep, type=pa.string()) ), ) - data = Column.from_pylibcudf(plc_column) + data = ColumnBase.from_pylibcudf(plc_column) else: parent_index = ( self._parent.index @@ -368,7 +367,7 @@ def cat(self, others=None, sep=None, na_rep=None): pa.scalar(na_rep, type=pa.string()) ), ) - data = Column.from_pylibcudf(plc_column) + data = ColumnBase.from_pylibcudf(plc_column) if len(data) == 1 and data.null_count == 1: data = cudf.core.column.as_column("", length=len(data)) @@ -534,7 +533,7 @@ def join( plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - data = Column.from_pylibcudf(plc_column) + data = ColumnBase.from_pylibcudf(plc_column) elif can_convert_to_column(sep): sep_column = column.as_column(sep) if len(sep_column) != len(strings_column): @@ -556,7 +555,7 @@ def join( plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) - data = Column.from_pylibcudf(plc_column) + data = ColumnBase.from_pylibcudf(plc_column) else: raise TypeError( f"sep should be an str, array-like or Series object, " @@ -653,7 +652,8 @@ def extract( ) data = dict( enumerate( - Column.from_pylibcudf(col) for col in plc_result.columns() + ColumnBase.from_pylibcudf(col) + for col in plc_result.columns() ) ) if len(data) == 1 and expand is False: @@ -800,7 +800,7 @@ def contains( plc_result = plc.strings.contains.contains_re( self._column.to_pylibcudf(mode="read"), prog ) - result_col = Column.from_pylibcudf(plc_result) + result_col = ColumnBase.from_pylibcudf(plc_result) else: if case is False: input_column = self.lower()._column # type: ignore[union-attr] @@ -813,7 +813,7 @@ def contains( input_column.to_pylibcudf(mode="read"), plc.interop.from_arrow(pa.scalar(pat_normed)), ) - result_col = Column.from_pylibcudf(plc_result) + result_col = ColumnBase.from_pylibcudf(plc_result) else: # TODO: we silently ignore the `regex=` flag here if case is False: @@ -827,7 +827,7 @@ def contains( input_column.to_pylibcudf(mode="read"), col_pat.to_pylibcudf(mode="read"), ) - result_col = Column.from_pylibcudf(plc_result) + result_col = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result_col) def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: @@ -899,7 +899,7 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: plc.interop.from_arrow(pa.scalar(pat)), plc.interop.from_arrow(pa.scalar(esc)), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -956,7 +956,7 @@ def repeat( plc_result = plc.strings.repeat.repeat_strings( self._column.to_pylibcudf(mode="read"), repeats ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def replace( @@ -1052,7 +1052,7 @@ def replace( mode="read" ), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) else: result = self._column.replace_multiple( cast(StringColumn, column.as_column(pat, dtype="str")), @@ -1085,7 +1085,7 @@ def replace( plc.interop.from_arrow(pa.scalar(repl)), n, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: @@ -1126,7 +1126,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: ), repl, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def slice( @@ -1206,7 +1206,7 @@ def slice( plc.interop.from_arrow(pa.scalar(stop, param_dtype)), plc.interop.from_arrow(pa.scalar(step, param_dtype)), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def _all_characters_of_type( @@ -1218,7 +1218,7 @@ def _all_characters_of_type( plc_column = plc.strings.char_types.all_characters_of_type( self._column.to_pylibcudf(mode="read"), char_type, case_type ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return self._return_or_inplace(result) def isinteger(self) -> SeriesOrIndex: @@ -2183,7 +2183,7 @@ def filter_alphanum( if keep else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return self._return_or_inplace(result) def slice_from( @@ -2230,7 +2230,7 @@ def slice_from( starts._column.to_pylibcudf(mode="read"), stops._column.to_pylibcudf(mode="read"), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def slice_replace( @@ -2326,7 +2326,7 @@ def slice_replace( start, stop, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: @@ -2506,7 +2506,7 @@ def get_json_object( plc.interop.from_arrow(pa.scalar(json_path)), options, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def split( @@ -3103,7 +3103,7 @@ def pad( side, fillchar, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def zfill(self, width: int) -> SeriesOrIndex: @@ -3174,7 +3174,7 @@ def zfill(self, width: int) -> SeriesOrIndex: plc_result = plc.strings.padding.zfill( self._column.to_pylibcudf(mode="read"), width ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def center(self, width: int, fillchar: str = " ") -> SeriesOrIndex: @@ -3323,7 +3323,7 @@ def _strip( side, plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def strip(self, to_strip: str | None = None) -> SeriesOrIndex: @@ -3568,7 +3568,7 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: plc_result = plc.strings.wrap.wrap( self._column.to_pylibcudf(mode="read"), width ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: @@ -3642,7 +3642,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: plc_result = plc.strings.contains.count_re( self._column.to_pylibcudf(mode="read"), prog ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def _findall( @@ -3666,7 +3666,7 @@ def _findall( self._column.to_pylibcudf(mode="read"), prog, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: @@ -3828,7 +3828,7 @@ def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series: self._column.to_pylibcudf(mode="read"), patterns_column.to_pylibcudf(mode="read"), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return cudf.Series._from_column( result, @@ -3947,7 +3947,7 @@ def _starts_ends_with( plc_result = method( self._column.to_pylibcudf(mode="read"), plc_pat ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def endswith(self, pat: str | Sequence) -> SeriesOrIndex: @@ -4142,7 +4142,7 @@ def _find( start, end, ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def find( @@ -4418,7 +4418,7 @@ def match( plc_result = plc.strings.contains.matches_re( self._column.to_pylibcudf(mode="read"), prog ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def url_decode(self) -> SeriesOrIndex: @@ -4516,7 +4516,7 @@ def code_points(self) -> SeriesOrIndex: plc_column = plc.strings.attributes.code_points( self._column.to_pylibcudf(mode="read") ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return self._return_or_inplace(result, retain_index=False) def translate(self, table: dict) -> SeriesOrIndex: @@ -4564,7 +4564,7 @@ def translate(self, table: dict) -> SeriesOrIndex: plc_result = plc.strings.translate.translate( self._column.to_pylibcudf(mode="read"), table ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def filter_characters( @@ -4623,7 +4623,7 @@ def filter_characters( else plc.strings.translate.FilterType.REMOVE, plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), ) - result = Column.from_pylibcudf(plc_result) + result = ColumnBase.from_pylibcudf(plc_result) return self._return_or_inplace(result) def normalize_spaces(self) -> SeriesOrIndex: @@ -4726,7 +4726,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: """ delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delim, Column): + if isinstance(delim, ColumnBase): result = self._return_or_inplace( self._column.tokenize_column(delim), # type: ignore[arg-type] retain_index=False, @@ -4867,7 +4867,7 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: dtype: int32 """ delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delim, Column): + if isinstance(delim, ColumnBase): return self._return_or_inplace( self._column.count_tokens_column(delim) # type: ignore[arg-type] ) @@ -5770,7 +5770,9 @@ def sum( plc.interop.from_arrow(pa.scalar("")), plc.interop.from_arrow(pa.scalar(None, type=pa.string())), ) - return Column.from_pylibcudf(plc_column).element_indexing(0) + return ColumnBase.from_pylibcudf(plc_column).element_indexing( + 0 + ) else: return result_col @@ -5785,7 +5787,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: plc_column = plc.strings.attributes.count_characters( self.to_pylibcudf(mode="read") ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return (result > np.int8(0)).fillna(False) elif out_dtype.kind in {"i", "u"}: if not self.is_integer().all(): @@ -5835,7 +5837,7 @@ def strptime( plc_column = plc.strings.attributes.count_characters( without_nat.to_pylibcudf(mode="read") ) - char_counts = Column.from_pylibcudf(plc_column) + char_counts = ColumnBase.from_pylibcudf(plc_column) if char_counts.distinct_count(dropna=True) != 1: # Unfortunately disables OK cases like: # ["2020-01-01", "2020-01-01 00:00:00"] @@ -5892,7 +5894,7 @@ def as_decimal_column( self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype), ) - result = Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) result.dtype.precision = dtype.precision # type: ignore[union-attr] return result # type: ignore[return-value] @@ -6058,7 +6060,7 @@ def _binaryop( pa.scalar(None, type=pa.string()) ), ) - return Column.from_pylibcudf(plc_column) + return ColumnBase.from_pylibcudf(plc_column) elif op in { "__eq__", "__ne__", @@ -6073,8 +6075,8 @@ def _binaryop( return binaryop.binaryop(lhs=lhs, rhs=rhs, op=op, dtype="bool") return NotImplemented - @copy_docstring(column.ColumnBase.view) - def view(self, dtype) -> "cudf.core.column.ColumnBase": + @copy_docstring(ColumnBase.view) + def view(self, dtype) -> ColumnBase: if self.null_count > 0: raise ValueError( "Can not produce a view of a string column with nulls" @@ -6218,7 +6220,7 @@ def normalize_spaces(self) -> Self: @acquire_spill_lock() def normalize_characters(self, do_lower: bool = True) -> Self: - return Column.from_pylibcudf( # type: ignore[return-value] + return ColumnBase.from_pylibcudf( # type: ignore[return-value] plc.nvtext.normalize.normalize_characters( self.to_pylibcudf(mode="read"), do_lower, @@ -6376,7 +6378,7 @@ def _modify_characters( Helper function for methods that modify characters e.g. to_lower """ plc_column = method(self.to_pylibcudf(mode="read")) - return cast(Self, Column.from_pylibcudf(plc_column)) + return cast(Self, ColumnBase.from_pylibcudf(plc_column)) def to_lower(self) -> Self: return self._modify_characters(plc.strings.case.to_lower) @@ -6403,7 +6405,7 @@ def replace_multiple(self, pattern: Self, replacements: Self) -> Self: pattern.to_pylibcudf(mode="read"), replacements.to_pylibcudf(mode="read"), ) - return cast(Self, Column.from_pylibcudf(plc_result)) + return cast(Self, ColumnBase.from_pylibcudf(plc_result)) @acquire_spill_lock() def is_hex(self) -> NumericalColumn: @@ -6463,7 +6465,7 @@ def _split_record_re( ), maxsplit, ) - return cast(Self, Column.from_pylibcudf(plc_column)) + return cast(Self, ColumnBase.from_pylibcudf(plc_column)) def split_record_re(self, pattern: str, maxsplit: int) -> Self: return self._split_record_re( @@ -6495,7 +6497,7 @@ def _split_re( ) return dict( enumerate( - Column.from_pylibcudf(col) # type: ignore[misc] + ColumnBase.from_pylibcudf(col) # type: ignore[misc] for col in plc_table.columns() ) ) @@ -6548,7 +6550,7 @@ def _split( ) return dict( enumerate( - Column.from_pylibcudf(col) # type: ignore[misc] + ColumnBase.from_pylibcudf(col) # type: ignore[misc] for col in plc_table.columns() ) ) @@ -6571,7 +6573,7 @@ def _partition( ) return dict( enumerate( - Column.from_pylibcudf(col) # type: ignore[misc] + ColumnBase.from_pylibcudf(col) # type: ignore[misc] for col in plc_table.columns() ) ) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 5bfea45a946..67c29dc59ed 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from collections import abc @@ -9,10 +9,9 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import as_column +from cudf.core.column import ColumnBase, as_column from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.index import IntervalIndex, interval_range @@ -272,7 +271,7 @@ def cut( if right_inclusive else plc.labeling.Inclusive.NO, ) - index_labels = Column.from_pylibcudf(plc_column) + index_labels = ColumnBase.from_pylibcudf(plc_column) if labels is False: # if labels is false we return the index labels, we return them diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index 391ee31f125..6218327063d 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2021-2025, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -24,9 +24,9 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.core.abc import Serializable from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase from cudf.core.groupby.groupby import ( DataFrameGroupBy, GroupBy, @@ -282,7 +282,7 @@ def _handle_frequency_grouper(self, by): if closed == "right" else plc.labeling.Inclusive.NO, ) - bin_numbers = Column.from_pylibcudf(plc_column) + bin_numbers = ColumnBase.from_pylibcudf(plc_column) if label == "right": cast_bin_labels = cast_bin_labels[1:] diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index eedd777aafe..6b3ec552d67 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -11,7 +11,6 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 @@ -978,7 +977,7 @@ def _merge_sorted( ) result_columns = [ - Column.from_pylibcudf(col) for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] return objs[0]._from_columns_like_self( diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index 94ce3001ca1..bfc5a67ab13 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -20,9 +20,8 @@ import rmm from cudf._lib import strings_udf -from cudf._lib.column import Column from cudf.api.types import is_scalar -from cudf.core.column.column import as_column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.dtypes import dtype from cudf.core.udf.masked_typing import MaskedType from cudf.core.udf.strings_typing import ( @@ -333,7 +332,7 @@ def _return_arr_from_dtype(dtype, size): def _post_process_output_col(col, retty): if retty == _cudf_str_dtype: - return Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( strings_udf.column_from_udf_string_array(col) ) return as_column(col, retty) diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py index dcbdd4423fc..7d778471b37 100644 --- a/python/cudf/cudf/io/avro.py +++ b/python/cudf/cudf/io/avro.py @@ -1,9 +1,9 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import pylibcudf as plc import cudf -from cudf._lib.column import Column +from cudf.core.column import ColumnBase from cudf.utils import ioutils @@ -47,7 +47,7 @@ def read_avro( plc_result = plc.io.avro.read_avro(options) data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip( plc_result.column_names(include_children=False), plc_result.columns, diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 7e8468c8e8a..018520a1f5e 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -15,9 +15,9 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase from cudf.utils import ioutils from cudf.utils.dtypes import ( _maybe_convert_to_default_type, @@ -255,7 +255,7 @@ def read_csv( table_w_meta = plc.io.csv.read_csv(options) data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip( table_w_meta.column_names(include_children=False), table_w_meta.columns, diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 16c7d189dfd..bbd223c6075 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -5,7 +5,7 @@ import warnings from collections import abc from io import BytesIO, StringIO -from typing import TYPE_CHECKING, Any, Literal +from typing import Any, Literal import numpy as np import pandas as pd @@ -13,17 +13,14 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase from cudf.utils import ioutils from cudf.utils.dtypes import ( _maybe_convert_to_default_type, dtype_to_pylibcudf_type, ) -if TYPE_CHECKING: - from cudf.core.column import ColumnBase - def _get_cudf_schema_element_from_dtype( dtype, @@ -180,7 +177,7 @@ def read_json( ) ) data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip(res_col_names, res_cols, strict=True) } df = cudf.DataFrame._from_data(data) @@ -207,7 +204,7 @@ def read_json( ) ) data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip( table_w_meta.column_names(include_children=False), table_w_meta.columns, diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 0ac2950a22b..a818f71d924 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -3,16 +3,16 @@ import itertools import warnings -from typing import TYPE_CHECKING, Literal +from typing import Literal import pyarrow as pa import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase from cudf.core.index import _index_from_data from cudf.utils import ioutils from cudf.utils.dtypes import dtype_to_pylibcudf_type @@ -22,9 +22,6 @@ except ImportError: import json -if TYPE_CHECKING: - from cudf.core.column import ColumnBase - @ioutils.doc_read_orc_metadata() def read_orc_metadata(path): @@ -328,14 +325,15 @@ def read_orc( if actual_index_names is None: index = None data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip( result_col_names, tbl_w_meta.columns, strict=True ) } else: result_columns = [ - Column.from_pylibcudf(col) for col in tbl_w_meta.columns + ColumnBase.from_pylibcudf(col) + for col in tbl_w_meta.columns ] index = _index_from_data( dict( diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index feb6e12da8c..37f1978a262 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import io @@ -22,10 +22,9 @@ import pylibcudf as plc import cudf -from cudf._lib.column import Column from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import as_column, column_empty +from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking @@ -40,8 +39,6 @@ from typing_extensions import Self - from cudf.core.column import ColumnBase - BYTE_SIZES = { "kb": 1000, @@ -1235,7 +1232,7 @@ def _read_parquet( tbl._columns[i] = None data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip(column_names, concatenated_columns) } df = cudf.DataFrame._from_data(data) @@ -1279,7 +1276,7 @@ def _read_parquet( tbl_w_meta = plc.io.parquet.read_parquet(options) data = { - name: Column.from_pylibcudf(col) + name: ColumnBase.from_pylibcudf(col) for name, col in zip( tbl_w_meta.column_names(include_children=False), tbl_w_meta.columns, diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 5e266c5ff55..09711bf36b0 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from io import BytesIO, StringIO, TextIOBase @@ -63,6 +63,6 @@ def read_text( byte_range=byte_range, strip_delimiters=strip_delimiters ) plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) - result = cudf._lib.column.Column.from_pylibcudf(plc_column) + result = cudf.core.column.ColumnBase.from_pylibcudf(plc_column) return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py index c1369a03031..67d214e302f 100644 --- a/python/cudf/cudf/tests/test_string_udfs.py +++ b/python/cudf/cudf/tests/test_string_udfs.py @@ -11,11 +11,11 @@ import rmm import cudf -from cudf._lib.column import Column from cudf._lib.strings_udf import ( column_from_udf_string_array, column_to_string_view_array, ) +from cudf.core.column import ColumnBase from cudf.core.udf.strings_typing import ( str_view_arg_handler, string_view, @@ -97,7 +97,9 @@ def run_udf_test(data, func, dtype): with _CUDFNumbaConfig(): sv_kernel.forall(len(data))(str_views, output) if dtype == "str": - result = Column.from_pylibcudf(column_from_udf_string_array(output)) + result = ColumnBase.from_pylibcudf( + column_from_udf_string_array(output) + ) else: result = output @@ -106,7 +108,9 @@ def run_udf_test(data, func, dtype): with _CUDFNumbaConfig(): udf_str_kernel.forall(len(data))(str_views, output) if dtype == "str": - result = Column.from_pylibcudf(column_from_udf_string_array(output)) + result = ColumnBase.from_pylibcudf( + column_from_udf_string_array(output) + ) else: result = output diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 9e932acb5fa..cb2eb80d014 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -628,6 +628,35 @@ def dtype_to_pylibcudf_type(dtype) -> plc.DataType: return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) +def dtype_from_pylibcudf_column(col: plc.Column): + type_ = col.type() + tid = type_.id() + + if tid == plc.TypeId.LIST: + child = col.list_view().child() + return cudf.ListDtype(dtype_from_pylibcudf_column(child)) + elif tid == plc.TypeId.STRUCT: + fields = { + str(i): dtype_from_pylibcudf_column(col.child(i)) + for i in range(col.num_children()) + } + return cudf.StructDtype(fields) + elif tid == plc.TypeId.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, scale=-type_.scale() + ) + elif tid == plc.TypeId.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, scale=-type_.scale() + ) + elif tid == plc.TypeId.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, scale=-type_.scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] + + SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { np.dtype("int8"): plc.types.TypeId.INT8, np.dtype("int16"): plc.types.TypeId.INT16, From 94a96f070809f150d50c0bc4c73b5b75924e6cb2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 15 Jan 2025 19:25:10 -0800 Subject: [PATCH 02/10] Start working through circular import errors --- python/cudf/cudf/core/_base_index.py | 18 +++++-- python/cudf/cudf/core/_internals/copying.py | 12 +++-- python/cudf/cudf/core/_internals/sorting.py | 36 ++++++------- .../cudf/core/_internals/stream_compaction.py | 18 ++++--- python/cudf/cudf/core/column/column.py | 34 +++++++------ .../cudf/cudf/core/column/numerical_base.py | 7 ++- python/cudf/cudf/core/frame.py | 31 ++++++----- python/cudf/cudf/core/groupby/groupby.py | 17 ++++--- python/cudf/cudf/core/indexed_frame.py | 51 +++++++++++-------- python/cudf/cudf/core/join/join.py | 24 +++++---- 10 files changed, 141 insertions(+), 107 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 5f90439f86f..9af05af8120 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2032,10 +2032,13 @@ def dropna(self, how="any"): data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( - drop_nulls( - data_columns, - how=how, - ), + [ + ColumnBase.from_pylibcudf(col) + for col in drop_nulls( + data_columns, + how=how, + ) + ], self._column_names, ) @@ -2103,7 +2106,12 @@ def _apply_boolean_mask(self, boolean_mask): raise ValueError("boolean_mask is not boolean type.") return self._from_columns_like_self( - apply_boolean_mask(list(self._columns), boolean_mask), + [ + ColumnBase.from_pylibcudf(col) + for col in apply_boolean_mask( + list(self._columns), boolean_mask + ) + ], column_names=self._column_names, ) diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py index 35d75c37c37..0e6a6d4ff41 100644 --- a/python/cudf/cudf/core/_internals/copying.py +++ b/python/cudf/cudf/core/_internals/copying.py @@ -6,12 +6,14 @@ import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase if TYPE_CHECKING: from collections.abc import Iterable from cudf import Scalar + + # ruff does not identify that there's a relative import in use + from cudf.core.column import ColumnBase # noqa: TC004 from cudf.core.column.numerical import NumericalColumn @@ -20,7 +22,7 @@ def gather( columns: Iterable[ColumnBase], gather_map: NumericalColumn, nullify: bool = False, -) -> list[ColumnBase]: +) -> list[plc.Column]: plc_tbl = plc.copying.gather( plc.Table([col.to_pylibcudf(mode="read") for col in columns]), gather_map.to_pylibcudf(mode="read"), @@ -28,7 +30,7 @@ def gather( if nullify else plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - return [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] + return plc_tbl.columns() @acquire_spill_lock() @@ -69,13 +71,13 @@ def scatter( plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]), ) - return [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] + return plc_tbl.columns() @acquire_spill_lock() def columns_split( input_columns: Iterable[ColumnBase], splits: list[int] -) -> list[list[ColumnBase]]: +) -> list[list[plc.Column]]: return [ [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] for plc_tbl in plc.copying.split( diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py index 8dbb169bb83..5e6f23f1368 100644 --- a/python/cudf/cudf/core/_internals/sorting.py +++ b/python/cudf/cudf/core/_internals/sorting.py @@ -7,11 +7,12 @@ import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase if TYPE_CHECKING: from collections.abc import Iterable + from cudf.core.column import ColumnBase + @acquire_spill_lock() def is_sorted( @@ -118,7 +119,7 @@ def order_by( na_position: Literal["first", "last"], *, stable: bool, -): +) -> plc.Column: """ Get index to sort the table in ascending/descending order. @@ -144,14 +145,12 @@ def order_by( func = ( plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order ) - return ColumnBase.from_pylibcudf( - func( - plc.Table( - [col.to_pylibcudf(mode="read") for col in columns_from_table], - ), - order[0], - order[1], - ) + return func( + plc.Table( + [col.to_pylibcudf(mode="read") for col in columns_from_table], + ), + order[0], + order[1], ) @@ -163,7 +162,7 @@ def sort_by_key( na_position: list[Literal["first", "last"]], *, stable: bool, -) -> list[ColumnBase]: +) -> list[plc.Column]: """ Sort a table by given keys @@ -192,12 +191,9 @@ def sort_by_key( func = ( plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key ) - return [ - ColumnBase.from_pylibcudf(col) - for col in func( - plc.Table([col.to_pylibcudf(mode="read") for col in values]), - plc.Table([col.to_pylibcudf(mode="read") for col in keys]), - order[0], - order[1], - ).columns() - ] + return func( + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + plc.Table([col.to_pylibcudf(mode="read") for col in keys]), + order[0], + order[1], + ).columns() diff --git a/python/cudf/cudf/core/_internals/stream_compaction.py b/python/cudf/cudf/core/_internals/stream_compaction.py index d82671bd3f0..57a655688c4 100644 --- a/python/cudf/cudf/core/_internals/stream_compaction.py +++ b/python/cudf/cudf/core/_internals/stream_compaction.py @@ -1,12 +1,14 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations -from typing import Literal +from typing import TYPE_CHECKING, Literal import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase + +if TYPE_CHECKING: + from cudf.core.column import ColumnBase @acquire_spill_lock() @@ -15,7 +17,7 @@ def drop_nulls( how: Literal["any", "all"] = "any", keys: list[int] | None = None, thresh: int | None = None, -) -> list[ColumnBase]: +) -> list[plc.Column]: """ Drops null rows from cols depending on key columns. @@ -50,13 +52,13 @@ def drop_nulls( keys, keep_threshold, ) - return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()] + return plc_table.columns() @acquire_spill_lock() def apply_boolean_mask( columns: list[ColumnBase], boolean_mask: ColumnBase -) -> list[ColumnBase]: +) -> list[plc.Column]: """ Drops the rows which correspond to False in boolean_mask. @@ -73,7 +75,7 @@ def apply_boolean_mask( plc.Table([col.to_pylibcudf(mode="read") for col in columns]), boolean_mask.to_pylibcudf(mode="read"), ) - return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()] + return plc_table.columns() @acquire_spill_lock() @@ -82,7 +84,7 @@ def drop_duplicates( keys: list[int] | None = None, keep: Literal["first", "last", False] = "first", nulls_are_equal: bool = True, -) -> list[ColumnBase]: +) -> list[plc.Column]: """ Drops rows in source_table as per duplicate rows in keys. @@ -115,4 +117,4 @@ def drop_duplicates( else plc.types.NullEquality.UNEQUAL, plc.types.NanEquality.ALL_EQUAL, ) - return [ColumnBase.from_pylibcudf(col) for col in plc_table.columns()] + return plc_table.columns() diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 0734c85e1a5..53319d84520 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -719,7 +719,9 @@ def any(self, skipna: bool = True) -> bool: def dropna(self) -> Self: if self.has_nulls(): - return drop_nulls([self])[0]._with_type_metadata(self.dtype) # type: ignore[return-value] + return ColumnBase.from_pylibcudf( + drop_nulls([self])[0] + )._with_type_metadata(self.dtype) # type: ignore[return-value] else: return self.copy() @@ -1300,9 +1302,9 @@ def indices_of( else: value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) - return apply_boolean_mask( # type: ignore[return-value] - [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask - )[0] + return as_column( + range(len(self)), dtype=SIZE_TYPE_DTYPE + ).apply_boolean_mask(mask) # type: ignore[return-value] def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: indices = self.indices_of(value) @@ -1682,9 +1684,9 @@ def apply_boolean_mask(self, mask) -> ColumnBase: if mask.dtype.kind != "b": raise ValueError("boolean_mask is not boolean type.") - return apply_boolean_mask([self], mask)[0]._with_type_metadata( - self.dtype - ) + return ColumnBase.from_pylibcudf( + apply_boolean_mask([self], mask)[0] + )._with_type_metadata(self.dtype) def argsort( self, @@ -1705,8 +1707,8 @@ def argsort( as_column(range(len(self) - 1, -1, -1)), ) else: - return sorting.order_by( - [self], [ascending], na_position, stable=True + return ColumnBase.from_pylibcudf( # type: ignore[return-value] + sorting.order_by([self], [ascending], na_position, stable=True) ) def __arrow_array__(self, type=None): @@ -1772,9 +1774,11 @@ def unique(self) -> Self: if self.is_unique: return self.copy() else: - return drop_duplicates([self], keep="first")[ # type: ignore[return-value] - 0 - ]._with_type_metadata(self.dtype) + return ColumnBase.from_pylibcudf( + drop_duplicates([self], keep="first")[ # type: ignore[return-value] + 0 + ] + )._with_type_metadata(self.dtype) def serialize(self) -> tuple[dict, list]: # data model: @@ -2010,10 +2014,10 @@ def _return_sentinel_column(): del right_rows # reorder `codes` so that its values correspond to the # values of `self`: - (codes,) = sorting.sort_by_key( + plc_codes = sorting.sort_by_key( [codes], [left_gather_map], [True], ["last"], stable=True - ) - return codes.fillna(na_sentinel.value) + )[0] + return ColumnBase.from_pylibcudf(plc_codes).fillna(na_sentinel.value) @acquire_spill_lock() def copy_if_else( diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 689d5132d45..e5768f090f2 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. """Define an interface for columns that can perform numerical operations.""" from __future__ import annotations @@ -10,7 +10,6 @@ import pylibcudf as plc import cudf -from cudf.core._internals import sorting from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.missing import NA @@ -145,8 +144,8 @@ def quantile( else: no_nans = self.nans_to_nulls() # get sorted indices and exclude nulls - indices = sorting.order_by( - [no_nans], [True], "first", stable=True + indices = no_nans.argsort( + ascending=True, na_position="first" ).slice(no_nans.null_count, len(no_nans)) with acquire_spill_lock(): plc_column = plc.quantiles.quantile( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index abf9f7b3686..2502e368c3c 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -19,7 +19,6 @@ # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. -from cudf import _lib as libcudf from cudf.api.types import is_dtype_equal, is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core._internals import copying, sorting @@ -965,9 +964,9 @@ def from_arrow(cls, data: pa.Table) -> Self: for name, plc_codes in zip( dict_indices_table.column_names, plc_indices.columns() ): - codes = libcudf.column.Column.from_pylibcudf(plc_codes) + codes = ColumnBase.from_pylibcudf(plc_codes) categories = cudf_dictionaries_columns[name] - codes = as_unsigned_codes(len(categories), codes) + codes = as_unsigned_codes(len(categories), codes) # type: ignore[arg-type] cudf_category_frame[name] = CategoricalColumn( data=None, size=codes.size, @@ -981,7 +980,7 @@ def from_arrow(cls, data: pa.Table) -> Self: # Handle non-dict arrays cudf_non_category_frame = { - name: libcudf.column.Column.from_pylibcudf(plc_col) + name: ColumnBase.from_pylibcudf(plc_col) for name, plc_col in zip( data.column_names, plc.interop.from_arrow(data).columns() ) @@ -1474,11 +1473,13 @@ def _get_sorted_inds( else: ascending_lst = list(ascending) - return sorting.order_by( - list(to_sort), - ascending_lst, - na_position, - stable=True, + return ColumnBase.from_pylibcudf( + sorting.order_by( + list(to_sort), + ascending_lst, + na_position, + stable=True, + ) ) @_performance_tracking @@ -1487,7 +1488,10 @@ def _split(self, splits: list[int]) -> list[Self]: Frames of length `len(splits) + 1`. """ return [ - self._from_columns_like_self(split, self._column_names) + self._from_columns_like_self( + [ColumnBase.from_pylibcudf(col) for col in split], + self._column_names, + ) for split in copying.columns_split(self._columns, splits) ] @@ -1497,10 +1501,9 @@ def _encode(self): plc.Table([col.to_pylibcudf(mode="read") for col in self._columns]) ) columns = [ - libcudf.column.Column.from_pylibcudf(col) - for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] - indices = libcudf.column.Column.from_pylibcudf(plc_column) + indices = ColumnBase.from_pylibcudf(plc_column) keys = self._from_columns_like_self(columns) return keys, indices @@ -1951,7 +1954,7 @@ def _repeat( if isinstance(repeats, ColumnBase): repeats = repeats.to_pylibcudf(mode="read") return [ - libcudf.column.Column.from_pylibcudf(col) + ColumnBase.from_pylibcudf(col) for col in plc.filling.repeat(plc_table, repeats).columns() ] diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 7bc4b08fc49..ab6e67ab248 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -592,9 +592,12 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: ] ) - group_keys = cudf.core._internals.stream_compaction.drop_duplicates( - group_keys - ) + group_keys = [ + ColumnBase.from_pylibcudf(col) + for col in cudf.core._internals.stream_compaction.drop_duplicates( + group_keys + ) + ] if len(group_keys) > 1: index = cudf.MultiIndex.from_arrays(group_keys) else: @@ -1084,16 +1087,18 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): # want, and right order is a matching gather map for # the result table. Get the correct order by sorting # the right gather map. - (right_order,) = sorting.sort_by_key( + right_order = sorting.sort_by_key( [right_order], [left_order], [True], ["first"], stable=False, - ) + )[0] result = result._gather( GatherMap.from_column_unchecked( - right_order, len(result), nullify=False + ColumnBase.from_pylibcudf(right_order), + len(result), + nullify=False, ) ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 4c6f8a9c152..fe2498190aa 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3122,14 +3122,17 @@ def drop_duplicates( subset, offset_by_index_columns=not ignore_index ) return self._from_columns_like_self( - cudf.core._internals.stream_compaction.drop_duplicates( - list(self._columns) - if ignore_index - else list(self.index._columns + self._columns), - keys=keys, - keep=keep, - nulls_are_equal=nulls_are_equal, - ), + [ + ColumnBase.from_pylibcudf(col) + for col in cudf.core._internals.stream_compaction.drop_duplicates( + list(self._columns) + if ignore_index + else list(self.index._columns + self._columns), + keys=keys, + keep=keep, + nulls_are_equal=nulls_are_equal, + ) + ], self._column_names, self.index.names if not ignore_index else None, ) @@ -3304,7 +3307,7 @@ def _split(self, splits, keep_index: bool = True) -> list[Self]: return [ self._from_columns_like_self( - split, + [ColumnBase.from_pylibcudf(col) for col in split], self._column_names, self.index.names if keep_index else None, ) @@ -4378,12 +4381,15 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): data_columns = [col.nans_to_nulls() for col in self._columns] return self._from_columns_like_self( - cudf.core._internals.stream_compaction.drop_nulls( - [*self.index._columns, *data_columns], - how=how, - keys=self._positions_from_column_names(subset), - thresh=thresh, - ), + [ + ColumnBase.from_pylibcudf(col) + for col in cudf.core._internals.stream_compaction.drop_nulls( + [*self.index._columns, *data_columns], + how=how, + keys=self._positions_from_column_names(subset), + thresh=thresh, + ) + ], self._column_names, self.index.names, ) @@ -4401,12 +4407,15 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): f"{len(boolean_mask.column)} not {len(self)}" ) return self._from_columns_like_self( - cudf.core._internals.stream_compaction.apply_boolean_mask( - list(self.index._columns + self._columns) - if keep_index - else list(self._columns), - boolean_mask.column, - ), + [ + ColumnBase.from_pylibcudf(col) + for col in cudf.core._internals.stream_compaction.apply_boolean_mask( + list(self.index._columns + self._columns) + if keep_index + else list(self._columns), + boolean_mask.column, + ) + ], column_names=self._column_names, index_names=self.index.names if keep_index else None, ) diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1159cd1c845..4db2eab73d0 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -10,6 +10,7 @@ from cudf import _lib as libcudf from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock +from cudf.core.column import ColumnBase from cudf.core.copy_types import GatherMap from cudf.core.join._join_helpers import ( _coerce_to_tuple, @@ -266,14 +267,17 @@ def _gather_maps(self, left_cols, right_cols): ) for map_, n, null in zip(maps, lengths, nullify) ] - return sorting.sort_by_key( - list(maps), - # If how is right, right map is primary sort key. - key_order[:: -1 if self.how == "right" else 1], - [True] * len(key_order), - ["last"] * len(key_order), - stable=True, - ) + return [ + ColumnBase.from_pylibcudf(col) + for col in sorting.sort_by_key( + list(maps), + # If how is right, right map is primary sort key. + key_order[:: -1 if self.how == "right" else 1], + [True] * len(key_order), + ["last"] * len(key_order), + stable=True, + ) + ] def perform_merge(self) -> cudf.DataFrame: left_join_cols = [] @@ -444,7 +448,9 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: stable=True, ) result = result._from_columns_like_self( - result_columns, result._column_names, index_names + [ColumnBase.from_pylibcudf(col) for col in result_columns], + result._column_names, + index_names, ) return result From 119c0fdc3a1298f21417924c974f9064cbdfd9b8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Jan 2025 09:50:53 -0800 Subject: [PATCH 03/10] Remove other access of cudf._lib.column as libcudf --- python/cudf/cudf/core/_internals/timezones.py | 12 ++++++---- python/cudf/cudf/core/column/datetime.py | 7 +++--- python/cudf/cudf/core/column/string.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 18 ++++++-------- python/cudf/cudf/core/groupby/groupby.py | 9 +++---- python/cudf/cudf/core/index.py | 7 +++--- python/cudf/cudf/core/indexed_frame.py | 20 +++++++--------- python/cudf/cudf/core/join/join.py | 24 +++++++++---------- python/cudf/cudf/core/multiindex.py | 6 ++--- python/cudf/cudf/core/tools/datetimes.py | 5 ++-- python/cudf/cudf/core/window/rolling.py | 9 ++----- 11 files changed, 52 insertions(+), 69 deletions(-) diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index 0ccd517cae0..8a1c4a6d22b 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -13,7 +13,6 @@ import pylibcudf as plc import cudf -from cudf.core.column import ColumnBase if TYPE_CHECKING: from cudf.core.column.datetime import DatetimeColumn @@ -116,9 +115,7 @@ def _read_tzfile_as_columns( plc_table = plc.io.timezone.make_timezone_transition_table( tzdir, zone_name ) - transition_times_and_offsets = [ - ColumnBase.from_pylibcudf(col) for col in plc_table.columns() - ] + transition_times_and_offsets = plc_table.columns() if not transition_times_and_offsets: from cudf.core.column.column import as_column @@ -126,7 +123,12 @@ def _read_tzfile_as_columns( # this happens for UTC-like zones min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]") return (as_column([min_date]), as_column([np.timedelta64(0, "s")])) # type: ignore[return-value] - return tuple(transition_times_and_offsets) # type: ignore[return-value] + + from cudf.core.column import ColumnBase + + return tuple( + ColumnBase.from_pylibcudf(col) for col in transition_times_and_offsets + ) # type: ignore[return-value] def check_ambiguous_and_nonexistent( diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 1bde7d27700..f58ffb105b7 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -19,7 +19,6 @@ import cudf import cudf.core.column.column as column -from cudf import _lib as libcudf from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals import binaryop, unary from cudf.core._internals.search import search_sorted @@ -908,7 +907,7 @@ def _find_ambiguous_and_nonexistent( ambiguous_end.to_pylibcudf(mode="read"), plc.labeling.Inclusive.NO, ) - ambiguous = libcudf.column.Column.from_pylibcudf(plc_column) + ambiguous = ColumnBase.from_pylibcudf(plc_column) ambiguous = ambiguous.notnull() # At the start of a non-existent time period, Clock 2 reads less @@ -927,10 +926,10 @@ def _find_ambiguous_and_nonexistent( nonexistent_end.to_pylibcudf(mode="read"), plc.labeling.Inclusive.NO, ) - nonexistent = libcudf.column.Column.from_pylibcudf(plc_column) + nonexistent = ColumnBase.from_pylibcudf(plc_column) nonexistent = nonexistent.notnull() - return ambiguous, nonexistent + return ambiguous, nonexistent # type: ignore[return-value] def tz_localize( self, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index cd38bfdf934..57724938368 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4079,7 +4079,7 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex: ends_column = self.endswith(suffix)._column # type: ignore[union-attr] removed_column = self.slice(0, -len(suffix), None)._column # type: ignore[union-attr] - result = removed_column.copy_if_else(self._column, ends_column) + result = removed_column.copy_if_else(self._column, ends_column) # type: ignore[arg-type] return self._return_or_inplace(result) def removeprefix(self, prefix: str) -> SeriesOrIndex: @@ -4117,7 +4117,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: return self._return_or_inplace(self._column) starts_column = self.startswith(prefix)._column # type: ignore[union-attr] removed_column = self.slice(len(prefix), None, None)._column # type: ignore[union-attr] - result = removed_column.copy_if_else(self._column, starts_column) + result = removed_column.copy_if_else(self._column, starts_column) # type: ignore[arg-type] return self._return_or_inplace(result) def _find( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1bddf8f4553..799271c86bc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -36,7 +36,6 @@ import cudf import cudf.core.common -from cudf import _lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -2502,8 +2501,7 @@ def scatter_by_map( map_size, ) partitioned_columns = [ - libcudf.column.Column.from_pylibcudf(col) - for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] partitioned = self._from_columns_like_self( @@ -4127,7 +4125,7 @@ def transpose(self): ) ) result_columns = [ - libcudf.column.Column.from_pylibcudf(col, data_ptr_exposed=True) + ColumnBase.from_pylibcudf(col, data_ptr_exposed=True) for col in result_table.columns() ] @@ -5035,8 +5033,7 @@ def partition_by_hash( nparts, ) output_columns = [ - libcudf.column.Column.from_pylibcudf(col) - for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] outdf = self._from_columns_like_self( @@ -7245,8 +7242,7 @@ def stack( self.shape[0], ) tiled_index = [ - libcudf.column.Column.from_pylibcudf(plc) - for plc in plc_table.columns() + ColumnBase.from_pylibcudf(plc) for plc in plc_table.columns() ] # Assemble the final index @@ -7325,7 +7321,7 @@ def unnamed_group_generator(): ) with acquire_spill_lock(): - interleaved_col = libcudf.column.Column.from_pylibcudf( + interleaved_col = ColumnBase.from_pylibcudf( plc.reshape.interleave_columns( plc.Table( [ @@ -7839,7 +7835,7 @@ def interleave_columns(self): "interleave_columns does not support 'category' dtype." ) with acquire_spill_lock(): - result_col = libcudf.column.Column.from_pylibcudf( + result_col = ColumnBase.from_pylibcudf( plc.reshape.interleave_columns( plc.Table( [ @@ -7860,7 +7856,7 @@ def _compute_column(self, expr: str) -> ColumnBase: ), plc.expressions.to_expression(expr, self._column_names), ) - return libcudf.column.Column.from_pylibcudf(plc_column) + return ColumnBase.from_pylibcudf(plc_column) @_performance_tracking def eval(self, expr: str, inplace: bool = False, **kwargs): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index ab6e67ab248..f2f9d3e2116 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -20,7 +20,6 @@ import cudf import cudf.core._internals -from cudf import _lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import ( is_list_like, @@ -1079,10 +1078,8 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): plc_tables[1], plc.types.NullEquality.EQUAL, ) - left_order = libcudf.column.Column.from_pylibcudf(left_plc) - right_order = libcudf.column.Column.from_pylibcudf( - right_plc - ) + left_order = ColumnBase.from_pylibcudf(left_plc) + right_order = ColumnBase.from_pylibcudf(right_plc) # left order is some permutation of the ordering we # want, and right order is a matching gather map for # the result table. Get the correct order by sorting @@ -2518,7 +2515,7 @@ def _cov_or_corr(self, func, method_name): @acquire_spill_lock() def interleave_columns(source_columns): - return libcudf.column.Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( plc.reshape.interleave_columns( plc.Table( [c.to_pylibcudf(mode="read") for c in source_columns] diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0d1bf552982..e55d18f3a81 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -18,7 +18,6 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -1364,8 +1363,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): plc.Table([rcol.to_pylibcudf(mode="read")]), plc.types.NullEquality.EQUAL, ) - scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) - indices = libcudf.column.Column.from_pylibcudf(right_plc) + scatter_map = ColumnBase.from_pylibcudf(left_plc) + indices = ColumnBase.from_pylibcudf(right_plc) result = copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) @@ -3383,7 +3382,7 @@ def interval_range( freq = freq.astype(common_dtype) with acquire_spill_lock(): - bin_edges = libcudf.column.Column.from_pylibcudf( + bin_edges = ColumnBase.from_pylibcudf( plc.filling.sequence( size=periods + 1, init=start.device_value.c_value, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fe2498190aa..53b57392dbc 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -25,7 +25,6 @@ import pylibcudf as plc import cudf -import cudf._lib as libcudf import cudf.core import cudf.core._internals import cudf.core.algorithms @@ -2939,7 +2938,7 @@ def hash_values( plc_column = plc.hashing.sha512(plc_table) else: raise ValueError(f"Unsupported hashing algorithm {method}.") - result = libcudf.column.Column.from_pylibcudf(plc_column) + result = ColumnBase.from_pylibcudf(plc_column) return cudf.Series._from_column( result, index=self.index, @@ -3057,7 +3056,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: [start, stop], ) sliced = [ - libcudf.column.Column.from_pylibcudf(col) + ColumnBase.from_pylibcudf(col) for col in plc_tables[0].columns() ] result = self._from_columns_like_self( @@ -3257,10 +3256,10 @@ def duplicated( plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) - distinct = libcudf.column.Column.from_pylibcudf(plc_column) + distinct = ColumnBase.from_pylibcudf(plc_column) result = copying.scatter( [cudf.Scalar(False)], - distinct, + distinct, # type: ignore[arg-type] [as_column(True, length=len(self), dtype=bool)], bounds_check=False, )[0] @@ -3282,8 +3281,7 @@ def _empty_like(self, keep_index: bool = True) -> Self: ) ) columns = [ - libcudf.column.Column.from_pylibcudf(col) - for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] result = self._from_columns_like_self( columns, @@ -5391,8 +5389,7 @@ def _explode(self, explode_column: Any, ignore_index: bool): column_index + len(idx_cols), ) exploded = [ - libcudf.column.Column.from_pylibcudf(col) - for col in plc_table.columns() + ColumnBase.from_pylibcudf(col) for col in plc_table.columns() ] # We must copy inner datatype of the exploded list column to # maintain struct dtype key names @@ -5449,8 +5446,7 @@ def tile(self, count: int): count, ) tiled = [ - libcudf.column.Column.from_pylibcudf(plc) - for plc in plc_table.columns() + ColumnBase.from_pylibcudf(plc) for plc in plc_table.columns() ] return self._from_columns_like_self( tiled, @@ -6455,7 +6451,7 @@ def rank( with acquire_spill_lock(): result_columns = [ - libcudf.column.Column.from_pylibcudf( + ColumnBase.from_pylibcudf( plc.sorting.rank( col.to_pylibcudf(mode="read"), method_enum, diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 4db2eab73d0..0f8508a5df1 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -7,7 +7,6 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ColumnBase @@ -25,10 +24,10 @@ class Merge: @staticmethod @acquire_spill_lock() def _joiner( - lhs: list[libcudf.column.Column], - rhs: list[libcudf.column.Column], + lhs: list[ColumnBase], + rhs: list[ColumnBase], how: str, - ) -> tuple[libcudf.column.Column, libcudf.column.Column]: + ) -> tuple[ColumnBase, ColumnBase]: if how == "outer": how = "full" if (join_func := getattr(plc.join, f"{how}_join", None)) is None: @@ -39,9 +38,10 @@ def _joiner( plc.Table([col.to_pylibcudf(mode="read") for col in rhs]), plc.types.NullEquality.EQUAL, ) - return libcudf.column.Column.from_pylibcudf( - left_rows - ), libcudf.column.Column.from_pylibcudf(right_rows) + return ( + ColumnBase.from_pylibcudf(left_rows), + ColumnBase.from_pylibcudf(right_rows), + ) def __init__( self, @@ -574,11 +574,11 @@ def _validate_merge_params( class MergeSemi(Merge): @staticmethod @acquire_spill_lock() - def _joiner( - lhs: list[libcudf.column.Column], - rhs: list[libcudf.column.Column], + def _joiner( # type: ignore[override] + lhs: list[ColumnBase], + rhs: list[ColumnBase], how: str, - ) -> tuple[libcudf.column.Column, None]: + ) -> tuple[ColumnBase, None]: if ( join_func := getattr( plc.join, f"{how.replace('left', 'left_')}_join", None @@ -586,7 +586,7 @@ def _joiner( ) is None: raise ValueError(f"Invalid join type {how}") - return libcudf.column.Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( join_func( plc.Table([col.to_pylibcudf(mode="read") for col in lhs]), plc.Table([col.to_pylibcudf(mode="read") for col in rhs]), diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a6cad4cc5dc..9eca77c3212 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -16,7 +16,6 @@ import pylibcudf as plc import cudf -import cudf._lib as libcudf from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column @@ -24,6 +23,7 @@ from cudf.core._internals import copying, sorting from cudf.core.algorithms import factorize from cudf.core.buffer import acquire_spill_lock +from cudf.core.column.column import ColumnBase from cudf.core.column_accessor import ColumnAccessor from cudf.core.frame import Frame from cudf.core.index import ( @@ -1962,8 +1962,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): plc_tables[1], plc.types.NullEquality.EQUAL, ) - scatter_map = libcudf.column.Column.from_pylibcudf(left_plc) - indices = libcudf.column.Column.from_pylibcudf(right_plc) + scatter_map = ColumnBase.from_pylibcudf(left_plc) + indices = ColumnBase.from_pylibcudf(right_plc) result = copying.scatter([indices], scatter_map, [result])[0] result_series = cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 8be336021b1..06e1f0f52cd 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import math @@ -14,7 +14,6 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf from cudf.api.types import is_integer, is_scalar from cudf.core import column from cudf.core.buffer import acquire_spill_lock @@ -995,7 +994,7 @@ def date_range( "months", 0 ) with acquire_spill_lock(): - res = libcudf.column.Column.from_pylibcudf( + res = column.ColumnBase.from_pylibcudf( plc.filling.calendrical_month_sequence( periods, start.device_value.c_value, diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index e2c332f34f5..5fa130b9d05 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -2,7 +2,6 @@ from __future__ import annotations import warnings -from typing import TYPE_CHECKING import numba import numpy as np @@ -12,18 +11,14 @@ import pylibcudf as plc import cudf -from cudf import _lib as libcudf from cudf.api.types import is_integer, is_number from cudf.core._internals.aggregation import make_aggregation from cudf.core.buffer import acquire_spill_lock -from cudf.core.column.column import as_column +from cudf.core.column.column import ColumnBase, as_column from cudf.core.mixins import Reducible from cudf.utils import cudautils from cudf.utils.utils import GetAttrGetItemMixin -if TYPE_CHECKING: - from cudf.core.column.column import ColumnBase - class _RollingBase: """ @@ -301,7 +296,7 @@ def _apply_agg_column(self, source_column, agg_name): pre = window fwd = 0 - return libcudf.column.Column.from_pylibcudf( + return ColumnBase.from_pylibcudf( plc.rolling.rolling_window( source_column.to_pylibcudf(mode="read"), pre, From 00d8709146644ddf29a66c739f25031468619b11 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Jan 2025 15:29:48 -0800 Subject: [PATCH 04/10] Fix more tests --- python/cudf/cudf/api/types.py | 3 ++- python/cudf/cudf/core/_base_index.py | 7 +++++- python/cudf/cudf/core/_internals/copying.py | 4 +++- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 24 +++++++++++++-------- python/cudf/cudf/core/indexed_frame.py | 17 +++++++++------ 6 files changed, 37 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index cad4b1aa72c..cb26a528717 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -17,6 +17,7 @@ from pandas.api import types as pd_types import cudf +from cudf._lib.scalar import DeviceScalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.dtypes import ( # noqa: F401 _BaseDtype, @@ -143,7 +144,7 @@ def is_scalar(val): val, ( cudf.Scalar, - cudf._lib.scalar.DeviceScalar, + DeviceScalar, cudf.core.tools.datetimes.DateOffset, pa.Scalar, ), diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 9af05af8120..560cf9973db 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -2057,7 +2057,12 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): GatherMap(gather_map, len(self), nullify=not check_bounds or nullify) return self._from_columns_like_self( - copying.gather(self._columns, gather_map, nullify=nullify), + [ + ColumnBase.from_pylibcudf(col) + for col in copying.gather( + self._columns, gather_map, nullify=nullify + ) + ], self._column_names, ) diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py index 0e6a6d4ff41..f0d421bfb04 100644 --- a/python/cudf/cudf/core/_internals/copying.py +++ b/python/cudf/cudf/core/_internals/copying.py @@ -63,9 +63,11 @@ def scatter( f"index out of bounds for column of size {n_rows}" ) + from cudf.core import column + plc_tbl = plc.copying.scatter( plc.Table([col.to_pylibcudf(mode="read") for col in sources]) # type: ignore[union-attr] - if isinstance(sources[0], ColumnBase) + if isinstance(sources[0], column.ColumnBase) else [slr.device_value.c_value for slr in sources], # type: ignore[union-attr] scatter_map.to_pylibcudf(mode="read"), plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]), diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 7e4985f8819..4b7cb9aef03 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -504,7 +504,7 @@ class CategoricalColumn(column.ColumnBase): """ dtype: CategoricalDtype - _children: tuple[NumericalColumn] + _children: tuple[NumericalColumn] # type: ignore[assignment] _VALID_REDUCTIONS = { "max", "min", diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 53319d84520..8ee3122d8ab 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -125,6 +125,10 @@ def __init__( self._dtype = dtype self._offset = offset self._null_count = null_count + self._mask = None + self._base_mask = None + self._data = None + self._children = None self.set_base_children(children) self.set_base_data(data) self.set_base_mask(mask) @@ -152,7 +156,7 @@ def data(self) -> None | Buffer: if self._data is None: # type: ignore[has-type] start = self.offset * self.dtype.itemsize end = start + self.size * self.dtype.itemsize - self._data = self.base_data[start:end] + self._data = self.base_data[start:end] # type: ignore[assignment] return self._data @property @@ -187,10 +191,10 @@ def base_mask(self) -> None | Buffer: def mask(self) -> None | Buffer: if self._mask is None: # type: ignore[has-type] if self.base_mask is None or self.offset == 0: - self._mask = self.base_mask + self._mask = self.base_mask # type: ignore[assignment] else: with acquire_spill_lock(): - self._mask = as_buffer( + self._mask = as_buffer( # type: ignore[assignment] plc.null_mask.copy_bitmask( self.to_pylibcudf(mode="read") ) @@ -236,7 +240,7 @@ def set_base_mask(self, value: None | Buffer) -> None: self._mask = None self._children = None - self._base_mask = value + self._base_mask = value # type: ignore[assignment] self._clear_cache() def _clear_cache(self) -> None: @@ -1174,9 +1178,9 @@ def _scatter_by_column( ._with_type_metadata(self.dtype) ) else: - return copying.scatter([value], key, [self])[ - 0 - ]._with_type_metadata(self.dtype) + return ColumnBase.from_pylibcudf( # type: ignore[return-value] + copying.scatter([value], key, [self])[0] + )._with_type_metadata(self.dtype) def _check_scatter_key_length( self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase @@ -1385,8 +1389,10 @@ def take( if indices.dtype.kind not in {"u", "i"}: indices = indices.astype(SIZE_TYPE_DTYPE) GatherMap(indices, len(self), nullify=not check_bounds or nullify) - gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] - return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] + gathered = ColumnBase.from_pylibcudf( + copying.gather([self], indices, nullify=nullify)[0] # type: ignore[arg-type] + ) + return gathered._with_type_metadata(self.dtype) # type: ignore[return-value] def isin(self, values: Sequence) -> ColumnBase: """Check whether values are contained in the Column. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 53b57392dbc..2db409fc3bb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2960,13 +2960,16 @@ def _gather( if not gather_map.nullify and len(self) != gather_map.nrows: raise IndexError("Gather map is out of bounds") return self._from_columns_like_self( - copying.gather( - itertools.chain(self.index._columns, self._columns) - if keep_index - else self._columns, - gather_map.column, - nullify=gather_map.nullify, - ), + [ + ColumnBase.from_pylibcudf(col) + for col in copying.gather( + itertools.chain(self.index._columns, self._columns) + if keep_index + else self._columns, + gather_map.column, + nullify=gather_map.nullify, + ) + ], self._column_names, self.index.names if keep_index else None, ) From bff0fa58c3364d4a3ad26f7dedc0b25f7c4e4a69 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 Jan 2025 19:06:10 -0800 Subject: [PATCH 05/10] Add column_from_self_view, and fix other tests --- python/cudf/cudf/core/_base_index.py | 13 +++++++---- python/cudf/cudf/core/_internals/copying.py | 4 ++-- python/cudf/cudf/core/column/column.py | 23 +++++++++---------- .../cudf/cudf/core/column/numerical_base.py | 8 ++++--- python/cudf/cudf/core/index.py | 4 +++- python/cudf/cudf/core/indexed_frame.py | 14 ++++++----- python/cudf/cudf/core/multiindex.py | 4 +++- python/pylibcudf/pylibcudf/column.pxd | 3 ++- python/pylibcudf/pylibcudf/column.pyi | 1 + python/pylibcudf/pylibcudf/column.pyx | 13 ++++++++++- 10 files changed, 55 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 560cf9973db..570e1f669b0 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1945,11 +1945,14 @@ def drop_duplicates( # This utilizes the fact that all `Index` is also a `Frame`. # Except RangeIndex. return self._from_columns_like_self( - drop_duplicates( - list(self._columns), - keep=keep, - nulls_are_equal=nulls_are_equal, - ), + [ + ColumnBase.from_pylibcudf(col) + for col in drop_duplicates( + list(self._columns), + keep=keep, + nulls_are_equal=nulls_are_equal, + ) + ], self._column_names, ) diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py index f0d421bfb04..f9cafa2652f 100644 --- a/python/cudf/cudf/core/_internals/copying.py +++ b/python/cudf/cudf/core/_internals/copying.py @@ -13,7 +13,7 @@ from cudf import Scalar # ruff does not identify that there's a relative import in use - from cudf.core.column import ColumnBase # noqa: TC004 + from cudf.core.column import ColumnBase from cudf.core.column.numerical import NumericalColumn @@ -81,7 +81,7 @@ def columns_split( input_columns: Iterable[ColumnBase], splits: list[int] ) -> list[list[plc.Column]]: return [ - [ColumnBase.from_pylibcudf(col) for col in plc_tbl.columns()] + plc_tbl.columns() for plc_tbl in plc.copying.split( plc.Table( [col.to_pylibcudf(mode="read") for col in input_columns] diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 8ee3122d8ab..eb5e5c5903d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -346,18 +346,17 @@ def children(self) -> tuple[ColumnBase, ...]: if not self.base_children: self._children = () # type: ignore[assignment] else: - self._children = self.base_children # type: ignore[assignment] - # children = Column.from_unique_ptr( - # move(make_unique[column](self.view())) - # ).base_children - # dtypes = [ - # base_child.dtype for base_child in self.base_children - # ] - # self._children = tuple( - # child._with_type_metadata(dtype) for child, dtype in zip( - # children, dtypes - # ) - # ) + # Compute children from the column view (children factoring self.size) + children = ColumnBase.from_pylibcudf( + self.to_pylibcudf(mode="read").column_from_self_view() + ).base_children + dtypes = ( + base_child.dtype for base_child in self.base_children + ) + self._children = tuple( # type: ignore[assignment] + child._with_type_metadata(dtype) + for child, dtype in zip(children, dtypes) + ) return self._children # type: ignore[return-value] def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index e5768f090f2..d8c316a4c8f 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -144,9 +144,11 @@ def quantile( else: no_nans = self.nans_to_nulls() # get sorted indices and exclude nulls - indices = no_nans.argsort( - ascending=True, na_position="first" - ).slice(no_nans.null_count, len(no_nans)) + indices = ( + no_nans.argsort(ascending=True, na_position="first") + .slice(no_nans.null_count, len(no_nans)) + .astype(np.dtype(np.int32)) + ) with acquire_spill_lock(): plc_column = plc.quantiles.quantile( no_nans.to_pylibcudf(mode="read"), diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e55d18f3a81..986306fbd72 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1365,7 +1365,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) scatter_map = ColumnBase.from_pylibcudf(left_plc) indices = ColumnBase.from_pylibcudf(right_plc) - result = copying.scatter([indices], scatter_map, [result])[0] + result = ColumnBase.from_pylibcudf( + copying.scatter([indices], scatter_map, [result])[0] + ) result_series = cudf.Series._from_column(result) if method in {"ffill", "bfill", "pad", "backfill"}: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 2db409fc3bb..67dea180a79 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3260,12 +3260,14 @@ def duplicated( plc.types.NanEquality.ALL_EQUAL, ) distinct = ColumnBase.from_pylibcudf(plc_column) - result = copying.scatter( - [cudf.Scalar(False)], - distinct, # type: ignore[arg-type] - [as_column(True, length=len(self), dtype=bool)], - bounds_check=False, - )[0] + result = ColumnBase.from_pylibcudf( + copying.scatter( + [cudf.Scalar(False)], + distinct, # type: ignore[arg-type] + [as_column(True, length=len(self), dtype=bool)], + bounds_check=False, + )[0] + ) return cudf.Series._from_column(result, index=self.index, name=name) @_performance_tracking diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9eca77c3212..ec961a06069 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1964,7 +1964,9 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): ) scatter_map = ColumnBase.from_pylibcudf(left_plc) indices = ColumnBase.from_pylibcudf(right_plc) - result = copying.scatter([indices], scatter_map, [result])[0] + result = ColumnBase.from_pylibcudf( + copying.scatter([indices], scatter_map, [result])[0] + ) result_series = cudf.Series._from_column(result) if method in {"ffill", "bfill", "pad", "backfill"}: diff --git a/python/pylibcudf/pylibcudf/column.pxd b/python/pylibcudf/pylibcudf/column.pxd index 92d63e4e495..224ff08d751 100644 --- a/python/pylibcudf/pylibcudf/column.pxd +++ b/python/pylibcudf/pylibcudf/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -47,6 +47,7 @@ cdef class Column: cpdef gpumemoryview null_mask(self) cpdef list children(self) cpdef Column copy(self) + cpdef Column column_from_self_view(self) cpdef Column with_mask(self, gpumemoryview, size_type) cpdef ListColumnView list_view(self) diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi index c9f70de3dbf..c472e756e18 100644 --- a/python/pylibcudf/pylibcudf/column.pyi +++ b/python/pylibcudf/pylibcudf/column.pyi @@ -27,6 +27,7 @@ class Column: def null_mask(self) -> gpumemoryview | None: ... def children(self) -> list[Column]: ... def copy(self) -> Column: ... + def column_from_self_view(self) -> Column: ... def with_mask( self, mask: gpumemoryview | None, null_count: int ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index 9bb5574608e..b03f39f1dc1 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport make_unique, unique_ptr @@ -173,6 +173,10 @@ cdef class Column: children, ) + cpdef Column column_from_self_view(self): + """Return a new column from self.view().""" + return Column.from_libcudf(move(make_unique[column](self.view()))) + cpdef Column with_mask(self, gpumemoryview mask, size_type null_count): """Augment this column with a new null mask. @@ -463,3 +467,10 @@ def is_c_contiguous( return False cumulative_stride *= dim return True + + +cpdef Column column_from_column_view(Column col): + """ + Return a new Column from a Column.view(). + """ + return Column.from_libcudf(move(make_unique[column](col.view()))) From cd60209d90047b4df53861f44d93b389062d8504 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:40:05 -0800 Subject: [PATCH 06/10] Type output of dtype_from_pylibcudf_column --- python/cudf/cudf/utils/dtypes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index cb2eb80d014..6a2d3a0ce3d 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -628,7 +628,7 @@ def dtype_to_pylibcudf_type(dtype) -> plc.DataType: return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) -def dtype_from_pylibcudf_column(col: plc.Column): +def dtype_from_pylibcudf_column(col: plc.Column) -> DtypeObj: type_ = col.type() tid = type_.id() From d578f6f5b834c99a41cfe0d576644c77d0ec2c61 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:46:50 -0800 Subject: [PATCH 07/10] Replace column_from_self_view with existing copy --- python/cudf/cudf/core/column/column.py | 2 +- python/pylibcudf/pylibcudf/column.pxd | 3 +-- python/pylibcudf/pylibcudf/column.pyi | 1 - python/pylibcudf/pylibcudf/column.pyx | 13 +------------ 4 files changed, 3 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 81bf1e5dfce..15769db6269 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -349,7 +349,7 @@ def children(self) -> tuple[ColumnBase, ...]: else: # Compute children from the column view (children factoring self.size) children = ColumnBase.from_pylibcudf( - self.to_pylibcudf(mode="read").column_from_self_view() + self.to_pylibcudf(mode="read").copy() ).base_children dtypes = ( base_child.dtype for base_child in self.base_children diff --git a/python/pylibcudf/pylibcudf/column.pxd b/python/pylibcudf/pylibcudf/column.pxd index 224ff08d751..92d63e4e495 100644 --- a/python/pylibcudf/pylibcudf/column.pxd +++ b/python/pylibcudf/pylibcudf/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2025, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -47,7 +47,6 @@ cdef class Column: cpdef gpumemoryview null_mask(self) cpdef list children(self) cpdef Column copy(self) - cpdef Column column_from_self_view(self) cpdef Column with_mask(self, gpumemoryview, size_type) cpdef ListColumnView list_view(self) diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi index c472e756e18..c9f70de3dbf 100644 --- a/python/pylibcudf/pylibcudf/column.pyi +++ b/python/pylibcudf/pylibcudf/column.pyi @@ -27,7 +27,6 @@ class Column: def null_mask(self) -> gpumemoryview | None: ... def children(self) -> list[Column]: ... def copy(self) -> Column: ... - def column_from_self_view(self) -> Column: ... def with_mask( self, mask: gpumemoryview | None, null_count: int ) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx index b03f39f1dc1..9bb5574608e 100644 --- a/python/pylibcudf/pylibcudf/column.pyx +++ b/python/pylibcudf/pylibcudf/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2025, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from cython.operator cimport dereference from libcpp.memory cimport make_unique, unique_ptr @@ -173,10 +173,6 @@ cdef class Column: children, ) - cpdef Column column_from_self_view(self): - """Return a new column from self.view().""" - return Column.from_libcudf(move(make_unique[column](self.view()))) - cpdef Column with_mask(self, gpumemoryview mask, size_type null_count): """Augment this column with a new null mask. @@ -467,10 +463,3 @@ def is_c_contiguous( return False cumulative_stride *= dim return True - - -cpdef Column column_from_column_view(Column col): - """ - Return a new Column from a Column.view(). - """ - return Column.from_libcudf(move(make_unique[column](col.view()))) From c1ff886bad8bf2fa3b87e16072cc1d1540475595 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 24 Jan 2025 12:31:19 -0800 Subject: [PATCH 08/10] Fold in some doc changes from scalar refactor --- python/cudf/cudf/core/scalar.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 19b13a8e97d..c342cde47cf 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -167,7 +167,8 @@ def _to_plc_scalar(value: ScalarLike, dtype: Dtype) -> plc.Scalar: Returns ------- - plc.Scalar + pylibcudf.Scalar + pylibcudf.Scalar for cudf.Scalar._device_value """ if cudf.utils.utils.is_na_like(value): value = None @@ -217,7 +218,8 @@ def pa_scalar_to_plc_scalar(pa_scalar: pa.Scalar) -> plc.Scalar: Returns ------- - plc.Scalar + pylibcudf.Scalar + pylibcudf.Scalar to use in pylibcudf APIs """ return plc.interop.from_arrow(pa_scalar) From b3aa5cdd2d3b9001850a6126e61b52bf81441e93 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:03:43 -0800 Subject: [PATCH 09/10] Adjust search_sorted output return --- python/cudf/cudf/core/_internals/search.py | 20 +++++++++--------- python/cudf/cudf/core/column/column.py | 14 +++++++------ python/cudf/cudf/core/frame.py | 14 +++++++------ python/cudf/cudf/core/index.py | 24 +++++++++++++--------- 4 files changed, 40 insertions(+), 32 deletions(-) diff --git a/python/cudf/cudf/core/_internals/search.py b/python/cudf/cudf/core/_internals/search.py index aa410c36575..bee198800e7 100644 --- a/python/cudf/cudf/core/_internals/search.py +++ b/python/cudf/cudf/core/_internals/search.py @@ -1,12 +1,14 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations -from typing import Literal +from typing import TYPE_CHECKING, Literal import pylibcudf as plc from cudf.core.buffer import acquire_spill_lock -from cudf.core.column import ColumnBase + +if TYPE_CHECKING: + from cudf.core.column import ColumnBase @acquire_spill_lock() @@ -16,7 +18,7 @@ def search_sorted( side: Literal["left", "right"], ascending: bool = True, na_position: Literal["first", "last"] = "last", -) -> ColumnBase: +) -> plc.Column: """Find indices where elements should be inserted to maintain order Parameters @@ -43,11 +45,9 @@ def search_sorted( plc.search, "lower_bound" if side == "left" else "upper_bound", ) - return ColumnBase.from_pylibcudf( - func( - plc.Table([col.to_pylibcudf(mode="read") for col in source]), - plc.Table([col.to_pylibcudf(mode="read") for col in values]), - column_order, - null_precedence, - ) + return func( + plc.Table([col.to_pylibcudf(mode="read") for col in source]), + plc.Table([col.to_pylibcudf(mode="read") for col in values]), + column_order, + null_precedence, ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 99c9b1133ae..d895a87054f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1812,12 +1812,14 @@ def searchsorted( raise ValueError( "Column searchsorted expects values to be column of same dtype" ) - return search.search_sorted( # type: ignore[return-value] - [self], - [value], - side=side, - ascending=ascending, - na_position=na_position, + return ColumnBase.from_pylibcudf( + search.search_sorted( # type: ignore[return-value] + [self], + [value], + side=side, + ascending=ascending, + na_position=na_position, + ) ) def unique(self) -> Self: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index acea8991f47..b9d5b0403da 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1348,12 +1348,14 @@ def searchsorted( for val, common_dtype in zip(values, common_dtype_list) ] - outcol = search.search_sorted( - sources, - values, - side, - ascending=ascending, - na_position=na_position, + outcol = ColumnBase.from_pylibcudf( + search.search_sorted( + sources, + values, + side, + ascending=ascending, + na_position=na_position, + ) ) # Return result as cupy array if the values is non-scalar diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 278c8b24e86..e883569a047 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -121,17 +121,21 @@ def _lexsorted_equal_range( else: sort_inds = None sort_vals = idx - lower_bound = search.search_sorted( - list(sort_vals._columns), - keys, - side="left", - ascending=sort_vals.is_monotonic_increasing, + lower_bound = ColumnBase.from_pylibcudf( + search.search_sorted( + list(sort_vals._columns), + keys, + side="left", + ascending=sort_vals.is_monotonic_increasing, + ) ).element_indexing(0) - upper_bound = search.search_sorted( - list(sort_vals._columns), - keys, - side="right", - ascending=sort_vals.is_monotonic_increasing, + upper_bound = ColumnBase.from_pylibcudf( + search.search_sorted( + list(sort_vals._columns), + keys, + side="right", + ascending=sort_vals.is_monotonic_increasing, + ) ).element_indexing(0) return lower_bound, upper_bound, sort_inds From 16626f2b425ea3991f1e3b85a473987e82e6b811 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Jan 2025 15:26:40 -0800 Subject: [PATCH 10/10] Remove c_value --- python/cudf/cudf/core/column/column.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index d895a87054f..d0d3eb5092a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -524,7 +524,7 @@ def from_pylibcudf( @classmethod def from_scalar(cls, slr: cudf.Scalar, size: int) -> Self: return cls.from_pylibcudf( - plc.Column.from_scalar(slr.device_value.c_value, size) + plc.Column.from_scalar(slr.device_value, size) ) def data_array_view(