From bfe5fadf466679c8e711cb4142291d26f7bdf36d Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 5 Mar 2021 18:03:17 -0800 Subject: [PATCH 01/29] Initial framework for isin_array. --- arraykit.c | 47 +++++++++++++++++++++++++ arraykit.pyi | 6 ++++ performance/reference/util.py | 65 +++++++++++++++++++++++++++++++++++ test/test_util.py | 64 ++++++++++++++++++++++++++++++++++ 4 files changed, 182 insertions(+) diff --git a/arraykit.c b/arraykit.c index 3d220b0f..73f22d0a 100644 --- a/arraykit.c +++ b/arraykit.c @@ -251,6 +251,52 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) return (PyObject *)AK_ResolveDTypeIter(arg); } +static PyObject * +isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) +{ + int array_is_unique, other_is_unique; + PyArrayObject *array, *other; + + static char *kwlist[] = {"array", "array_is_unique", "other", "other_is_unique", NULL}; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", + kwlist, + &PyArray_Type, &array, &array_is_unique, + &PyArray_Type, &other, &other_is_unique) + ) + { + return NULL; + } + if (PyArray_NDIM(other) != 1) { + return PyErr_Format(PyExc_TypeError, "Expected other to be 1-dimensional"); + } + + if (PyDataType_ISOBJECT(PyArray_DESCR(array)) || PyDataType_ISOBJECT(PyArray_DESCR(other))) { + + int unique = array_is_unique && other_is_unique; + int ndim = PyArray_NDIM(array); + + PyArrayObject* result = NULL; + + // PyObject* numpy = PyImport_Import("numpy"); + // if (numpy == NULL) { + // PyErr_SetString(PyExc_ImportError, "numpy failed to import"); + // return NULL; + // } + + // PyObject* in1d = PyObject_GetAttrString(numpy, "in1d"); + // Py_DECREF(numpy); + // if (in1d == NULL) { + // PyErr_SetString(PyExc_AttributeError, "in1d not found"); + // return NULL; + // } + + return result; + } + + return PyBool_FromLong(1); +} + //------------------------------------------------------------------------------ // ArrayGO //------------------------------------------------------------------------------ @@ -527,6 +573,7 @@ static PyMethodDef arraykit_methods[] = { {"row_1d_filter", row_1d_filter, METH_O, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, + {"isin_array", isin_array, METH_VARARGS | METH_KEYWORDS, NULL}, // I don't know how to deal with this warning :'( {NULL}, }; diff --git a/arraykit.pyi b/arraykit.pyi index b5a78afc..3f5bce78 100644 --- a/arraykit.pyi +++ b/arraykit.pyi @@ -27,3 +27,9 @@ def column_1d_filter(__array: np.array) -> np.ndarray: ... def row_1d_filter(__array: np.array) -> np.ndarray: ... def resolve_dtype(__d1: np.dtype, __d2: np.dtype) -> np.dtype: ... def resolve_dtype_iter(__dtypes: tp.Iterable[np.dtype]) -> np.dtype: ... +def isin_array(*, + array: np.ndarray, + array_is_unique: bool, + other: np.ndarray, + other_is_unique: bool, + ) -> np.ndarray: ... diff --git a/performance/reference/util.py b/performance/reference/util.py index 6d437b28..8203a210 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -181,3 +181,68 @@ def array_deepcopy( if memo is not None: memo[ident] = post return post + + +def _isin_1d( + array: np.ndarray, + other: tp.FrozenSet[tp.Any] + ) -> np.ndarray: + ''' + Iterate over an 1D array to build a 1D Boolean ndarray representing whether or not the original element is in the set + + Args: + array: The source array + other: The set of elements being looked for + ''' + result: np.ndarray = np.empty(array.shape, dtype=DTYPE_BOOL) + + for i, element in enumerate(array): + result[i] = element in other + + result.flags.writeable = False + return result + + +def _isin_2d( + array: np.ndarray, + other: tp.FrozenSet[tp.Any] + ) -> np.ndarray: + ''' + Iterate over an 2D array to build a 2D, immutable, Boolean ndarray representing whether or not the original element is in the set + + Args: + array: The source array + other: The set of elements being looked for + ''' + result: np.ndarray = np.empty(array.shape, dtype=DTYPE_BOOL) + + for (i, j), v in np.ndenumerate(array): + result[i, j] = v in other + + result.flags.writeable = False + return result + + +def isin_array(*, + array: np.ndarray, + array_is_unique: bool, + other: np.ndarray, + other_is_unique: bool, + ) -> np.ndarray: + '''Core isin processing after other has been converted to an array. + ''' + if array.dtype == DTYPE_OBJECT or other.dtype == DTYPE_OBJECT: + # both funcs return immutable arrays + func = _isin_1d if array.ndim == 1 else _isin_2d + try: + return func(array, frozenset(other)) + except TypeError: # only occur when something is unhashable. + pass + + assume_unique = array_is_unique and other_is_unique + func = np.in1d if array.ndim == 1 else np.isin + + result = func(array, other, assume_unique=assume_unique) #type: ignore + result.flags.writeable = False + + return result diff --git a/test/test_util.py b/test/test_util.py index 64c45cdd..91a9eb75 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,3 +1,4 @@ +import itertools import unittest import numpy as np # type: ignore @@ -167,6 +168,69 @@ def test_row_1d_filter_a(self) -> None: with self.assertRaises(NotImplementedError): row_1d_filter(a1.reshape(1,2,5)) + def test_isin_1d(self) -> None: + from performance.reference.util import isin_array + + T, F = True, False + arr1 = np.array([1, 2, 3, 4, 5]) + + expected = [ + (np.array([T, F, T, T, F]), [1, 3, 4]), + (np.array([F, F, F, F, F]), [7, 8]), + (np.array([T, T, T, T, T]), [1, 2, 3, 4, 5]), + ] + + for expected_result, values in expected: + for dtype in (int, object): + arr2 = np.array(values, dtype=dtype) + + for aiu, oiu in itertools.product((T, F), (T, F)): + self.assertTrue(np.array_equal(expected_result, isin_array( + array=arr1, + array_is_unique=aiu, + other=arr2, + other_is_unique=oiu, + ))) + + def test_isin_2d(self) -> None: + from performance.reference.util import isin_array + + T, F = True, False + arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + expected = [ + (np.array([[T, F, T], [T, F, F], [F, F, T]]), [1, 3, 4, 9]), + (np.array([[F, F, F], [F, F, F], [F, F, F]]), [10, 11]), + (np.array([[T, T, T], [T, T, T], [T, T, T]]), [1, 2, 3, 4, 5, 6, 7, 8, 9]), + ] + + for expected_result, values in expected: + for dtype in (int, object): + arr2 = np.array(values, dtype=dtype) + + for aiu, oiu in itertools.product((T, F), (T, F)): + self.assertTrue(np.array_equal(expected_result, isin_array( + array=arr1, + array_is_unique=aiu, + other=arr2, + other_is_unique=oiu, + ))) + + def test_risky(self) -> None: + from arraykit import isin_array + + T, F = True, False + arr1 = np.array([1, 2, 3, 4, 5]) + arr2 = np.array([1, 3, 4]) + expected = np.array([T, F, T, T, F]) + + self.assertTrue(np.array_equal(expected, isin_array( + array=arr1, + array_is_unique=T, + other=arr2, + other_is_unique=F, + ))) + if __name__ == '__main__': unittest.main() From fc068dfb2e2b90c51d8aa70b25cd2edc7f6146ef Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 10:54:52 -0800 Subject: [PATCH 02/29] Commit to save work. Not working. --- arraykit.c | 174 +++++++++++++++++++++++++++++++++++++++++----- test/test_util.py | 17 +++-- 2 files changed, 166 insertions(+), 25 deletions(-) diff --git a/arraykit.c b/arraykit.c index 73f22d0a..e33c5f25 100644 --- a/arraykit.c +++ b/arraykit.c @@ -251,6 +251,97 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) return (PyObject *)AK_ResolveDTypeIter(arg); } +//------------------------------------------------------------------------------ +// Comparison Macros +//------------------------------------------------------------------------------ + +# define _AK_C_QSORT_COMP_FUNC(type) \ + _##type##_compare + +# define _AK_C_QSORT_COMP_REF_ARR(type) \ + _##type##_COMP_ARR + +# define _AK_BUILD_C_QSORT_COMP(type) \ + static type* _AK_C_QSORT_COMP_REF_ARR(type); \ + \ + static int \ + _AK_C_QSORT_COMP_FUNC(type)(const void * a, const void * b) { \ + int aa = *((int *) a); \ + int bb = *((int *) b); \ + \ + if (_AK_C_QSORT_COMP_REF_ARR(type)[aa] < _AK_C_QSORT_COMP_REF_ARR(type)[bb]) { \ + return -1; \ + } \ + if (_AK_C_QSORT_COMP_REF_ARR(type)[aa] == _AK_C_QSORT_COMP_REF_ARR(type)[bb]) { \ + return 0; \ + } \ + return 1; \ + } + +_AK_BUILD_C_QSORT_COMP(long) +_AK_BUILD_C_QSORT_COMP(PY_UINT32_T) + +# define AK_IDX_COMP(type, ref_array, ptr_array, size) \ + _AK_C_QSORT_COMP_REF_ARR(type) = ref_array; \ + qsort(ptr_array, size, sizeof(PY_UINT32_T), _AK_C_QSORT_COMP_FUNC(type)); \ + _AK_C_QSORT_COMP_REF_ARR(type) = NULL; \ + +//------------------------------------------------------------------------------ +// Concat Macros +//------------------------------------------------------------------------------ + +# define AK_C_ARR_CONCAT_FUNC(type) \ + _##type##_concat + +# define _AK_BUILD_C_ARR_CONCAT(type) \ + static void \ + AK_C_ARR_CONCAT_FUNC(type)(type* target, type* arr1, size_t arr1_size, type* arr2, size_t arr2_size) \ + { \ + for (size_t i = 0; i < arr1_size; ++i) { \ + target[i] = arr1[i]; \ + } \ + for (size_t i = 0; i < arr2_size; ++i) { \ + target[arr1_size + i] = arr2[i]; \ + } \ + } \ + +_AK_BUILD_C_ARR_CONCAT(long) + + +/* +static void +print_arr_long(long* arr, PY_UINT32_T* indexer, size_t size) +{ + for (size_t i = 0; i < size; ++i) { + if (indexer) { + printf("%lu ", arr[indexer[i]]); + } + else { + printf("%lu ", arr[i]); + } + } + printf("\n"); +} + +static void +print_arr_sizet(PY_UINT32_T* arr, size_t size) +{ + for (size_t i = 0; i < size; ++i) { + printf("%u ", arr[i]); + } + printf("\n"); +} + +static void +print_arr_char(char* arr, size_t size) +{ + for (size_t i = 0; i < size; ++i) { + printf("%x ", arr[i]); + } + printf("\n"); +} +*/ + static PyObject * isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { @@ -271,27 +362,78 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) return PyErr_Format(PyExc_TypeError, "Expected other to be 1-dimensional"); } - if (PyDataType_ISOBJECT(PyArray_DESCR(array)) || PyDataType_ISOBJECT(PyArray_DESCR(other))) { + PyArray_Descr* array_dtype = PyArray_DTYPE(array); + PyArray_Descr* other_dtype = PyArray_DTYPE(other); + + if (PyDataType_ISOBJECT(array_dtype) || PyDataType_ISOBJECT(other_dtype)) { + return NULL; + } + + // int unique = array_is_unique && other_is_unique; + size_t ar1_size = PyArray_SIZE(array); + size_t ar2_size = PyArray_SIZE(other); + size_t ar3_size = ar1_size + ar2_size; - int unique = array_is_unique && other_is_unique; - int ndim = PyArray_NDIM(array); + if (array_dtype->type_num == NPY_LONG && other_dtype->type_num == NPY_LONG) { + long* ar1 = (long*)PyArray_DATA(array); + long* ar2 = (long*)PyArray_DATA(other); - PyArrayObject* result = NULL; + long ar3[ar3_size]; + AK_C_ARR_CONCAT_FUNC(long)(ar3, ar1, ar1_size, ar2, ar2_size); - // PyObject* numpy = PyImport_Import("numpy"); - // if (numpy == NULL) { - // PyErr_SetString(PyExc_ImportError, "numpy failed to import"); - // return NULL; - // } + PY_UINT32_T order[ar3_size]; + PY_UINT32_T indx[ar3_size]; + for (size_t i = 0; i < ar3_size; ++i) { + order[i] = i; + indx[i] = i; + } + + AK_IDX_COMP(long, ar3, order, ar3_size) - // PyObject* in1d = PyObject_GetAttrString(numpy, "in1d"); - // Py_DECREF(numpy); - // if (in1d == NULL) { - // PyErr_SetString(PyExc_AttributeError, "in1d not found"); - // return NULL; - // } + // Set flag + char flag[ar3_size]; + for (size_t i = 1; i < ar3_size; ++i) { + flag[i-1] = (ar3[order[i]] == ar3[order[i-1]]); + } + flag[ar3_size - 1] = 0; + + AK_IDX_COMP(PY_UINT32_T, order, indx, ar3_size) + + //char *result = PyMem_Malloc(ar1_size); + char result[5]; + if (!result) { + return NULL; + } + for (size_t i = 0; i < ar1_size; ++i) { + result[i] = flag[indx[i]]; + } - return result; + // npy_intp *argsort_result; argsort_result = PyMem_Malloc(ar3_size); + // PyArrayObject* sorted = PyArray_ArgSort(array, ar1_size, NPY_MERGESORT); + // long* sorted_arr = (long*)PyArray_DATA(array); + + /* + ``*data`` : ``char *`` + ``NULL`` for creating brand-new memory. If you want this array to wrap + another memory area, then pass the pointer here. You are + responsible for deleting the memory in that case, but do not do so + until the new array object has been deleted. The best way to + handle that is to get the memory from another Python object, + ``INCREF`` that Python object after passing it's data pointer to this + routine, and set the ``->base`` member of the returned array to the + Python object. *You are responsible for* setting ``PyArray_BASE(ret)`` + to the base object. Failure to do so will create a memory leak. + */ + + return PyArray_NewFromDescr( + &PyArray_Type, // class (subtype) + PyArray_DescrFromType(NPY_BOOL), // dtype (descr) + PyArray_NDIM(array), // ndim (nd) + (npy_intp*)&ar1_size, // dims + NULL, // strides + result, // data + NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags + NULL); // sublclass (obj) } return PyBool_FromLong(1); diff --git a/test/test_util.py b/test/test_util.py index 91a9eb75..2b2c2424 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -221,15 +221,14 @@ def test_risky(self) -> None: T, F = True, False arr1 = np.array([1, 2, 3, 4, 5]) - arr2 = np.array([1, 3, 4]) - expected = np.array([T, F, T, T, F]) - - self.assertTrue(np.array_equal(expected, isin_array( - array=arr1, - array_is_unique=T, - other=arr2, - other_is_unique=F, - ))) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([T, F, F, T, F]) + + + post = isin_array(array=arr1, array_is_unique=T, other=arr2, other_is_unique=F) + print(post.tobytes()) + print(post.tobytes()) + self.assertTrue(np.array_equal(expected, post)) if __name__ == '__main__': unittest.main() From e98d0fd517e31716b3353188d4b0215c78aaad5a Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 19:08:07 -0800 Subject: [PATCH 03/29] Finishes initial iteration for non-object, 1D or 2D, unique arrays --- arraykit.c | 285 +++++++++++++++++++------------------------- performance/main.py | 48 +++++++- test/test_util.py | 40 +++++-- 3 files changed, 197 insertions(+), 176 deletions(-) diff --git a/arraykit.c b/arraykit.c index e33c5f25..75359531 100644 --- a/arraykit.c +++ b/arraykit.c @@ -41,6 +41,11 @@ return NULL;\ } while (0) +// To simplify lines merely checking for NULL pointers +# define AK_CHECK_NULL(obj) \ + if (!obj) { \ + return NULL; \ + } # if defined __GNUC__ || defined __clang__ # define AK_LIKELY(X) __builtin_expect(!!(X), 1) @@ -237,11 +242,8 @@ static PyObject * resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args) { PyArray_Descr *d1, *d2; - if (!PyArg_ParseTuple(args, "O!O!:resolve_dtype", + AK_CHECK_NULL(PyArg_ParseTuple(args, "O!O!:resolve_dtype", &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)) - { - return NULL; - } return (PyObject *)AK_ResolveDTypes(d1, d2); } @@ -252,95 +254,126 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) } //------------------------------------------------------------------------------ -// Comparison Macros -//------------------------------------------------------------------------------ +// isin + +static PyArrayObject * +AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) +{ + PyObject *container = PyTuple_Pack(2, arr1, arr2); + AK_CHECK_NULL(container) + + PyArrayObject *array = (PyArrayObject*)PyArray_Concatenate(container, 0); + Py_DECREF(container); + return array; +} + +static PyObject * +AK_isin_array_object(PyArrayObject *Py_UNUSED(array), PyArrayObject *Py_UNUSED(other)) +{ + return NULL; +} + +static PyObject * +AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_unique) +{ + /* Algorithm: + + if not assume_unique: + array, rev_idx = np.unique(array, return_inverse=True) + other = np.unique(other) + + ar = np.concatenate((array, other)) -# define _AK_C_QSORT_COMP_FUNC(type) \ - _##type##_compare - -# define _AK_C_QSORT_COMP_REF_ARR(type) \ - _##type##_COMP_ARR - -# define _AK_BUILD_C_QSORT_COMP(type) \ - static type* _AK_C_QSORT_COMP_REF_ARR(type); \ - \ - static int \ - _AK_C_QSORT_COMP_FUNC(type)(const void * a, const void * b) { \ - int aa = *((int *) a); \ - int bb = *((int *) b); \ - \ - if (_AK_C_QSORT_COMP_REF_ARR(type)[aa] < _AK_C_QSORT_COMP_REF_ARR(type)[bb]) { \ - return -1; \ - } \ - if (_AK_C_QSORT_COMP_REF_ARR(type)[aa] == _AK_C_QSORT_COMP_REF_ARR(type)[bb]) { \ - return 0; \ - } \ - return 1; \ + order = ar.argsort(kind='mergesort') + sar = ar[order] + + flag = np.concatenate(((sar[1:] == sar[:-1]), [False])) + + ret = np.empty(ar.shape, dtype=bool) + ret[order] = flag + + if assume_unique: + return ret[:len(array)] + else: + return ret[rev_idx] + */ + + // 1. Capture original array shape for return value + int array_ndim = PyArray_NDIM(array); + npy_intp* array_dims = PyArray_DIMS(array); + size_t array_size = PyArray_SIZE(array); + + // 2. Ravel the array as we want to operate on 1D arrays only. + array = (PyArrayObject*)PyArray_Ravel(array, NPY_CORDER); + // other is guaranteed to be 1D + + if (!assume_unique) { + // TODO: Call array, rev_idx = np.unique(array, return_inverse=True) + // TODO: Call other = np.unique(other) } -_AK_BUILD_C_QSORT_COMP(long) -_AK_BUILD_C_QSORT_COMP(PY_UINT32_T) + // 3. Concatenate + PyArrayObject* ar = AK_concat_arrays(array, other); + AK_CHECK_NULL(ar) -# define AK_IDX_COMP(type, ref_array, ptr_array, size) \ - _AK_C_QSORT_COMP_REF_ARR(type) = ref_array; \ - qsort(ptr_array, size, sizeof(PY_UINT32_T), _AK_C_QSORT_COMP_FUNC(type)); \ - _AK_C_QSORT_COMP_REF_ARR(type) = NULL; \ + size_t ar_size = PyArray_SIZE(ar); -//------------------------------------------------------------------------------ -// Concat Macros -//------------------------------------------------------------------------------ + // 4: Sort + PyArrayObject *order = (PyArrayObject*)PyArray_ArgSort(ar, 0, NPY_MERGESORT); + long* order_arr = (long*)PyArray_DATA(order); -# define AK_C_ARR_CONCAT_FUNC(type) \ - _##type##_concat + // 5. Find duplicates + PyObject* sar = PyObject_GetItem((PyObject*)ar, (PyObject*)order); + AK_CHECK_NULL(sar) -# define _AK_BUILD_C_ARR_CONCAT(type) \ - static void \ - AK_C_ARR_CONCAT_FUNC(type)(type* target, type* arr1, size_t arr1_size, type* arr2, size_t arr2_size) \ - { \ - for (size_t i = 0; i < arr1_size; ++i) { \ - target[i] = arr1[i]; \ - } \ - for (size_t i = 0; i < arr2_size; ++i) { \ - target[arr1_size + i] = arr2[i]; \ - } \ - } \ + PyObject* comp_a = PySequence_GetSlice((PyObject*)sar, 1, ar_size); + AK_CHECK_NULL(comp_a) -_AK_BUILD_C_ARR_CONCAT(long) + PyObject* comp_b = PySequence_GetSlice((PyObject*)sar, 0, ar_size - 1); + AK_CHECK_NULL(comp_b) + PyObject* flag = PyObject_RichCompare(comp_a, comp_b, Py_EQ); + AK_CHECK_NULL(flag) -/* -static void -print_arr_long(long* arr, PY_UINT32_T* indexer, size_t size) -{ - for (size_t i = 0; i < size; ++i) { - if (indexer) { - printf("%lu ", arr[indexer[i]]); + npy_bool* flag_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)flag); + + // 6: Construct empty array + PyArrayObject* ret = (PyArrayObject*)PyArray_Empty( + array_ndim, // nd + array_dims, // dims + PyArray_DescrFromType(NPY_BOOL), // dtype + 1); // is_f_order + AK_CHECK_NULL(ret) + + size_t stride = 0; + if (array_ndim == 2) { + stride = (size_t)array_dims[1]; + } + + // 7: Assign into duplicates array + for (size_t i = 0; i < (size_t)PyArray_SIZE(order); ++i) { + size_t idx_0 = (size_t)order_arr[i]; + if (idx_0 >= array_size) { continue; } + + // We are guaranteed that flag_ar[i] is always a valid index + if (array_ndim == 1) { + *(npy_bool *) PyArray_GETPTR1(ret, idx_0) = flag_arr[i]; } else { - printf("%lu ", arr[i]); + size_t idx_1 = idx_0 / stride; + idx_0 = idx_0 - (stride * idx_1); + + *(npy_bool *) PyArray_GETPTR2(ret, idx_1, idx_0) = flag_arr[i]; } } - printf("\n"); -} -static void -print_arr_sizet(PY_UINT32_T* arr, size_t size) -{ - for (size_t i = 0; i < size; ++i) { - printf("%u ", arr[i]); + // 8. Return! + if (assume_unique) { + return (PyObject*)ret; } - printf("\n"); -} -static void -print_arr_char(char* arr, size_t size) -{ - for (size_t i = 0; i < size; ++i) { - printf("%x ", arr[i]); - } - printf("\n"); + return NULL; } -*/ static PyObject * isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) @@ -350,14 +383,11 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) static char *kwlist[] = {"array", "array_is_unique", "other", "other_is_unique", NULL}; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", + AK_CHECK_NULL(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", kwlist, &PyArray_Type, &array, &array_is_unique, - &PyArray_Type, &other, &other_is_unique) - ) - { - return NULL; - } + &PyArray_Type, &other, &other_is_unique)) + if (PyArray_NDIM(other) != 1) { return PyErr_Format(PyExc_TypeError, "Expected other to be 1-dimensional"); } @@ -365,78 +395,12 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) PyArray_Descr* array_dtype = PyArray_DTYPE(array); PyArray_Descr* other_dtype = PyArray_DTYPE(other); + // 2. Handle object dtypes if (PyDataType_ISOBJECT(array_dtype) || PyDataType_ISOBJECT(other_dtype)) { - return NULL; - } - - // int unique = array_is_unique && other_is_unique; - size_t ar1_size = PyArray_SIZE(array); - size_t ar2_size = PyArray_SIZE(other); - size_t ar3_size = ar1_size + ar2_size; - - if (array_dtype->type_num == NPY_LONG && other_dtype->type_num == NPY_LONG) { - long* ar1 = (long*)PyArray_DATA(array); - long* ar2 = (long*)PyArray_DATA(other); - - long ar3[ar3_size]; - AK_C_ARR_CONCAT_FUNC(long)(ar3, ar1, ar1_size, ar2, ar2_size); - - PY_UINT32_T order[ar3_size]; - PY_UINT32_T indx[ar3_size]; - for (size_t i = 0; i < ar3_size; ++i) { - order[i] = i; - indx[i] = i; - } - - AK_IDX_COMP(long, ar3, order, ar3_size) - - // Set flag - char flag[ar3_size]; - for (size_t i = 1; i < ar3_size; ++i) { - flag[i-1] = (ar3[order[i]] == ar3[order[i-1]]); - } - flag[ar3_size - 1] = 0; - - AK_IDX_COMP(PY_UINT32_T, order, indx, ar3_size) - - //char *result = PyMem_Malloc(ar1_size); - char result[5]; - if (!result) { - return NULL; - } - for (size_t i = 0; i < ar1_size; ++i) { - result[i] = flag[indx[i]]; - } - - // npy_intp *argsort_result; argsort_result = PyMem_Malloc(ar3_size); - // PyArrayObject* sorted = PyArray_ArgSort(array, ar1_size, NPY_MERGESORT); - // long* sorted_arr = (long*)PyArray_DATA(array); - - /* - ``*data`` : ``char *`` - ``NULL`` for creating brand-new memory. If you want this array to wrap - another memory area, then pass the pointer here. You are - responsible for deleting the memory in that case, but do not do so - until the new array object has been deleted. The best way to - handle that is to get the memory from another Python object, - ``INCREF`` that Python object after passing it's data pointer to this - routine, and set the ``->base`` member of the returned array to the - Python object. *You are responsible for* setting ``PyArray_BASE(ret)`` - to the base object. Failure to do so will create a memory leak. - */ - - return PyArray_NewFromDescr( - &PyArray_Type, // class (subtype) - PyArray_DescrFromType(NPY_BOOL), // dtype (descr) - PyArray_NDIM(array), // ndim (nd) - (npy_intp*)&ar1_size, // dims - NULL, // strides - result, // data - NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags - NULL); // sublclass (obj) + return AK_isin_array_object(array, other); } - return PyBool_FromLong(1); + return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); } //------------------------------------------------------------------------------ @@ -504,13 +468,10 @@ ArrayGO_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs) int parsed = PyArg_ParseTupleAndKeywords( args, kwargs, "O|$p:ArrayGO", argnames, &iterable, &own_iterable ); - if (!parsed) { - return NULL; - } + AK_CHECK_NULL(parsed) + ArrayGOObject *self = (ArrayGOObject *)cls->tp_alloc(cls, 0); - if (!self) { - return NULL; - } + AK_CHECK_NULL(self) if (PyArray_Check(iterable)) { if (!PyDataType_ISOBJECT(PyArray_DESCR((PyArrayObject *)iterable))) { @@ -553,9 +514,8 @@ ArrayGO_append(ArrayGOObject *self, PyObject *value) { if (!self->list) { self->list = PyList_New(1); - if (!self->list) { - return NULL; - } + AK_CHECK_NULL(self->list) + Py_INCREF(value); PyList_SET_ITEM(self->list, 0, value); } @@ -571,9 +531,8 @@ ArrayGO_extend(ArrayGOObject *self, PyObject *values) { if (!self->list) { self->list = PySequence_List(values); - if (!self->list) { - return NULL; - } + AK_CHECK_NULL(self->list) + Py_RETURN_NONE; } Py_ssize_t len = PyList_Size(self->list); @@ -715,7 +674,7 @@ static PyMethodDef arraykit_methods[] = { {"row_1d_filter", row_1d_filter, METH_O, NULL}, {"resolve_dtype", resolve_dtype, METH_VARARGS, NULL}, {"resolve_dtype_iter", resolve_dtype_iter, METH_O, NULL}, - {"isin_array", isin_array, METH_VARARGS | METH_KEYWORDS, NULL}, // I don't know how to deal with this warning :'( + {"isin_array", (PyCFunction)isin_array, METH_VARARGS | METH_KEYWORDS, NULL}, {NULL}, }; diff --git a/performance/main.py b/performance/main.py index d0bd0809..583968e1 100644 --- a/performance/main.py +++ b/performance/main.py @@ -1,6 +1,5 @@ - - - +from datetime import date, timedelta +from functools import partial import timeit import argparse @@ -15,6 +14,7 @@ from performance.reference.util import row_1d_filter as row_1d_filter_ref from performance.reference.util import resolve_dtype as resolve_dtype_ref from performance.reference.util import resolve_dtype_iter as resolve_dtype_iter_ref +from performance.reference.util import isin_array as isin_array_ref from performance.reference.array_go import ArrayGO as ArrayGOREF @@ -27,6 +27,7 @@ from arraykit import row_1d_filter as row_1d_filter_ak from arraykit import resolve_dtype as resolve_dtype_ak from arraykit import resolve_dtype_iter as resolve_dtype_iter_ak +from arraykit import isin_array as isin_array_ak from arraykit import ArrayGO as ArrayGOAK @@ -221,6 +222,47 @@ class ArrayGOPerfREF(ArrayGOPerf): entry = staticmethod(ArrayGOREF) +#------------------------------------------------------------------------------- +class IsinArrayPerf(Perf): + NUMBER = 1000 + + def pre(self): + self.arrays = [] + + v_1d = [1, 2, 3, 4, 5] + v_2d = [[1, 2, 3], [4, 5, 9]] + w_1d = [1, 4, 7, 9] + + dtype_funcs = [ + (int, int), + (float, float), + (str, str), + ('datetime64[D]', lambda x: date(2020, 1, x)), + ] + + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + self.arrays.append((arr1, arr2)) + + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + self.arrays.append((arr1, arr2)) + + def main(self): + for _ in range(25): + for arr1, arr2 in self.arrays: + self.entry(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + +class IsinArrayPerfAK(IsinArrayPerf): + entry = staticmethod(isin_array_ak) + +class IsinArrayPerfREF(IsinArrayPerf): + entry = staticmethod(isin_array_ref) + + + #------------------------------------------------------------------------------- def get_arg_parser(): diff --git a/test/test_util.py b/test/test_util.py index 2b2c2424..3111aee7 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,3 +1,5 @@ +from datetime import date, timedelta +from functools import partial import itertools import unittest @@ -219,19 +221,37 @@ def test_isin_2d(self) -> None: def test_risky(self) -> None: from arraykit import isin_array - T, F = True, False - arr1 = np.array([1, 2, 3, 4, 5]) - arr2 = np.array([1, 4, 7, 9]) - expected = np.array([T, F, F, T, F]) + isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - post = isin_array(array=arr1, array_is_unique=T, other=arr2, other_is_unique=F) - print(post.tobytes()) - print(post.tobytes()) - self.assertTrue(np.array_equal(expected, post)) + v_1d = [1, 2, 3, 4, 5] + v_2d = [[1, 2, 3], [4, 5, 9]] -if __name__ == '__main__': - unittest.main() + w_1d = [1, 4, 7, 9] + + dtype_funcs = [ + (int, int), + (float, float), + (str, str), + ('datetime64[D]', lambda x: date(2020, 1, x)), + ] + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_1d, post)) + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_2d, post)) + + +if __name__ == '__main__': + unittest.main() From a0aa83ede4e25da4089cdeaa6c0690d0745dddaf Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 21:37:00 -0800 Subject: [PATCH 04/29] Lints. --- performance/main.py | 3 +-- test/test_util.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/performance/main.py b/performance/main.py index 583968e1..8e0bee12 100644 --- a/performance/main.py +++ b/performance/main.py @@ -1,5 +1,4 @@ -from datetime import date, timedelta -from functools import partial +from datetime import date import timeit import argparse diff --git a/test/test_util.py b/test/test_util.py index 3111aee7..50e775fe 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,4 +1,4 @@ -from datetime import date, timedelta +from datetime import date from functools import partial import itertools import unittest From 01e0496c403d84b0c334b7241ef141e5d5cfa257 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 21:37:50 -0800 Subject: [PATCH 05/29] Lints. --- test/test_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util.py b/test/test_util.py index 50e775fe..98933077 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -243,14 +243,14 @@ def test_risky(self) -> None: arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_1d, post)) + self.assertTrue(np.array_equal(e_1d, post), msg=(e_1d, post)) for dtype, dtype_func in dtype_funcs: arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_2d, post)) + self.assertFalse(np.array_equal(e_2d, post), msg=(e_2d, post)) if __name__ == '__main__': From 329258dca742f2009fb99e02fbe79a479cb6e9bc Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 21:39:40 -0800 Subject: [PATCH 06/29] Adds a more explicit failure message. --- test/test_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util.py b/test/test_util.py index 98933077..33730229 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -243,14 +243,14 @@ def test_risky(self) -> None: arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_1d, post), msg=(e_1d, post)) + self.assertTrue(np.array_equal(e_1d, post), msg=f'\nExpected:\n{e_1d}\nActual:\n{post}') for dtype, dtype_func in dtype_funcs: arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) post = isin_array_func(array=arr1, other=arr2) - self.assertFalse(np.array_equal(e_2d, post), msg=(e_2d, post)) + self.assertTrue(np.array_equal(e_2d, post), msg=f'\nExpected:\n{e_2d}\nActual:\n{post}') if __name__ == '__main__': From 64286d9ea2a2bb22a473f83656d9190b8328b5f8 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 21:47:06 -0800 Subject: [PATCH 07/29] Adds more info to debug msg. --- test/test_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_util.py b/test/test_util.py index 33730229..17e695c6 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -243,14 +243,14 @@ def test_risky(self) -> None: arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_1d, post), msg=f'\nExpected:\n{e_1d}\nActual:\n{post}') + self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') for dtype, dtype_func in dtype_funcs: arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_2d, post), msg=f'\nExpected:\n{e_2d}\nActual:\n{post}') + self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') if __name__ == '__main__': From 2cb35fd1be3bde3cb369c6407400a732312ca8ef Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Tue, 9 Mar 2021 21:58:41 -0800 Subject: [PATCH 08/29] Adds test for failing windows. --- arraykit.c | 2 +- test/test_util.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arraykit.c b/arraykit.c index 75359531..e1053c8e 100644 --- a/arraykit.c +++ b/arraykit.c @@ -320,7 +320,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu // 4: Sort PyArrayObject *order = (PyArrayObject*)PyArray_ArgSort(ar, 0, NPY_MERGESORT); - long* order_arr = (long*)PyArray_DATA(order); + npy_intp* order_arr = (npy_intp*)PyArray_DATA(order); // 5. Find duplicates PyObject* sar = PyObject_GetItem((PyObject*)ar, (PyObject*)order); diff --git a/test/test_util.py b/test/test_util.py index 17e695c6..78cab61a 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -252,6 +252,34 @@ def test_risky(self) -> None: post = isin_array_func(array=arr1, other=arr2) self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') + def test_failing_windows(self) -> None: + from arraykit import isin_array + + isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) + + arr1 = np.array([1, 2, 3, 4, 5], dtype=int) + arr2 = np.array([1, 4, 7, 9], dtype=int) + expected = np.array([1, 0, 0, 1, 0], dtype=bool) + + post = isin_array_func(array=arr1, other=arr2) + + if not np.array_equal(expected, post): + print('arr1:') + print(arr1.shape, arr1.dtype, arr1.nbytes, arr1.itemsize) + print(arr1.tobytes()) + print(arr1) + + print('\narr2:') + print(arr2.shape, arr2.dtype, arr2.nbytes, arr2.itemsize) + print(arr2.tobytes()) + print(arr2) + + print('\npost:') + print(post.shape, post.dtype, post.nbytes, post.itemsize) + print(post.tobytes()) + print(post) + + assert False if __name__ == '__main__': unittest.main() From 996b47aab85abf1443d261ced090924a4ae2d614 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Wed, 10 Mar 2021 00:22:41 -0800 Subject: [PATCH 09/29] Adds initial support for object dtypes. --- arraykit.c | 88 +++++++++++++++++++++++++++++++++++++++++++---- test/test_util.py | 69 ++++++++++++++++++++++++++----------- 2 files changed, 130 insertions(+), 27 deletions(-) diff --git a/arraykit.c b/arraykit.c index e1053c8e..0c33ac63 100644 --- a/arraykit.c +++ b/arraykit.c @@ -103,9 +103,8 @@ PyArray_Descr* AK_ResolveDTypeIter(PyObject *dtypes) { PyObject *iterator = PyObject_GetIter(dtypes); - if (iterator == NULL) { - return NULL; - } + AK_CHECK_NULL(iterator) + PyArray_Descr *resolved = NULL; PyArray_Descr *dtype; while ((dtype = (PyArray_Descr*) PyIter_Next(iterator))) { @@ -268,9 +267,86 @@ AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) } static PyObject * -AK_isin_array_object(PyArrayObject *Py_UNUSED(array), PyArrayObject *Py_UNUSED(other)) +AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) { - return NULL; + /* Algorithm: + + for loc, element in loc_iter(array): + result[loc] = element in set(other) + return ret[rev_idx] + */ + + // 1. Capture original array shape for return value + int array_ndim = PyArray_NDIM(array); + npy_intp* array_dims = PyArray_DIMS(array); + + PyObject* compare_elements = PyFrozenSet_New((PyObject*)other); + AK_CHECK_NULL(compare_elements) + + // 2: Construct empty array + PyArrayObject* result = (PyArrayObject*)PyArray_Empty( + array_ndim, // nd + array_dims, // dims + PyArray_DescrFromType(NPY_BOOL), // dtype + 0); // is_f_order + AK_CHECK_NULL(result) + + // 3. Set up iteration + // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example + NpyIter *iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + AK_CHECK_NULL(iter) + + // Store locally since it is called alot + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + if (!iternext) { + NpyIter_Deallocate(iter); + return NULL; + } + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + // 4. Iterate! + int i = 0; + do { + int j = 0; + char* data = *dataptr; + npy_intp size = *sizeptr; + npy_intp stride = *strideptr; + + while (size--) { + PyObject* obj; + memcpy(&obj, data, sizeof(obj)); + + // 5. Assign into result whether or not the element exists in the set + int found = PySequence_Contains(compare_elements, obj); + if (found == -1) { + return NULL; + } + + if (array_ndim == 1){ + *(npy_bool *) PyArray_GETPTR1(result, j) = (npy_bool)found; + } + else { + *(npy_bool *) PyArray_GETPTR2(result, i, j) = (npy_bool)found; + } + + data += stride; + ++j; + } + + ++i; + /* Increment the iterator to the next inner loop */ + } while(iternext(iter)); + + NpyIter_Deallocate(iter); + + return (PyObject*)result; } static PyObject * @@ -342,7 +418,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu array_ndim, // nd array_dims, // dims PyArray_DescrFromType(NPY_BOOL), // dtype - 1); // is_f_order + 0); // is_f_order AK_CHECK_NULL(ret) size_t stride = 0; diff --git a/test/test_util.py b/test/test_util.py index 78cab61a..2601f4a6 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -218,7 +218,7 @@ def test_isin_2d(self) -> None: other_is_unique=oiu, ))) - def test_risky(self) -> None: + def test_1d_2d_dtype_unique(self) -> None: from arraykit import isin_array isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) @@ -252,34 +252,61 @@ def test_risky(self) -> None: post = isin_array_func(array=arr1, other=arr2) self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') - def test_failing_windows(self) -> None: + def test_1d_2d_dtype_object_unique(self) -> None: from arraykit import isin_array - isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + + arr1_1d = np.array([1, 2, 3, 4, 5], dtype=object) + arr1_2d = np.array([[1, 2, 3], [4, 5, 9]], dtype=object) + + arr2 = np.array([1, 4, 7, 9], dtype=object) + + post = isin_array(array=arr1_1d, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_1d, post)) + + post = isin_array(array=arr1_2d, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_2d, post)) + + class C: + def __init__(self, val): + self.val = val + + def __eq__(self, other): + return self.val == other.val - arr1 = np.array([1, 2, 3, 4, 5], dtype=int) - arr2 = np.array([1, 4, 7, 9], dtype=int) - expected = np.array([1, 0, 0, 1, 0], dtype=bool) + def __hash__(self): + return hash(self.val) + + arr1 = np.array([C(1), C(2), C(3), C(4), C(5)]) + arr2 = np.array([C(1), C(4), C(9)]) + + post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_1d, post)) + + arr1 = np.array([[C(1), C(2), C(3)], [C(4), C(5), C(9)]]) + + post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_2d, post)) + + def test_1d_2d_dtype_object_non_unique(self) -> None: + from arraykit import isin_array + + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - post = isin_array_func(array=arr1, other=arr2) + arr1_1d = np.array([1, 2, 2, 4, 5], dtype=object) + arr1_2d = np.array([[1, 2, 3], [4, 2, 9]], dtype=object) - if not np.array_equal(expected, post): - print('arr1:') - print(arr1.shape, arr1.dtype, arr1.nbytes, arr1.itemsize) - print(arr1.tobytes()) - print(arr1) + arr2 = np.array([1, 4, 4, 9], dtype=object) - print('\narr2:') - print(arr2.shape, arr2.dtype, arr2.nbytes, arr2.itemsize) - print(arr2.tobytes()) - print(arr2) + post = isin_array(array=arr1_1d, array_is_unique=False, other=arr2, other_is_unique=False) + self.assertTrue(np.array_equal(e_1d, post)) - print('\npost:') - print(post.shape, post.dtype, post.nbytes, post.itemsize) - print(post.tobytes()) - print(post) + post = isin_array(array=arr1_2d, array_is_unique=False, other=arr2, other_is_unique=False) + self.assertTrue(np.array_equal(e_2d, post)) - assert False if __name__ == '__main__': unittest.main() From b546e34af7a37d3cef6e9fe8c5598666ee6b814f Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 12 Mar 2021 11:50:33 -0800 Subject: [PATCH 10/29] Cleans up ref counts and error handling in AK_isin_array_object. --- arraykit.c | 85 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 32 deletions(-) diff --git a/arraykit.c b/arraykit.c index 0c33ac63..839a03f8 100644 --- a/arraykit.c +++ b/arraykit.c @@ -42,7 +42,7 @@ } while (0) // To simplify lines merely checking for NULL pointers -# define AK_CHECK_NULL(obj) \ +# define AK_CHECK_NOT(obj) \ if (!obj) { \ return NULL; \ } @@ -103,7 +103,7 @@ PyArray_Descr* AK_ResolveDTypeIter(PyObject *dtypes) { PyObject *iterator = PyObject_GetIter(dtypes); - AK_CHECK_NULL(iterator) + AK_CHECK_NOT(iterator) PyArray_Descr *resolved = NULL; PyArray_Descr *dtype; @@ -241,7 +241,7 @@ static PyObject * resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args) { PyArray_Descr *d1, *d2; - AK_CHECK_NULL(PyArg_ParseTuple(args, "O!O!:resolve_dtype", + AK_CHECK_NOT(PyArg_ParseTuple(args, "O!O!:resolve_dtype", &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)) return (PyObject *)AK_ResolveDTypes(d1, d2); } @@ -259,7 +259,7 @@ static PyArrayObject * AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) { PyObject *container = PyTuple_Pack(2, arr1, arr2); - AK_CHECK_NULL(container) + AK_CHECK_NOT(container) PyArrayObject *array = (PyArrayObject*)PyArray_Concatenate(container, 0); Py_DECREF(container); @@ -269,42 +269,48 @@ AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) static PyObject * AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) { - /* Algorithm: + /* Algorithm: for loc, element in loc_iter(array): result[loc] = element in set(other) - return ret[rev_idx] */ + // 0. Deallocate on failure + PyObject* compare_elements; + PyArrayObject* result; + NpyIter *iter = NULL; // Compiler gets mad if this isn't set to NULL + // 1. Capture original array shape for return value int array_ndim = PyArray_NDIM(array); npy_intp* array_dims = PyArray_DIMS(array); - PyObject* compare_elements = PyFrozenSet_New((PyObject*)other); - AK_CHECK_NULL(compare_elements) + compare_elements = PyFrozenSet_New((PyObject*)other); + AK_CHECK_NOT(compare_elements) // 2: Construct empty array - PyArrayObject* result = (PyArrayObject*)PyArray_Empty( + result = (PyArrayObject*)PyArray_Empty( array_ndim, // nd array_dims, // dims PyArray_DescrFromType(NPY_BOOL), // dtype 0); // is_f_order - AK_CHECK_NULL(result) + if (!result) { + goto failure; + } // 3. Set up iteration // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example - NpyIter *iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); - AK_CHECK_NULL(iter) - - // Store locally since it is called alot + iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + if (!iter) { + goto failure; + } + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); if (!iternext) { - NpyIter_Deallocate(iter); - return NULL; + goto failure; } char** dataptr = NpyIter_GetDataPtrArray(iter); @@ -322,11 +328,17 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) while (size--) { PyObject* obj; memcpy(&obj, data, sizeof(obj)); + if (!obj) { + goto failure; + } + Py_INCREF(obj); // 5. Assign into result whether or not the element exists in the set int found = PySequence_Contains(compare_elements, obj); + Py_DECREF(obj); + if (found == -1) { - return NULL; + goto failure; } if (array_ndim == 1){ @@ -344,9 +356,18 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) /* Increment the iterator to the next inner loop */ } while(iternext(iter)); + Py_DECREF(compare_elements); NpyIter_Deallocate(iter); return (PyObject*)result; + +failure: + Py_DECREF(compare_elements); + Py_XDECREF(result); + if (iter != NULL) { + NpyIter_Deallocate(iter); + } + return NULL; } static PyObject * @@ -390,7 +411,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu // 3. Concatenate PyArrayObject* ar = AK_concat_arrays(array, other); - AK_CHECK_NULL(ar) + AK_CHECK_NOT(ar) size_t ar_size = PyArray_SIZE(ar); @@ -400,16 +421,16 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu // 5. Find duplicates PyObject* sar = PyObject_GetItem((PyObject*)ar, (PyObject*)order); - AK_CHECK_NULL(sar) + AK_CHECK_NOT(sar) PyObject* comp_a = PySequence_GetSlice((PyObject*)sar, 1, ar_size); - AK_CHECK_NULL(comp_a) + AK_CHECK_NOT(comp_a) PyObject* comp_b = PySequence_GetSlice((PyObject*)sar, 0, ar_size - 1); - AK_CHECK_NULL(comp_b) + AK_CHECK_NOT(comp_b) PyObject* flag = PyObject_RichCompare(comp_a, comp_b, Py_EQ); - AK_CHECK_NULL(flag) + AK_CHECK_NOT(flag) npy_bool* flag_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)flag); @@ -419,7 +440,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu array_dims, // dims PyArray_DescrFromType(NPY_BOOL), // dtype 0); // is_f_order - AK_CHECK_NULL(ret) + AK_CHECK_NOT(ret) size_t stride = 0; if (array_ndim == 2) { @@ -459,7 +480,7 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) static char *kwlist[] = {"array", "array_is_unique", "other", "other_is_unique", NULL}; - AK_CHECK_NULL(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", + AK_CHECK_NOT(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", kwlist, &PyArray_Type, &array, &array_is_unique, &PyArray_Type, &other, &other_is_unique)) @@ -544,10 +565,10 @@ ArrayGO_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs) int parsed = PyArg_ParseTupleAndKeywords( args, kwargs, "O|$p:ArrayGO", argnames, &iterable, &own_iterable ); - AK_CHECK_NULL(parsed) + AK_CHECK_NOT(parsed) ArrayGOObject *self = (ArrayGOObject *)cls->tp_alloc(cls, 0); - AK_CHECK_NULL(self) + AK_CHECK_NOT(self) if (PyArray_Check(iterable)) { if (!PyDataType_ISOBJECT(PyArray_DESCR((PyArrayObject *)iterable))) { @@ -590,7 +611,7 @@ ArrayGO_append(ArrayGOObject *self, PyObject *value) { if (!self->list) { self->list = PyList_New(1); - AK_CHECK_NULL(self->list) + AK_CHECK_NOT(self->list) Py_INCREF(value); PyList_SET_ITEM(self->list, 0, value); @@ -607,7 +628,7 @@ ArrayGO_extend(ArrayGOObject *self, PyObject *values) { if (!self->list) { self->list = PySequence_List(values); - AK_CHECK_NULL(self->list) + AK_CHECK_NOT(self->list) Py_RETURN_NONE; } From 71d2fa210dc681965a6d98fecd9b9d617eed522d Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 12 Mar 2021 12:21:59 -0800 Subject: [PATCH 11/29] Adds AK_GOTO_ON_NOT. Cleans up variable names. Handles refcounts & errors in AK_isin_array_dtype --- arraykit.c | 123 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 77 insertions(+), 46 deletions(-) diff --git a/arraykit.c b/arraykit.c index 839a03f8..3bbad455 100644 --- a/arraykit.c +++ b/arraykit.c @@ -41,12 +41,19 @@ return NULL;\ } while (0) -// To simplify lines merely checking for NULL pointers +// To simplify lines merely checking for `!value` # define AK_CHECK_NOT(obj) \ if (!obj) { \ return NULL; \ } +// To simplify lines going to a label failure on `!value` +# define AK_GOTO_ON_NOT(obj, label) \ + if (!obj) { \ + goto label; \ + } + + # if defined __GNUC__ || defined __clang__ # define AK_LIKELY(X) __builtin_expect(!!(X), 1) # define AK_UNLIKELY(X) __builtin_expect(!!(X), 0) @@ -276,9 +283,9 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) */ // 0. Deallocate on failure - PyObject* compare_elements; - PyArrayObject* result; - NpyIter *iter = NULL; // Compiler gets mad if this isn't set to NULL + PyObject* compare_elements = NULL; + PyArrayObject* result = NULL; + NpyIter *iter = NULL; // 1. Capture original array shape for return value int array_ndim = PyArray_NDIM(array); @@ -293,9 +300,7 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) array_dims, // dims PyArray_DescrFromType(NPY_BOOL), // dtype 0); // is_f_order - if (!result) { - goto failure; - } + AK_GOTO_ON_NOT(result, failure) // 3. Set up iteration // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example @@ -304,14 +309,10 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) NPY_KEEPORDER, NPY_NO_CASTING, NULL); - if (!iter) { - goto failure; - } + AK_GOTO_ON_NOT(iter, failure) NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - if (!iternext) { - goto failure; - } + AK_GOTO_ON_NOT(iternext, failure) char** dataptr = NpyIter_GetDataPtrArray(iter); npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); @@ -328,9 +329,7 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) while (size--) { PyObject* obj; memcpy(&obj, data, sizeof(obj)); - if (!obj) { - goto failure; - } + AK_GOTO_ON_NOT(obj, failure) Py_INCREF(obj); // 5. Assign into result whether or not the element exists in the set @@ -375,64 +374,76 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu { /* Algorithm: + array = np.ravel(array) + if not assume_unique: array, rev_idx = np.unique(array, return_inverse=True) other = np.unique(other) - ar = np.concatenate((array, other)) + concatenated = np.concatenate((array, other)) - order = ar.argsort(kind='mergesort') - sar = ar[order] + ordered_idx = concatenated.argsort(kind='mergesort') + sorted_arr = concatenated[ordered_idx] - flag = np.concatenate(((sar[1:] == sar[:-1]), [False])) + flag = np.concatenate(((sorted_arr[1:] == sorted_arr[:-1]), [False])) - ret = np.empty(ar.shape, dtype=bool) - ret[order] = flag + ret = np.empty(concatenated.shape, dtype=bool) + ret[ordered_idx] = flag if assume_unique: return ret[:len(array)] else: return ret[rev_idx] */ + // 0. Deallocate on failure + PyArrayObject* raveled_array = NULL; + PyArrayObject* concatenated = NULL; + PyArrayObject *ordered_idx = NULL; + PyObject* sorted_arr = NULL; + PyObject* left_slice = NULL; + PyObject* right_slice = NULL; + PyObject* comparison = NULL; // 1. Capture original array shape for return value int array_ndim = PyArray_NDIM(array); npy_intp* array_dims = PyArray_DIMS(array); size_t array_size = PyArray_SIZE(array); - // 2. Ravel the array as we want to operate on 1D arrays only. - array = (PyArrayObject*)PyArray_Ravel(array, NPY_CORDER); - // other is guaranteed to be 1D + // 2. Ravel the array as we want to operate on 1D arrays only. (other is guaranteed to be 1D) + raveled_array = (PyArrayObject*)PyArray_Ravel(array, NPY_CORDER); + AK_GOTO_ON_NOT(raveled_array, failure) if (!assume_unique) { - // TODO: Call array, rev_idx = np.unique(array, return_inverse=True) + // TODO: Call raveled_array, rev_idx = np.unique(raveled_array, return_inverse=True) // TODO: Call other = np.unique(other) + goto failure; } // 3. Concatenate - PyArrayObject* ar = AK_concat_arrays(array, other); - AK_CHECK_NOT(ar) + concatenated = AK_concat_arrays(raveled_array, other); + AK_GOTO_ON_NOT(concatenated, failure) - size_t ar_size = PyArray_SIZE(ar); + size_t concatenated_size = PyArray_SIZE(concatenated); // 4: Sort - PyArrayObject *order = (PyArrayObject*)PyArray_ArgSort(ar, 0, NPY_MERGESORT); - npy_intp* order_arr = (npy_intp*)PyArray_DATA(order); + ordered_idx = (PyArrayObject*)PyArray_ArgSort(concatenated, 0, NPY_MERGESORT); + AK_GOTO_ON_NOT(ordered_idx, failure) + npy_intp* ordered_idx_arr = (npy_intp*)PyArray_DATA(ordered_idx); // 5. Find duplicates - PyObject* sar = PyObject_GetItem((PyObject*)ar, (PyObject*)order); - AK_CHECK_NOT(sar) + sorted_arr = PyObject_GetItem((PyObject*)concatenated, (PyObject*)ordered_idx); + AK_GOTO_ON_NOT(sorted_arr, failure) - PyObject* comp_a = PySequence_GetSlice((PyObject*)sar, 1, ar_size); - AK_CHECK_NOT(comp_a) + left_slice = PySequence_GetSlice((PyObject*)sorted_arr, 1, concatenated_size); + AK_GOTO_ON_NOT(left_slice, failure) - PyObject* comp_b = PySequence_GetSlice((PyObject*)sar, 0, ar_size - 1); - AK_CHECK_NOT(comp_b) + right_slice = PySequence_GetSlice((PyObject*)sorted_arr, 0, concatenated_size - 1); + AK_GOTO_ON_NOT(right_slice, failure) - PyObject* flag = PyObject_RichCompare(comp_a, comp_b, Py_EQ); - AK_CHECK_NOT(flag) + comparison = PyObject_RichCompare(left_slice, right_slice, Py_EQ); + AK_GOTO_ON_NOT(comparison, failure) - npy_bool* flag_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)flag); + npy_bool* comparison_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)comparison); // 6: Construct empty array PyArrayObject* ret = (PyArrayObject*)PyArray_Empty( @@ -440,7 +451,8 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu array_dims, // dims PyArray_DescrFromType(NPY_BOOL), // dtype 0); // is_f_order - AK_CHECK_NOT(ret) + + AK_GOTO_ON_NOT(ret, failure) size_t stride = 0; if (array_ndim == 2) { @@ -448,28 +460,46 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu } // 7: Assign into duplicates array - for (size_t i = 0; i < (size_t)PyArray_SIZE(order); ++i) { - size_t idx_0 = (size_t)order_arr[i]; + for (size_t i = 0; i < (size_t)PyArray_SIZE(ordered_idx); ++i) { + size_t idx_0 = (size_t)ordered_idx_arr[i]; if (idx_0 >= array_size) { continue; } // We are guaranteed that flag_ar[i] is always a valid index if (array_ndim == 1) { - *(npy_bool *) PyArray_GETPTR1(ret, idx_0) = flag_arr[i]; + *(npy_bool *) PyArray_GETPTR1(ret, idx_0) = comparison_arr[i]; } else { size_t idx_1 = idx_0 / stride; idx_0 = idx_0 - (stride * idx_1); - *(npy_bool *) PyArray_GETPTR2(ret, idx_1, idx_0) = flag_arr[i]; + *(npy_bool *) PyArray_GETPTR2(ret, idx_1, idx_0) = comparison_arr[i]; } } // 8. Return! if (assume_unique) { + Py_DECREF(raveled_array); + Py_DECREF(concatenated); + Py_DECREF(ordered_idx); + Py_DECREF(sorted_arr); + Py_DECREF(left_slice); + Py_DECREF(right_slice); + Py_DECREF(comparison); return (PyObject*)ret; } + // return NULL; + +failure: + Py_XDECREF(raveled_array); + Py_XDECREF(concatenated); + Py_XDECREF(ordered_idx); + Py_XDECREF(sorted_arr); + Py_XDECREF(left_slice); + Py_XDECREF(right_slice); + Py_XDECREF(comparison); return NULL; + } static PyObject * @@ -492,11 +522,12 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) PyArray_Descr* array_dtype = PyArray_DTYPE(array); PyArray_Descr* other_dtype = PyArray_DTYPE(other); - // 2. Handle object dtypes + // Use Python sets to handle object arrays if (PyDataType_ISOBJECT(array_dtype) || PyDataType_ISOBJECT(other_dtype)) { return AK_isin_array_object(array, other); } + // Use numpy in1d logic for dtype arrays return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); } From 6c07d658288e6ce84d2b8b05c90f35a5cdff8b5f Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 12 Mar 2021 19:46:50 -0800 Subject: [PATCH 12/29] Partial commit to save work. Working on infrastructure for non-unique dtype arrays. --- arraykit.c | 377 +++++++++++++++++++++++++++++++++- performance/reference/util.py | 34 +++ test/test_util.py | 229 +++++++++++---------- 3 files changed, 525 insertions(+), 115 deletions(-) diff --git a/arraykit.c b/arraykit.c index 3bbad455..2fccf30a 100644 --- a/arraykit.c +++ b/arraykit.c @@ -5,6 +5,7 @@ # define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION # include "numpy/arrayobject.h" +# include "numpy/arrayscalars.h" // Needed for Datetime scalar expansions //------------------------------------------------------------------------------ // Macros @@ -273,6 +274,352 @@ AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) return array; } +static void +AK_print_array_float(PyArrayObject *array) +{ + NpyIter *iter = NULL; + + int array_ndim = PyArray_NDIM(array); + npy_intp* array_dims = PyArray_DIMS(array); + + iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + // 4. Iterate! + do { + char* data = *dataptr; + npy_intp size = *sizeptr; + npy_intp stride = *strideptr; + + printf("NUMPY %i ", (int)stride); + while (size--) { + double val; + memcpy(&val, data, sizeof(double)); + printf("%.1f ", val); + data += stride; + } + printf("\n"); + + /* Increment the iterator to the next inner loop */ + } while(iternext(iter)); + + NpyIter_Deallocate(iter); +} + +static void +AK_print_array_bool(PyArrayObject *array) +{ + NpyIter *iter = NULL; + + int array_ndim = PyArray_NDIM(array); + npy_intp* array_dims = PyArray_DIMS(array); + + iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + // 4. Iterate! + do { + char* data = *dataptr; + npy_intp size = *sizeptr; + npy_intp stride = *strideptr; + + printf("NUMPY %i ", (int)stride); + while (size--) { + printf("%i ", (int)*data); + data += stride; + } + printf("\n"); + + /* Increment the iterator to the next inner loop */ + } while(iternext(iter)); + + NpyIter_Deallocate(iter); +} + +static int +is_nan(PyObject *a) +{ + double v = PyFloat_AsDouble(a); + + // Need to disambiguate, since v could be -1 and no failure happened + if (v == -1 && PyErr_Occurred()) { + return -1; + } + + return isnan(v); +} + +static int +is_nanj(PyObject *a) +{ + return isnan(((PyComplexObject*)a)->cval.real); +} + +static int +is_nat(PyObject *a) +{ + // NaT - Datetime + if (PyArray_IsScalar(a, Datetime)) { // Cannot fail + return PyArrayScalar_VAL(a, Datetime) == NPY_DATETIME_NAT; + } + + // NaT - Timedelta + if (PyArray_IsScalar(a, Timedelta)) { // Cannot fail + return PyArrayScalar_VAL(a, Timedelta) == NPY_DATETIME_NAT; + } + + Py_UNREACHABLE(); +} + +static PyObject* +AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) +{ + /* Algorithm (assumes `sar` is sorted & mask is initialized to [1, 0, ... len(sar)] + + if sar.dtype.kind in "cfmM" and np.isnan(sar[-1]): + if sar.dtype.kind == "c": # for complex all NaNs are considered equivalent + aux_firstnan = np.searchsorted(np.isnan(sar), True, side='left') + else: + aux_firstnan = np.searchsorted(sar, sar[-1], side='left') + + mask[1:aux_firstnan] = (sar[1:aux_firstnan] != sar[:aux_firstnan - 1]) + mask[aux_firstnan] = True + mask[aux_firstnan + 1:] = False + else: + mask[1:] = sar[1:] != sar[:-1] + */ + PyObject* left_slice = NULL; + PyObject* right_slice = NULL; + PyObject* comparison = NULL; + + int size = PyArray_SIZE(sar); + PyArray_Descr* dtype = PyArray_DESCR(sar); + + int is_float = PyDataType_ISFLOAT(dtype); + int is_complex = PyDataType_ISCOMPLEX(dtype); + int is_dt = PyDataType_ISDATETIME(dtype); + + PyObject* last_element = NULL; + int contains_nan = 0; + if (is_float | is_complex | is_dt) { + last_element = PyObject_GetItem((PyObject*)sar, PyLong_FromLong(-1)); + AK_GOTO_ON_NOT(last_element, failure) + if (is_float) { + contains_nan = is_nan(last_element); + } + else if (is_complex) { + contains_nan = is_nanj(last_element); + } + else { + // This will always be false as long as numpy < 1.18. NaT sort to the front + contains_nan = is_nat(last_element); + } + } + + if (contains_nan) { + size_t firstnan = 0; + if (is_complex) { + // aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') + } + else { + // This gives back an array of 1-element since `last_element` is a single element + PyObject* firstnan_obj = PyArray_SearchSorted(sar, last_element, NPY_SEARCHLEFT, NULL); + AK_GOTO_ON_NOT(firstnan_obj, failure) + + firstnan = *(size_t*)PyArray_DATA((PyArrayObject*)firstnan_obj); + Py_DECREF(firstnan_obj); + } + + left_slice = PySequence_GetSlice((PyObject*)sar, 1, (int)firstnan); + AK_GOTO_ON_NOT(left_slice, failure) + + right_slice = PySequence_GetSlice((PyObject*)sar, 0, firstnan - 1); + AK_GOTO_ON_NOT(right_slice, failure) + + comparison = PyObject_RichCompare(left_slice, right_slice, Py_NE); + AK_GOTO_ON_NOT(comparison, failure) + + npy_bool* comparison_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)comparison); + + for (size_t i = 1; i < firstnan; ++i) { + mask[i] = comparison_arr[i-1]; + } + mask[firstnan] = 1; + for (size_t i = firstnan + 1; i < size; ++i) { + mask[i] = 0; + } + } + else { + left_slice = PySequence_GetSlice((PyObject*)sar, 1, size); + AK_GOTO_ON_NOT(left_slice, failure) + + right_slice = PySequence_GetSlice((PyObject*)sar, 0, size - 1); + AK_GOTO_ON_NOT(right_slice, failure) + + comparison = PyObject_RichCompare(left_slice, right_slice, Py_NE); + AK_GOTO_ON_NOT(comparison, failure) + + npy_bool* comparison_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)comparison); + + for (size_t i = 1; i < (size_t)size; ++i) { + mask[i] = comparison_arr[i-1]; + } + } + + return 1; + +failure: + return -1; +} + +static PyObject* +AK_get_unique_arr(PyArrayObject *original_arr) +{ + /* Algorithm + + arr = arr(original_arr) + arr.sort() + + mask = np.empty(arr.shape, dtype=np.bool_) + mask[0] = True + + build_mask(...) + + return arr[mask] + */ + + // 1. Initialize + PyObject* filtered_arr = NULL; + + int size = PyArray_SIZE(original_arr); + PyArray_Descr* dtype = PyArray_DESCR(original_arr); + + npy_bool mask_arr[size]; + + // 2. Get a copy of the original arr since sorting is inplace + PyArrayObject* arr = (PyArrayObject*)PyArray_FromArray( + original_arr, + dtype, + NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSURECOPY); + + if (PyArray_Sort(arr, 0, NPY_QUICKSORT) == -1) { // In-place + goto failure; + } + + // 3. Build mask + memset(mask_arr, 0, sizeof(mask_arr)); + mask_arr[0] = 1; + + AK_build_unique_arr_mask(arr, &mask_arr); + + PyObject* mask = PyArray_NewFromDescr( + &PyArray_Type, // class (subtype) + PyArray_DescrFromType(NPY_BOOL), // dtype (descr) + PyArray_NDIM(arr), // ndim (nd) + PyArray_DIMS(arr), // dims + NULL, // strides + mask_arr, // data + NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags + NULL); // sublclass (obj) + + // 4. Filter arr + filtered_arr = PyObject_GetItem((PyObject*)arr, (PyObject*)mask); + AK_GOTO_ON_NOT(filtered_arr, failure) + + return filtered_arr; + +failure: + printf("FAILURE!\n"); + return NULL; +} + +static PyObject* +AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) +{ + /* Algorithm + + arr = arr(original_arr) + + perm = arr.argsort('quicksort') + arr = arr[perm] + + mask = np.empty(arr.shape, dtype=np.bool_) + mask[0] = True + + AK_build_unique_arr_mask(arr, mask) + + ret = arr[mask] + imask = np.cumsum(mask) - 1 + inv_idx = np.empty(mask.shape, dtype=np.intp) + inv_idx[perm] = imask + return ret, inv_idx + + return ret + */ + + // 1. Initialize + PyObject* filtered_arr = NULL; + + int size = PyArray_SIZE(original_arr); + PyArray_Descr* dtype = PyArray_DESCR(original_arr); + + npy_bool mask_arr[size]; + + // 2. Get a copy of the original arr since sorting is inplace + PyArrayObject* arr = (PyArrayObject*)PyArray_FromArray( + original_arr, + dtype, + NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSURECOPY); + + if (PyArray_Sort(arr, 0, NPY_QUICKSORT) == -1) { // In-place + goto failure; + } + + // 3. Build mask + memset(mask_arr, 0, sizeof(mask_arr)); + mask_arr[0] = 1; + + AK_build_unique_arr_mask(arr, &mask_arr); + + PyObject* mask = PyArray_NewFromDescr( + &PyArray_Type, // class (subtype) + PyArray_DescrFromType(NPY_BOOL), // dtype (descr) + PyArray_NDIM(arr), // ndim (nd) + PyArray_DIMS(arr), // dims + NULL, // strides + mask_arr, // data + NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags + NULL); // sublclass (obj) + + // 4. Filter arr + filtered_arr = PyObject_GetItem((PyObject*)arr, (PyObject*)mask); + AK_GOTO_ON_NOT(filtered_arr, failure) + + return filtered_arr; + +failure: + printf("FAILURE!\n"); + return NULL; +} + static PyObject * AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) { @@ -397,6 +744,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu */ // 0. Deallocate on failure PyArrayObject* raveled_array = NULL; + PyArrayObject *reverse_idx = NULL; PyArrayObject* concatenated = NULL; PyArrayObject *ordered_idx = NULL; PyObject* sorted_arr = NULL; @@ -412,17 +760,30 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu // 2. Ravel the array as we want to operate on 1D arrays only. (other is guaranteed to be 1D) raveled_array = (PyArrayObject*)PyArray_Ravel(array, NPY_CORDER); AK_GOTO_ON_NOT(raveled_array, failure) + Py_INCREF(raveled_array); + + return AK_get_unique(raveled_array); if (!assume_unique) { - // TODO: Call raveled_array, rev_idx = np.unique(raveled_array, return_inverse=True) - // TODO: Call other = np.unique(other) - goto failure; - } + PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(raveled_array); + PyArrayObject *raveled_array_unique = PyTuple_GetItem(arr_and_rev_idx, 0); + reverse_idx = PyTuple_GetItem(arr_and_rev_idx, 1); + Py_DECREF(arr_and_rev_idx); + + PyArrayObject *other_unique = (PyArrayObject*)AK_get_unique_arr(other); - // 3. Concatenate - concatenated = AK_concat_arrays(raveled_array, other); + // 3. Concatenate + concatenated = AK_concat_arrays(raveled_array_unique, other_unique); + Py_DECREF(raveled_array_unique); + Py_DECREF(other_unique); + } + else { + // 3. Concatenate + concatenated = AK_concat_arrays(raveled_array, other); + } AK_GOTO_ON_NOT(concatenated, failure) + size_t concatenated_size = PyArray_SIZE(concatenated); // 4: Sort @@ -476,6 +837,8 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu } } + Py_XDECREF(reverse_idx); // This might not exist + // 8. Return! if (assume_unique) { Py_DECREF(raveled_array); @@ -492,6 +855,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu failure: Py_XDECREF(raveled_array); + Py_XDECREF(reverse_idx); Py_XDECREF(concatenated); Py_XDECREF(ordered_idx); Py_XDECREF(sorted_arr); @@ -526,7 +890,6 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) if (PyDataType_ISOBJECT(array_dtype) || PyDataType_ISOBJECT(other_dtype)) { return AK_isin_array_object(array, other); } - // Use numpy in1d logic for dtype arrays return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); } diff --git a/performance/reference/util.py b/performance/reference/util.py index 8203a210..d93c41f5 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -246,3 +246,37 @@ def isin_array(*, result.flags.writeable = False return result + +def unique(ar, return_inverse=False): + + ar = np.asanyarray(ar).flatten() + + if return_inverse: + perm = ar.argsort(kind='quicksort') + aux = ar[perm] + else: + ar.sort() + aux = ar + + mask = np.empty(aux.shape, dtype=np.bool_) + mask[:1] = True + if aux.dtype.kind in "cfmM" and np.isnan(aux[-1]): + if aux.dtype.kind == "c": # for complex all NaNs are considered equivalent + aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') + else: + aux_firstnan = np.searchsorted(aux, aux[-1], side='left') + + mask[1:aux_firstnan] = (aux[1:aux_firstnan] != aux[:aux_firstnan - 1]) + mask[aux_firstnan] = True + mask[aux_firstnan + 1:] = False + else: + mask[1:] = aux[1:] != aux[:-1] + + ret = aux[mask] + if return_inverse: + imask = np.cumsum(mask) - 1 + inv_idx = np.empty(mask.shape, dtype=np.intp) + inv_idx[perm] = imask + return ret, inv_idx + + return ret \ No newline at end of file diff --git a/test/test_util.py b/test/test_util.py index 2601f4a6..477c454c 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -170,142 +170,155 @@ def test_row_1d_filter_a(self) -> None: with self.assertRaises(NotImplementedError): row_1d_filter(a1.reshape(1,2,5)) - def test_isin_1d(self) -> None: - from performance.reference.util import isin_array - - T, F = True, False - arr1 = np.array([1, 2, 3, 4, 5]) - - expected = [ - (np.array([T, F, T, T, F]), [1, 3, 4]), - (np.array([F, F, F, F, F]), [7, 8]), - (np.array([T, T, T, T, T]), [1, 2, 3, 4, 5]), - ] - - for expected_result, values in expected: - for dtype in (int, object): - arr2 = np.array(values, dtype=dtype) - - for aiu, oiu in itertools.product((T, F), (T, F)): - self.assertTrue(np.array_equal(expected_result, isin_array( - array=arr1, - array_is_unique=aiu, - other=arr2, - other_is_unique=oiu, - ))) - - def test_isin_2d(self) -> None: - from performance.reference.util import isin_array - - T, F = True, False - arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - - expected = [ - (np.array([[T, F, T], [T, F, F], [F, F, T]]), [1, 3, 4, 9]), - (np.array([[F, F, F], [F, F, F], [F, F, F]]), [10, 11]), - (np.array([[T, T, T], [T, T, T], [T, T, T]]), [1, 2, 3, 4, 5, 6, 7, 8, 9]), - ] - - for expected_result, values in expected: - for dtype in (int, object): - arr2 = np.array(values, dtype=dtype) - - for aiu, oiu in itertools.product((T, F), (T, F)): - self.assertTrue(np.array_equal(expected_result, isin_array( - array=arr1, - array_is_unique=aiu, - other=arr2, - other_is_unique=oiu, - ))) - - def test_1d_2d_dtype_unique(self) -> None: - from arraykit import isin_array + # def test_isin_1d(self) -> None: + # from performance.reference.util import isin_array - isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) + # T, F = True, False + # arr1 = np.array([1, 2, 3, 4, 5]) - e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) - e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + # expected = [ + # (np.array([T, F, T, T, F]), [1, 3, 4]), + # (np.array([F, F, F, F, F]), [7, 8]), + # (np.array([T, T, T, T, T]), [1, 2, 3, 4, 5]), + # ] - v_1d = [1, 2, 3, 4, 5] - v_2d = [[1, 2, 3], [4, 5, 9]] + # for expected_result, values in expected: + # for dtype in (int, object): + # arr2 = np.array(values, dtype=dtype) - w_1d = [1, 4, 7, 9] + # for aiu, oiu in itertools.product((T, F), (T, F)): + # self.assertTrue(np.array_equal(expected_result, isin_array( + # array=arr1, + # array_is_unique=aiu, + # other=arr2, + # other_is_unique=oiu, + # ))) - dtype_funcs = [ - (int, int), - (float, float), - (str, str), - ('datetime64[D]', lambda x: date(2020, 1, x)), - ] + # def test_isin_2d(self) -> None: + # from performance.reference.util import isin_array - for dtype, dtype_func in dtype_funcs: - arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) - arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + # T, F = True, False + # arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) - post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') + # expected = [ + # (np.array([[T, F, T], [T, F, F], [F, F, T]]), [1, 3, 4, 9]), + # (np.array([[F, F, F], [F, F, F], [F, F, F]]), [10, 11]), + # (np.array([[T, T, T], [T, T, T], [T, T, T]]), [1, 2, 3, 4, 5, 6, 7, 8, 9]), + # ] - for dtype, dtype_func in dtype_funcs: - arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) - arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + # for expected_result, values in expected: + # for dtype in (int, object): + # arr2 = np.array(values, dtype=dtype) - post = isin_array_func(array=arr1, other=arr2) - self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') + # for aiu, oiu in itertools.product((T, F), (T, F)): + # self.assertTrue(np.array_equal(expected_result, isin_array( + # array=arr1, + # array_is_unique=aiu, + # other=arr2, + # other_is_unique=oiu, + # ))) - def test_1d_2d_dtype_object_unique(self) -> None: - from arraykit import isin_array + # def test_1d_2d_dtype_unique(self) -> None: + # from arraykit import isin_array - e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) - e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + # isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) - arr1_1d = np.array([1, 2, 3, 4, 5], dtype=object) - arr1_2d = np.array([[1, 2, 3], [4, 5, 9]], dtype=object) + # e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + # e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - arr2 = np.array([1, 4, 7, 9], dtype=object) + # v_1d = [1, 2, 3, 4, 5] + # v_2d = [[1, 2, 3], [4, 5, 9]] - post = isin_array(array=arr1_1d, array_is_unique=True, other=arr2, other_is_unique=True) - self.assertTrue(np.array_equal(e_1d, post)) + # w_1d = [1, 4, 7, 9] - post = isin_array(array=arr1_2d, array_is_unique=True, other=arr2, other_is_unique=True) - self.assertTrue(np.array_equal(e_2d, post)) + # dtype_funcs = [ + # (int, int), + # (float, float), + # (str, str), + # ('datetime64[D]', lambda x: date(2020, 1, x)), + # ] - class C: - def __init__(self, val): - self.val = val + # for dtype, dtype_func in dtype_funcs: + # arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + # arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - def __eq__(self, other): - return self.val == other.val + # post = isin_array_func(array=arr1, other=arr2) + # self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') - def __hash__(self): - return hash(self.val) + # for dtype, dtype_func in dtype_funcs: + # arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + # arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - arr1 = np.array([C(1), C(2), C(3), C(4), C(5)]) - arr2 = np.array([C(1), C(4), C(9)]) + # post = isin_array_func(array=arr1, other=arr2) + # self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') - post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) - self.assertTrue(np.array_equal(e_1d, post)) + # def test_1d_2d_dtype_object_unique(self) -> None: + # from arraykit import isin_array - arr1 = np.array([[C(1), C(2), C(3)], [C(4), C(5), C(9)]]) + # e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + # e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) - self.assertTrue(np.array_equal(e_2d, post)) + # arr1_1d = np.array([1, 2, 3, 4, 5], dtype=object) + # arr1_2d = np.array([[1, 2, 3], [4, 5, 9]], dtype=object) - def test_1d_2d_dtype_object_non_unique(self) -> None: - from arraykit import isin_array + # arr2 = np.array([1, 4, 7, 9], dtype=object) + + # post = isin_array(array=arr1_1d, array_is_unique=True, other=arr2, other_is_unique=True) + # self.assertTrue(np.array_equal(e_1d, post)) + + # post = isin_array(array=arr1_2d, array_is_unique=True, other=arr2, other_is_unique=True) + # self.assertTrue(np.array_equal(e_2d, post)) + + # class C: + # def __init__(self, val): + # self.val = val + + # def __eq__(self, other): + # return self.val == other.val + + # def __hash__(self): + # return hash(self.val) - e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) - e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + # arr1 = np.array([C(1), C(2), C(3), C(4), C(5)]) + # arr2 = np.array([C(1), C(4), C(9)]) - arr1_1d = np.array([1, 2, 2, 4, 5], dtype=object) - arr1_2d = np.array([[1, 2, 3], [4, 2, 9]], dtype=object) + # post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + # self.assertTrue(np.array_equal(e_1d, post)) - arr2 = np.array([1, 4, 4, 9], dtype=object) + # arr1 = np.array([[C(1), C(2), C(3)], [C(4), C(5), C(9)]]) - post = isin_array(array=arr1_1d, array_is_unique=False, other=arr2, other_is_unique=False) - self.assertTrue(np.array_equal(e_1d, post)) + # post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + # self.assertTrue(np.array_equal(e_2d, post)) + + # def test_1d_2d_dtype_object_non_unique(self) -> None: + # from arraykit import isin_array + + # e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + # e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + + # arr1_1d = np.array([1, 2, 2, 4, 5], dtype=object) + # arr1_2d = np.array([[1, 2, 3], [4, 2, 9]], dtype=object) + + # arr2 = np.array([1, 4, 4, 9], dtype=object) + + # post = isin_array(array=arr1_1d, array_is_unique=False, other=arr2, other_is_unique=False) + # self.assertTrue(np.array_equal(e_1d, post)) + + # post = isin_array(array=arr1_2d, array_is_unique=False, other=arr2, other_is_unique=False) + # self.assertTrue(np.array_equal(e_2d, post)) + + def test_risky(self) -> None: + print() + from arraykit import isin_array - post = isin_array(array=arr1_2d, array_is_unique=False, other=arr2, other_is_unique=False) - self.assertTrue(np.array_equal(e_2d, post)) + #arr = np.array([6,3,np.datetime64('NaT'),5,1,8,6,1,3,3, np.datetime64('NaT')], dtype='datetime64[Y]') + #arr = np.array([6,3,np.timedelta64('NaT'),5,1,8,6,1,3,3, np.timedelta64('NaT')], dtype='timedelta64[Y]') + arr = np.array([6,3,np.nan,5,1,8,6,1,3,3, np.nan], dtype=float) + other = np.array([]) + print('ORIG: ', ' '.join(str(x) for x in arr))#.astype(int))) + post = isin_array(array=arr, array_is_unique=True, other=other, other_is_unique=True) + print('POST: ', ' '.join(str(x) for x in post))#.astype(int))) + print('ORIG: ', ' '.join(str(x) for x in arr))#.astype(int))) if __name__ == '__main__': From 376c49a198d5fb990a20dc2bf2e5478a528f4351 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 12 Mar 2021 20:04:18 -0800 Subject: [PATCH 13/29] Fixes compiler warnings & rearranges order of function declarations --- arraykit.c | 297 +++++++++++++++++++++++++++-------------------------- 1 file changed, 153 insertions(+), 144 deletions(-) diff --git a/arraykit.c b/arraykit.c index 2fccf30a..b4dd0532 100644 --- a/arraykit.c +++ b/arraykit.c @@ -261,19 +261,47 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) } //------------------------------------------------------------------------------ -// isin +// utils -static PyArrayObject * -AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) +static int +is_nan(PyObject *a) { - PyObject *container = PyTuple_Pack(2, arr1, arr2); - AK_CHECK_NOT(container) + double v = PyFloat_AsDouble(a); - PyArrayObject *array = (PyArrayObject*)PyArray_Concatenate(container, 0); - Py_DECREF(container); - return array; + // Need to disambiguate, since v could be -1 and no failure happened + if (v == -1 && PyErr_Occurred()) { + return -1; + } + + return isnan(v); } +static int +is_nanj(PyObject *a) +{ + return isnan(((PyComplexObject*)a)->cval.real); +} + +static int +is_nat(PyObject *a) +{ + // NaT - Datetime + if (PyArray_IsScalar(a, Datetime)) { // Cannot fail + return PyArrayScalar_VAL(a, Datetime) == NPY_DATETIME_NAT; + } + + // NaT - Timedelta + if (PyArray_IsScalar(a, Timedelta)) { // Cannot fail + return PyArrayScalar_VAL(a, Timedelta) == NPY_DATETIME_NAT; + } + + Py_UNREACHABLE(); +} + +//------------------------------------------------------------------------------ +// isin + +/* static void AK_print_array_float(PyArrayObject *array) { @@ -309,7 +337,7 @@ AK_print_array_float(PyArrayObject *array) } printf("\n"); - /* Increment the iterator to the next inner loop */ + // Increment the iterator to the next inner loop } while(iternext(iter)); NpyIter_Deallocate(iter); @@ -348,48 +376,26 @@ AK_print_array_bool(PyArrayObject *array) } printf("\n"); - /* Increment the iterator to the next inner loop */ + // Increment the iterator to the next inner loop } while(iternext(iter)); NpyIter_Deallocate(iter); } +*/ -static int -is_nan(PyObject *a) +// DONE +static PyArrayObject * +AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) { - double v = PyFloat_AsDouble(a); - - // Need to disambiguate, since v could be -1 and no failure happened - if (v == -1 && PyErr_Occurred()) { - return -1; - } - - return isnan(v); -} + PyObject *container = PyTuple_Pack(2, arr1, arr2); + AK_CHECK_NOT(container) -static int -is_nanj(PyObject *a) -{ - return isnan(((PyComplexObject*)a)->cval.real); + PyArrayObject *array = (PyArrayObject*)PyArray_Concatenate(container, 0); + Py_DECREF(container); + return array; } static int -is_nat(PyObject *a) -{ - // NaT - Datetime - if (PyArray_IsScalar(a, Datetime)) { // Cannot fail - return PyArrayScalar_VAL(a, Datetime) == NPY_DATETIME_NAT; - } - - // NaT - Timedelta - if (PyArray_IsScalar(a, Timedelta)) { // Cannot fail - return PyArrayScalar_VAL(a, Timedelta) == NPY_DATETIME_NAT; - } - - Py_UNREACHABLE(); -} - -static PyObject* AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) { /* Algorithm (assumes `sar` is sorted & mask is initialized to [1, 0, ... len(sar)] @@ -410,7 +416,7 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) PyObject* right_slice = NULL; PyObject* comparison = NULL; - int size = PyArray_SIZE(sar); + size_t size = (size_t)PyArray_SIZE(sar); PyArray_Descr* dtype = PyArray_DESCR(sar); int is_float = PyDataType_ISFLOAT(dtype); @@ -528,7 +534,7 @@ AK_get_unique_arr(PyArrayObject *original_arr) memset(mask_arr, 0, sizeof(mask_arr)); mask_arr[0] = 1; - AK_build_unique_arr_mask(arr, &mask_arr); + AK_GOTO_ON_NOT(AK_build_unique_arr_mask(arr, mask_arr), failure) PyObject* mask = PyArray_NewFromDescr( &PyArray_Type, // class (subtype) @@ -597,7 +603,7 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) memset(mask_arr, 0, sizeof(mask_arr)); mask_arr[0] = 1; - AK_build_unique_arr_mask(arr, &mask_arr); + AK_GOTO_ON_NOT(AK_build_unique_arr_mask(arr, mask_arr), failure) PyObject* mask = PyArray_NewFromDescr( &PyArray_Type, // class (subtype) @@ -620,102 +626,6 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) return NULL; } -static PyObject * -AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) -{ - /* Algorithm: - - for loc, element in loc_iter(array): - result[loc] = element in set(other) - */ - - // 0. Deallocate on failure - PyObject* compare_elements = NULL; - PyArrayObject* result = NULL; - NpyIter *iter = NULL; - - // 1. Capture original array shape for return value - int array_ndim = PyArray_NDIM(array); - npy_intp* array_dims = PyArray_DIMS(array); - - compare_elements = PyFrozenSet_New((PyObject*)other); - AK_CHECK_NOT(compare_elements) - - // 2: Construct empty array - result = (PyArrayObject*)PyArray_Empty( - array_ndim, // nd - array_dims, // dims - PyArray_DescrFromType(NPY_BOOL), // dtype - 0); // is_f_order - AK_GOTO_ON_NOT(result, failure) - - // 3. Set up iteration - // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example - iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); - AK_GOTO_ON_NOT(iter, failure) - - NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - AK_GOTO_ON_NOT(iternext, failure) - - char** dataptr = NpyIter_GetDataPtrArray(iter); - npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); - npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); - - // 4. Iterate! - int i = 0; - do { - int j = 0; - char* data = *dataptr; - npy_intp size = *sizeptr; - npy_intp stride = *strideptr; - - while (size--) { - PyObject* obj; - memcpy(&obj, data, sizeof(obj)); - AK_GOTO_ON_NOT(obj, failure) - Py_INCREF(obj); - - // 5. Assign into result whether or not the element exists in the set - int found = PySequence_Contains(compare_elements, obj); - Py_DECREF(obj); - - if (found == -1) { - goto failure; - } - - if (array_ndim == 1){ - *(npy_bool *) PyArray_GETPTR1(result, j) = (npy_bool)found; - } - else { - *(npy_bool *) PyArray_GETPTR2(result, i, j) = (npy_bool)found; - } - - data += stride; - ++j; - } - - ++i; - /* Increment the iterator to the next inner loop */ - } while(iternext(iter)); - - Py_DECREF(compare_elements); - NpyIter_Deallocate(iter); - - return (PyObject*)result; - -failure: - Py_DECREF(compare_elements); - Py_XDECREF(result); - if (iter != NULL) { - NpyIter_Deallocate(iter); - } - return NULL; -} - static PyObject * AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_unique) { @@ -762,13 +672,14 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu AK_GOTO_ON_NOT(raveled_array, failure) Py_INCREF(raveled_array); - return AK_get_unique(raveled_array); - if (!assume_unique) { PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(raveled_array); - PyArrayObject *raveled_array_unique = PyTuple_GetItem(arr_and_rev_idx, 0); - reverse_idx = PyTuple_GetItem(arr_and_rev_idx, 1); + PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GetItem(arr_and_rev_idx, 0); + AK_GOTO_ON_NOT(raveled_array_unique, failure) + + reverse_idx = (PyArrayObject*)PyTuple_GetItem(arr_and_rev_idx, (Py_ssize_t)1); Py_DECREF(arr_and_rev_idx); + AK_GOTO_ON_NOT(reverse_idx, failure) PyArrayObject *other_unique = (PyArrayObject*)AK_get_unique_arr(other); @@ -866,6 +777,104 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu } +// DONE +static PyObject * +AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) +{ + /* Algorithm: + + for loc, element in loc_iter(array): + result[loc] = element in set(other) + */ + + // 0. Deallocate on failure + PyObject* compare_elements = NULL; + PyArrayObject* result = NULL; + NpyIter *iter = NULL; + + // 1. Capture original array shape for return value + int array_ndim = PyArray_NDIM(array); + npy_intp* array_dims = PyArray_DIMS(array); + + compare_elements = PyFrozenSet_New((PyObject*)other); + AK_CHECK_NOT(compare_elements) + + // 2: Construct empty array + result = (PyArrayObject*)PyArray_Empty( + array_ndim, // nd + array_dims, // dims + PyArray_DescrFromType(NPY_BOOL), // dtype + 0); // is_f_order + AK_GOTO_ON_NOT(result, failure) + + // 3. Set up iteration + // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example + iter = NpyIter_New(array, + NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, + NPY_KEEPORDER, + NPY_NO_CASTING, + NULL); + AK_GOTO_ON_NOT(iter, failure) + + NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); + AK_GOTO_ON_NOT(iternext, failure) + + char** dataptr = NpyIter_GetDataPtrArray(iter); + npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); + npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); + + // 4. Iterate! + int i = 0; + do { + int j = 0; + char* data = *dataptr; + npy_intp size = *sizeptr; + npy_intp stride = *strideptr; + + while (size--) { + PyObject* obj; + memcpy(&obj, data, sizeof(obj)); + AK_GOTO_ON_NOT(obj, failure) + Py_INCREF(obj); + + // 5. Assign into result whether or not the element exists in the set + int found = PySequence_Contains(compare_elements, obj); + Py_DECREF(obj); + + if (found == -1) { + goto failure; + } + + if (array_ndim == 1){ + *(npy_bool *) PyArray_GETPTR1(result, j) = (npy_bool)found; + } + else { + *(npy_bool *) PyArray_GETPTR2(result, i, j) = (npy_bool)found; + } + + data += stride; + ++j; + } + + ++i; + /* Increment the iterator to the next inner loop */ + } while(iternext(iter)); + + Py_DECREF(compare_elements); + NpyIter_Deallocate(iter); + + return (PyObject*)result; + +failure: + Py_DECREF(compare_elements); + Py_XDECREF(result); + if (iter != NULL) { + NpyIter_Deallocate(iter); + } + return NULL; +} + +// DONE static PyObject * isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { From d306991250c7d056f9d58de9c8f0218ddbd9911f Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 12 Mar 2021 21:29:28 -0800 Subject: [PATCH 14/29] Completes initial pass to have compiling code. Not working yet. --- arraykit.c | 237 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 142 insertions(+), 95 deletions(-) diff --git a/arraykit.c b/arraykit.c index b4dd0532..fe82f5ab 100644 --- a/arraykit.c +++ b/arraykit.c @@ -6,6 +6,7 @@ # include "numpy/arrayobject.h" # include "numpy/arrayscalars.h" // Needed for Datetime scalar expansions +# include "numpy/ufuncobject.h" //------------------------------------------------------------------------------ // Macros @@ -383,7 +384,6 @@ AK_print_array_bool(PyArrayObject *array) } */ -// DONE static PyArrayObject * AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) { @@ -395,11 +395,39 @@ AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) return array; } +static PyArrayObject* +AK_compare_two_slices_from_array(PyArrayObject *arr, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t r1, Py_ssize_t r2) +{ + PyObject* left_slice = NULL; + PyObject* right_slice = NULL; + PyObject* comparison = NULL; + + left_slice = PySequence_GetSlice((PyObject*)arr, l1, l2); + AK_GOTO_ON_NOT(left_slice, failure) + + right_slice = PySequence_GetSlice((PyObject*)arr, r1, r2); + AK_GOTO_ON_NOT(right_slice, failure) + + comparison = PyObject_RichCompare(left_slice, right_slice, Py_EQ); + AK_GOTO_ON_NOT(comparison, failure) + + Py_DECREF(left_slice); + Py_DECREF(right_slice); + + return (PyArrayObject*)comparison; + +failure: + Py_XDECREF(left_slice); + Py_XDECREF(right_slice); + return NULL; +} + static int AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) { - /* Algorithm (assumes `sar` is sorted & mask is initialized to [1, 0, ... len(sar)] + /* Algorithm (assumes `sar` is sorted & mask is initialized to [1, 0, ...] & len(mask) == len(sar) + // cfmM = [Complex, Float, Datetime, Timedelta] if sar.dtype.kind in "cfmM" and np.isnan(sar[-1]): if sar.dtype.kind == "c": # for complex all NaNs are considered equivalent aux_firstnan = np.searchsorted(np.isnan(sar), True, side='left') @@ -412,10 +440,11 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) else: mask[1:] = sar[1:] != sar[:-1] */ - PyObject* left_slice = NULL; - PyObject* right_slice = NULL; - PyObject* comparison = NULL; + // 0. Deallocate on failure + PyArrayObject* comparison = NULL; + PyObject* last_element = NULL; + // 1. Determine if last element contains NaNs/NaTs size_t size = (size_t)PyArray_SIZE(sar); PyArray_Descr* dtype = PyArray_DESCR(sar); @@ -423,8 +452,8 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) int is_complex = PyDataType_ISCOMPLEX(dtype); int is_dt = PyDataType_ISDATETIME(dtype); - PyObject* last_element = NULL; int contains_nan = 0; + if (is_float | is_complex | is_dt) { last_element = PyObject_GetItem((PyObject*)sar, PyLong_FromLong(-1)); AK_GOTO_ON_NOT(last_element, failure) @@ -440,7 +469,9 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) } } + // 2. Populate mask if (contains_nan) { + // 3. Discover the location of the first NaN element size_t firstnan = 0; if (is_complex) { // aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') @@ -454,16 +485,10 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) Py_DECREF(firstnan_obj); } - left_slice = PySequence_GetSlice((PyObject*)sar, 1, (int)firstnan); - AK_GOTO_ON_NOT(left_slice, failure) - - right_slice = PySequence_GetSlice((PyObject*)sar, 0, firstnan - 1); - AK_GOTO_ON_NOT(right_slice, failure) - - comparison = PyObject_RichCompare(left_slice, right_slice, Py_NE); + // 4. Build mask in such a way to only include 1 NaN value + comparison = AK_compare_two_slices_from_array(sar, 1, firstnan, 0, firstnan - 1); AK_GOTO_ON_NOT(comparison, failure) - - npy_bool* comparison_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)comparison); + npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); for (size_t i = 1; i < firstnan; ++i) { mask[i] = comparison_arr[i-1]; @@ -474,26 +499,25 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) } } else { - left_slice = PySequence_GetSlice((PyObject*)sar, 1, size); - AK_GOTO_ON_NOT(left_slice, failure) - - right_slice = PySequence_GetSlice((PyObject*)sar, 0, size - 1); - AK_GOTO_ON_NOT(right_slice, failure) - - comparison = PyObject_RichCompare(left_slice, right_slice, Py_NE); + // 3. Build mask through a simple [1:] != [:-1] slice comparison + comparison = AK_compare_two_slices_from_array(sar, 1, size, 0, size - 1); AK_GOTO_ON_NOT(comparison, failure) - - npy_bool* comparison_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)comparison); + npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); for (size_t i = 1; i < (size_t)size; ++i) { mask[i] = comparison_arr[i-1]; } } + Py_DECREF(comparison); + Py_XDECREF(last_element); // Only populated when sar contains NaNs/NaTs + return 1; failure: - return -1; + Py_XDECREF(comparison); + Py_XDECREF(last_element); + return 0; } static PyObject* @@ -501,59 +525,62 @@ AK_get_unique_arr(PyArrayObject *original_arr) { /* Algorithm - arr = arr(original_arr) - arr.sort() + sar = copy(original_arr) + sar.sort() - mask = np.empty(arr.shape, dtype=np.bool_) + mask = np.empty(sar.shape, dtype=np.bool_) mask[0] = True build_mask(...) - return arr[mask] + return sar[mask] */ // 1. Initialize - PyObject* filtered_arr = NULL; + PyObject* mask = NULL; // Deallocate on failure - int size = PyArray_SIZE(original_arr); + size_t size = PyArray_SIZE(original_arr); PyArray_Descr* dtype = PyArray_DESCR(original_arr); npy_bool mask_arr[size]; - // 2. Get a copy of the original arr since sorting is inplace - PyArrayObject* arr = (PyArrayObject*)PyArray_FromArray( + // 2. Get a copy of the original arr since sorting is in-place + PyArrayObject* sar = (PyArrayObject*)PyArray_FromArray( original_arr, dtype, NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSURECOPY); - - if (PyArray_Sort(arr, 0, NPY_QUICKSORT) == -1) { // In-place + AK_CHECK_NOT(sar) + if (PyArray_Sort(sar, 0, NPY_QUICKSORT) == -1) { // In-place goto failure; } // 3. Build mask memset(mask_arr, 0, sizeof(mask_arr)); mask_arr[0] = 1; + AK_GOTO_ON_NOT(AK_build_unique_arr_mask(sar, mask_arr), failure) - AK_GOTO_ON_NOT(AK_build_unique_arr_mask(arr, mask_arr), failure) - - PyObject* mask = PyArray_NewFromDescr( + mask = PyArray_NewFromDescr( &PyArray_Type, // class (subtype) PyArray_DescrFromType(NPY_BOOL), // dtype (descr) - PyArray_NDIM(arr), // ndim (nd) - PyArray_DIMS(arr), // dims + PyArray_NDIM(sar), // ndim (nd) + PyArray_DIMS(sar), // dims NULL, // strides mask_arr, // data NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags NULL); // sublclass (obj) + AK_GOTO_ON_NOT(mask, failure) - // 4. Filter arr - filtered_arr = PyObject_GetItem((PyObject*)arr, (PyObject*)mask); + // 4. Filter sar + PyObject *filtered_arr = PyObject_GetItem((PyObject*)sar, (PyObject*)mask); AK_GOTO_ON_NOT(filtered_arr, failure) + Py_DECREF(sar); + Py_DECREF(mask); return filtered_arr; failure: - printf("FAILURE!\n"); + Py_DECREF(sar); // Cannot be NULL + Py_XDECREF(mask); return NULL; } @@ -562,67 +589,103 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) { /* Algorithm - arr = arr(original_arr) - - perm = arr.argsort('quicksort') - arr = arr[perm] + ordered_idx = original_arr.argsort(kind='quicksort') + sar = original_arr[ordered_idx] - mask = np.empty(arr.shape, dtype=np.bool_) + mask = np.empty(sar.shape, dtype=np.bool_) mask[0] = True - AK_build_unique_arr_mask(arr, mask) + AK_build_unique_arr_mask(sar, mask) - ret = arr[mask] + ret = sar[mask] imask = np.cumsum(mask) - 1 inv_idx = np.empty(mask.shape, dtype=np.intp) - inv_idx[perm] = imask + inv_idx[ordered_idx] = imask return ret, inv_idx - - return ret */ // 1. Initialize - PyObject* filtered_arr = NULL; + PyObject *ordered_idx = NULL; + PyArrayObject *sar = NULL; + PyArrayObject* mask = NULL; + PyObject *filtered_arr = NULL; + PyObject* cumsum = NULL; + PyObject* imask = NULL; + PyObject* inv_idx = NULL; - int size = PyArray_SIZE(original_arr); - PyArray_Descr* dtype = PyArray_DESCR(original_arr); + size_t size = PyArray_SIZE(original_arr); npy_bool mask_arr[size]; - // 2. Get a copy of the original arr since sorting is inplace - PyArrayObject* arr = (PyArrayObject*)PyArray_FromArray( - original_arr, - dtype, - NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSURECOPY); + // 2. Get sorted indices & sort array + ordered_idx = PyArray_ArgSort(original_arr, 0, NPY_QUICKSORT); + AK_GOTO_ON_NOT(ordered_idx, failure) - if (PyArray_Sort(arr, 0, NPY_QUICKSORT) == -1) { // In-place - goto failure; - } + sar = (PyArrayObject*)PyObject_GetItem((PyObject*)original_arr, ordered_idx); + AK_GOTO_ON_NOT(sar, failure) // 3. Build mask memset(mask_arr, 0, sizeof(mask_arr)); mask_arr[0] = 1; + AK_GOTO_ON_NOT(AK_build_unique_arr_mask(sar, mask_arr), failure) - AK_GOTO_ON_NOT(AK_build_unique_arr_mask(arr, mask_arr), failure) - - PyObject* mask = PyArray_NewFromDescr( - &PyArray_Type, // class (subtype) - PyArray_DescrFromType(NPY_BOOL), // dtype (descr) - PyArray_NDIM(arr), // ndim (nd) - PyArray_DIMS(arr), // dims + mask = (PyArrayObject*)PyArray_NewFromDescr( + &PyArray_Type, // subtype + PyArray_DescrFromType(NPY_BOOL), // dtype + PyArray_NDIM(sar), // nd + PyArray_DIMS(sar), // dims NULL, // strides mask_arr, // data NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags NULL); // sublclass (obj) + AK_GOTO_ON_NOT(mask, failure) // 4. Filter arr - filtered_arr = PyObject_GetItem((PyObject*)arr, (PyObject*)mask); + filtered_arr = PyObject_GetItem((PyObject*)sar, (PyObject*)mask); AK_GOTO_ON_NOT(filtered_arr, failure) - return filtered_arr; + // 5. Determine the inverse index + cumsum = PyArray_CumSum( + mask, // array + 0, // axis + NPY_INT, // dtype + NULL); // out-array + AK_GOTO_ON_NOT(cumsum, failure) + + imask = PyNumber_Subtract(cumsum, PyLong_FromLong(1)); + AK_GOTO_ON_NOT(imask, failure) + + inv_idx = PyArray_Empty( + PyArray_NDIM(mask), // nd + PyArray_DIMS(mask), // dims + PyArray_DescrFromType(NPY_INT), // dtype + 0); // is_f_order + + if (PyObject_SetItem(inv_idx, ordered_idx, imask)) { + goto failure; + } + + // 6. Pack it up in a tuple and return + PyObject* ret = PyTuple_Pack(2, filtered_arr, inv_idx); + AK_GOTO_ON_NOT(ret, failure) + + Py_DECREF(ordered_idx); + Py_DECREF(sar); + Py_DECREF(mask); + Py_DECREF(filtered_arr); + Py_DECREF(cumsum); + Py_DECREF(imask); + Py_DECREF(inv_idx); + return ret; failure: - printf("FAILURE!\n"); + Py_XDECREF(ordered_idx); + Py_XDECREF(sar); + Py_XDECREF(mask); + Py_XDECREF(filtered_arr); + Py_XDECREF(cumsum); + Py_XDECREF(imask); + Py_XDECREF(inv_idx); return NULL; } @@ -657,10 +720,8 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu PyArrayObject *reverse_idx = NULL; PyArrayObject* concatenated = NULL; PyArrayObject *ordered_idx = NULL; - PyObject* sorted_arr = NULL; - PyObject* left_slice = NULL; - PyObject* right_slice = NULL; - PyObject* comparison = NULL; + PyArrayObject* sorted_arr = NULL; + PyArrayObject* comparison = NULL; // 1. Capture original array shape for return value int array_ndim = PyArray_NDIM(array); @@ -703,19 +764,12 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu npy_intp* ordered_idx_arr = (npy_intp*)PyArray_DATA(ordered_idx); // 5. Find duplicates - sorted_arr = PyObject_GetItem((PyObject*)concatenated, (PyObject*)ordered_idx); + sorted_arr = (PyArrayObject*)PyObject_GetItem((PyObject*)concatenated, (PyObject*)ordered_idx); AK_GOTO_ON_NOT(sorted_arr, failure) - left_slice = PySequence_GetSlice((PyObject*)sorted_arr, 1, concatenated_size); - AK_GOTO_ON_NOT(left_slice, failure) - - right_slice = PySequence_GetSlice((PyObject*)sorted_arr, 0, concatenated_size - 1); - AK_GOTO_ON_NOT(right_slice, failure) - - comparison = PyObject_RichCompare(left_slice, right_slice, Py_EQ); + comparison = AK_compare_two_slices_from_array(sorted_arr, 1, concatenated_size, 0, concatenated_size - 1); AK_GOTO_ON_NOT(comparison, failure) - - npy_bool* comparison_arr = (npy_bool*)PyArray_DATA((PyArrayObject*)comparison); + npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); // 6: Construct empty array PyArrayObject* ret = (PyArrayObject*)PyArray_Empty( @@ -756,8 +810,6 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu Py_DECREF(concatenated); Py_DECREF(ordered_idx); Py_DECREF(sorted_arr); - Py_DECREF(left_slice); - Py_DECREF(right_slice); Py_DECREF(comparison); return (PyObject*)ret; } @@ -770,14 +822,10 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu Py_XDECREF(concatenated); Py_XDECREF(ordered_idx); Py_XDECREF(sorted_arr); - Py_XDECREF(left_slice); - Py_XDECREF(right_slice); Py_XDECREF(comparison); return NULL; - } -// DONE static PyObject * AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) { @@ -874,7 +922,6 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) return NULL; } -// DONE static PyObject * isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { From 38eaeb72db9acb0d0f6f49ef571a90a84e99a9fd Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 12 Mar 2021 23:18:19 -0800 Subject: [PATCH 15/29] Fixes some reference count issues, type inconsistencies, and adds a branch I forgot about. --- arraykit.c | 201 ++++++++++++++---------------------- debug.py | 15 +++ test/test_util.py | 252 +++++++++++++++++++++++++--------------------- 3 files changed, 226 insertions(+), 242 deletions(-) create mode 100755 debug.py diff --git a/arraykit.c b/arraykit.c index fe82f5ab..c79bd8bf 100644 --- a/arraykit.c +++ b/arraykit.c @@ -302,87 +302,8 @@ is_nat(PyObject *a) //------------------------------------------------------------------------------ // isin -/* -static void -AK_print_array_float(PyArrayObject *array) -{ - NpyIter *iter = NULL; - - int array_ndim = PyArray_NDIM(array); - npy_intp* array_dims = PyArray_DIMS(array); - - iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); - - NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - - char** dataptr = NpyIter_GetDataPtrArray(iter); - npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); - npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); - - // 4. Iterate! - do { - char* data = *dataptr; - npy_intp size = *sizeptr; - npy_intp stride = *strideptr; - - printf("NUMPY %i ", (int)stride); - while (size--) { - double val; - memcpy(&val, data, sizeof(double)); - printf("%.1f ", val); - data += stride; - } - printf("\n"); - - // Increment the iterator to the next inner loop - } while(iternext(iter)); - - NpyIter_Deallocate(iter); -} - -static void -AK_print_array_bool(PyArrayObject *array) -{ - NpyIter *iter = NULL; - - int array_ndim = PyArray_NDIM(array); - npy_intp* array_dims = PyArray_DIMS(array); - - iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); - - NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - - char** dataptr = NpyIter_GetDataPtrArray(iter); - npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); - npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); - - // 4. Iterate! - do { - char* data = *dataptr; - npy_intp size = *sizeptr; - npy_intp stride = *strideptr; - - printf("NUMPY %i ", (int)stride); - while (size--) { - printf("%i ", (int)*data); - data += stride; - } - printf("\n"); - - // Increment the iterator to the next inner loop - } while(iternext(iter)); - - NpyIter_Deallocate(iter); -} -*/ +# define AK_PPRINT(obj) \ + printf(""#obj""); printf(": "); PyObject_Print(obj, stdout, 0); printf("\n"); fflush(stdout); static PyArrayObject * AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) @@ -396,7 +317,7 @@ AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) } static PyArrayObject* -AK_compare_two_slices_from_array(PyArrayObject *arr, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t r1, Py_ssize_t r2) +AK_compare_two_slices_from_array(PyArrayObject *arr, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t r1, Py_ssize_t r2, int EQ) { PyObject* left_slice = NULL; PyObject* right_slice = NULL; @@ -408,7 +329,7 @@ AK_compare_two_slices_from_array(PyArrayObject *arr, Py_ssize_t l1, Py_ssize_t l right_slice = PySequence_GetSlice((PyObject*)arr, r1, r2); AK_GOTO_ON_NOT(right_slice, failure) - comparison = PyObject_RichCompare(left_slice, right_slice, Py_EQ); + comparison = PyObject_RichCompare(left_slice, right_slice, EQ); AK_GOTO_ON_NOT(comparison, failure) Py_DECREF(left_slice); @@ -486,7 +407,7 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) } // 4. Build mask in such a way to only include 1 NaN value - comparison = AK_compare_two_slices_from_array(sar, 1, firstnan, 0, firstnan - 1); + comparison = AK_compare_two_slices_from_array(sar, 1, firstnan, 0, firstnan - 1, Py_NE); AK_GOTO_ON_NOT(comparison, failure) npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); @@ -500,7 +421,7 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) } else { // 3. Build mask through a simple [1:] != [:-1] slice comparison - comparison = AK_compare_two_slices_from_array(sar, 1, size, 0, size - 1); + comparison = AK_compare_two_slices_from_array(sar, 1, size, 0, size - 1, Py_NE); AK_GOTO_ON_NOT(comparison, failure) npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); @@ -520,7 +441,7 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) return 0; } -static PyObject* +static PyArrayObject* AK_get_unique_arr(PyArrayObject *original_arr) { /* Algorithm @@ -571,7 +492,7 @@ AK_get_unique_arr(PyArrayObject *original_arr) AK_GOTO_ON_NOT(mask, failure) // 4. Filter sar - PyObject *filtered_arr = PyObject_GetItem((PyObject*)sar, (PyObject*)mask); + PyArrayObject *filtered_arr = (PyArrayObject*)PyObject_GetItem((PyObject*)sar, (PyObject*)mask); AK_GOTO_ON_NOT(filtered_arr, failure) Py_DECREF(sar); @@ -717,11 +638,12 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu */ // 0. Deallocate on failure PyArrayObject* raveled_array = NULL; - PyArrayObject *reverse_idx = NULL; + PyObject *reverse_idx = NULL; PyArrayObject* concatenated = NULL; PyArrayObject *ordered_idx = NULL; PyArrayObject* sorted_arr = NULL; PyArrayObject* comparison = NULL; + PyArrayObject* ret = NULL; // 1. Capture original array shape for return value int array_ndim = PyArray_NDIM(array); @@ -735,17 +657,20 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu if (!assume_unique) { PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(raveled_array); - PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GetItem(arr_and_rev_idx, 0); + PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GET_ITEM(arr_and_rev_idx, 0); AK_GOTO_ON_NOT(raveled_array_unique, failure) + Py_INCREF(raveled_array_unique); - reverse_idx = (PyArrayObject*)PyTuple_GetItem(arr_and_rev_idx, (Py_ssize_t)1); - Py_DECREF(arr_and_rev_idx); + reverse_idx = PyTuple_GET_ITEM(arr_and_rev_idx, 1); AK_GOTO_ON_NOT(reverse_idx, failure) - PyArrayObject *other_unique = (PyArrayObject*)AK_get_unique_arr(other); + PyArrayObject *other_unique = AK_get_unique_arr(other); + AK_GOTO_ON_NOT(other_unique, failure) + Py_INCREF(other_unique); // 3. Concatenate concatenated = AK_concat_arrays(raveled_array_unique, other_unique); + Py_DECREF(arr_and_rev_idx); Py_DECREF(raveled_array_unique); Py_DECREF(other_unique); } @@ -755,7 +680,6 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu } AK_GOTO_ON_NOT(concatenated, failure) - size_t concatenated_size = PyArray_SIZE(concatenated); // 4: Sort @@ -767,54 +691,75 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu sorted_arr = (PyArrayObject*)PyObject_GetItem((PyObject*)concatenated, (PyObject*)ordered_idx); AK_GOTO_ON_NOT(sorted_arr, failure) - comparison = AK_compare_two_slices_from_array(sorted_arr, 1, concatenated_size, 0, concatenated_size - 1); + comparison = AK_compare_two_slices_from_array(sorted_arr, 1, concatenated_size, 0, concatenated_size - 1, Py_EQ); AK_GOTO_ON_NOT(comparison, failure) npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); - // 6: Construct empty array - PyArrayObject* ret = (PyArrayObject*)PyArray_Empty( - array_ndim, // nd - array_dims, // dims - PyArray_DescrFromType(NPY_BOOL), // dtype - 0); // is_f_order + if (!assume_unique) { + // 6: Construct empty array + PyObject* tmp = PyArray_Empty( + PyArray_NDIM(concatenated), // nd + PyArray_DIMS(concatenated), // dims + PyArray_DescrFromType(NPY_BOOL), // dtype + 0); // is_f_order + + Py_INCREF(tmp); + AK_PPRINT(tmp) + AK_PPRINT(ordered_idx) + AK_PPRINT(comparison) + + // TODO: Comparison is missing a trailing False value... + if (PyObject_SetItem(tmp, ordered_idx, comparison)) { + goto failure; + } - AK_GOTO_ON_NOT(ret, failure) + printf("HERE\n"); - size_t stride = 0; - if (array_ndim == 2) { - stride = (size_t)array_dims[1]; + ret = (PyArrayObject*)PyObject_GetItem(tmp, reverse_idx); + Py_DECREF(tmp); + Py_DECREF(reverse_idx); } + else { + // 6: Construct empty array + ret = (PyArrayObject*)PyArray_Empty( + array_ndim, // nd + array_dims, // dims + PyArray_DescrFromType(NPY_BOOL), // dtype + 0); // is_f_order + + AK_GOTO_ON_NOT(ret, failure) + + size_t stride = 0; + if (array_ndim == 2) { + stride = (size_t)array_dims[1]; + } - // 7: Assign into duplicates array - for (size_t i = 0; i < (size_t)PyArray_SIZE(ordered_idx); ++i) { - size_t idx_0 = (size_t)ordered_idx_arr[i]; - if (idx_0 >= array_size) { continue; } + // 7: Assign into duplicates array + for (size_t i = 0; i < (size_t)PyArray_SIZE(ordered_idx); ++i) { + size_t idx_0 = (size_t)ordered_idx_arr[i]; + if (idx_0 >= array_size) { continue; } - // We are guaranteed that flag_ar[i] is always a valid index - if (array_ndim == 1) { - *(npy_bool *) PyArray_GETPTR1(ret, idx_0) = comparison_arr[i]; - } - else { - size_t idx_1 = idx_0 / stride; - idx_0 = idx_0 - (stride * idx_1); + // We are guaranteed that flag_ar[i] is always a valid index + if (array_ndim == 1) { + *(npy_bool *) PyArray_GETPTR1(ret, idx_0) = comparison_arr[i]; + } + else { + size_t idx_1 = idx_0 / stride; + idx_0 = idx_0 - (stride * idx_1); - *(npy_bool *) PyArray_GETPTR2(ret, idx_1, idx_0) = comparison_arr[i]; + *(npy_bool *) PyArray_GETPTR2(ret, idx_1, idx_0) = comparison_arr[i]; + } } } - Py_XDECREF(reverse_idx); // This might not exist - - // 8. Return! - if (assume_unique) { - Py_DECREF(raveled_array); - Py_DECREF(concatenated); - Py_DECREF(ordered_idx); - Py_DECREF(sorted_arr); - Py_DECREF(comparison); - return (PyObject*)ret; - } + // 8. Cleanup & Return! + Py_DECREF(raveled_array); + Py_DECREF(concatenated); + Py_DECREF(ordered_idx); + Py_DECREF(sorted_arr); + Py_DECREF(comparison); - // return NULL; + return (PyObject*)ret; failure: Py_XDECREF(raveled_array); diff --git a/debug.py b/debug.py new file mode 100755 index 00000000..a98d2a45 --- /dev/null +++ b/debug.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python +from functools import partial + +import numpy as np # type: ignore +from arraykit import isin_array + +isin_array_func = partial(isin_array, array_is_unique=False, other_is_unique=False) + +# e_1d = np.array([1, 0, 0, 0, 1, 0, 1], dtype=bool) +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=int) +arr2 = np.array([1, 4, 7, 9], dtype=int) +print(arr1) +print(arr2) +post = isin_array_func(array=arr1, other=arr2) +print(post) diff --git a/test/test_util.py b/test/test_util.py index 477c454c..15b4fdd8 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -1,4 +1,4 @@ -from datetime import date +from datetime import date, timedelta from functools import partial import itertools import unittest @@ -71,7 +71,6 @@ def test_resolve_dtype_c(self) -> None: self.assertEqual(resolve_dtype(a1.dtype, a4.dtype), np.dtype('O')) - def test_resolve_dtype_d(self) -> None: dt1 = np.array(1).dtype dt2 = np.array(2.3).dtype @@ -83,7 +82,6 @@ def test_resolve_dtype_e(self) -> None: assert resolve_dtype(dt1, dt2) == np.dtype(object) assert resolve_dtype(dt1, dt1) == dt1 - #--------------------------------------------------------------------------- def test_resolve_dtype_iter_a(self) -> None: @@ -141,7 +139,6 @@ def test_column_2d_filter_a(self) -> None: with self.assertRaises(NotImplementedError): column_2d_filter(a1.reshape(1,2,5)) - #--------------------------------------------------------------------------- def test_column_1d_filter_a(self) -> None: @@ -170,155 +167,182 @@ def test_row_1d_filter_a(self) -> None: with self.assertRaises(NotImplementedError): row_1d_filter(a1.reshape(1,2,5)) - # def test_isin_1d(self) -> None: - # from performance.reference.util import isin_array - - # T, F = True, False - # arr1 = np.array([1, 2, 3, 4, 5]) + def test_isin_1d(self) -> None: + from performance.reference.util import isin_array + + T, F = True, False + arr1 = np.array([1, 2, 3, 4, 5]) + + expected = [ + (np.array([T, F, T, T, F]), [1, 3, 4]), + (np.array([F, F, F, F, F]), [7, 8]), + (np.array([T, T, T, T, T]), [1, 2, 3, 4, 5]), + ] + + for expected_result, values in expected: + for dtype in (int, object): + arr2 = np.array(values, dtype=dtype) + + for aiu, oiu in itertools.product((T, F), (T, F)): + self.assertTrue(np.array_equal(expected_result, isin_array( + array=arr1, + array_is_unique=aiu, + other=arr2, + other_is_unique=oiu, + ))) + + def test_isin_2d(self) -> None: + from performance.reference.util import isin_array + + T, F = True, False + arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + + expected = [ + (np.array([[T, F, T], [T, F, F], [F, F, T]]), [1, 3, 4, 9]), + (np.array([[F, F, F], [F, F, F], [F, F, F]]), [10, 11]), + (np.array([[T, T, T], [T, T, T], [T, T, T]]), [1, 2, 3, 4, 5, 6, 7, 8, 9]), + ] + + for expected_result, values in expected: + for dtype in (int, object): + arr2 = np.array(values, dtype=dtype) + + for aiu, oiu in itertools.product((T, F), (T, F)): + self.assertTrue(np.array_equal(expected_result, isin_array( + array=arr1, + array_is_unique=aiu, + other=arr2, + other_is_unique=oiu, + ))) + + def test_1d_2d_dtype_unique(self) -> None: + from arraykit import isin_array - # expected = [ - # (np.array([T, F, T, T, F]), [1, 3, 4]), - # (np.array([F, F, F, F, F]), [7, 8]), - # (np.array([T, T, T, T, T]), [1, 2, 3, 4, 5]), - # ] + isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) - # for expected_result, values in expected: - # for dtype in (int, object): - # arr2 = np.array(values, dtype=dtype) + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - # for aiu, oiu in itertools.product((T, F), (T, F)): - # self.assertTrue(np.array_equal(expected_result, isin_array( - # array=arr1, - # array_is_unique=aiu, - # other=arr2, - # other_is_unique=oiu, - # ))) + v_1d = [1, 2, 3, 4, 5] + v_2d = [[1, 2, 3], [4, 5, 9]] - # def test_isin_2d(self) -> None: - # from performance.reference.util import isin_array + w_1d = [1, 4, 7, 9] - # T, F = True, False - # arr1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + dtype_funcs = [ + (int, int), + (float, float), + (str, str), + ('datetime64[D]', lambda x: date(2020, 1, x)), + ('timedelta64[D]', lambda x: timedelta(x)), + ] - # expected = [ - # (np.array([[T, F, T], [T, F, F], [F, F, T]]), [1, 3, 4, 9]), - # (np.array([[F, F, F], [F, F, F], [F, F, F]]), [10, 11]), - # (np.array([[T, T, T], [T, T, T], [T, T, T]]), [1, 2, 3, 4, 5, 6, 7, 8, 9]), - # ] + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - # for expected_result, values in expected: - # for dtype in (int, object): - # arr2 = np.array(values, dtype=dtype) + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') - # for aiu, oiu in itertools.product((T, F), (T, F)): - # self.assertTrue(np.array_equal(expected_result, isin_array( - # array=arr1, - # array_is_unique=aiu, - # other=arr2, - # other_is_unique=oiu, - # ))) + for dtype, dtype_func in dtype_funcs: + arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - # def test_1d_2d_dtype_unique(self) -> None: - # from arraykit import isin_array + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') - # isin_array_func = partial(isin_array, array_is_unique=True, other_is_unique=True) + def test_1d_2d_dtype_object_unique(self) -> None: + from arraykit import isin_array - # e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) - # e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - # v_1d = [1, 2, 3, 4, 5] - # v_2d = [[1, 2, 3], [4, 5, 9]] + arr1_1d = np.array([1, 2, 3, 4, 5], dtype=object) + arr1_2d = np.array([[1, 2, 3], [4, 5, 9]], dtype=object) - # w_1d = [1, 4, 7, 9] + arr2 = np.array([1, 4, 7, 9], dtype=object) - # dtype_funcs = [ - # (int, int), - # (float, float), - # (str, str), - # ('datetime64[D]', lambda x: date(2020, 1, x)), - # ] + post = isin_array(array=arr1_1d, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_1d, post)) - # for dtype, dtype_func in dtype_funcs: - # arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) - # arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + post = isin_array(array=arr1_2d, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_2d, post)) - # post = isin_array_func(array=arr1, other=arr2) - # self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') + class C: + def __init__(self, val): + self.val = val - # for dtype, dtype_func in dtype_funcs: - # arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) - # arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) + def __eq__(self, other): + return self.val == other.val - # post = isin_array_func(array=arr1, other=arr2) - # self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') + def __hash__(self): + return hash(self.val) - # def test_1d_2d_dtype_object_unique(self) -> None: - # from arraykit import isin_array + arr1 = np.array([C(1), C(2), C(3), C(4), C(5)]) + arr2 = np.array([C(1), C(4), C(9)]) - # e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) - # e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_1d, post)) - # arr1_1d = np.array([1, 2, 3, 4, 5], dtype=object) - # arr1_2d = np.array([[1, 2, 3], [4, 5, 9]], dtype=object) + arr1 = np.array([[C(1), C(2), C(3)], [C(4), C(5), C(9)]]) - # arr2 = np.array([1, 4, 7, 9], dtype=object) + post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + self.assertTrue(np.array_equal(e_2d, post)) - # post = isin_array(array=arr1_1d, array_is_unique=True, other=arr2, other_is_unique=True) - # self.assertTrue(np.array_equal(e_1d, post)) + def test_1d_2d_dtype_object_non_unique(self) -> None: + from arraykit import isin_array - # post = isin_array(array=arr1_2d, array_is_unique=True, other=arr2, other_is_unique=True) - # self.assertTrue(np.array_equal(e_2d, post)) + e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) + e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) - # class C: - # def __init__(self, val): - # self.val = val + arr1_1d = np.array([1, 2, 2, 4, 5], dtype=object) + arr1_2d = np.array([[1, 2, 3], [4, 2, 9]], dtype=object) - # def __eq__(self, other): - # return self.val == other.val + arr2 = np.array([1, 4, 4, 9], dtype=object) - # def __hash__(self): - # return hash(self.val) + post = isin_array(array=arr1_1d, array_is_unique=False, other=arr2, other_is_unique=False) + self.assertTrue(np.array_equal(e_1d, post)) - # arr1 = np.array([C(1), C(2), C(3), C(4), C(5)]) - # arr2 = np.array([C(1), C(4), C(9)]) + post = isin_array(array=arr1_2d, array_is_unique=False, other=arr2, other_is_unique=False) + self.assertTrue(np.array_equal(e_2d, post)) - # post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) - # self.assertTrue(np.array_equal(e_1d, post)) + def test_1d_2d_dtype_non_unique(self) -> None: + from arraykit import isin_array - # arr1 = np.array([[C(1), C(2), C(3)], [C(4), C(5), C(9)]]) + isin_array_func = partial(isin_array, array_is_unique=False, other_is_unique=False) - # post = isin_array(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) - # self.assertTrue(np.array_equal(e_2d, post)) + e_1d = np.array([1, 0, 0, 0, 1, 0, 1], dtype=bool) + e_2d = np.array([[1, 1, 0, 0], [1, 0, 0, 1]], dtype=bool) - # def test_1d_2d_dtype_object_non_unique(self) -> None: - # from arraykit import isin_array + v_1d = [1, 5, 2, 3, 4, 5, 1] + v_2d = [[9, 1, 2, 3], [4, 3, 5, 9]] - # e_1d = np.array([1, 0, 0, 1, 0], dtype=bool) - # e_2d = np.array([[1, 0, 0], [1, 0, 1]], dtype=bool) + w_1d = [1, 4, 7, 9] - # arr1_1d = np.array([1, 2, 2, 4, 5], dtype=object) - # arr1_2d = np.array([[1, 2, 3], [4, 2, 9]], dtype=object) + dtype_funcs = [ + (int, int), + (float, float), + (str, str), + ('datetime64[D]', lambda x: date(2020, 1, x)), + ('timedelta64[D]', lambda x: timedelta(x)), + ] - # arr2 = np.array([1, 4, 4, 9], dtype=object) + for dtype, dtype_func in dtype_funcs: + print(dtype, dtype_func) + arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - # post = isin_array(array=arr1_1d, array_is_unique=False, other=arr2, other_is_unique=False) - # self.assertTrue(np.array_equal(e_1d, post)) + print(arr1, arr2, e_1d) - # post = isin_array(array=arr1_2d, array_is_unique=False, other=arr2, other_is_unique=False) - # self.assertTrue(np.array_equal(e_2d, post)) + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') - def test_risky(self) -> None: - print() - from arraykit import isin_array + for dtype, dtype_func in dtype_funcs: + print(dtype, dtype_func) + arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) + arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - #arr = np.array([6,3,np.datetime64('NaT'),5,1,8,6,1,3,3, np.datetime64('NaT')], dtype='datetime64[Y]') - #arr = np.array([6,3,np.timedelta64('NaT'),5,1,8,6,1,3,3, np.timedelta64('NaT')], dtype='timedelta64[Y]') - arr = np.array([6,3,np.nan,5,1,8,6,1,3,3, np.nan], dtype=float) - other = np.array([]) - print('ORIG: ', ' '.join(str(x) for x in arr))#.astype(int))) - post = isin_array(array=arr, array_is_unique=True, other=other, other_is_unique=True) - print('POST: ', ' '.join(str(x) for x in post))#.astype(int))) - print('ORIG: ', ' '.join(str(x) for x in arr))#.astype(int))) + post = isin_array_func(array=arr1, other=arr2) + self.assertTrue(np.array_equal(e_2d, post), msg=f'\n{dtype}\nExpected:\n{e_2d}\nActual:\n{post}') if __name__ == '__main__': From ac476ad7ee2c6b8468da095a5998336eb62039bd Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 15:28:18 -0800 Subject: [PATCH 16/29] Adds an alternative method to simply use numpy :( --- arraykit.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 49 insertions(+), 5 deletions(-) diff --git a/arraykit.c b/arraykit.c index c79bd8bf..b2bf47be 100644 --- a/arraykit.c +++ b/arraykit.c @@ -704,12 +704,9 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu 0); // is_f_order Py_INCREF(tmp); - AK_PPRINT(tmp) - AK_PPRINT(ordered_idx) - AK_PPRINT(comparison) // TODO: Comparison is missing a trailing False value... - if (PyObject_SetItem(tmp, ordered_idx, comparison)) { + if (PyObject_SetItem(tmp, (PyObject*)ordered_idx, (PyObject*)comparison)) { goto failure; } @@ -771,6 +768,52 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu return NULL; } +static PyObject * +AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assume_unique) +{ + PyObject* numpy = NULL; + PyObject* func = NULL; + PyObject* args = NULL; + PyObject* kwarg = NULL; + + numpy = PyImport_ImportModule("numpy"); + AK_GOTO_ON_NOT(numpy, failure) + + if (PyArray_NDIM(array) == 1) { + func = PyObject_GetAttrString(numpy, "in1d"); + } + else { + func = PyObject_GetAttrString(numpy, "isin"); + } + AK_GOTO_ON_NOT(func, failure) + + args = PyTuple_Pack(2, (PyObject*)array, (PyObject*)other); + AK_GOTO_ON_NOT(args, failure) + + kwarg = PyDict_New(); + AK_GOTO_ON_NOT(kwarg, failure); + if (PyDict_SetItemString(kwarg, "assume_unique", PyLong_FromLong((long)assume_unique)) == -1) { + goto failure; + } + + PyObject* result = PyObject_Call(func, args, kwarg); + AK_GOTO_ON_NOT(result, failure) + + Py_DECREF(numpy); + Py_DECREF(func); + Py_DECREF(args); + Py_DECREF(kwarg); + + return result; + +failure: + Py_XDECREF(numpy); + Py_XDECREF(func); + Py_XDECREF(args); + Py_XDECREF(kwarg); + return NULL; +} + static PyObject * AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) { @@ -892,7 +935,8 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) return AK_isin_array_object(array, other); } // Use numpy in1d logic for dtype arrays - return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); + //return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); + return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique); } //------------------------------------------------------------------------------ From d3b818c073847f16a001a2dd9fa8560d41c20d34 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 17:42:38 -0800 Subject: [PATCH 17/29] Adds a lot more performance testing infrastructure. --- performance/main.py | 184 ++++++++++++++++++++++++++++------ performance/reference/util.py | 3 +- test/test_util.py | 4 - 3 files changed, 158 insertions(+), 33 deletions(-) diff --git a/performance/main.py b/performance/main.py index 8e0bee12..e96f3d06 100644 --- a/performance/main.py +++ b/performance/main.py @@ -222,45 +222,172 @@ class ArrayGOPerfREF(ArrayGOPerf): #------------------------------------------------------------------------------- -class IsinArrayPerf(Perf): - NUMBER = 1000 - def pre(self): - self.arrays = [] +def build_arr(dtype, size, num_nans, contains_dups): + if dtype.kind == 'M': + if dtype == 'datetime64[Y]': + delta = np.timedelta64(size, 'Y') + elif dtype == 'datetime64[M]': + delta = np.timedelta64(size, 'M') + else: + delta = np.timedelta64(size, 'D') + + start = np.datetime64('2000-01-01').astype(dtype) + end = start + delta + arr = np.arange(start, start + delta).astype(dtype) + + nan_val = np.datetime64('NaT') + else: + if dtype.kind == 'm': + nan_val = np.timedelta64('NaT') + elif dtype.kind == 'c': + nan_val = np.complex_(np.nan) + else: + nan_val = np.nan + + arr = np.arange(size).astype(dtype) + + if num_nans == 1: + arr = np.concatenate((arr, [nan_val]*num_nans)) + elif num_nans > 1: + arr = np.concatenate((arr, [nan_val]*num_nans)) + + if contains_dups: + dups = np.array([arr[i] for i in range(0, size, int(size * 0.3))], dtype=dtype) + arr = np.concatenate((arr, dups)) + + np.random.seed(0) + np.random.shuffle(arr) + return arr, (num_nans <= 1 and not contains_dups) + +storage = [] +def build_subclassses(klass, meth): + storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ak'])))) + storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ref'])))) + +class Obj: + def __init__(self, val): + self.val = val + def __eq__(self, other): + return self.val == other.val + def __hash__(self): + return hash(self.val) + +def get_dtypes(): + dtypes = [np.dtype(int), np.dtype(float), np.dtype(np.complex_), np.dtype('O')] + dtypes.extend((np.dtype(f'datetime64[{f}]') for f in 'DMY')) + dtypes.extend((np.dtype(f'timedelta64[{f}]') for f in 'DMY')) + return dtypes + +class IsinDtypeUnique1DPerf(Perf): + NUMBER = 1 - v_1d = [1, 2, 3, 4, 5] - v_2d = [[1, 2, 3], [4, 5, 9]] - w_1d = [1, 4, 7, 9] + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000, 100000): + for num_nans in (0, 1): + arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups=False) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups=False) + assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' + #print(size, len(arr1), len(arr2)) + self.kwargs.append(dict(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True)) - dtype_funcs = [ - (int, int), - (float, float), - (str, str), - ('datetime64[D]', lambda x: date(2020, 1, x)), - ] + def main(self): + for kwargs in self.kwargs: + self.entry(**kwargs) - for dtype, dtype_func in dtype_funcs: - arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) - arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - self.arrays.append((arr1, arr2)) +class IsinDtypeNonUnique1DPerf(Perf): + NUMBER = 10 - for dtype, dtype_func in dtype_funcs: - arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) - arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - self.arrays.append((arr1, arr2)) + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000): + for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): + arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) + assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) def main(self): - for _ in range(25): - for arr1, arr2 in self.arrays: - self.entry(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True) + for kwargs in self.kwargs: + self.entry(**kwargs) -class IsinArrayPerfAK(IsinArrayPerf): - entry = staticmethod(isin_array_ak) +class IsinDtypeUnique2DPerf(Perf): + NUMBER = 10 -class IsinArrayPerfREF(IsinArrayPerf): - entry = staticmethod(isin_array_ref) + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000, 100000): + for num_nans in (0, 1): + arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups=False) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups=False) + assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' + self.kwargs.append(dict(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True)) + def main(self): + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinDtypeNonUnique2DPerf(Perf): + NUMBER = 10 + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size in (100, 5000, 20000): + for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): + arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) + assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + def main(self): + for kwargs in self.kwargs: + self.entry(**kwargs) + +# class IsinObject1DPerf(Perf): +# NUMBER = 10 + +# def pre(self): +# self.kwargs = [] +# for dtype in get_dtypes(): +# for size in (100, 5000, 20000): +# for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): +# arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) +# arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) +# assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' +# self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + +# def main(self): +# for kwargs in self.kwargs: +# self.entry(**kwargs) + +# class IsinObject2DPerf(Perf): +# NUMBER = 10 + +# def pre(self): +# self.kwargs = [] +# for dtype in get_dtypes(): +# for size in (100, 5000, 20000): +# for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): +# arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) +# arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) +# assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' +# self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + +# def main(self): +# for kwargs in self.kwargs: +# self.entry(**kwargs) + +build_subclassses(IsinDtypeUnique1DPerf, 'isin_array') +build_subclassses(IsinDtypeUnique2DPerf, 'isin_array') +build_subclassses(IsinDtypeNonUnique1DPerf, 'isin_array') +build_subclassses(IsinDtypeNonUnique2DPerf, 'isin_array') +# build_subclassses(IsinObject1DPerf, 'isin_array') +# build_subclassses(IsinObject2DPerf, 'isin_array') #------------------------------------------------------------------------------- @@ -290,6 +417,7 @@ def main(): cls_map['ak'] = cls_runner elif cls_runner.__name__.endswith('REF'): cls_map['ref'] = cls_runner + assert cls_map for func_attr in cls_perf.FUNCTIONS: results = {} for key, cls_runner in cls_map.items(): diff --git a/performance/reference/util.py b/performance/reference/util.py index d93c41f5..61d333fc 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -247,6 +247,7 @@ def isin_array(*, return result + def unique(ar, return_inverse=False): ar = np.asanyarray(ar).flatten() @@ -279,4 +280,4 @@ def unique(ar, return_inverse=False): inv_idx[perm] = imask return ret, inv_idx - return ret \ No newline at end of file + return ret diff --git a/test/test_util.py b/test/test_util.py index 15b4fdd8..6812f365 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -327,17 +327,13 @@ def test_1d_2d_dtype_non_unique(self) -> None: ] for dtype, dtype_func in dtype_funcs: - print(dtype, dtype_func) arr1 = np.array([dtype_func(v) for v in v_1d], dtype=dtype) arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) - print(arr1, arr2, e_1d) - post = isin_array_func(array=arr1, other=arr2) self.assertTrue(np.array_equal(e_1d, post), msg=f'\n{dtype}\nExpected:\n{e_1d}\nActual:\n{post}') for dtype, dtype_func in dtype_funcs: - print(dtype, dtype_func) arr1 = np.array([[dtype_func(x) for x in y] for y in v_2d], dtype=dtype) arr2 = np.array([dtype_func(v) for v in w_1d], dtype=dtype) From e6b1c853892ef8c9fc1e7d955c6ee3e73ad3d1bf Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 19:19:10 -0800 Subject: [PATCH 18/29] Finishes perf setup. --- debug.py | 12 ++- performance/main.py | 209 +++++++++++++++++++++++++++++--------------- 2 files changed, 148 insertions(+), 73 deletions(-) diff --git a/debug.py b/debug.py index a98d2a45..10b690d0 100755 --- a/debug.py +++ b/debug.py @@ -4,12 +4,20 @@ import numpy as np # type: ignore from arraykit import isin_array -isin_array_func = partial(isin_array, array_is_unique=False, other_is_unique=False) +funcTT = partial(isin_array, array_is_unique=True, other_is_unique=True) +funcTF = partial(isin_array, array_is_unique=True, other_is_unique=False) +funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) +funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) + + +arr1 = np.array([1, 5, 2, 3, 4], dtype=int) +arr2 = np.array([1, 4, 7, 9], dtype=int) +post = funcTT(array=arr1, other=arr2) # e_1d = np.array([1, 0, 0, 0, 1, 0, 1], dtype=bool) arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=int) arr2 = np.array([1, 4, 7, 9], dtype=int) print(arr1) print(arr2) -post = isin_array_func(array=arr1, other=arr2) +post = funcFF(array=arr1, other=arr2) print(post) diff --git a/performance/main.py b/performance/main.py index e96f3d06..95d4c955 100644 --- a/performance/main.py +++ b/performance/main.py @@ -3,6 +3,7 @@ import argparse import numpy as np +import pandas as pd from performance.reference.util import mloc as mloc_ref from performance.reference.util import immutable_filter as immutable_filter_ref @@ -223,7 +224,7 @@ class ArrayGOPerfREF(ArrayGOPerf): #------------------------------------------------------------------------------- -def build_arr(dtype, size, num_nans, contains_dups): +def build_arr(dtype, size, num_nans, num_duplicates): if dtype.kind == 'M': if dtype == 'datetime64[Y]': delta = np.timedelta64(size, 'Y') @@ -248,17 +249,22 @@ def build_arr(dtype, size, num_nans, contains_dups): arr = np.arange(size).astype(dtype) if num_nans == 1: - arr = np.concatenate((arr, [nan_val]*num_nans)) + arr = np.concatenate((arr[:-1], [nan_val]*num_nans)) elif num_nans > 1: arr = np.concatenate((arr, [nan_val]*num_nans)) - if contains_dups: - dups = np.array([arr[i] for i in range(0, size, int(size * 0.3))], dtype=dtype) + if num_duplicates: + indices = np.arange(size) + np.random.seed(0) + np.random.shuffle(indices) + + dups = np.array([arr[i] for i in indices[:num_duplicates]]) + dups[~pd.isnull(dups)].astype(dtype) arr = np.concatenate((arr, dups)) np.random.seed(0) np.random.shuffle(arr) - return arr, (num_nans <= 1 and not contains_dups) + return arr, (num_nans <= 1 and num_duplicates == 0) storage = [] def build_subclassses(klass, meth): @@ -279,115 +285,177 @@ def get_dtypes(): dtypes.extend((np.dtype(f'timedelta64[{f}]') for f in 'DMY')) return dtypes -class IsinDtypeUnique1DPerf(Perf): - NUMBER = 1 +class IsinArrayDtypeUnique1DPerf(Perf): + NUMBER = 3 def pre(self): self.kwargs = [] for dtype in get_dtypes(): for size in (100, 5000, 20000, 100000): for num_nans in (0, 1): - arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups=False) - arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups=False) + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates=0) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, num_duplicates=0) assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' - #print(size, len(arr1), len(arr2)) self.kwargs.append(dict(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True)) def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {1}, "Expected all arr1's to be 1D" for kwargs in self.kwargs: self.entry(**kwargs) -class IsinDtypeNonUnique1DPerf(Perf): - NUMBER = 10 +class IsinArrayDtypeUnique2DPerf(Perf): + NUMBER = 3 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size, num_nans, reshape in [ + (100, 0, (10, 10)), + (100, 1, (10, 10)), + (5000, 0, (200, 25)), + (5000, 1, (200, 25)), + (20000, 0, (200, 100)), + (20000, 1, (200, 100)), + (100000, 0, (500, 200)), + (100000, 1, (500, 200)), + ]: + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates=0) + arr2, arr2_unique = build_arr(dtype, size // 10, num_nans // 10, num_duplicates=0) + assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' + self.kwargs.append(dict(array=arr1.reshape(reshape), array_is_unique=True, other=arr2, other_is_unique=True)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayDtypeNonUnique1DPerf(Perf): + NUMBER = 3 def pre(self): self.kwargs = [] for dtype in get_dtypes(): for size in (100, 5000, 20000): - for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): - arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) - arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) + for num_nans, num_duplicates in ((2 + (size // 2), 0), (size // 2, size // 15), (2 + (size // 8), 0), (size // 8, size // 15)): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, num_duplicates) assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {1}, "Expected all arr1's to be 1D" for kwargs in self.kwargs: self.entry(**kwargs) -class IsinDtypeUnique2DPerf(Perf): - NUMBER = 10 +class IsinArrayDtypeNonUnique2DPerf(Perf): + NUMBER = 1 def pre(self): self.kwargs = [] for dtype in get_dtypes(): - for size in (100, 5000, 20000, 100000): - for num_nans in (0, 1): - arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups=False) - arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups=False) - assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' - self.kwargs.append(dict(array=arr1, array_is_unique=True, other=arr2, other_is_unique=True)) + for size, num_nans, num_duplicates, reshape in [ + (90, 10, 35, (27, 5)), + (80, 20, 35, (27, 5)), + (4500, 500, 950, (119, 50)), + (4000, 1000, 950, (119, 50)), + (18000, 2000, 2500, (250, 90)), + (16000, 4000, 2500, (250, 90)), + (90000, 10000, 15000, (500, 230)), + (80000, 20000, 15000, (500, 230)), + ]: + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 10, int(num_nans / 10), int(num_duplicates / 10)) + assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' + self.kwargs.append(dict(array=arr1.reshape(reshape), array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" for kwargs in self.kwargs: self.entry(**kwargs) -class IsinDtypeNonUnique2DPerf(Perf): - NUMBER = 10 +class IsinArrayObject1DPerf(Perf): + NUMBER = 3 def pre(self): self.kwargs = [] for dtype in get_dtypes(): for size in (100, 5000, 20000): - for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): - arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) - arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) + for num_nans, num_duplicates in ((2 + (size // 2), 0), (size // 2, size // 15), (2 + (size // 8), 0), (size // 8, size // 15)): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, num_duplicates) assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + for size in (100, 5000, 20000): + for num_duplicates in (size // 15, 0): + tmp_arr1, arr1_unique = build_arr(np.dtype(int), size, 0, num_duplicates) + tmp_arr2, arr2_unique = build_arr(np.dtype(int), size // 25, 0, num_duplicates) + + arr1 = np.array([Obj(v) for v in tmp_arr1]) + arr2 = np.array([Obj(v) for v in tmp_arr2]) + + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {1}, "Expected all arr1's to be 1D" + for kwargs in self.kwargs: + self.entry(**kwargs) + +class IsinArrayObject2DPerf(Perf): + NUMBER = 1 + + def pre(self): + self.kwargs = [] + for dtype in get_dtypes(): + for size, num_nans, num_duplicates, reshape in [ + (100, 0, 0, (10, 10)), + (90, 10, 35, (27, 5)), + (80, 20, 35, (27, 5)), + (5000, 0, 0, (200, 25)), + (4500, 500, 950, (119, 50)), + (4000, 1000, 950, (119, 50)), + (20000, 0, 0, (200, 100)), + (18000, 2000, 2500, (250, 90)), + (16000, 4000, 2500, (250, 90)), + (100000, 1, 0, (500, 200)), + (90000, 10000, 15000, (500, 230)), + (80000, 20000, 15000, (500, 230)), + ]: + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates) + arr2, arr2_unique = build_arr(dtype, size // 10, int(num_nans / 10), int(num_duplicates / 10)) + self.kwargs.append(dict(array=arr1.reshape(reshape).astype(object), array_is_unique=arr1_unique, other=arr2.astype(object), other_is_unique=arr2_unique)) + + for size, num_duplicates, reshape in [ + (100, 0, (10, 10)), + (90, 10, (10, 10)), + (5000, 0, (200, 25)), + (4500, 500, (200, 25)), + (20000, 0, (200, 100)), + (18000, 2000, (200, 100)), + ]: + tmp_arr1, arr1_unique = build_arr(np.dtype(int), size, 0, num_duplicates) + tmp_arr2, arr2_unique = build_arr(np.dtype(int), size // 10, 0, num_duplicates // 10) + + arr1 = np.array([Obj(v) for v in tmp_arr1]).reshape(reshape) + arr2 = np.array([Obj(v) for v in tmp_arr2]) + + self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) + def main(self): + assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" for kwargs in self.kwargs: self.entry(**kwargs) -# class IsinObject1DPerf(Perf): -# NUMBER = 10 - -# def pre(self): -# self.kwargs = [] -# for dtype in get_dtypes(): -# for size in (100, 5000, 20000): -# for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): -# arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) -# arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) -# assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' -# self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) - -# def main(self): -# for kwargs in self.kwargs: -# self.entry(**kwargs) - -# class IsinObject2DPerf(Perf): -# NUMBER = 10 - -# def pre(self): -# self.kwargs = [] -# for dtype in get_dtypes(): -# for size in (100, 5000, 20000): -# for num_nans, contains_dups in ((2 + (size // 2), False), (size // 2, True), (2 + (size // 8), False), (size // 8, True)): -# arr1, arr1_unique = build_arr(dtype, size, num_nans, contains_dups) -# arr2, arr2_unique = build_arr(dtype, size // 25, num_nans // 25, contains_dups) -# assert not arr1_unique or not arr2_unique, 'Expect at least one of the arrays to contains duplicates' -# self.kwargs.append(dict(array=arr1, array_is_unique=arr1_unique, other=arr2, other_is_unique=arr2_unique)) - -# def main(self): -# for kwargs in self.kwargs: -# self.entry(**kwargs) - -build_subclassses(IsinDtypeUnique1DPerf, 'isin_array') -build_subclassses(IsinDtypeUnique2DPerf, 'isin_array') -build_subclassses(IsinDtypeNonUnique1DPerf, 'isin_array') -build_subclassses(IsinDtypeNonUnique2DPerf, 'isin_array') -# build_subclassses(IsinObject1DPerf, 'isin_array') -# build_subclassses(IsinObject2DPerf, 'isin_array') + +build_subclassses(IsinArrayDtypeUnique1DPerf, 'isin_array') +build_subclassses(IsinArrayDtypeUnique2DPerf, 'isin_array') + +build_subclassses(IsinArrayDtypeNonUnique1DPerf, 'isin_array') +build_subclassses(IsinArrayDtypeNonUnique2DPerf, 'isin_array') + +build_subclassses(IsinArrayObject1DPerf, 'isin_array') +build_subclassses(IsinArrayObject2DPerf, 'isin_array') + #------------------------------------------------------------------------------- @@ -401,7 +469,6 @@ def get_arg_parser(): help='Provide one or more performance tests by name.') return p - def main(): options = get_arg_parser().parse_args() match = None if not options.names else set(options.names) @@ -429,12 +496,12 @@ def main(): number=cls_runner.NUMBER) records.append((cls_perf.__name__, func_attr, results['ak'], results['ref'], results['ref'] / results['ak'])) - width = 24 + width = 36 for record in records: print(''.join( (r.ljust(width) if isinstance(r, str) else str(round(r, 8)).ljust(width)) for r in record )) + if __name__ == '__main__': main() - From 52c029c7a96a59a670402406b17bb1573015ace2 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 20:06:14 -0800 Subject: [PATCH 19/29] Adds work for missing trailing False for non-unique arrays. --- arraykit.c | 44 ++++++++++++++++------- debug.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 125 insertions(+), 23 deletions(-) diff --git a/arraykit.c b/arraykit.c index b2bf47be..a0cbf31c 100644 --- a/arraykit.c +++ b/arraykit.c @@ -637,7 +637,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu return ret[rev_idx] */ // 0. Deallocate on failure - PyArrayObject* raveled_array = NULL; + PyArrayObject* flattened_array = NULL; PyObject *reverse_idx = NULL; PyArrayObject* concatenated = NULL; PyArrayObject *ordered_idx = NULL; @@ -651,18 +651,19 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu size_t array_size = PyArray_SIZE(array); // 2. Ravel the array as we want to operate on 1D arrays only. (other is guaranteed to be 1D) - raveled_array = (PyArrayObject*)PyArray_Ravel(array, NPY_CORDER); - AK_GOTO_ON_NOT(raveled_array, failure) - Py_INCREF(raveled_array); + flattened_array = (PyArrayObject*)PyArray_Flatten(array, NPY_CORDER); + AK_GOTO_ON_NOT(flattened_array, failure) + Py_INCREF(flattened_array); if (!assume_unique) { - PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(raveled_array); + PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(flattened_array); PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GET_ITEM(arr_and_rev_idx, 0); AK_GOTO_ON_NOT(raveled_array_unique, failure) Py_INCREF(raveled_array_unique); reverse_idx = PyTuple_GET_ITEM(arr_and_rev_idx, 1); AK_GOTO_ON_NOT(reverse_idx, failure) + Py_INCREF(reverse_idx); PyArrayObject *other_unique = AK_get_unique_arr(other); AK_GOTO_ON_NOT(other_unique, failure) @@ -676,7 +677,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu } else { // 3. Concatenate - concatenated = AK_concat_arrays(raveled_array, other); + concatenated = AK_concat_arrays(flattened_array, other); } AK_GOTO_ON_NOT(concatenated, failure) @@ -705,12 +706,31 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu Py_INCREF(tmp); + npy_intp dims[1] = {1}; + + PyArrayObject *false = (PyArrayObject*)PyArray_NewFromDescr( + &PyArray_Type, // class (subtype) + PyArray_DescrFromType(NPY_BOOL), // dtype (descr) + 1, // ndim (nd) + dims, // dims + NULL, // strides + "\0", // data + NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags + NULL); // sublclass (obj) + Py_INCREF(false); + + PyArrayObject* xyz = AK_concat_arrays(comparison, false); + Py_INCREF(xyz); + // TODO: Comparison is missing a trailing False value... - if (PyObject_SetItem(tmp, (PyObject*)ordered_idx, (PyObject*)comparison)) { + if (PyObject_SetItem(tmp, (PyObject*)ordered_idx, (PyObject*)xyz)) { goto failure; } - printf("HERE\n"); + Py_INCREF(ordered_idx); + Py_INCREF(xyz); + Py_INCREF(tmp); + Py_INCREF(reverse_idx); ret = (PyArrayObject*)PyObject_GetItem(tmp, reverse_idx); Py_DECREF(tmp); @@ -750,7 +770,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu } // 8. Cleanup & Return! - Py_DECREF(raveled_array); + Py_DECREF(flattened_array); Py_DECREF(concatenated); Py_DECREF(ordered_idx); Py_DECREF(sorted_arr); @@ -759,7 +779,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu return (PyObject*)ret; failure: - Py_XDECREF(raveled_array); + Py_XDECREF(flattened_array); Py_XDECREF(reverse_idx); Py_XDECREF(concatenated); Py_XDECREF(ordered_idx); @@ -935,8 +955,8 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) return AK_isin_array_object(array, other); } // Use numpy in1d logic for dtype arrays - //return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); - return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique); + return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); + //return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique); } //------------------------------------------------------------------------------ diff --git a/debug.py b/debug.py index 10b690d0..eb4a9f6c 100755 --- a/debug.py +++ b/debug.py @@ -9,15 +9,97 @@ funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) +def dtype_unique_1d(func): + expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) + arr1 = np.array([1, 5, 2, 3, 4], dtype=np.int_) + arr2 = np.array([1, 4, 7, 9], dtype=np.int_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) -arr1 = np.array([1, 5, 2, 3, 4], dtype=int) -arr2 = np.array([1, 4, 7, 9], dtype=int) -post = funcTT(array=arr1, other=arr2) - -# e_1d = np.array([1, 0, 0, 0, 1, 0, 1], dtype=bool) -arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=int) -arr2 = np.array([1, 4, 7, 9], dtype=int) -print(arr1) -print(arr2) -post = funcFF(array=arr1, other=arr2) -print(post) + arr1 = np.array([1, 5, 2, 3, 4], dtype=np.float_) + arr2 = np.array([1, 4, 7, 9], dtype=np.float_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4], dtype=np.complex_) + arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + for freq in 'DMY': + arr1 = np.array([1, 5, 2, 3, 4], dtype=f'datetime64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4], dtype=f'timedelta64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + +dtype_unique_1d(funcTT) +dtype_unique_1d(funcTF) +dtype_unique_1d(funcFT) +dtype_unique_1d(funcFF) + +def dtype_arr1_non_unique_1d(func): + expected = np.array([1, 0, 0, 0, 1, 0, 1], dtype=np.bool_) + arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.int_) + arr2 = np.array([1, 4, 7, 9], dtype=np.int_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.float_) + arr2 = np.array([1, 4, 7, 9], dtype=np.float_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.complex_) + arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + for freq in 'DMY': + arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=f'datetime64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=f'timedelta64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + +dtype_arr1_non_unique_1d(funcFT) +dtype_arr1_non_unique_1d(funcFF) + +def dtype_arr2_non_unique_1d(func): + expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) + arr1 = np.array([1, 5, 2, 3, 4], dtype=np.int_) + arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.int_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4], dtype=np.float_) + arr2 = np.array([1, 4, 7, 9], dtype=np.float_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4], dtype=np.complex_) + arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + for freq in 'DMY': + arr1 = np.array([1, 5, 2, 3, 4], dtype=f'datetime64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([1, 5, 2, 3, 4], dtype=f'timedelta64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + +dtype_arr2_non_unique_1d(funcTF) +dtype_arr2_non_unique_1d(funcFF) From 904cdf607e1474f4629630b9b00fbe6a4f0236f0 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 20:23:06 -0800 Subject: [PATCH 20/29] Finishes debug testing on 2D non-unique branch. --- arraykit.c | 6 ++++ debug.py | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) diff --git a/arraykit.c b/arraykit.c index a0cbf31c..07961b72 100644 --- a/arraykit.c +++ b/arraykit.c @@ -733,6 +733,12 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu Py_INCREF(reverse_idx); ret = (PyArrayObject*)PyObject_GetItem(tmp, reverse_idx); + + if (array_ndim == 2) { + PyObject* shape = PyTuple_Pack(2, PyLong_FromLong(array_dims[0]), PyLong_FromLong(array_dims[1])); + ret = PyArray_Reshape(ret, shape); + } + Py_DECREF(tmp); Py_DECREF(reverse_idx); } diff --git a/debug.py b/debug.py index eb4a9f6c..b5b3b1e7 100755 --- a/debug.py +++ b/debug.py @@ -9,6 +9,9 @@ funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) +# ------------------------------------------------------------------------------ +# ------------------------------------- 1D ------------------------------------- + def dtype_unique_1d(func): expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) arr1 = np.array([1, 5, 2, 3, 4], dtype=np.int_) @@ -103,3 +106,101 @@ def dtype_arr2_non_unique_1d(func): dtype_arr2_non_unique_1d(funcTF) dtype_arr2_non_unique_1d(funcFF) + +# ------------------------------------------------------------------------------ +# ------------------------------------- 2D ------------------------------------- + +def dtype_unique_2d(func): + expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.int_) + arr2 = np.array([1, 4, 7, 9], dtype=np.int_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.float_) + arr2 = np.array([1, 4, 7, 9], dtype=np.float_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.complex_) + arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + for freq in 'DMY': + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'datetime64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'timedelta64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + +dtype_unique_2d(funcTT) +dtype_unique_2d(funcTF) +dtype_unique_2d(funcFT) +dtype_unique_2d(funcFF) + +def dtype_arr2_non_unique_1d(func): + expected = np.array([[1, 1, 0, 0], [1, 0, 0, 1]], dtype=np.bool_) + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=np.int_) + arr2 = np.array([1, 4, 7, 9], dtype=np.int_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=np.float_) + arr2 = np.array([1, 4, 7, 9], dtype=np.float_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=np.complex_) + arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + for freq in 'DMY': + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=f'datetime64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=f'timedelta64[{freq}]') + arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + +dtype_arr2_non_unique_1d(funcFT) +dtype_arr2_non_unique_1d(funcFF) + +def dtype_arr2_non_unique_1d(func): + expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.int_) + arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.int_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.float_) + arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.float_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.complex_) + arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.complex_) + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + for freq in 'DMY': + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'datetime64[{freq}]') + arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=f'datetime64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + + arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'timedelta64[{freq}]') + arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=f'timedelta64[{freq}]') + post = func(array=arr1, other=arr2) + assert np.array_equal(expected, post) + +dtype_arr2_non_unique_1d(funcTF) +dtype_arr2_non_unique_1d(funcFF) From 886ec4aa226337f7908638453d6d19a6c67fbe9d Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 20:51:00 -0800 Subject: [PATCH 21/29] Lints. --- arraykit.c | 20 +++++++++++--------- performance/main.py | 7 ++++--- test/test_util.py | 4 ++-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arraykit.c b/arraykit.c index 07961b72..2c05ab69 100644 --- a/arraykit.c +++ b/arraykit.c @@ -395,16 +395,16 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) // 3. Discover the location of the first NaN element size_t firstnan = 0; if (is_complex) { + // TODO: I don't understand the necessity of this branch. // aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') } - else { - // This gives back an array of 1-element since `last_element` is a single element - PyObject* firstnan_obj = PyArray_SearchSorted(sar, last_element, NPY_SEARCHLEFT, NULL); - AK_GOTO_ON_NOT(firstnan_obj, failure) - firstnan = *(size_t*)PyArray_DATA((PyArrayObject*)firstnan_obj); - Py_DECREF(firstnan_obj); - } + // This gives back an array of 1-element since `last_element` is a single element + PyObject* firstnan_obj = PyArray_SearchSorted(sar, last_element, NPY_SEARCHLEFT, NULL); + AK_GOTO_ON_NOT(firstnan_obj, failure) + + firstnan = *(size_t*)PyArray_DATA((PyArrayObject*)firstnan_obj); + Py_DECREF(firstnan_obj); // 4. Build mask in such a way to only include 1 NaN value comparison = AK_compare_two_slices_from_array(sar, 1, firstnan, 0, firstnan - 1, Py_NE); @@ -736,7 +736,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu if (array_ndim == 2) { PyObject* shape = PyTuple_Pack(2, PyLong_FromLong(array_dims[0]), PyLong_FromLong(array_dims[1])); - ret = PyArray_Reshape(ret, shape); + ret = (PyArrayObject*)PyArray_Reshape(ret, shape); } Py_DECREF(tmp); @@ -794,6 +794,7 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu return NULL; } +/* static PyObject * AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assume_unique) { @@ -839,6 +840,7 @@ AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assum Py_XDECREF(kwarg); return NULL; } +*/ static PyObject * AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) @@ -919,7 +921,7 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) } ++i; - /* Increment the iterator to the next inner loop */ + // Increment the iterator to the next inner loop } while(iternext(iter)); Py_DECREF(compare_elements); diff --git a/performance/main.py b/performance/main.py index 95d4c955..a79bd336 100644 --- a/performance/main.py +++ b/performance/main.py @@ -1,4 +1,3 @@ -from datetime import date import timeit import argparse @@ -268,8 +267,10 @@ def build_arr(dtype, size, num_nans, num_duplicates): storage = [] def build_subclassses(klass, meth): - storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ak'])))) - storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ref'])))) + #storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ak'])))) + #storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(globals()[f'{meth}_ref'])))) + storage.append(type(f'{klass.__name__}AK', (klass,), dict(entry=staticmethod(isin_array_ak)))) + storage.append(type(f'{klass.__name__}REF', (klass,), dict(entry=staticmethod(isin_array_ref)))) class Obj: def __init__(self, val): diff --git a/test/test_util.py b/test/test_util.py index 6812f365..bf9f050c 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -233,7 +233,7 @@ def test_1d_2d_dtype_unique(self) -> None: (float, float), (str, str), ('datetime64[D]', lambda x: date(2020, 1, x)), - ('timedelta64[D]', lambda x: timedelta(x)), + ('timedelta64[D]', timedelta), ] for dtype, dtype_func in dtype_funcs: @@ -323,7 +323,7 @@ def test_1d_2d_dtype_non_unique(self) -> None: (float, float), (str, str), ('datetime64[D]', lambda x: date(2020, 1, x)), - ('timedelta64[D]', lambda x: timedelta(x)), + ('timedelta64[D]', timedelta), ] for dtype, dtype_func in dtype_funcs: From db5bf3441a99313ef9db5b14b8d570896ab146b9 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Sat, 13 Mar 2021 21:07:20 -0800 Subject: [PATCH 22/29] Updates macro to support previous Python versions. --- arraykit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arraykit.c b/arraykit.c index 2c05ab69..114278e4 100644 --- a/arraykit.c +++ b/arraykit.c @@ -296,7 +296,7 @@ is_nat(PyObject *a) return PyArrayScalar_VAL(a, Timedelta) == NPY_DATETIME_NAT; } - Py_UNREACHABLE(); + Py_FatalError("This should be impossible"); } //------------------------------------------------------------------------------ From 4aafd216d77eff00f149dac653e0d8fc7f6acd5b Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Mon, 15 Mar 2021 09:56:14 -0700 Subject: [PATCH 23/29] Cleans up a lot of duplicate code. --- debug.py | 196 ++++++++++---------------------------------- performance/main.py | 23 +++--- 2 files changed, 53 insertions(+), 166 deletions(-) diff --git a/debug.py b/debug.py index b5b3b1e7..17c9fa5c 100755 --- a/debug.py +++ b/debug.py @@ -9,198 +9,88 @@ funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) -# ------------------------------------------------------------------------------ -# ------------------------------------- 1D ------------------------------------- -def dtype_unique_1d(func): - expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) - arr1 = np.array([1, 5, 2, 3, 4], dtype=np.int_) - arr2 = np.array([1, 4, 7, 9], dtype=np.int_) - post = func(array=arr1, other=arr2) +def test_arrays(arr1, arr2, expected, func): + post = func(array=arr1.astype(np.int_), other=arr2.astype(np.int_)) assert np.array_equal(expected, post) - arr1 = np.array([1, 5, 2, 3, 4], dtype=np.float_) - arr2 = np.array([1, 4, 7, 9], dtype=np.float_) - post = func(array=arr1, other=arr2) + post = func(array=arr1.astype(np.float_), other=arr2.astype(np.float_)) assert np.array_equal(expected, post) - arr1 = np.array([1, 5, 2, 3, 4], dtype=np.complex_) - arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) - post = func(array=arr1, other=arr2) + post = func(array=arr1.astype(np.complex_), other=arr2.astype(np.complex_)) assert np.array_equal(expected, post) for freq in 'DMY': - arr1 = np.array([1, 5, 2, 3, 4], dtype=f'datetime64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') - post = func(array=arr1, other=arr2) + post = func(array=arr1.astype(f'datetime64[{freq}]'), other=arr2.astype(f'datetime64[{freq}]')) assert np.array_equal(expected, post) - arr1 = np.array([1, 5, 2, 3, 4], dtype=f'timedelta64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') - post = func(array=arr1, other=arr2) + post = func(array=arr1.astype(f'timedelta64[{freq}]'), other=arr2.astype(f'timedelta64[{freq}]')) assert np.array_equal(expected, post) -dtype_unique_1d(funcTT) -dtype_unique_1d(funcTF) -dtype_unique_1d(funcFT) -dtype_unique_1d(funcFF) -def dtype_arr1_non_unique_1d(func): - expected = np.array([1, 0, 0, 0, 1, 0, 1], dtype=np.bool_) - arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.int_) - arr2 = np.array([1, 4, 7, 9], dtype=np.int_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.float_) - arr2 = np.array([1, 4, 7, 9], dtype=np.float_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) +# ------------------------------------------------------------------------------ +# ------------------------------------- 1D ------------------------------------- - arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.complex_) - arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) +def dtype_unique_1d(func): + arr1 = np.array([1, 5, 2, 3, 4]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) - for freq in 'DMY': - arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=f'datetime64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=f'timedelta64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) +def dtype_arr1_non_unique_1d(func): + arr1 = np.array([1, 5, 2, 3, 4, 5, 1]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([1, 0, 0, 0, 1, 0, 1], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) -dtype_arr1_non_unique_1d(funcFT) -dtype_arr1_non_unique_1d(funcFF) def dtype_arr2_non_unique_1d(func): + arr1 = np.array([1, 5, 2, 3, 4]) + arr2 = np.array([1, 9, 4, 7, 9, 1]) expected = np.array([1, 0, 0, 0, 1], dtype=np.bool_) - arr1 = np.array([1, 5, 2, 3, 4], dtype=np.int_) - arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.int_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([1, 5, 2, 3, 4], dtype=np.float_) - arr2 = np.array([1, 4, 7, 9], dtype=np.float_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([1, 5, 2, 3, 4], dtype=np.complex_) - arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - for freq in 'DMY': - arr1 = np.array([1, 5, 2, 3, 4], dtype=f'datetime64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) + test_arrays(arr1, arr2, expected, func) - arr1 = np.array([1, 5, 2, 3, 4], dtype=f'timedelta64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - -dtype_arr2_non_unique_1d(funcTF) -dtype_arr2_non_unique_1d(funcFF) # ------------------------------------------------------------------------------ # ------------------------------------- 2D ------------------------------------- def dtype_unique_2d(func): + arr1 = np.array([[1, 2, 3], [4, 5, 9]]) + arr2 = np.array([1, 4, 7, 9]) expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.int_) - arr2 = np.array([1, 4, 7, 9], dtype=np.int_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) + test_arrays(arr1, arr2, expected, func) - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.float_) - arr2 = np.array([1, 4, 7, 9], dtype=np.float_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.complex_) - arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) +def dtype_arr2_non_unique_1d(func): + arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]]) + arr2 = np.array([1, 4, 7, 9]) + expected = np.array([[1, 1, 0, 0], [1, 0, 0, 1]], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) - for freq in 'DMY': - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'datetime64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'timedelta64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) +def dtype_arr2_non_unique_1d(func): + arr1 = np.array([[1, 2, 3], [4, 5, 9]]) + arr2 = np.array([1, 9, 4, 7, 9, 1]) + expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) + test_arrays(arr1, arr2, expected, func) + +dtype_unique_1d(funcTT) +dtype_unique_1d(funcTF) +dtype_unique_1d(funcFT) +dtype_unique_1d(funcFF) dtype_unique_2d(funcTT) dtype_unique_2d(funcTF) dtype_unique_2d(funcFT) dtype_unique_2d(funcFF) -def dtype_arr2_non_unique_1d(func): - expected = np.array([[1, 1, 0, 0], [1, 0, 0, 1]], dtype=np.bool_) - arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=np.int_) - arr2 = np.array([1, 4, 7, 9], dtype=np.int_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=np.float_) - arr2 = np.array([1, 4, 7, 9], dtype=np.float_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=np.complex_) - arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - for freq in 'DMY': - arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=f'datetime64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'datetime64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([[9, 1, 2, 3], [4, 3, 5, 9]], dtype=f'timedelta64[{freq}]') - arr2 = np.array([1, 4, 7, 9], dtype=f'timedelta64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) +dtype_arr1_non_unique_1d(funcFT) +dtype_arr1_non_unique_1d(funcFF) +dtype_arr2_non_unique_1d(funcTF) +dtype_arr2_non_unique_1d(funcFF) dtype_arr2_non_unique_1d(funcFT) dtype_arr2_non_unique_1d(funcFF) - -def dtype_arr2_non_unique_1d(func): - expected = np.array([[1, 0, 0], [1, 0, 1]], dtype=np.bool_) - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.int_) - arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.int_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.float_) - arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.float_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=np.complex_) - arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=np.complex_) - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - for freq in 'DMY': - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'datetime64[{freq}]') - arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=f'datetime64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - - arr1 = np.array([[1, 2, 3], [4, 5, 9]], dtype=f'timedelta64[{freq}]') - arr2 = np.array([1, 9, 4, 7, 9, 1], dtype=f'timedelta64[{freq}]') - post = func(array=arr1, other=arr2) - assert np.array_equal(expected, post) - dtype_arr2_non_unique_1d(funcTF) dtype_arr2_non_unique_1d(funcFF) diff --git a/performance/main.py b/performance/main.py index a79bd336..08f949bf 100644 --- a/performance/main.py +++ b/performance/main.py @@ -310,20 +310,17 @@ class IsinArrayDtypeUnique2DPerf(Perf): def pre(self): self.kwargs = [] for dtype in get_dtypes(): - for size, num_nans, reshape in [ - (100, 0, (10, 10)), - (100, 1, (10, 10)), - (5000, 0, (200, 25)), - (5000, 1, (200, 25)), - (20000, 0, (200, 100)), - (20000, 1, (200, 100)), - (100000, 0, (500, 200)), - (100000, 1, (500, 200)), + for size, reshape in [ + (100, (10, 10)), + (5000, (200, 25)), + (20000, (200, 100)), + (100000, (500, 200)), ]: - arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates=0) - arr2, arr2_unique = build_arr(dtype, size // 10, num_nans // 10, num_duplicates=0) - assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' - self.kwargs.append(dict(array=arr1.reshape(reshape), array_is_unique=True, other=arr2, other_is_unique=True)) + for num_nans in (0, 1): + arr1, arr1_unique = build_arr(dtype, size, num_nans, num_duplicates=0) + arr2, arr2_unique = build_arr(dtype, size // 10, num_nans // 10, num_duplicates=0) + assert arr1_unique and arr2_unique, 'Expect both arrays to be unique' + self.kwargs.append(dict(array=arr1.reshape(reshape), array_is_unique=True, other=arr2, other_is_unique=True)) def main(self): assert set(x['array'].ndim for x in self.kwargs) == {2}, "Expected all arr1's to be 2D" From 29a4ec7b65a1667b277248f70a928c8c101b3a05 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Mon, 15 Mar 2021 10:03:12 -0700 Subject: [PATCH 24/29] Fixes some c-compiler warnings I think? --- arraykit.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arraykit.c b/arraykit.c index 114278e4..847d0b38 100644 --- a/arraykit.c +++ b/arraykit.c @@ -463,7 +463,7 @@ AK_get_unique_arr(PyArrayObject *original_arr) size_t size = PyArray_SIZE(original_arr); PyArray_Descr* dtype = PyArray_DESCR(original_arr); - npy_bool mask_arr[size]; + npy_bool* mask_arr = PyMem_Malloc(size); // 2. Get a copy of the original arr since sorting is in-place PyArrayObject* sar = (PyArrayObject*)PyArray_FromArray( @@ -476,7 +476,7 @@ AK_get_unique_arr(PyArrayObject *original_arr) } // 3. Build mask - memset(mask_arr, 0, sizeof(mask_arr)); + memset(mask_arr, 0, size); mask_arr[0] = 1; AK_GOTO_ON_NOT(AK_build_unique_arr_mask(sar, mask_arr), failure) @@ -495,6 +495,8 @@ AK_get_unique_arr(PyArrayObject *original_arr) PyArrayObject *filtered_arr = (PyArrayObject*)PyObject_GetItem((PyObject*)sar, (PyObject*)mask); AK_GOTO_ON_NOT(filtered_arr, failure) + PyMem_Free(mask_arr); + Py_DECREF(sar); Py_DECREF(mask); return filtered_arr; @@ -536,7 +538,7 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) size_t size = PyArray_SIZE(original_arr); - npy_bool mask_arr[size]; + npy_bool* mask_arr = PyMem_Malloc(size); // 2. Get sorted indices & sort array ordered_idx = PyArray_ArgSort(original_arr, 0, NPY_QUICKSORT); @@ -546,7 +548,7 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) AK_GOTO_ON_NOT(sar, failure) // 3. Build mask - memset(mask_arr, 0, sizeof(mask_arr)); + memset(mask_arr, 0, size); mask_arr[0] = 1; AK_GOTO_ON_NOT(AK_build_unique_arr_mask(sar, mask_arr), failure) @@ -590,6 +592,8 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) PyObject* ret = PyTuple_Pack(2, filtered_arr, inv_idx); AK_GOTO_ON_NOT(ret, failure) + PyMem_Free(mask_arr); + Py_DECREF(ordered_idx); Py_DECREF(sar); Py_DECREF(mask); From d9687ae7d934d12424a8a0ff1bc434f53a622581 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 18 Mar 2021 13:05:27 -0700 Subject: [PATCH 25/29] Begins to clean up bad memory mgmt. --- arraykit.c | 103 ++++++++++++++++++++++++---------- debug.py | 17 ++++++ performance/reference/util.py | 2 +- 3 files changed, 92 insertions(+), 30 deletions(-) diff --git a/arraykit.c b/arraykit.c index 847d0b38..bbef8e1a 100644 --- a/arraykit.c +++ b/arraykit.c @@ -305,6 +305,9 @@ is_nat(PyObject *a) # define AK_PPRINT(obj) \ printf(""#obj""); printf(": "); PyObject_Print(obj, stdout, 0); printf("\n"); fflush(stdout); +# define AK_HERE printf("HERE\n"); fflush(stdout); + +// DONE. Returns a new reference static PyArrayObject * AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) { @@ -441,6 +444,7 @@ AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) return 0; } +// Returns a new reference static PyArrayObject* AK_get_unique_arr(PyArrayObject *original_arr) { @@ -507,6 +511,7 @@ AK_get_unique_arr(PyArrayObject *original_arr) return NULL; } +// Returns a new reference static PyObject* AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) { @@ -575,7 +580,10 @@ AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) NULL); // out-array AK_GOTO_ON_NOT(cumsum, failure) - imask = PyNumber_Subtract(cumsum, PyLong_FromLong(1)); + PyObject* one = PyLong_FromLong(1); + AK_GOTO_ON_NOT(one, failure) + imask = PyNumber_Subtract(cumsum, one); + Py_DECREF(one); AK_GOTO_ON_NOT(imask, failure) inv_idx = PyArray_Empty( @@ -661,22 +669,30 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu if (!assume_unique) { PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(flattened_array); - PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GET_ITEM(arr_and_rev_idx, 0); - AK_GOTO_ON_NOT(raveled_array_unique, failure) - Py_INCREF(raveled_array_unique); + AK_GOTO_ON_NOT(arr_and_rev_idx, failure) + + PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GET_ITEM(arr_and_rev_idx, 0); // BORROWED! Py_DECREF(arr_and_rev_idx); handles this + if (!raveled_array_unique) { + Py_DECREF(arr_and_rev_idx); + goto failure; + } reverse_idx = PyTuple_GET_ITEM(arr_and_rev_idx, 1); - AK_GOTO_ON_NOT(reverse_idx, failure) - Py_INCREF(reverse_idx); + if (!reverse_idx) { + Py_DECREF(arr_and_rev_idx); + goto failure; + } + Py_INCREF(reverse_idx); // Since this was borrowed and we need outside of this scope, increment it's refcount PyArrayObject *other_unique = AK_get_unique_arr(other); - AK_GOTO_ON_NOT(other_unique, failure) - Py_INCREF(other_unique); + if (!other_unique) { + Py_DECREF(arr_and_rev_idx); + goto failure; + } // 3. Concatenate concatenated = AK_concat_arrays(raveled_array_unique, other_unique); Py_DECREF(arr_and_rev_idx); - Py_DECREF(raveled_array_unique); Py_DECREF(other_unique); } else { @@ -702,17 +718,16 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu if (!assume_unique) { // 6: Construct empty array - PyObject* tmp = PyArray_Empty( + PyObject* concatenated_mask = PyArray_Empty( PyArray_NDIM(concatenated), // nd PyArray_DIMS(concatenated), // dims PyArray_DescrFromType(NPY_BOOL), // dtype 0); // is_f_order - - Py_INCREF(tmp); + AK_GOTO_ON_NOT(concatenated_mask, failure) npy_intp dims[1] = {1}; - PyArrayObject *false = (PyArrayObject*)PyArray_NewFromDescr( + PyArrayObject *single_false_array = (PyArrayObject*)PyArray_NewFromDescr( &PyArray_Type, // class (subtype) PyArray_DescrFromType(NPY_BOOL), // dtype (descr) 1, // ndim (nd) @@ -721,30 +736,58 @@ AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_uniqu "\0", // data NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags NULL); // sublclass (obj) - Py_INCREF(false); - - PyArrayObject* xyz = AK_concat_arrays(comparison, false); - Py_INCREF(xyz); + if (!single_false_array) { + Py_DECREF(concatenated_mask); + goto failure; + } - // TODO: Comparison is missing a trailing False value... - if (PyObject_SetItem(tmp, (PyObject*)ordered_idx, (PyObject*)xyz)) { + PyArrayObject* full_comparison = AK_concat_arrays(comparison, single_false_array); + Py_DECREF(single_false_array); + if (!full_comparison) { + Py_DECREF(concatenated_mask); goto failure; } - Py_INCREF(ordered_idx); - Py_INCREF(xyz); - Py_INCREF(tmp); - Py_INCREF(reverse_idx); + int success = PyObject_SetItem(concatenated_mask, (PyObject*)ordered_idx, (PyObject*)full_comparison); + Py_DECREF(full_comparison); + if (success == -1) { + Py_DECREF(concatenated_mask); + goto failure; + } - ret = (PyArrayObject*)PyObject_GetItem(tmp, reverse_idx); + PyObject* ret_1d = (PyArrayObject*)PyObject_GetItem(concatenated_mask, reverse_idx); // Should be a new reference? + Py_DECREF(reverse_idx); // We are officially done with this + Py_DECREF(concatenated_mask); + AK_GOTO_ON_NOT(ret_1d, failure) if (array_ndim == 2) { - PyObject* shape = PyTuple_Pack(2, PyLong_FromLong(array_dims[0]), PyLong_FromLong(array_dims[1])); - ret = (PyArrayObject*)PyArray_Reshape(ret, shape); - } + PyObject* dim0 = PyLong_FromLong(array_dims[0]); + if (!dim0) { + Py_DECREF(ret_1d); + goto failure; + } + PyObject* dim1 = PyLong_FromLong(array_dims[1]); + if (!dim1) { + Py_DECREF(ret_1d); + Py_DECREF(dim0); + goto failure; + } - Py_DECREF(tmp); - Py_DECREF(reverse_idx); + PyObject* shape = PyTuple_Pack(2, dim0, dim1); + Py_DECREF(dim0); + Py_DECREF(dim1); + if (!shape) { + Py_DECREF(ret_1d); + goto failure; + } + + ret = (PyArrayObject*)PyArray_Reshape(ret_1d, shape); + Py_DECREF(ret_1d); + Py_DECREF(shape); + } + else { + ret = ret_1d; + } } else { // 6: Construct empty array @@ -906,6 +949,7 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) Py_INCREF(obj); // 5. Assign into result whether or not the element exists in the set + // int found = PySequence_Contains(compare_elements, ((PyObject**)data)[0]); int found = PySequence_Contains(compare_elements, obj); Py_DECREF(obj); @@ -945,6 +989,7 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) static PyObject * isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { + AK_PPRINT(kwargs) int array_is_unique, other_is_unique; PyArrayObject *array, *other; diff --git a/debug.py b/debug.py index 17c9fa5c..8de98390 100755 --- a/debug.py +++ b/debug.py @@ -9,6 +9,23 @@ funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.int_) +arr2 = np.array([1, 4, 7, 9], dtype=np.int_) +post = funcFF(array=arr1, other=arr2) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.float_) +arr2 = np.array([1, 4, 7, 9], dtype=np.float_) +post = funcFF(array=arr1, other=arr2) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=str) +arr2 = np.array([1, 4, 7, 9], dtype=str) +post = funcFF(array=arr1, other=arr2) + +arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.complex_) +arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) +post = funcFF(array=arr1, other=arr2) +exit(0) + def test_arrays(arr1, arr2, expected, func): post = func(array=arr1.astype(np.int_), other=arr2.astype(np.int_)) diff --git a/performance/reference/util.py b/performance/reference/util.py index 61d333fc..3accdda6 100644 --- a/performance/reference/util.py +++ b/performance/reference/util.py @@ -235,7 +235,7 @@ def isin_array(*, # both funcs return immutable arrays func = _isin_1d if array.ndim == 1 else _isin_2d try: - return func(array, frozenset(other)) + return func(array, frozenset(other)) # Isolate the frozenset creation to it's own try-except except TypeError: # only occur when something is unhashable. pass From baa5421ce0f04b8ea4c492360ec97fd23bfab23d Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Thu, 18 Mar 2021 16:09:55 -0700 Subject: [PATCH 26/29] Rips out manual in1d impl. Removes GOTO macro. Other misc changes. --- arraykit.c | 678 +++++------------------------------------------------ 1 file changed, 56 insertions(+), 622 deletions(-) diff --git a/arraykit.c b/arraykit.c index bbef8e1a..43b1634e 100644 --- a/arraykit.c +++ b/arraykit.c @@ -49,12 +49,12 @@ return NULL; \ } -// To simplify lines going to a label failure on `!value` -# define AK_GOTO_ON_NOT(obj, label) \ - if (!obj) { \ - goto label; \ - } +// Print & flush out an arbitrary Python object +# define AK_PPRINT(obj) \ + printf(""#obj""); printf(": "); PyObject_Print(obj, stdout, 0); printf("\n"); fflush(stdout); +// A simple `DEBUG` print & flush +# define AK_DEBUG printf("DEBUG\n"); fflush(stdout); # if defined __GNUC__ || defined __clang__ # define AK_LIKELY(X) __builtin_expect(!!(X), 1) @@ -112,7 +112,7 @@ PyArray_Descr* AK_ResolveDTypeIter(PyObject *dtypes) { PyObject *iterator = PyObject_GetIter(dtypes); - AK_CHECK_NOT(iterator) + AK_CHECK_NOT(iterator); PyArray_Descr *resolved = NULL; PyArray_Descr *dtype; @@ -251,7 +251,7 @@ resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args) { PyArray_Descr *d1, *d2; AK_CHECK_NOT(PyArg_ParseTuple(args, "O!O!:resolve_dtype", - &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)) + &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)); return (PyObject *)AK_ResolveDTypes(d1, d2); } @@ -261,633 +261,61 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) return (PyObject *)AK_ResolveDTypeIter(arg); } -//------------------------------------------------------------------------------ -// utils - -static int -is_nan(PyObject *a) -{ - double v = PyFloat_AsDouble(a); - - // Need to disambiguate, since v could be -1 and no failure happened - if (v == -1 && PyErr_Occurred()) { - return -1; - } - - return isnan(v); -} - -static int -is_nanj(PyObject *a) -{ - return isnan(((PyComplexObject*)a)->cval.real); -} - -static int -is_nat(PyObject *a) -{ - // NaT - Datetime - if (PyArray_IsScalar(a, Datetime)) { // Cannot fail - return PyArrayScalar_VAL(a, Datetime) == NPY_DATETIME_NAT; - } - - // NaT - Timedelta - if (PyArray_IsScalar(a, Timedelta)) { // Cannot fail - return PyArrayScalar_VAL(a, Timedelta) == NPY_DATETIME_NAT; - } - - Py_FatalError("This should be impossible"); -} //------------------------------------------------------------------------------ // isin -# define AK_PPRINT(obj) \ - printf(""#obj""); printf(": "); PyObject_Print(obj, stdout, 0); printf("\n"); fflush(stdout); - -# define AK_HERE printf("HERE\n"); fflush(stdout); - -// DONE. Returns a new reference -static PyArrayObject * -AK_concat_arrays(PyArrayObject *arr1, PyArrayObject *arr2) -{ - PyObject *container = PyTuple_Pack(2, arr1, arr2); - AK_CHECK_NOT(container) - - PyArrayObject *array = (PyArrayObject*)PyArray_Concatenate(container, 0); - Py_DECREF(container); - return array; -} - -static PyArrayObject* -AK_compare_two_slices_from_array(PyArrayObject *arr, Py_ssize_t l1, Py_ssize_t l2, Py_ssize_t r1, Py_ssize_t r2, int EQ) -{ - PyObject* left_slice = NULL; - PyObject* right_slice = NULL; - PyObject* comparison = NULL; - - left_slice = PySequence_GetSlice((PyObject*)arr, l1, l2); - AK_GOTO_ON_NOT(left_slice, failure) - - right_slice = PySequence_GetSlice((PyObject*)arr, r1, r2); - AK_GOTO_ON_NOT(right_slice, failure) - - comparison = PyObject_RichCompare(left_slice, right_slice, EQ); - AK_GOTO_ON_NOT(comparison, failure) - - Py_DECREF(left_slice); - Py_DECREF(right_slice); - - return (PyArrayObject*)comparison; - -failure: - Py_XDECREF(left_slice); - Py_XDECREF(right_slice); - return NULL; -} - -static int -AK_build_unique_arr_mask(PyArrayObject *sar, npy_bool* mask) +static PyObject * +AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assume_unique) { - /* Algorithm (assumes `sar` is sorted & mask is initialized to [1, 0, ...] & len(mask) == len(sar) - - // cfmM = [Complex, Float, Datetime, Timedelta] - if sar.dtype.kind in "cfmM" and np.isnan(sar[-1]): - if sar.dtype.kind == "c": # for complex all NaNs are considered equivalent - aux_firstnan = np.searchsorted(np.isnan(sar), True, side='left') - else: - aux_firstnan = np.searchsorted(sar, sar[-1], side='left') - - mask[1:aux_firstnan] = (sar[1:aux_firstnan] != sar[:aux_firstnan - 1]) - mask[aux_firstnan] = True - mask[aux_firstnan + 1:] = False - else: - mask[1:] = sar[1:] != sar[:-1] - */ - // 0. Deallocate on failure - PyArrayObject* comparison = NULL; - PyObject* last_element = NULL; + PyObject* result = NULL; - // 1. Determine if last element contains NaNs/NaTs - size_t size = (size_t)PyArray_SIZE(sar); - PyArray_Descr* dtype = PyArray_DESCR(sar); + PyObject* args = PyTuple_Pack(2, (PyObject*)array, (PyObject*)other); + AK_CHECK_NOT(args); - int is_float = PyDataType_ISFLOAT(dtype); - int is_complex = PyDataType_ISCOMPLEX(dtype); - int is_dt = PyDataType_ISDATETIME(dtype); - - int contains_nan = 0; - - if (is_float | is_complex | is_dt) { - last_element = PyObject_GetItem((PyObject*)sar, PyLong_FromLong(-1)); - AK_GOTO_ON_NOT(last_element, failure) - if (is_float) { - contains_nan = is_nan(last_element); - } - else if (is_complex) { - contains_nan = is_nanj(last_element); - } - else { - // This will always be false as long as numpy < 1.18. NaT sort to the front - contains_nan = is_nat(last_element); - } - } - - // 2. Populate mask - if (contains_nan) { - // 3. Discover the location of the first NaN element - size_t firstnan = 0; - if (is_complex) { - // TODO: I don't understand the necessity of this branch. - // aux_firstnan = np.searchsorted(np.isnan(aux), True, side='left') - } - - // This gives back an array of 1-element since `last_element` is a single element - PyObject* firstnan_obj = PyArray_SearchSorted(sar, last_element, NPY_SEARCHLEFT, NULL); - AK_GOTO_ON_NOT(firstnan_obj, failure) - - firstnan = *(size_t*)PyArray_DATA((PyArrayObject*)firstnan_obj); - Py_DECREF(firstnan_obj); - - // 4. Build mask in such a way to only include 1 NaN value - comparison = AK_compare_two_slices_from_array(sar, 1, firstnan, 0, firstnan - 1, Py_NE); - AK_GOTO_ON_NOT(comparison, failure) - npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); - - for (size_t i = 1; i < firstnan; ++i) { - mask[i] = comparison_arr[i-1]; - } - mask[firstnan] = 1; - for (size_t i = firstnan + 1; i < size; ++i) { - mask[i] = 0; - } - } - else { - // 3. Build mask through a simple [1:] != [:-1] slice comparison - comparison = AK_compare_two_slices_from_array(sar, 1, size, 0, size - 1, Py_NE); - AK_GOTO_ON_NOT(comparison, failure) - npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); - - for (size_t i = 1; i < (size_t)size; ++i) { - mask[i] = comparison_arr[i-1]; - } + PyObject* kwarg = PyDict_New(); + if (!kwarg) { + Py_DECREF(args); + return NULL; } - Py_DECREF(comparison); - Py_XDECREF(last_element); // Only populated when sar contains NaNs/NaTs - - return 1; - -failure: - Py_XDECREF(comparison); - Py_XDECREF(last_element); - return 0; -} - -// Returns a new reference -static PyArrayObject* -AK_get_unique_arr(PyArrayObject *original_arr) -{ - /* Algorithm - - sar = copy(original_arr) - sar.sort() - - mask = np.empty(sar.shape, dtype=np.bool_) - mask[0] = True - - build_mask(...) - - return sar[mask] - */ - - // 1. Initialize - PyObject* mask = NULL; // Deallocate on failure - - size_t size = PyArray_SIZE(original_arr); - PyArray_Descr* dtype = PyArray_DESCR(original_arr); - - npy_bool* mask_arr = PyMem_Malloc(size); - - // 2. Get a copy of the original arr since sorting is in-place - PyArrayObject* sar = (PyArrayObject*)PyArray_FromArray( - original_arr, - dtype, - NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSURECOPY); - AK_CHECK_NOT(sar) - if (PyArray_Sort(sar, 0, NPY_QUICKSORT) == -1) { // In-place + PyObject* assume_unique_obj = PyLong_FromLong((long)assume_unique); + if (!assume_unique_obj) { goto failure; } - // 3. Build mask - memset(mask_arr, 0, size); - mask_arr[0] = 1; - AK_GOTO_ON_NOT(AK_build_unique_arr_mask(sar, mask_arr), failure) - - mask = PyArray_NewFromDescr( - &PyArray_Type, // class (subtype) - PyArray_DescrFromType(NPY_BOOL), // dtype (descr) - PyArray_NDIM(sar), // ndim (nd) - PyArray_DIMS(sar), // dims - NULL, // strides - mask_arr, // data - NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags - NULL); // sublclass (obj) - AK_GOTO_ON_NOT(mask, failure) - - // 4. Filter sar - PyArrayObject *filtered_arr = (PyArrayObject*)PyObject_GetItem((PyObject*)sar, (PyObject*)mask); - AK_GOTO_ON_NOT(filtered_arr, failure) - - PyMem_Free(mask_arr); - - Py_DECREF(sar); - Py_DECREF(mask); - return filtered_arr; - -failure: - Py_DECREF(sar); // Cannot be NULL - Py_XDECREF(mask); - return NULL; -} - -// Returns a new reference -static PyObject* -AK_get_unique_arr_w_inverse(PyArrayObject *original_arr) -{ - /* Algorithm - - ordered_idx = original_arr.argsort(kind='quicksort') - sar = original_arr[ordered_idx] - - mask = np.empty(sar.shape, dtype=np.bool_) - mask[0] = True - - AK_build_unique_arr_mask(sar, mask) - - ret = sar[mask] - imask = np.cumsum(mask) - 1 - inv_idx = np.empty(mask.shape, dtype=np.intp) - inv_idx[ordered_idx] = imask - return ret, inv_idx - */ - - // 1. Initialize - PyObject *ordered_idx = NULL; - PyArrayObject *sar = NULL; - PyArrayObject* mask = NULL; - PyObject *filtered_arr = NULL; - PyObject* cumsum = NULL; - PyObject* imask = NULL; - PyObject* inv_idx = NULL; - - size_t size = PyArray_SIZE(original_arr); - - npy_bool* mask_arr = PyMem_Malloc(size); - - // 2. Get sorted indices & sort array - ordered_idx = PyArray_ArgSort(original_arr, 0, NPY_QUICKSORT); - AK_GOTO_ON_NOT(ordered_idx, failure) - - sar = (PyArrayObject*)PyObject_GetItem((PyObject*)original_arr, ordered_idx); - AK_GOTO_ON_NOT(sar, failure) - - // 3. Build mask - memset(mask_arr, 0, size); - mask_arr[0] = 1; - AK_GOTO_ON_NOT(AK_build_unique_arr_mask(sar, mask_arr), failure) - - mask = (PyArrayObject*)PyArray_NewFromDescr( - &PyArray_Type, // subtype - PyArray_DescrFromType(NPY_BOOL), // dtype - PyArray_NDIM(sar), // nd - PyArray_DIMS(sar), // dims - NULL, // strides - mask_arr, // data - NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags - NULL); // sublclass (obj) - AK_GOTO_ON_NOT(mask, failure) - - // 4. Filter arr - filtered_arr = PyObject_GetItem((PyObject*)sar, (PyObject*)mask); - AK_GOTO_ON_NOT(filtered_arr, failure) - - // 5. Determine the inverse index - cumsum = PyArray_CumSum( - mask, // array - 0, // axis - NPY_INT, // dtype - NULL); // out-array - AK_GOTO_ON_NOT(cumsum, failure) - - PyObject* one = PyLong_FromLong(1); - AK_GOTO_ON_NOT(one, failure) - imask = PyNumber_Subtract(cumsum, one); - Py_DECREF(one); - AK_GOTO_ON_NOT(imask, failure) - - inv_idx = PyArray_Empty( - PyArray_NDIM(mask), // nd - PyArray_DIMS(mask), // dims - PyArray_DescrFromType(NPY_INT), // dtype - 0); // is_f_order - - if (PyObject_SetItem(inv_idx, ordered_idx, imask)) { + int success = PyDict_SetItemString(kwarg, "assume_unique", assume_unique_obj); + Py_DECREF(assume_unique_obj); + if (success == -1) { goto failure; } - // 6. Pack it up in a tuple and return - PyObject* ret = PyTuple_Pack(2, filtered_arr, inv_idx); - AK_GOTO_ON_NOT(ret, failure) - - PyMem_Free(mask_arr); - - Py_DECREF(ordered_idx); - Py_DECREF(sar); - Py_DECREF(mask); - Py_DECREF(filtered_arr); - Py_DECREF(cumsum); - Py_DECREF(imask); - Py_DECREF(inv_idx); - return ret; - -failure: - Py_XDECREF(ordered_idx); - Py_XDECREF(sar); - Py_XDECREF(mask); - Py_XDECREF(filtered_arr); - Py_XDECREF(cumsum); - Py_XDECREF(imask); - Py_XDECREF(inv_idx); - return NULL; -} - -static PyObject * -AK_isin_array_dtype(PyArrayObject *array, PyArrayObject *other, int assume_unique) -{ - /* Algorithm: - - array = np.ravel(array) - - if not assume_unique: - array, rev_idx = np.unique(array, return_inverse=True) - other = np.unique(other) - - concatenated = np.concatenate((array, other)) - - ordered_idx = concatenated.argsort(kind='mergesort') - sorted_arr = concatenated[ordered_idx] - - flag = np.concatenate(((sorted_arr[1:] == sorted_arr[:-1]), [False])) - - ret = np.empty(concatenated.shape, dtype=bool) - ret[ordered_idx] = flag - - if assume_unique: - return ret[:len(array)] - else: - return ret[rev_idx] - */ - // 0. Deallocate on failure - PyArrayObject* flattened_array = NULL; - PyObject *reverse_idx = NULL; - PyArrayObject* concatenated = NULL; - PyArrayObject *ordered_idx = NULL; - PyArrayObject* sorted_arr = NULL; - PyArrayObject* comparison = NULL; - PyArrayObject* ret = NULL; - - // 1. Capture original array shape for return value - int array_ndim = PyArray_NDIM(array); - npy_intp* array_dims = PyArray_DIMS(array); - size_t array_size = PyArray_SIZE(array); - - // 2. Ravel the array as we want to operate on 1D arrays only. (other is guaranteed to be 1D) - flattened_array = (PyArrayObject*)PyArray_Flatten(array, NPY_CORDER); - AK_GOTO_ON_NOT(flattened_array, failure) - Py_INCREF(flattened_array); - - if (!assume_unique) { - PyObject* arr_and_rev_idx = AK_get_unique_arr_w_inverse(flattened_array); - AK_GOTO_ON_NOT(arr_and_rev_idx, failure) - - PyArrayObject *raveled_array_unique = (PyArrayObject*)PyTuple_GET_ITEM(arr_and_rev_idx, 0); // BORROWED! Py_DECREF(arr_and_rev_idx); handles this - if (!raveled_array_unique) { - Py_DECREF(arr_and_rev_idx); - goto failure; - } - - reverse_idx = PyTuple_GET_ITEM(arr_and_rev_idx, 1); - if (!reverse_idx) { - Py_DECREF(arr_and_rev_idx); - goto failure; - } - Py_INCREF(reverse_idx); // Since this was borrowed and we need outside of this scope, increment it's refcount - - PyArrayObject *other_unique = AK_get_unique_arr(other); - if (!other_unique) { - Py_DECREF(arr_and_rev_idx); - goto failure; - } - - // 3. Concatenate - concatenated = AK_concat_arrays(raveled_array_unique, other_unique); - Py_DECREF(arr_and_rev_idx); - Py_DECREF(other_unique); - } - else { - // 3. Concatenate - concatenated = AK_concat_arrays(flattened_array, other); - } - AK_GOTO_ON_NOT(concatenated, failure) - - size_t concatenated_size = PyArray_SIZE(concatenated); - - // 4: Sort - ordered_idx = (PyArrayObject*)PyArray_ArgSort(concatenated, 0, NPY_MERGESORT); - AK_GOTO_ON_NOT(ordered_idx, failure) - npy_intp* ordered_idx_arr = (npy_intp*)PyArray_DATA(ordered_idx); - - // 5. Find duplicates - sorted_arr = (PyArrayObject*)PyObject_GetItem((PyObject*)concatenated, (PyObject*)ordered_idx); - AK_GOTO_ON_NOT(sorted_arr, failure) - - comparison = AK_compare_two_slices_from_array(sorted_arr, 1, concatenated_size, 0, concatenated_size - 1, Py_EQ); - AK_GOTO_ON_NOT(comparison, failure) - npy_bool* comparison_arr = (npy_bool*)PyArray_DATA(comparison); - - if (!assume_unique) { - // 6: Construct empty array - PyObject* concatenated_mask = PyArray_Empty( - PyArray_NDIM(concatenated), // nd - PyArray_DIMS(concatenated), // dims - PyArray_DescrFromType(NPY_BOOL), // dtype - 0); // is_f_order - AK_GOTO_ON_NOT(concatenated_mask, failure) - - npy_intp dims[1] = {1}; - - PyArrayObject *single_false_array = (PyArrayObject*)PyArray_NewFromDescr( - &PyArray_Type, // class (subtype) - PyArray_DescrFromType(NPY_BOOL), // dtype (descr) - 1, // ndim (nd) - dims, // dims - NULL, // strides - "\0", // data - NPY_ARRAY_DEFAULT | NPY_ARRAY_OWNDATA, // flags - NULL); // sublclass (obj) - if (!single_false_array) { - Py_DECREF(concatenated_mask); - goto failure; - } - - PyArrayObject* full_comparison = AK_concat_arrays(comparison, single_false_array); - Py_DECREF(single_false_array); - if (!full_comparison) { - Py_DECREF(concatenated_mask); - goto failure; - } - - int success = PyObject_SetItem(concatenated_mask, (PyObject*)ordered_idx, (PyObject*)full_comparison); - Py_DECREF(full_comparison); - if (success == -1) { - Py_DECREF(concatenated_mask); - goto failure; - } - - PyObject* ret_1d = (PyArrayObject*)PyObject_GetItem(concatenated_mask, reverse_idx); // Should be a new reference? - Py_DECREF(reverse_idx); // We are officially done with this - Py_DECREF(concatenated_mask); - AK_GOTO_ON_NOT(ret_1d, failure) - - if (array_ndim == 2) { - PyObject* dim0 = PyLong_FromLong(array_dims[0]); - if (!dim0) { - Py_DECREF(ret_1d); - goto failure; - } - PyObject* dim1 = PyLong_FromLong(array_dims[1]); - if (!dim1) { - Py_DECREF(ret_1d); - Py_DECREF(dim0); - goto failure; - } - - PyObject* shape = PyTuple_Pack(2, dim0, dim1); - Py_DECREF(dim0); - Py_DECREF(dim1); - if (!shape) { - Py_DECREF(ret_1d); - goto failure; - } - - ret = (PyArrayObject*)PyArray_Reshape(ret_1d, shape); - Py_DECREF(ret_1d); - Py_DECREF(shape); - } - else { - ret = ret_1d; - } - } - else { - // 6: Construct empty array - ret = (PyArrayObject*)PyArray_Empty( - array_ndim, // nd - array_dims, // dims - PyArray_DescrFromType(NPY_BOOL), // dtype - 0); // is_f_order - - AK_GOTO_ON_NOT(ret, failure) - - size_t stride = 0; - if (array_ndim == 2) { - stride = (size_t)array_dims[1]; - } - - // 7: Assign into duplicates array - for (size_t i = 0; i < (size_t)PyArray_SIZE(ordered_idx); ++i) { - size_t idx_0 = (size_t)ordered_idx_arr[i]; - if (idx_0 >= array_size) { continue; } - - // We are guaranteed that flag_ar[i] is always a valid index - if (array_ndim == 1) { - *(npy_bool *) PyArray_GETPTR1(ret, idx_0) = comparison_arr[i]; - } - else { - size_t idx_1 = idx_0 / stride; - idx_0 = idx_0 - (stride * idx_1); - - *(npy_bool *) PyArray_GETPTR2(ret, idx_1, idx_0) = comparison_arr[i]; - } - } + PyObject* numpy = PyImport_ImportModule("numpy"); + if (!numpy) { + goto failure; } - // 8. Cleanup & Return! - Py_DECREF(flattened_array); - Py_DECREF(concatenated); - Py_DECREF(ordered_idx); - Py_DECREF(sorted_arr); - Py_DECREF(comparison); - - return (PyObject*)ret; - -failure: - Py_XDECREF(flattened_array); - Py_XDECREF(reverse_idx); - Py_XDECREF(concatenated); - Py_XDECREF(ordered_idx); - Py_XDECREF(sorted_arr); - Py_XDECREF(comparison); - return NULL; -} - -/* -static PyObject * -AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assume_unique) -{ - PyObject* numpy = NULL; - PyObject* func = NULL; - PyObject* args = NULL; - PyObject* kwarg = NULL; - - numpy = PyImport_ImportModule("numpy"); - AK_GOTO_ON_NOT(numpy, failure) - - if (PyArray_NDIM(array) == 1) { - func = PyObject_GetAttrString(numpy, "in1d"); - } - else { - func = PyObject_GetAttrString(numpy, "isin"); + PyObject* func = PyObject_GetAttrString(numpy, PyArray_NDIM(array) == 1 ? "in1d": "isin"); + Py_DECREF(numpy); + if (!func) { + goto failure; } - AK_GOTO_ON_NOT(func, failure) - args = PyTuple_Pack(2, (PyObject*)array, (PyObject*)other); - AK_GOTO_ON_NOT(args, failure) - - kwarg = PyDict_New(); - AK_GOTO_ON_NOT(kwarg, failure); - if (PyDict_SetItemString(kwarg, "assume_unique", PyLong_FromLong((long)assume_unique)) == -1) { + result = PyObject_Call(func, args, kwarg); + Py_DECREF(func); + if (!result) { goto failure; } - PyObject* result = PyObject_Call(func, args, kwarg); - AK_GOTO_ON_NOT(result, failure) - - Py_DECREF(numpy); - Py_DECREF(func); - Py_DECREF(args); - Py_DECREF(kwarg); + if (0) { + failure: + // These will always exist. + Py_DECREF(args); + Py_DECREF(kwarg); + } return result; - -failure: - Py_XDECREF(numpy); - Py_XDECREF(func); - Py_XDECREF(args); - Py_XDECREF(kwarg); - return NULL; } -*/ static PyObject * AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) @@ -908,7 +336,7 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) npy_intp* array_dims = PyArray_DIMS(array); compare_elements = PyFrozenSet_New((PyObject*)other); - AK_CHECK_NOT(compare_elements) + AK_CHECK_NOT(compare_elements); // 2: Construct empty array result = (PyArrayObject*)PyArray_Empty( @@ -916,7 +344,9 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) array_dims, // dims PyArray_DescrFromType(NPY_BOOL), // dtype 0); // is_f_order - AK_GOTO_ON_NOT(result, failure) + if (!result) { + goto failure; + } // 3. Set up iteration // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example @@ -925,10 +355,14 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) NPY_KEEPORDER, NPY_NO_CASTING, NULL); - AK_GOTO_ON_NOT(iter, failure) + if (!iter) { + goto failure; + } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); - AK_GOTO_ON_NOT(iternext, failure) + if (!iternext) { + goto failure; + } char** dataptr = NpyIter_GetDataPtrArray(iter); npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); @@ -945,7 +379,9 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) while (size--) { PyObject* obj; memcpy(&obj, data, sizeof(obj)); - AK_GOTO_ON_NOT(obj, failure) + if (!obj) { + goto failure; + } Py_INCREF(obj); // 5. Assign into result whether or not the element exists in the set @@ -989,7 +425,6 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) static PyObject * isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) { - AK_PPRINT(kwargs) int array_is_unique, other_is_unique; PyArrayObject *array, *other; @@ -998,7 +433,7 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) AK_CHECK_NOT(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", kwlist, &PyArray_Type, &array, &array_is_unique, - &PyArray_Type, &other, &other_is_unique)) + &PyArray_Type, &other, &other_is_unique)); if (PyArray_NDIM(other) != 1) { return PyErr_Format(PyExc_TypeError, "Expected other to be 1-dimensional"); @@ -1012,8 +447,7 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) return AK_isin_array_object(array, other); } // Use numpy in1d logic for dtype arrays - return AK_isin_array_dtype(array, other, array_is_unique && other_is_unique); - //return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique); + return AK_isin_array_dtype_use_np(array, other, array_is_unique && other_is_unique); } //------------------------------------------------------------------------------ @@ -1081,10 +515,10 @@ ArrayGO_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs) int parsed = PyArg_ParseTupleAndKeywords( args, kwargs, "O|$p:ArrayGO", argnames, &iterable, &own_iterable ); - AK_CHECK_NOT(parsed) + AK_CHECK_NOT(parsed); ArrayGOObject *self = (ArrayGOObject *)cls->tp_alloc(cls, 0); - AK_CHECK_NOT(self) + AK_CHECK_NOT(self); if (PyArray_Check(iterable)) { if (!PyDataType_ISOBJECT(PyArray_DESCR((PyArrayObject *)iterable))) { @@ -1127,7 +561,7 @@ ArrayGO_append(ArrayGOObject *self, PyObject *value) { if (!self->list) { self->list = PyList_New(1); - AK_CHECK_NOT(self->list) + AK_CHECK_NOT(self->list); Py_INCREF(value); PyList_SET_ITEM(self->list, 0, value); @@ -1144,7 +578,7 @@ ArrayGO_extend(ArrayGOObject *self, PyObject *values) { if (!self->list) { self->list = PySequence_List(values); - AK_CHECK_NOT(self->list) + AK_CHECK_NOT(self->list); Py_RETURN_NONE; } From b161f019e8aa32421698ac2bc864a78c3bd83653 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 26 Mar 2021 14:45:43 -0700 Subject: [PATCH 27/29] Update arraykit.c Co-authored-by: Brandt Bucher --- arraykit.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/arraykit.c b/arraykit.c index 43b1634e..03033cab 100644 --- a/arraykit.c +++ b/arraykit.c @@ -303,16 +303,10 @@ AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assum result = PyObject_Call(func, args, kwarg); Py_DECREF(func); - if (!result) { - goto failure; - } - - if (0) { - failure: - // These will always exist. - Py_DECREF(args); - Py_DECREF(kwarg); - } +failure: + // These will always exist. + Py_DECREF(args); + Py_DECREF(kwarg); return result; } From da43f861dc43104c6688c873a877cce966b789ef Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Mon, 29 Mar 2021 00:13:13 -0700 Subject: [PATCH 28/29] Modifies object reference code to be better. --- arraykit.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arraykit.c b/arraykit.c index 5e118119..02136058 100644 --- a/arraykit.c +++ b/arraykit.c @@ -383,18 +383,16 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) npy_intp size = *sizeptr; npy_intp stride = *strideptr; + PyObject* obj_ref = NULL; + while (size--) { - PyObject* obj; - memcpy(&obj, data, sizeof(obj)); - if (!obj) { - goto failure; - } - Py_INCREF(obj); + // Object arrays contains pointers to PyObjects, so we will only temporarily + // look at the reference here. + memcpy(&obj_ref, data, sizeof(obj_ref)); // 5. Assign into result whether or not the element exists in the set // int found = PySequence_Contains(compare_elements, ((PyObject**)data)[0]); - int found = PySequence_Contains(compare_elements, obj); - Py_DECREF(obj); + int found = PySequence_Contains(compare_elements, obj_ref); if (found == -1) { goto failure; From a1c7ec962dd6fb4ff13e6c7882fdcaf49d0ef821 Mon Sep 17 00:00:00 2001 From: Charles Burkland Date: Fri, 9 Apr 2021 11:22:16 -0700 Subject: [PATCH 29/29] Improves object array iteration. Renames a macro. Ignores *.diff files. --- .gitignore | 1 + arraykit.c | 150 +++++++++++++++++++++++++++++------------------------ debug.py | 14 ++++- 3 files changed, 97 insertions(+), 68 deletions(-) diff --git a/.gitignore b/.gitignore index 300033e8..e55b9691 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ .vscode __pycache__ build +*.diff diff --git a/arraykit.c b/arraykit.c index 02136058..ef3aed5b 100644 --- a/arraykit.c +++ b/arraykit.c @@ -44,7 +44,7 @@ } while (0) // To simplify lines merely checking for `!value` -# define AK_CHECK_NOT(obj) \ +# define AK_RETURN_NULL_IF_NOT(obj) \ if (!obj) { \ return NULL; \ } @@ -125,7 +125,7 @@ PyArray_Descr* AK_ResolveDTypeIter(PyObject *dtypes) { PyObject *iterator = PyObject_GetIter(dtypes); - AK_CHECK_NOT(iterator); + AK_RETURN_NULL_IF_NOT(iterator); PyArray_Descr *resolved = NULL; PyArray_Descr *dtype; @@ -197,9 +197,9 @@ shape_filter(PyObject *Py_UNUSED(m), PyObject *a) AK_CHECK_NUMPY_ARRAY_1D_2D(a); PyArrayObject *array = (PyArrayObject *)a; - int size0 = PyArray_DIM(array, 0); + npy_intp size0 = PyArray_DIM(array, 0); // If 1D array, set size for axis 1 at 1, else use 2D array to get the size of axis 1 - int size1 = PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1); + npy_intp size1 = PyArray_NDIM(array) == 1 ? 1 : PyArray_DIM(array, 1); return Py_BuildValue("ii", size0, size1); } @@ -263,7 +263,7 @@ static PyObject * resolve_dtype(PyObject *Py_UNUSED(m), PyObject *args) { PyArray_Descr *d1, *d2; - AK_CHECK_NOT(PyArg_ParseTuple(args, "O!O!:resolve_dtype", + AK_RETURN_NULL_IF_NOT(PyArg_ParseTuple(args, "O!O!:resolve_dtype", &PyArrayDescr_Type, &d1, &PyArrayDescr_Type, &d2)); return (PyObject *)AK_ResolveDTypes(d1, d2); } @@ -274,7 +274,6 @@ resolve_dtype_iter(PyObject *Py_UNUSED(m), PyObject *arg) return (PyObject *)AK_ResolveDTypeIter(arg); } - //------------------------------------------------------------------------------ // isin @@ -284,7 +283,7 @@ AK_isin_array_dtype_use_np(PyArrayObject *array, PyArrayObject *other, int assum PyObject* result = NULL; PyObject* args = PyTuple_Pack(2, (PyObject*)array, (PyObject*)other); - AK_CHECK_NOT(args); + AK_RETURN_NULL_IF_NOT(args); PyObject* kwarg = PyDict_New(); if (!kwarg) { @@ -333,98 +332,115 @@ AK_isin_array_object(PyArrayObject *array, PyArrayObject *other) result[loc] = element in set(other) */ - // 0. Deallocate on failure - PyObject* compare_elements = NULL; - PyArrayObject* result = NULL; - NpyIter *iter = NULL; - - // 1. Capture original array shape for return value - int array_ndim = PyArray_NDIM(array); - npy_intp* array_dims = PyArray_DIMS(array); - - compare_elements = PyFrozenSet_New((PyObject*)other); - AK_CHECK_NOT(compare_elements); - - // 2: Construct empty array - result = (PyArrayObject*)PyArray_Empty( - array_ndim, // nd - array_dims, // dims - PyArray_DescrFromType(NPY_BOOL), // dtype - 0); // is_f_order - if (!result) { - goto failure; - } + PyObject *compare_elements = PyFrozenSet_New((PyObject*)other); + AK_RETURN_NULL_IF_NOT(compare_elements); + + PyArrayObject *arrays[2]; + npy_uint32 arrays_flags[2]; + PyArray_Descr *op_dtypes[2]; + arrays[0] = array; + arrays[1] = NULL; + arrays_flags[0] = NPY_ITER_READONLY; + arrays_flags[1] = NPY_ITER_WRITEONLY | NPY_ITER_ALLOCATE; + op_dtypes[0] = PyArray_DescrFromType(NPY_OBJECT); + op_dtypes[1] = PyArray_DescrFromType(NPY_BOOL); + + // No inner iteration - inner loop is handled by CopyArray code + // Reference objects are OK. + int iter_flags = NPY_ITER_EXTERNAL_LOOP | NPY_ITER_REFS_OK; + + // Construct the iterator + NpyIter *iter = NpyIter_MultiNew( + 2, // number of arrays + arrays, + iter_flags, + NPY_KEEPORDER, // Maintain existing order for `array` + NPY_NO_CASTING, // No casting will be required + arrays_flags, + op_dtypes); - // 3. Set up iteration - // https://numpy.org/doc/stable/reference/c-api/iterator.html?highlight=npyiter_multinew#simple-iteration-example - iter = NpyIter_New(array, - NPY_ITER_READONLY | NPY_ITER_REFS_OK | NPY_ITER_EXTERNAL_LOOP, - NPY_KEEPORDER, - NPY_NO_CASTING, - NULL); if (!iter) { - goto failure; + Py_DECREF(compare_elements); + return NULL; } NpyIter_IterNextFunc *iternext = NpyIter_GetIterNext(iter, NULL); if (!iternext) { - goto failure; + Py_DECREF(compare_elements); + NpyIter_Deallocate(iter); + return NULL; } char** dataptr = NpyIter_GetDataPtrArray(iter); npy_intp *strideptr = NpyIter_GetInnerStrideArray(iter); npy_intp *sizeptr = NpyIter_GetInnerLoopSizePtr(iter); - // 4. Iterate! - int i = 0; + // If we don't need the GIL, iteration can be multi-threaded! + NPY_BEGIN_THREADS_DEF; + if (!NpyIter_IterationNeedsAPI(iter)) { + // This will likely never happen, since I am pretty sure that object + // dtypes need the API. However, I don't know enough about the internals + // of numpy iteration to know that this will *never happen.... + NPY_BEGIN_THREADS; + } + do { - int j = 0; - char* data = *dataptr; - npy_intp size = *sizeptr; - npy_intp stride = *strideptr; + char* src_data = dataptr[0]; + char* dst_data = dataptr[1]; + npy_intp size = sizeptr[0]; + npy_intp src_stride = strideptr[0]; + npy_intp dst_stride = strideptr[1]; PyObject* obj_ref = NULL; while (size--) { // Object arrays contains pointers to PyObjects, so we will only temporarily // look at the reference here. - memcpy(&obj_ref, data, sizeof(obj_ref)); + memcpy(&obj_ref, src_data, sizeof(obj_ref)); // 5. Assign into result whether or not the element exists in the set // int found = PySequence_Contains(compare_elements, ((PyObject**)data)[0]); - int found = PySequence_Contains(compare_elements, obj_ref); + npy_bool found = (npy_bool)PySequence_Contains(compare_elements, obj_ref); if (found == -1) { - goto failure; + NpyIter_Deallocate(iter); + Py_DECREF(compare_elements); + return NULL; } - if (array_ndim == 1){ - *(npy_bool *) PyArray_GETPTR1(result, j) = (npy_bool)found; - } - else { - *(npy_bool *) PyArray_GETPTR2(result, i, j) = (npy_bool)found; - } + *dst_data = found; - data += stride; - ++j; + src_data += src_stride; + dst_data += dst_stride; } - ++i; // Increment the iterator to the next inner loop } while(iternext(iter)); + NPY_END_THREADS; + Py_DECREF(compare_elements); - NpyIter_Deallocate(iter); - return (PyObject*)result; + // If the API was needed, it may have thrown an error + if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) { + NpyIter_Deallocate(iter); + return NULL; + } -failure: - Py_DECREF(compare_elements); - Py_XDECREF(result); - if (iter != NULL) { + // Get the result from the iterator object array + PyObject *ret = (PyObject*)NpyIter_GetOperandArray(iter)[1]; + if (!ret) { NpyIter_Deallocate(iter); + return NULL; } - return NULL; + Py_INCREF(ret); + + if (NpyIter_Deallocate(iter) != NPY_SUCCEED) { + Py_DECREF(ret); + return NULL; + } + + return ret; } static PyObject * @@ -435,7 +451,7 @@ isin_array(PyObject *Py_UNUSED(m), PyObject *args, PyObject *kwargs) static char *kwlist[] = {"array", "array_is_unique", "other", "other_is_unique", NULL}; - AK_CHECK_NOT(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", + AK_RETURN_NULL_IF_NOT(PyArg_ParseTupleAndKeywords(args, kwargs, "O!iO!i:isin_array", kwlist, &PyArray_Type, &array, &array_is_unique, &PyArray_Type, &other, &other_is_unique)); @@ -520,10 +536,10 @@ ArrayGO_new(PyTypeObject *cls, PyObject *args, PyObject *kwargs) int parsed = PyArg_ParseTupleAndKeywords( args, kwargs, "O|$p:ArrayGO", argnames, &iterable, &own_iterable ); - AK_CHECK_NOT(parsed); + AK_RETURN_NULL_IF_NOT(parsed); ArrayGOObject *self = (ArrayGOObject *)cls->tp_alloc(cls, 0); - AK_CHECK_NOT(self); + AK_RETURN_NULL_IF_NOT(self); if (PyArray_Check(iterable)) { if (!PyDataType_ISOBJECT(PyArray_DESCR((PyArrayObject *)iterable))) { @@ -566,7 +582,7 @@ ArrayGO_append(ArrayGOObject *self, PyObject *value) { if (!self->list) { self->list = PyList_New(1); - AK_CHECK_NOT(self->list); + AK_RETURN_NULL_IF_NOT(self->list); Py_INCREF(value); PyList_SET_ITEM(self->list, 0, value); @@ -583,7 +599,7 @@ ArrayGO_extend(ArrayGOObject *self, PyObject *values) { if (!self->list) { self->list = PySequence_List(values); - AK_CHECK_NOT(self->list); + AK_RETURN_NULL_IF_NOT(self->list); Py_RETURN_NONE; } diff --git a/debug.py b/debug.py index 8de98390..4c30504d 100755 --- a/debug.py +++ b/debug.py @@ -9,6 +9,19 @@ funcFT = partial(isin_array, array_is_unique=False, other_is_unique=True) funcFF = partial(isin_array, array_is_unique=False, other_is_unique=False) +class Obj: + def __init__(self, value): + self.v = value + def __hash__(self): + return hash(self.v) + def __eq__(self, other): + return self.v == other.v + +arr1 = np.array([[Obj(1), Obj(2), Obj(3)], [Obj(4), Obj(5), Obj(9)]], dtype=object) +arr2 = np.array([Obj(1), Obj(4), Obj(7), Obj(9)], dtype=object) +post = funcTT(array=arr1, other=arr2) +print(post) + arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.int_) arr2 = np.array([1, 4, 7, 9], dtype=np.int_) post = funcFF(array=arr1, other=arr2) @@ -24,7 +37,6 @@ arr1 = np.array([1, 5, 2, 3, 4, 5, 1], dtype=np.complex_) arr2 = np.array([1, 4, 7, 9], dtype=np.complex_) post = funcFF(array=arr1, other=arr2) -exit(0) def test_arrays(arr1, arr2, expected, func):