diff --git a/arkouda/numeric.py b/arkouda/numeric.py index b948fa002..5811d3e2e 100644 --- a/arkouda/numeric.py +++ b/arkouda/numeric.py @@ -17,8 +17,9 @@ isSupportedNumber, numeric_scalars, resolve_scalar_dtype, - str_, ) +from arkouda.numpy.dtypes import str_ +from arkouda.numpy.dtypes import str_ as akstr_ from arkouda.pdarrayclass import all as ak_all from arkouda.pdarrayclass import any as ak_any from arkouda.pdarrayclass import argmax, create_pdarray, pdarray, sum @@ -73,6 +74,7 @@ "rad2deg", "deg2rad", "hash", + "array_equal", "putmask", "where", "histogram", @@ -2202,6 +2204,56 @@ def count_nonzero(pda): return sum((pda != "").astype(np.int64)) +def array_equal(pda_a: pdarray, pda_b: pdarray, equal_nan: bool = False): + """ + Compares two pdarrays for equality. + If neither array has any nan elements, then if all elements are pairwise equal, + it returns True. + If equal_Nan is False, then any nan element in either array gives a False return. + If equal_Nan is True, then pairwise-corresponding nans are considered equal. + + Parameters + ---------- + pda_a : pdarray + pda_b : pdarray + equal_nan : boolean to determine how to handle nans, default False + + Returns + ------- + boolean + With string data: + False if one array is type ak.str_ & the other isn't, True if both are ak.str_ & they match. + + With numeric data: + True if neither array has any nan elements, and all elements pairwise equal. + + True if equal_Nan True, all non-nans pairwise equal & nans in pda_a correspond to nans in pda_b + + False if equal_Nan False, & either array has any nan element. + + Examples + -------- + >>> a = ak.randint(0,10,10,dtype=ak.float64) + >>> b = a + >>> ak.array_equal(a,b) + True + >>> b[9] = np.nan + >>> ak.array_equal(a,b) + False + >>> a[9] = np.nan + >>> ak.array_equal(a,b) + False + >>> ak.array_equal(a,b,True) + True + """ + if (pda_a.shape != pda_b.shape) or ((pda_a.dtype == akstr_) ^ (pda_b.dtype == akstr_)): + return False + elif equal_nan: + return ak_all(where(isnan(pda_a), isnan(pda_b), pda_a == pda_b)) + else: + return ak_all(pda_a == pda_b) + + def putmask(pda: pdarray, mask: Union[bool, pdarray], values: pdarray): """ Overwrite elements of a pdarray at indices where mask is True diff --git a/tests/numeric_test.py b/tests/numeric_test.py index 60ec5bc6d..67310e03b 100644 --- a/tests/numeric_test.py +++ b/tests/numeric_test.py @@ -989,3 +989,207 @@ def test_vecdot(self, data_type1, data_type2, prob_size): npProduct = np.add.reduce(nda_a * nda_b) assert check(npProduct, akProduct.to_ndarray(), akProduct.dtype) + + # Notes about array_equal: + # Strings compared to non-strings are always not equal. + # nan handling is (of course) unique to floating point + # we deliberately test on matched and mismatched arrays + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + @pytest.mark.parametrize("data_type", ARRAY_TYPES) + @pytest.mark.parametrize("same_size", YES_NO) + @pytest.mark.parametrize("matching", YES_NO) + @pytest.mark.parametrize("nan_handling", YES_NO) + def test_array_equal(self, prob_size, data_type, same_size, matching, nan_handling): + seed = pytest.seed if pytest.seed is not None else 8675309 + if data_type is ak.str_: # strings require special handling + np.random.seed(seed) + temp = np.random.choice(VOWELS_AND_SUCH, prob_size) + pda_a = ak.array(temp) + pda_b = ak.array(temp) + assert ak.array_equal(pda_a, pda_b) # matching string arrays + pda_c = pda_b[:-1] + assert not (ak.array_equal(pda_a,pda_c)) # matching except c is shorter by 1 + temp = np.random.choice(VOWELS_AND_SUCH, prob_size) + pda_b = ak.array(temp) + assert not (ak.array_equal(pda_a, pda_b)) # mismatching string arrays + pda_b = ak.randint(0, 100, prob_size, dtype=ak.int64) + assert not (ak.array_equal(pda_a, pda_b)) # string to int comparison + pda_b = ak.randint(0, 2, prob_size, dtype=ak.bool_) + assert not (ak.array_equal(pda_a, pda_b)) # string to bool comparison + elif data_type is ak.float64: # so do floats, because of nan + nda_a = np.random.uniform(0, 100, prob_size) + if nan_handling: + nda_a[-1] = np.nan + nda_b = nda_a.copy() if matching else np.random.uniform(0, 100, prob_size) + pda_a = ak.array(nda_a) + pda_b = ak.array(nda_b) if same_size else ak.array(nda_b[:-1]) + assert ak.array_equal(pda_a, pda_b, nan_handling) == (matching and same_size) + else: # other types have simpler tests + pda_a = ak.random.randint(0, 100, prob_size, dtype=data_type) + if matching: # known to match? + pda_b = pda_a if same_size else pda_a[:-1] + assert (ak.array_equal(pda_a, pda_b) == (matching and same_size)) + elif same_size: # not matching, but same size? + pda_b = ak.random.randint(0, 100, prob_size, dtype=data_type) + assert not (ak.array_equal(pda_a, pda_b)) + else: + pda_b = ak.random.randint(0, 100, (prob_size if same_size else prob_size-1), dtype=data_type) + assert not (ak.array_equal(pda_a, pda_b)) + + # Notes about median: + # prob_size is either even or odd, so one of sample_e, sample_o will have an even + # length, and the other an odd length. Median should be tested with both even and odd + # length inputs. + + # median can be done on ints or floats + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + @pytest.mark.parametrize("data_type", NUMERIC_TYPES) + def test_median(self, prob_size, data_type): + sample_e = np.random.permutation(prob_size).astype(data_type) + pda_e = ak.array(sample_e) + assert isclose(np.median(sample_e), ak.median(pda_e)) + sample_o = np.random.permutation(prob_size + 1).astype(data_type) + pda_o = ak.array(sample_o) + assert isclose(np.median(sample_o), ak.median(pda_o)) + + # test_count_nonzero doesn't use parameterization on data types, because + # the data is generated differently. + + # counts are ints, so we test for equality, not closeness. + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_count_nonzero(self, prob_size): + # ints, floats + + for data_type in INT_FLOAT: + sample = np.random.randint(20, size=prob_size).astype(data_type) + pda = ak.array(sample) + assert np.count_nonzero(sample) == ak.count_nonzero(pda) + + # bool + + sample = np.random.randint(2, size=prob_size).astype(bool) + pda = ak.array(sample) + assert np.count_nonzero(sample) == ak.count_nonzero(pda) + + # string + + sample = sample.astype(str) + for i in range(10): + sample[np.random.randint(prob_size)] = "" # empty some strings at random + pda = ak.array(sample) + assert np.count_nonzero(sample) == ak.count_nonzero(pda) + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_clip(self, prob_size): + seed = pytest.seed if pytest.seed is not None else 8675309 + np.random.seed(seed) + ia = np.random.randint(1, 100, prob_size) + ilo = 25 + ihi = 75 + + dtypes = ["int64", "float64"] + + # test clip. + # array to be clipped can be integer or float + # range limits can be integer, float, or none, and can be scalars or arrays + + # Looping over all data types, the interior loop tests using lo, hi as: + + # None, Scalar + # None, Array + # Scalar, Scalar + # Scalar, Array + # Scalar, None + # Array, Scalar + # Array, Array + # Array, None + + # There is no test with lo and hi both equal to None, because that's not allowed + + for dtype1 in dtypes: + hi = np.full(ia.shape, ihi, dtype=dtype1) + akhi = ak.array(hi) + for dtype2 in dtypes: + lo = np.full(ia.shape, ilo, dtype=dtype2) + aklo = ak.array(lo) + for dtype3 in dtypes: + nd_arry = ia.astype(dtype3) + ak_arry = ak.array(nd_arry) + assert np.allclose( + np.clip(nd_arry, None, hi[0]), ak.clip(ak_arry, None, hi[0]).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, None, hi), ak.clip(ak_arry, None, akhi).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, lo[0], hi[0]), ak.clip(ak_arry, lo[0], hi[0]).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, lo[0], hi), ak.clip(ak_arry, lo[0], akhi).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, lo[0], None), ak.clip(ak_arry, lo[0], None).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, lo, hi[0]), ak.clip(ak_arry, aklo, hi[0]).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, lo, hi), ak.clip(ak_arry, aklo, akhi).to_ndarray() + ) + assert np.allclose( + np.clip(nd_arry, lo, None), ak.clip(ak_arry, aklo, None).to_ndarray() + ) + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + def test_putmask(self, prob_size): + + for data_type in INT_FLOAT: + + # three things to test: values same size as data + + nda = np.random.randint(0, 10, prob_size).astype(data_type) + result = nda.copy() + np.putmask(result, result > 5, result**2) + pda = ak.array(nda) + ak.putmask(pda, pda > 5, pda**2) + assert ( + np.all(result == pda.to_ndarray()) + if data_type == ak.int64 + else np.allclose(result, pda.to_ndarray()) + ) + + # values shorter than data + + result = nda.copy() + pda = ak.array(nda) + values = np.arange(3).astype(data_type) + np.putmask(result, result > 5, values) + ak.putmask(pda, pda > 5, ak.array(values)) + assert ( + np.all(result == pda.to_ndarray()) + if data_type == ak.int64 + else np.allclose(result, pda.to_ndarray()) + ) + + # values longer than data + + result = nda.copy() + pda = ak.array(nda) + values = np.arange(prob_size + 1).astype(data_type) + np.putmask(result, result > 5, values) + ak.putmask(pda, pda > 5, ak.array(values)) + assert ( + np.all(result == pda.to_ndarray()) + if data_type == ak.int64 + else np.allclose(result, pda.to_ndarray()) + ) + + # finally try to raise the error + + pda = ak.random.randint(0, 10, 10).astype(ak.float64) + values = np.arange(10) + with pytest.raises(TypeError): + ak.putmask(pda, pda > 3, values)