Closes #3297: Adds array_equal (#3725)

* Further work * Adds array_equal, fixes test_triu * almost there * Addresses docstring errors. * Fixes flake8 warnings * Still fixing docstring bits --------- Co-authored-by: drculhane <drculhane@users.noreply.github.com> Co-authored-by: ajpotts <amanda.j.potts@gmail.com>
Bears-R-Us · Sep 4, 2024 · 6de7e3d · 6de7e3d
1 parent 2205c45
commit 6de7e3d
Show file tree

Hide file tree

Showing 2 changed files with 257 additions and 1 deletion.
diff --git a/arkouda/numeric.py b/arkouda/numeric.py
@@ -17,8 +17,9 @@
     isSupportedNumber,
     numeric_scalars,
     resolve_scalar_dtype,
-    str_,
 )
+from arkouda.numpy.dtypes import str_
+from arkouda.numpy.dtypes import str_ as akstr_
 from arkouda.pdarrayclass import all as ak_all
 from arkouda.pdarrayclass import any as ak_any
 from arkouda.pdarrayclass import argmax, create_pdarray, pdarray, sum
@@ -73,6 +74,7 @@
     "rad2deg",
     "deg2rad",
     "hash",
+    "array_equal",
     "putmask",
     "where",
     "histogram",
@@ -2202,6 +2204,56 @@ def count_nonzero(pda):
         return sum((pda != "").astype(np.int64))
 
 
+def array_equal(pda_a: pdarray, pda_b: pdarray, equal_nan: bool = False):
+    """
+    Compares two pdarrays for equality.
+    If neither array has any nan elements, then if all elements are pairwise equal,
+    it returns True.
+    If equal_Nan is False, then any nan element in either array gives a False return.
+    If equal_Nan is True, then pairwise-corresponding nans are considered equal.
+
+    Parameters
+    ----------
+    pda_a : pdarray
+    pda_b : pdarray
+    equal_nan : boolean to determine how to handle nans, default False
+
+    Returns
+    -------
+    boolean
+      With string data:
+         False if one array is type ak.str_ & the other isn't, True if both are ak.str_ & they match.
+
+      With numeric data:
+         True if neither array has any nan elements, and all elements pairwise equal.
+
+         True if equal_Nan True, all non-nans pairwise equal & nans in pda_a correspond to nans in pda_b
+
+         False if equal_Nan False, & either array has any nan element.
+
+    Examples
+    --------
+    >>> a = ak.randint(0,10,10,dtype=ak.float64)
+    >>> b = a
+    >>> ak.array_equal(a,b)
+    True
+    >>> b[9] = np.nan
+    >>> ak.array_equal(a,b)
+    False
+    >>> a[9] = np.nan
+    >>> ak.array_equal(a,b)
+    False
+    >>> ak.array_equal(a,b,True)
+    True
+    """
+    if (pda_a.shape != pda_b.shape) or ((pda_a.dtype == akstr_) ^ (pda_b.dtype == akstr_)):
+        return False
+    elif equal_nan:
+        return ak_all(where(isnan(pda_a), isnan(pda_b), pda_a == pda_b))
+    else:
+        return ak_all(pda_a == pda_b)
+
+
 def putmask(pda: pdarray, mask: Union[bool, pdarray], values: pdarray):
     """
     Overwrite elements of a pdarray at indices where mask is True

diff --git a/tests/numeric_test.py b/tests/numeric_test.py
@@ -989,3 +989,207 @@ def test_vecdot(self, data_type1, data_type2, prob_size):
 
         npProduct = np.add.reduce(nda_a * nda_b)
         assert check(npProduct, akProduct.to_ndarray(), akProduct.dtype)
+
+    # Notes about array_equal:
+    #   Strings compared to non-strings are always not equal.
+    #   nan handling is (of course) unique to floating point
+    #   we deliberately test on matched and mismatched arrays
+
+    @pytest.mark.parametrize("prob_size", pytest.prob_size)
+    @pytest.mark.parametrize("data_type", ARRAY_TYPES)
+    @pytest.mark.parametrize("same_size", YES_NO)
+    @pytest.mark.parametrize("matching", YES_NO)
+    @pytest.mark.parametrize("nan_handling", YES_NO)
+    def test_array_equal(self, prob_size, data_type, same_size, matching, nan_handling):
+        seed = pytest.seed if pytest.seed is not None else 8675309
+        if data_type is ak.str_:  # strings require special handling
+            np.random.seed(seed)
+            temp = np.random.choice(VOWELS_AND_SUCH, prob_size)
+            pda_a = ak.array(temp)
+            pda_b = ak.array(temp)
+            assert ak.array_equal(pda_a, pda_b)  # matching string arrays
+            pda_c = pda_b[:-1]
+            assert not (ak.array_equal(pda_a,pda_c))   # matching except c is shorter by 1
+            temp = np.random.choice(VOWELS_AND_SUCH, prob_size)
+            pda_b = ak.array(temp)
+            assert not (ak.array_equal(pda_a, pda_b))  # mismatching string arrays
+            pda_b = ak.randint(0, 100, prob_size, dtype=ak.int64)
+            assert not (ak.array_equal(pda_a, pda_b))  # string to int comparison
+            pda_b = ak.randint(0, 2, prob_size, dtype=ak.bool_)
+            assert not (ak.array_equal(pda_a, pda_b))  # string to bool comparison
+        elif data_type is ak.float64:  # so do floats, because of nan
+            nda_a = np.random.uniform(0, 100, prob_size)
+            if nan_handling:
+                nda_a[-1] = np.nan
+            nda_b = nda_a.copy() if matching else np.random.uniform(0, 100, prob_size)
+            pda_a = ak.array(nda_a)
+            pda_b = ak.array(nda_b) if same_size else ak.array(nda_b[:-1])
+            assert ak.array_equal(pda_a, pda_b, nan_handling) == (matching and same_size)
+        else:  # other types have simpler tests
+            pda_a = ak.random.randint(0, 100, prob_size, dtype=data_type)
+            if matching:  # known to match?
+                pda_b = pda_a if same_size else pda_a[:-1]
+                assert (ak.array_equal(pda_a, pda_b) == (matching and same_size))
+            elif same_size:  # not matching, but same size?
+                pda_b = ak.random.randint(0, 100, prob_size, dtype=data_type)
+                assert not (ak.array_equal(pda_a, pda_b))
+            else: 
+                pda_b = ak.random.randint(0, 100, (prob_size if same_size else prob_size-1), dtype=data_type)
+                assert not (ak.array_equal(pda_a, pda_b))
+
+    # Notes about median:
+    #  prob_size is either even or odd, so one of sample_e, sample_o will have an even
+    #  length, and the other an odd length.  Median should be tested with both even and odd
+    #  length inputs.
+
+    #  median can be done on ints or floats
+
+    @pytest.mark.parametrize("prob_size", pytest.prob_size)
+    @pytest.mark.parametrize("data_type", NUMERIC_TYPES)
+    def test_median(self, prob_size, data_type):
+        sample_e = np.random.permutation(prob_size).astype(data_type)
+        pda_e = ak.array(sample_e)
+        assert isclose(np.median(sample_e), ak.median(pda_e))
+        sample_o = np.random.permutation(prob_size + 1).astype(data_type)
+        pda_o = ak.array(sample_o)
+        assert isclose(np.median(sample_o), ak.median(pda_o))
+
+    #  test_count_nonzero doesn't use parameterization on data types, because
+    #  the data is generated differently.
+
+    #  counts are ints, so we test for equality, not closeness.
+
+    @pytest.mark.parametrize("prob_size", pytest.prob_size)
+    def test_count_nonzero(self, prob_size):
+        # ints, floats
+
+        for data_type in INT_FLOAT:
+            sample = np.random.randint(20, size=prob_size).astype(data_type)
+            pda = ak.array(sample)
+            assert np.count_nonzero(sample) == ak.count_nonzero(pda)
+
+        # bool
+
+        sample = np.random.randint(2, size=prob_size).astype(bool)
+        pda = ak.array(sample)
+        assert np.count_nonzero(sample) == ak.count_nonzero(pda)
+
+        # string
+
+        sample = sample.astype(str)
+        for i in range(10):
+            sample[np.random.randint(prob_size)] = ""  # empty some strings at random
+        pda = ak.array(sample)
+        assert np.count_nonzero(sample) == ak.count_nonzero(pda)
+
+    @pytest.mark.parametrize("prob_size", pytest.prob_size)
+    def test_clip(self, prob_size):
+        seed = pytest.seed if pytest.seed is not None else 8675309
+        np.random.seed(seed)
+        ia = np.random.randint(1, 100, prob_size)
+        ilo = 25
+        ihi = 75
+
+        dtypes = ["int64", "float64"]
+
+        # test clip.
+        # array to be clipped can be integer or float
+        # range limits can be integer, float, or none, and can be scalars or arrays
+
+        # Looping over all data types, the interior loop tests using lo, hi as:
+
+        #   None, Scalar
+        #   None, Array
+        #   Scalar, Scalar
+        #   Scalar, Array
+        #   Scalar, None
+        #   Array, Scalar
+        #   Array, Array
+        #   Array, None
+
+        # There is no test with lo and hi both equal to None, because that's not allowed
+
+        for dtype1 in dtypes:
+            hi = np.full(ia.shape, ihi, dtype=dtype1)
+            akhi = ak.array(hi)
+            for dtype2 in dtypes:
+                lo = np.full(ia.shape, ilo, dtype=dtype2)
+                aklo = ak.array(lo)
+                for dtype3 in dtypes:
+                    nd_arry = ia.astype(dtype3)
+                    ak_arry = ak.array(nd_arry)
+                    assert np.allclose(
+                        np.clip(nd_arry, None, hi[0]), ak.clip(ak_arry, None, hi[0]).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, None, hi), ak.clip(ak_arry, None, akhi).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, lo[0], hi[0]), ak.clip(ak_arry, lo[0], hi[0]).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, lo[0], hi), ak.clip(ak_arry, lo[0], akhi).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, lo[0], None), ak.clip(ak_arry, lo[0], None).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, lo, hi[0]), ak.clip(ak_arry, aklo, hi[0]).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, lo, hi), ak.clip(ak_arry, aklo, akhi).to_ndarray()
+                    )
+                    assert np.allclose(
+                        np.clip(nd_arry, lo, None), ak.clip(ak_arry, aklo, None).to_ndarray()
+                    )
+
+    @pytest.mark.parametrize("prob_size", pytest.prob_size)
+    def test_putmask(self, prob_size):
+
+        for data_type in INT_FLOAT:
+
+            #  three things to test: values same size as data
+
+            nda = np.random.randint(0, 10, prob_size).astype(data_type)
+            result = nda.copy()
+            np.putmask(result, result > 5, result**2)
+            pda = ak.array(nda)
+            ak.putmask(pda, pda > 5, pda**2)
+            assert (
+                np.all(result == pda.to_ndarray())
+                if data_type == ak.int64
+                else np.allclose(result, pda.to_ndarray())
+            )
+
+            # values shorter than data
+
+            result = nda.copy()
+            pda = ak.array(nda)
+            values = np.arange(3).astype(data_type)
+            np.putmask(result, result > 5, values)
+            ak.putmask(pda, pda > 5, ak.array(values))
+            assert (
+                np.all(result == pda.to_ndarray())
+                if data_type == ak.int64
+                else np.allclose(result, pda.to_ndarray())
+            )
+
+            # values longer than data
+
+            result = nda.copy()
+            pda = ak.array(nda)
+            values = np.arange(prob_size + 1).astype(data_type)
+            np.putmask(result, result > 5, values)
+            ak.putmask(pda, pda > 5, ak.array(values))
+            assert (
+                np.all(result == pda.to_ndarray())
+                if data_type == ak.int64
+                else np.allclose(result, pda.to_ndarray())
+            )
+
+            # finally try to raise the error
+
+            pda = ak.random.randint(0, 10, 10).astype(ak.float64)
+            values = np.arange(10)
+            with pytest.raises(TypeError):
+                ak.putmask(pda, pda > 3, values)