Skip to content

Commit

Permalink
Closes #3297: Adds array_equal (#3725)
Browse files Browse the repository at this point in the history
* Further work

* Adds array_equal, fixes test_triu

* almost there

* Addresses docstring errors.

* Fixes flake8 warnings

* Still fixing docstring bits

---------

Co-authored-by: drculhane <drculhane@users.noreply.github.com>
Co-authored-by: ajpotts <amanda.j.potts@gmail.com>
  • Loading branch information
3 people committed Sep 4, 2024
1 parent 2205c45 commit 6de7e3d
Show file tree
Hide file tree
Showing 2 changed files with 257 additions and 1 deletion.
54 changes: 53 additions & 1 deletion arkouda/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,9 @@
isSupportedNumber,
numeric_scalars,
resolve_scalar_dtype,
str_,
)
from arkouda.numpy.dtypes import str_
from arkouda.numpy.dtypes import str_ as akstr_
from arkouda.pdarrayclass import all as ak_all
from arkouda.pdarrayclass import any as ak_any
from arkouda.pdarrayclass import argmax, create_pdarray, pdarray, sum
Expand Down Expand Up @@ -73,6 +74,7 @@
"rad2deg",
"deg2rad",
"hash",
"array_equal",
"putmask",
"where",
"histogram",
Expand Down Expand Up @@ -2202,6 +2204,56 @@ def count_nonzero(pda):
return sum((pda != "").astype(np.int64))


def array_equal(pda_a: pdarray, pda_b: pdarray, equal_nan: bool = False):
"""
Compares two pdarrays for equality.
If neither array has any nan elements, then if all elements are pairwise equal,
it returns True.
If equal_Nan is False, then any nan element in either array gives a False return.
If equal_Nan is True, then pairwise-corresponding nans are considered equal.
Parameters
----------
pda_a : pdarray
pda_b : pdarray
equal_nan : boolean to determine how to handle nans, default False
Returns
-------
boolean
With string data:
False if one array is type ak.str_ & the other isn't, True if both are ak.str_ & they match.
With numeric data:
True if neither array has any nan elements, and all elements pairwise equal.
True if equal_Nan True, all non-nans pairwise equal & nans in pda_a correspond to nans in pda_b
False if equal_Nan False, & either array has any nan element.
Examples
--------
>>> a = ak.randint(0,10,10,dtype=ak.float64)
>>> b = a
>>> ak.array_equal(a,b)
True
>>> b[9] = np.nan
>>> ak.array_equal(a,b)
False
>>> a[9] = np.nan
>>> ak.array_equal(a,b)
False
>>> ak.array_equal(a,b,True)
True
"""
if (pda_a.shape != pda_b.shape) or ((pda_a.dtype == akstr_) ^ (pda_b.dtype == akstr_)):
return False
elif equal_nan:
return ak_all(where(isnan(pda_a), isnan(pda_b), pda_a == pda_b))
else:
return ak_all(pda_a == pda_b)


def putmask(pda: pdarray, mask: Union[bool, pdarray], values: pdarray):
"""
Overwrite elements of a pdarray at indices where mask is True
Expand Down
204 changes: 204 additions & 0 deletions tests/numeric_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,3 +989,207 @@ def test_vecdot(self, data_type1, data_type2, prob_size):

npProduct = np.add.reduce(nda_a * nda_b)
assert check(npProduct, akProduct.to_ndarray(), akProduct.dtype)

# Notes about array_equal:
# Strings compared to non-strings are always not equal.
# nan handling is (of course) unique to floating point
# we deliberately test on matched and mismatched arrays

@pytest.mark.parametrize("prob_size", pytest.prob_size)
@pytest.mark.parametrize("data_type", ARRAY_TYPES)
@pytest.mark.parametrize("same_size", YES_NO)
@pytest.mark.parametrize("matching", YES_NO)
@pytest.mark.parametrize("nan_handling", YES_NO)
def test_array_equal(self, prob_size, data_type, same_size, matching, nan_handling):
seed = pytest.seed if pytest.seed is not None else 8675309
if data_type is ak.str_: # strings require special handling
np.random.seed(seed)
temp = np.random.choice(VOWELS_AND_SUCH, prob_size)
pda_a = ak.array(temp)
pda_b = ak.array(temp)
assert ak.array_equal(pda_a, pda_b) # matching string arrays
pda_c = pda_b[:-1]
assert not (ak.array_equal(pda_a,pda_c)) # matching except c is shorter by 1
temp = np.random.choice(VOWELS_AND_SUCH, prob_size)
pda_b = ak.array(temp)
assert not (ak.array_equal(pda_a, pda_b)) # mismatching string arrays
pda_b = ak.randint(0, 100, prob_size, dtype=ak.int64)
assert not (ak.array_equal(pda_a, pda_b)) # string to int comparison
pda_b = ak.randint(0, 2, prob_size, dtype=ak.bool_)
assert not (ak.array_equal(pda_a, pda_b)) # string to bool comparison
elif data_type is ak.float64: # so do floats, because of nan
nda_a = np.random.uniform(0, 100, prob_size)
if nan_handling:
nda_a[-1] = np.nan
nda_b = nda_a.copy() if matching else np.random.uniform(0, 100, prob_size)
pda_a = ak.array(nda_a)
pda_b = ak.array(nda_b) if same_size else ak.array(nda_b[:-1])
assert ak.array_equal(pda_a, pda_b, nan_handling) == (matching and same_size)
else: # other types have simpler tests
pda_a = ak.random.randint(0, 100, prob_size, dtype=data_type)
if matching: # known to match?
pda_b = pda_a if same_size else pda_a[:-1]
assert (ak.array_equal(pda_a, pda_b) == (matching and same_size))
elif same_size: # not matching, but same size?
pda_b = ak.random.randint(0, 100, prob_size, dtype=data_type)
assert not (ak.array_equal(pda_a, pda_b))
else:
pda_b = ak.random.randint(0, 100, (prob_size if same_size else prob_size-1), dtype=data_type)
assert not (ak.array_equal(pda_a, pda_b))

# Notes about median:
# prob_size is either even or odd, so one of sample_e, sample_o will have an even
# length, and the other an odd length. Median should be tested with both even and odd
# length inputs.

# median can be done on ints or floats

@pytest.mark.parametrize("prob_size", pytest.prob_size)
@pytest.mark.parametrize("data_type", NUMERIC_TYPES)
def test_median(self, prob_size, data_type):
sample_e = np.random.permutation(prob_size).astype(data_type)
pda_e = ak.array(sample_e)
assert isclose(np.median(sample_e), ak.median(pda_e))
sample_o = np.random.permutation(prob_size + 1).astype(data_type)
pda_o = ak.array(sample_o)
assert isclose(np.median(sample_o), ak.median(pda_o))

# test_count_nonzero doesn't use parameterization on data types, because
# the data is generated differently.

# counts are ints, so we test for equality, not closeness.

@pytest.mark.parametrize("prob_size", pytest.prob_size)
def test_count_nonzero(self, prob_size):
# ints, floats

for data_type in INT_FLOAT:
sample = np.random.randint(20, size=prob_size).astype(data_type)
pda = ak.array(sample)
assert np.count_nonzero(sample) == ak.count_nonzero(pda)

# bool

sample = np.random.randint(2, size=prob_size).astype(bool)
pda = ak.array(sample)
assert np.count_nonzero(sample) == ak.count_nonzero(pda)

# string

sample = sample.astype(str)
for i in range(10):
sample[np.random.randint(prob_size)] = "" # empty some strings at random
pda = ak.array(sample)
assert np.count_nonzero(sample) == ak.count_nonzero(pda)

@pytest.mark.parametrize("prob_size", pytest.prob_size)
def test_clip(self, prob_size):
seed = pytest.seed if pytest.seed is not None else 8675309
np.random.seed(seed)
ia = np.random.randint(1, 100, prob_size)
ilo = 25
ihi = 75

dtypes = ["int64", "float64"]

# test clip.
# array to be clipped can be integer or float
# range limits can be integer, float, or none, and can be scalars or arrays

# Looping over all data types, the interior loop tests using lo, hi as:

# None, Scalar
# None, Array
# Scalar, Scalar
# Scalar, Array
# Scalar, None
# Array, Scalar
# Array, Array
# Array, None

# There is no test with lo and hi both equal to None, because that's not allowed

for dtype1 in dtypes:
hi = np.full(ia.shape, ihi, dtype=dtype1)
akhi = ak.array(hi)
for dtype2 in dtypes:
lo = np.full(ia.shape, ilo, dtype=dtype2)
aklo = ak.array(lo)
for dtype3 in dtypes:
nd_arry = ia.astype(dtype3)
ak_arry = ak.array(nd_arry)
assert np.allclose(
np.clip(nd_arry, None, hi[0]), ak.clip(ak_arry, None, hi[0]).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, None, hi), ak.clip(ak_arry, None, akhi).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, lo[0], hi[0]), ak.clip(ak_arry, lo[0], hi[0]).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, lo[0], hi), ak.clip(ak_arry, lo[0], akhi).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, lo[0], None), ak.clip(ak_arry, lo[0], None).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, lo, hi[0]), ak.clip(ak_arry, aklo, hi[0]).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, lo, hi), ak.clip(ak_arry, aklo, akhi).to_ndarray()
)
assert np.allclose(
np.clip(nd_arry, lo, None), ak.clip(ak_arry, aklo, None).to_ndarray()
)

@pytest.mark.parametrize("prob_size", pytest.prob_size)
def test_putmask(self, prob_size):

for data_type in INT_FLOAT:

# three things to test: values same size as data

nda = np.random.randint(0, 10, prob_size).astype(data_type)
result = nda.copy()
np.putmask(result, result > 5, result**2)
pda = ak.array(nda)
ak.putmask(pda, pda > 5, pda**2)
assert (
np.all(result == pda.to_ndarray())
if data_type == ak.int64
else np.allclose(result, pda.to_ndarray())
)

# values shorter than data

result = nda.copy()
pda = ak.array(nda)
values = np.arange(3).astype(data_type)
np.putmask(result, result > 5, values)
ak.putmask(pda, pda > 5, ak.array(values))
assert (
np.all(result == pda.to_ndarray())
if data_type == ak.int64
else np.allclose(result, pda.to_ndarray())
)

# values longer than data

result = nda.copy()
pda = ak.array(nda)
values = np.arange(prob_size + 1).astype(data_type)
np.putmask(result, result > 5, values)
ak.putmask(pda, pda > 5, ak.array(values))
assert (
np.all(result == pda.to_ndarray())
if data_type == ak.int64
else np.allclose(result, pda.to_ndarray())
)

# finally try to raise the error

pda = ak.random.randint(0, 10, 10).astype(ak.float64)
values = np.arange(10)
with pytest.raises(TypeError):
ak.putmask(pda, pda > 3, values)

0 comments on commit 6de7e3d

Please sign in to comment.