Skip to content

Commit

Permalink
3231 unique unit tests (Bears-R-Us#3258)
Browse files Browse the repository at this point in the history
* Adds PROTOs unit test for ak.unique

* Adds PROTOs unit test for ak.unique

* 3231 addresses feedback, splits 1 assert into 2

* addresses feedback re asserts

* 3231 incorporates all feedback.

* 3231 updates comments.

* adds test that keys are always sorted for int64

---------

Co-authored-by: drculhane <drculhane@users.noreply.github.com>
Co-authored-by: ajpotts <amanda.j.potts@gmail.com>
  • Loading branch information
3 people authored Jun 27, 2024
1 parent dd93f86 commit 714d587
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 2 deletions.
100 changes: 100 additions & 0 deletions PROTO_tests/tests/groupby_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,39 @@
from arkouda import sum as aksum
from arkouda.groupbyclass import GroupByReductionType
from arkouda.scipy import chisquare as akchisquare
from arkouda.dtypes import npstr

# block of variables and functions used in test_unique

UNIQUE_TYPES = [ak.categorical, ak.int64, ak.float64, npstr]
VOWELS_AND_SUCH = ["a", "e", "i", "o", "u", "AB", 47, 2, 3.14159]
PICKS = np.array([f"base {i}" for i in range(10)])

isSorted = lambda x: np.all(x[:-1] <= x[1:]) # short for is x[i] <= x[i+1] for all i

# This function (almost) guarantees both a sorted and unsorted version of
# a 1d array. The only exception is an array of all identical values.
# The first "if" block skips the whole function in that case. Otherwise,
# if the sample is already sorted, a non-sorted permutation is generated,
# and the two are returned. If it isn't, a sorted version is created,
# and those two are returned.


def make_sorted_and_unsorted_data(sample):
if np.all(sample == sample[0]):
return sample, sample
if isSorted(sample):
s_a = sample[:]
us_a = np.random.permutation(sample)
while isSorted(us_a):
us_a = np.random.permutation(us_a)
else:
s_a = np.sort(sample)
us_a = sample[:]
return s_a, us_a


# end of block


def to_tuple_dict(labels, values):
Expand Down Expand Up @@ -757,6 +790,73 @@ def test_large_mean_aggregation(self):
for m in means.to_list():
assert np.isclose(float(a[0]), m)

# ak.unique takes 1 pda argument and 3 booleans
# However, not all 8 combinations of the booleans are needed to
# cover the test space.
# Combinations TTF and TFF are supersets of all other possible
# combinations, so only those are tested below.

@pytest.mark.parametrize("data_type", UNIQUE_TYPES)
@pytest.mark.parametrize("prob_size", pytest.prob_size)
def test_unique(self, data_type, prob_size):
Jenny = pytest.seed if pytest.seed is not None else 8675309
T = True
F = False
np.random.seed(Jenny)
arrays = {
npstr: np.random.choice(VOWELS_AND_SUCH, prob_size),
ak.int64: np.random.randint(0, prob_size // 3, prob_size),
ak.float64: np.random.uniform(0, prob_size // 3, prob_size),
ak.categorical: np.random.choice(PICKS, prob_size),
}
nda = arrays[data_type]
np_unique = np.unique(nda) # get unique keys from np for comparison
s_nda, us_nda = make_sorted_and_unsorted_data(nda)
s_pda = ak.array(s_nda)
us_pda = ak.array(us_nda)

# Categorical requires another step to make the pdarrays categorical

if data_type == "categorical":
s_pda = ak.Categorical(s_pda)
us_pda = ak.Categorical(us_pda)

# Call ak.unique with TTF and TFF

ak_TTF = ak.unique(s_pda, T, T, F)
ak_TFF = ak.unique(us_pda, T, F, F)

# Check for correct unique keys.

assert np.all(np_unique == np.sort(ak_TFF[0].to_ndarray()))
assert np.all(np_unique == np.sort(ak_TTF[0].to_ndarray()))

# Check groups and indices. If data was sorted, the group ndarray
# should just be list(range(len(nda))).
# For unsorted data, a reordered copy of the pdarray is created
# based on the returned permutation.
# In both cases, broadcasting the unique values using the returned
# indices should create the sorted/reordered array.

# keys should always be returned sorted if data is int64

# sorted

if data_type == ak.int64 : assert isSorted(ak_TFF[0].to_ndarray())
srange = np.arange(len(nda))
assert np.all(srange == ak_TTF[1].to_ndarray())
indices = ak_TTF[2]
assert ak.all(s_pda == ak.broadcast(indices, ak_TTF[0], len(s_nda)))

# unsorted

aku = ak.unique(us_pda).to_ndarray()
if data_type == ak.int64 : assert isSorted(aku)
reordering = ak_TFF[1]
reordered = us_pda[reordering]
indices = ak_TFF[2]
assert ak.all(reordered == ak.broadcast(indices, ak_TFF[0], len(us_nda)))

def test_unique_aggregation(self):
keys = ak.array([0, 1, 0, 1, 0, 1, 0, 1])
vals = ak.array([4, 3, 5, 3, 5, 2, 6, 2])
Expand Down
4 changes: 2 additions & 2 deletions arkouda/groupbyclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@ def unique(
Input array.
return_groups : bool, optional
If True, also return grouping information for the array.
assume_sorted : bool, optional
If True, assume pda is sorted and skip sorting step
return_indices: bool, optional
Only applicable if return_groups is True.
If True, return unique key indices along with other groups
assume_sorted : bool, optional
If True, assume pda is sorted and skip sorting step
Returns
-------
Expand Down

0 comments on commit 714d587

Please sign in to comment.