From 2c8061b816e5d4ddebe5a799882d3cec50b04106 Mon Sep 17 00:00:00 2001 From: jaketrookman <114928862+jaketrookman@users.noreply.github.com> Date: Wed, 6 Sep 2023 16:10:30 -0400 Subject: [PATCH] Closed #2651: `series_test.py` conversion for new framework (#2671) * conversion of series test to new framework * update to prob_size parameterization * fix prob_size scaling issue * left over prob_size variable * requested changes, clean up --------- Co-authored-by: jaketrookman --- PROTO_tests/tests/series_test.py | 187 +++++++++++++++++++++++++++++++ arkouda/index.py | 5 +- src/KExtremeMsg.chpl | 1 - 3 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 PROTO_tests/tests/series_test.py diff --git a/PROTO_tests/tests/series_test.py b/PROTO_tests/tests/series_test.py new file mode 100644 index 0000000000..de9683c14d --- /dev/null +++ b/PROTO_tests/tests/series_test.py @@ -0,0 +1,187 @@ +import numpy as np +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +import arkouda as ak + +DTYPES = [ak.int64, ak.uint64, ak.bool, ak.float64, ak.bigint, ak.str_] +NO_STRING = [ak.int64, ak.uint64, ak.bool, ak.float64, ak.bigint] +NUMERICAL_TYPES = [ak.int64, ak.uint64, ak.float64, ak.bigint] +INTEGRAL_TYPES = [ak.int64, ak.uint64, ak.bool, ak.bigint] + + +class TestSeries: + @pytest.mark.parametrize("dtype", DTYPES) + def test_series_creation(self, dtype): + idx = ak.arange(3, dtype=dtype) + for val in idx, ak.array(["A", "B", "C"]): + ans = ak.Series(data=val, index=idx).to_list() + for series in ( + ak.Series(data=val, index=idx), + ak.Series(data=val), + ak.Series(val, idx), + ak.Series(val), + ak.Series((idx, val)), + ): + assert isinstance(series, ak.Series) + assert isinstance(series.index, ak.Index) + assert series.to_list() == ans + + with pytest.raises(TypeError): + ak.Series(index=idx) + + with pytest.raises(TypeError): + ak.Series((ak.arange(3),)) + + with pytest.raises(TypeError): + ak.Series() + + with pytest.raises(ValueError): + ak.Series(data=ak.arange(3), index=ak.arange(6)) + + @pytest.mark.parametrize("dtype", INTEGRAL_TYPES) + @pytest.mark.parametrize("dtype_index", [ak.int64, ak.uint64]) + def test_lookup(self, dtype, dtype_index): + pda = ak.arange(3, dtype=dtype) + for val in pda, ak.array(["A", "B", "C"]): + s = ak.Series(data=val, index=ak.arange(3, dtype=dtype_index)) + + for key in ( + 1, + ak.Index(ak.array([1], dtype=dtype_index)), + ak.Index(ak.array([0, 2], dtype=dtype_index)), + ): + lk = s.locate(key) + assert isinstance(lk, ak.Series) + key = ak.array(key) if not isinstance(key, int) else key + assert (lk.index == s.index[key]).all() + assert (lk.values == s.values[key]).all() + + # testing multi-index lookup + mi = ak.MultiIndex([pda, pda[::-1]]) + s = ak.Series(data=val, index=mi) + lk = s.locate(mi[0]) + assert isinstance(lk, ak.Series) + assert lk.index.index == mi[0].index + assert lk.values[0] == val[0] + + # ensure error with scalar and multi-index + with pytest.raises(TypeError): + s.locate(0) + + with pytest.raises(TypeError): + s.locate([0, 2]) + + @pytest.mark.parametrize("prob_size", pytest.prob_size) + @pytest.mark.parametrize("dtype", NO_STRING) + def test_add(self, prob_size, dtype): + added = ak.Series(ak.arange(prob_size // 2, dtype=dtype)).add( + ak.Series( + data=ak.arange(prob_size // 2, prob_size, dtype=dtype), + index=ak.arange(prob_size // 2, prob_size), + ) + ) + assert (added.index == ak.arange(prob_size)).all() + assert (added.values == ak.arange(prob_size, dtype=dtype)).all() + + @pytest.mark.parametrize("dtype", [ak.int64, ak.uint64, ak.float64]) + def test_topn(self, dtype): + top = ak.Series(ak.arange(100, dtype=dtype)).topn(50) + assert top.values.to_list() == list(range(99, 49, -1)) + assert top.index.to_list() == list(range(99, 49, -1)) + + @pytest.mark.parametrize("dtype", NUMERICAL_TYPES) + @pytest.mark.parametrize("dtype_index", NUMERICAL_TYPES) + def test_sort(self, dtype, dtype_index): + def gen_perm(n): + idx_left = set(range(n)) + perm = np.arange(n) + while len(idx_left) >= 2: + inds = np.random.choice(list(idx_left), 2, replace=False) + tmp = perm[inds[0]] + perm[inds[0]] = perm[inds[1]] + perm[inds[1]] = tmp + idx_left.remove(inds[0]) + idx_left.remove(inds[1]) + return perm + + ordered = ak.arange(100, dtype=dtype) + perm = ak.array(gen_perm(100), dtype=dtype_index) + + idx_sort = ak.Series(data=ordered, index=perm).sort_index() + assert idx_sort.index.to_list() == ordered.to_list() + assert idx_sort.values.to_list() == perm.to_list() + + val_sort = ak.Series(data=perm, index=ordered).sort_values() + assert val_sort.index.to_pandas().tolist() == perm.to_list() + assert val_sort.values.to_list() == ordered.to_list() + + @pytest.mark.parametrize("dtype", DTYPES) + def test_head_tail(self, dtype): + n = 10 + s = ak.Series(ak.arange(n, dtype=dtype)) + for i in range(n): + + head = s.head(i) + assert head.index.to_list() == list(range(i)) + assert head.values.to_list() == ak.arange(i, dtype=dtype).to_list() + + tail = s.tail(i) + assert tail.index.to_list() == ak.arange(n)[-i:n].to_list() + assert tail.values.to_list() == ak.arange(n, dtype=dtype)[-i:n].to_list() + + def test_value_counts(self): + s = ak.Series(ak.array([1, 2, 0, 2, 0])) + + c = s.value_counts() + assert c.index.to_list() == [0, 2, 1] + assert c.values.to_list() == [2, 2, 1] + + c = s.value_counts(sort=False) + assert c.index.to_list() == list(range(3)) + assert c.values.to_list() == [2, 1, 2] + + def test_concat(self): + s = ak.Series(ak.arange(5)) + s2 = ak.Series(ak.arange(5, 11), ak.arange(5, 11)) + s3 = ak.Series(ak.arange(5, 10), ak.arange(5, 10)) + + df = ak.Series.concat([s, s2], axis=1) + assert isinstance(df, ak.DataFrame) + + ref_df = pd.DataFrame( + { + "idx": list(range(11)), + "val_0": [0, 1, 2, 3, 4, 0, 0, 0, 0, 0, 0], + "val_1": [0, 0, 0, 0, 0, 5, 6, 7, 8, 9, 10], + } + ) + assert_frame_equal(ref_df, df.to_pandas()) + + def list_helper(arr): + return arr.to_list() if isinstance(arr, (ak.pdarray, ak.Index)) else arr.tolist() + + for fname in "concat", "pdconcat": + func = getattr(ak.Series, fname) + c = func([s, s2]) + assert list_helper(c.index) == list(range(11)) + assert list_helper(c.values) == list(range(11)) + + df = func([s, s3], axis=1) + if fname == "concat": + ref_df = pd.DataFrame( + {"idx": [0, 1, 2, 3, 4], "val_0": [0, 1, 2, 3, 4], "val_1": [5, 6, 7, 8, 9]} + ) + assert isinstance(df, ak.DataFrame) + assert_frame_equal(ref_df, df.to_pandas()) + else: + ref_df = pd.DataFrame({0: [0, 1, 2, 3, 4], 1: [5, 6, 7, 8, 9]}) + assert isinstance(df, pd.DataFrame) + assert_frame_equal(ref_df, df) + + def test_index_as_index_compat(self): + # added to validate functionality for issue #1506 + df = ak.DataFrame({"a": ak.arange(10), "b": ak.arange(10), "c": ak.arange(10)}) + g = df.groupby(["a", "b"]) + g.broadcast(g.sum("c")) diff --git a/arkouda/index.py b/arkouda/index.py index 8df1895add..fb9079c1fd 100644 --- a/arkouda/index.py +++ b/arkouda/index.py @@ -9,6 +9,7 @@ from arkouda.dtypes import float64 as akfloat64 from arkouda.dtypes import int64 as akint64 from arkouda.groupbyclass import GroupBy, unique +from arkouda.numeric import cast as akcast from arkouda.pdarrayclass import RegistrationError, pdarray from arkouda.pdarraycreation import arange, array, create_pdarray, ones from arkouda.pdarraysetops import argsort, in1d @@ -806,5 +807,7 @@ def lookup(self, key): raise TypeError("MultiIndex lookup failure") # if individual vals convert to pdarrays if not isinstance(key[0], pdarray): - key = [array([x]) for x in key] + dt = self.values[0].dtype if isinstance(self.values[0], pdarray) else akint64 + key = [akcast(array([x]), dt) for x in key] + return in1d(self.index, key) diff --git a/src/KExtremeMsg.chpl b/src/KExtremeMsg.chpl index b38526049d..c23d57fc7b 100644 --- a/src/KExtremeMsg.chpl +++ b/src/KExtremeMsg.chpl @@ -163,7 +163,6 @@ module KExtremeMsg } } - otherwise { var errorMsg = notImplementedError("maxk",gEnt.dtype); keLogger.error(getModuleName(),getRoutineName(),getLineNumber(),errorMsg);