From 444cbf685472c830e1150a2d1668698ccc8babf5 Mon Sep 17 00:00:00 2001 From: Erin Sheldon Date: Tue, 20 Aug 2024 11:10:44 -0400 Subject: [PATCH 1/6] make numpy_util.match work for non-integer inputs --- esutil/numpy_util.py | 52 ++++++++++++++++-------------- esutil/tests/test_numpy_util.py | 56 +++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 24 deletions(-) diff --git a/esutil/numpy_util.py b/esutil/numpy_util.py index 608a9cf..8717549 100644 --- a/esutil/numpy_util.py +++ b/esutil/numpy_util.py @@ -1509,40 +1509,44 @@ def rem_dup(arr, flag, values=False): def match(arr1input, arr2input, presorted=False): """ - NAME: - match + Match two arrays, returning the indicies of matches for each array, or + empty arrays if no matches are found. This means arr1[ind1] == arr2[ind2] + is true for all corresponding pairs. - CALLING SEQUENCE: - ind1,ind2 = match(arr1, arr2, presorted=False) + arr1 must contain only unique inputs, but arr2 may be non-unique. - PURPOSE: - Match two numpy arrays. Return the indices of the matches or empty - arrays if no matches are found. This means arr1[ind1] == arr2[ind2] is - true for all corresponding pairs. arr1 must contain only unique - inputs, but arr2 may be non-unique. - If you know arr1 is sorted, set presorted=True and it will run - even faster + If you know arr1 is sorted, set presorted=True and it will run even faster - METHOD: - uses searchsorted with some sugar. Much faster than old version - based on IDL code. - REVISION HISTORY: - Created 2015, Eli Rykoff, SLAC. + Parameters + ---------- + arr1: array + The first array, which must have unique elements. + arr2: array + The second array. + presorted: bool, optional + If set to True, the first array is assumed to be sorted. + Returns + ------- + ind1, ind2: array, array + The index arrays of matches for each array + + Revision history + ----------------- + Created 2015, Eli Rykoff, SLAC. """ # make sure 1D arr1 = np.atleast_1d(arr1input) arr2 = np.atleast_1d(arr2input) - # check for integer data... - if not issubclass(arr1.dtype.type, np.integer) or not issubclass( - arr2.dtype.type, np.integer - ): - mess = "Error: only works with integer types, got %s %s" - mess = mess % (arr1.dtype.type, arr2.dtype.type) - raise ValueError(mess) + el = arr1input[0] + + if isinstance(el, str) or isinstance(el, bytes): + is_string = True + else: + is_string = False if (arr1.size == 0) or (arr2.size == 0): mess = "Error: arr1 and arr2 must each be non-zero length" @@ -1563,7 +1567,7 @@ def match(arr1input, arr2input, presorted=False): sub1 = np.searchsorted(arr1, arr2, sorter=st1) # check for out-of-bounds at the high end if necessary - if arr2.max() > arr1.max(): + if is_string or arr2.max() > arr1.max(): (bad,) = np.where(sub1 == arr1.size) sub1[bad] = arr1.size - 1 diff --git a/esutil/tests/test_numpy_util.py b/esutil/tests/test_numpy_util.py index 90cebad..ff4571d 100644 --- a/esutil/tests/test_numpy_util.py +++ b/esutil/tests/test_numpy_util.py @@ -58,3 +58,59 @@ def test_split_array(): assert np.all(chunks[6] == [18, 19, 20]) assert np.all(chunks[7] == [21, 22, 23]) assert np.all(chunks[8] == [24]) + + +@pytest.mark.parametrize('presorted', [True, False]) +def test_match_int(presorted): + a1 = np.array([3, 10, 8, 4, 7]) + a2 = np.array([8, 3]) + + if not presorted: + ind = np.array([4, 1, 0, 2, 3]) + m1, m2 = eu.numpy_util.match(a1[ind], a2) + assert np.all(m1 == [3, 2]) + else: + m1, m2 = eu.numpy_util.match(a1, a2) + assert np.all(m1 == [2, 0]) + + +@pytest.mark.parametrize('presorted', [True, False]) +def test_match_float(presorted): + a1 = np.array([1.25, 6.61, 8.51, 9.91, 11.25]) + a2 = np.array([6.61, 9.91]) + + if not presorted: + ind = np.array([4, 1, 0, 2, 3]) + m1, m2 = eu.numpy_util.match(a1[ind], a2) + assert np.all(m1 == [1, 4]) + else: + m1, m2 = eu.numpy_util.match(a1, a2) + assert np.all(m1 == [1, 3]) + + +@pytest.mark.parametrize('presorted', [True, False]) +def test_match_str(presorted): + a1 = np.array(['blah', 'goodbye', 'hello', 'stuff', 'things']) + a2 = np.array(['goodbye', 'things', 'zz']) + + if not presorted: + ind = np.array([3, 4, 0, 2, 1]) + m1, m2 = eu.numpy_util.match(a1[ind], a2) + assert np.all(m1 == [4, 1]) + else: + m1, m2 = eu.numpy_util.match(a1, a2) + assert np.all(m1 == [1, 4]) + + +@pytest.mark.parametrize('presorted', [True, False]) +def test_match_none(presorted): + a1 = np.array(['blah', 'goodbye', 'hello', 'stuff', 'things']) + a2 = np.array(['zz', 'bb']) + + if not presorted: + ind = np.array([3, 4, 0, 2, 1]) + m1, m2 = eu.numpy_util.match(a1[ind], a2) + else: + m1, m2 = eu.numpy_util.match(a1, a2) + + assert m1.size == 0 and m2.size == 0 From 079c0b0b7d6192d05bbf00a1a1333060f1f606a2 Mon Sep 17 00:00:00 2001 From: Erin Sheldon Date: Tue, 20 Aug 2024 13:03:30 -0400 Subject: [PATCH 2/6] rename test to test_match_nomatch --- esutil/tests/test_numpy_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/esutil/tests/test_numpy_util.py b/esutil/tests/test_numpy_util.py index ff4571d..9f2fa60 100644 --- a/esutil/tests/test_numpy_util.py +++ b/esutil/tests/test_numpy_util.py @@ -103,7 +103,7 @@ def test_match_str(presorted): @pytest.mark.parametrize('presorted', [True, False]) -def test_match_none(presorted): +def test_match_nomatch(presorted): a1 = np.array(['blah', 'goodbye', 'hello', 'stuff', 'things']) a2 = np.array(['zz', 'bb']) From bae187d632393dedb6560fdf938c99ef0494ac3c Mon Sep 17 00:00:00 2001 From: Erin Sheldon Date: Tue, 20 Aug 2024 13:13:43 -0400 Subject: [PATCH 3/6] update doc --- esutil/numpy_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/esutil/numpy_util.py b/esutil/numpy_util.py index 8717549..3e454ed 100644 --- a/esutil/numpy_util.py +++ b/esutil/numpy_util.py @@ -1511,7 +1511,8 @@ def match(arr1input, arr2input, presorted=False): """ Match two arrays, returning the indicies of matches for each array, or empty arrays if no matches are found. This means arr1[ind1] == arr2[ind2] - is true for all corresponding pairs. + is true for all corresponding pairs. For floating-point data this implies + exact matching with no floating-point tolerance. arr1 must contain only unique inputs, but arr2 may be non-unique. From 95abee44c106ddca013a3afbcf784dc2f50652c5 Mon Sep 17 00:00:00 2001 From: Erin Sheldon Date: Tue, 20 Aug 2024 13:15:48 -0400 Subject: [PATCH 4/6] update release notes, bump version --- RELEASE_NOTES | 7 +++++++ esutil/__init__.py | 2 +- esutil/numpy_util.py | 3 ++- setup.py | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index d22f059..55105f6 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,3 +1,10 @@ +0.6.15 (not yet released) +------ + +Enhancements + + - numpy_util.match works for data types other than int + 0.6.14 ------ diff --git a/esutil/__init__.py b/esutil/__init__.py index bf7de63..b0287c8 100644 --- a/esutil/__init__.py +++ b/esutil/__init__.py @@ -83,7 +83,7 @@ class for gauss-legendre integration, which relies on the gauleg C++ extension. import sys -__version__ = "0.6.14" +__version__ = "0.6.15" def version(): return __version__ diff --git a/esutil/numpy_util.py b/esutil/numpy_util.py index 3e454ed..fcf8702 100644 --- a/esutil/numpy_util.py +++ b/esutil/numpy_util.py @@ -1512,7 +1512,8 @@ def match(arr1input, arr2input, presorted=False): Match two arrays, returning the indicies of matches for each array, or empty arrays if no matches are found. This means arr1[ind1] == arr2[ind2] is true for all corresponding pairs. For floating-point data this implies - exact matching with no floating-point tolerance. + exact matching with no floating-point tolerance. The data type can be + string or bytes. arr1 must contain only unique inputs, but arr2 may be non-unique. diff --git a/setup.py b/setup.py index 1e5fe6d..c5dae01 100644 --- a/setup.py +++ b/setup.py @@ -244,7 +244,7 @@ def build_extensions(self): setup( name="esutil", - version="0.6.14", + version="0.6.15", author="Erin Scott Sheldon", author_email="erin.sheldon@gmail.com", classifiers=classifiers, From 19551bd06cd51235184ba12797215bf6d8b56273 Mon Sep 17 00:00:00 2001 From: Erin Sheldon Date: Tue, 20 Aug 2024 13:46:39 -0400 Subject: [PATCH 5/6] better note --- RELEASE_NOTES | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 55105f6..2819459 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -3,7 +3,7 @@ Enhancements - - numpy_util.match works for data types other than int + - numpy_util.match works for non-integer data types 0.6.14 ------ From 6273689e00161e9716bf8f7eed076fa1dd16fbc4 Mon Sep 17 00:00:00 2001 From: Erin Sheldon Date: Tue, 20 Aug 2024 14:01:46 -0400 Subject: [PATCH 6/6] docs --- esutil/numpy_util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/esutil/numpy_util.py b/esutil/numpy_util.py index fcf8702..d19ebc8 100644 --- a/esutil/numpy_util.py +++ b/esutil/numpy_util.py @@ -1512,8 +1512,9 @@ def match(arr1input, arr2input, presorted=False): Match two arrays, returning the indicies of matches for each array, or empty arrays if no matches are found. This means arr1[ind1] == arr2[ind2] is true for all corresponding pairs. For floating-point data this implies - exact matching with no floating-point tolerance. The data type can be - string or bytes. + exact matching with no floating-point tolerance. + + The data type can be int, float, string or bytes. arr1 must contain only unique inputs, but arr2 may be non-unique.