From 271b6ed42f68ea1909adb311b04f32b55bab7b6b Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Tue, 10 Sep 2024 10:42:08 -0500 Subject: [PATCH 1/6] permit numpy >= 2 --- pyproject.toml | 2 +- sharrow/dataset.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 83f7f36..31d28cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ name = "sharrow" requires-python = ">=3.9" dynamic = ["version"] dependencies = [ - "numpy >= 1.19, <2", + "numpy >= 1.19", "pandas >= 1.2", "pyarrow", "xarray", diff --git a/sharrow/dataset.py b/sharrow/dataset.py index c1907ea..bce4837 100755 --- a/sharrow/dataset.py +++ b/sharrow/dataset.py @@ -1182,6 +1182,8 @@ def _to_ast_literal(x): return _to_ast_literal(x.to_list()) elif isinstance(x, np.ndarray): return _to_ast_literal(list(x)) + elif isinstance(x, np.str_): + return repr(str(x)) else: return repr(x) From ffdfd9df728d5b9db101d85a2e0d48584e6a4390 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 13 Sep 2024 12:43:57 -0500 Subject: [PATCH 2/6] add iloc to DataArray --- sharrow/dataset.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/sharrow/dataset.py b/sharrow/dataset.py index bce4837..3d7039c 100755 --- a/sharrow/dataset.py +++ b/sharrow/dataset.py @@ -91,7 +91,7 @@ def construct(source): Parameters ---------- source : pandas.DataFrame, pyarrow.Table, xarray.Dataset, or Sequence[str] - The source from which to create a Dataset. DataFrames and Tables + The source from which to create a Dataset. DataFrame and Table objects are converted to Datasets that have one dimension (the rows) and separate variables for each of the columns. A list of strings creates a dataset with those named empty variables. @@ -1065,6 +1065,38 @@ def __getitem__(self, key: Mapping[Hashable, Any]) -> Dataset: return self.dataset.isel(key) +@xr.register_dataarray_accessor("iloc") +class _iLocArrayIndexer: + """ + Purely integer-location based indexing for selection by position on 1-d DataArrays. + + In many ways, a dataset with a single dimensions is like a pandas DataFrame, + with the one dimension giving the rows, and the variables as columns. This + analogy eventually breaks down (DataFrame columns are ordered, Dataset + variables are not) but the similarities are enough that it’s sometimes + convenient to have iloc functionality enabled. This only works for indexing + on the rows, but if there’s only the one dimension the complexity of isel + is not needed. + """ + + __slots__ = ("dataarray",) + + def __init__(self, dataarray: DataArray): + self.dataarray = dataarray + + def __getitem__(self, key: Mapping[Hashable, Any]) -> DataArray: + if not is_dict_like(key): + if len(self.dataarray.dims) == 1: + dim_name = self.dataarray.dims.__iter__().__next__() + key = {dim_name: key} + else: + raise TypeError( + "can only lookup dictionaries from DataArray.iloc, " + "unless there is only one dimension" + ) + return self.dataarray.isel(key) + + xr.Dataset.rename_dims_and_coords = xr.Dataset.rename From 1fed7718c17f43dec69d00e63cfe41c9a165cd23 Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 13 Sep 2024 12:45:04 -0500 Subject: [PATCH 3/6] fix from_named_objects when using Series to init --- sharrow/dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sharrow/dataset.py b/sharrow/dataset.py index 3d7039c..43e7de2 100755 --- a/sharrow/dataset.py +++ b/sharrow/dataset.py @@ -1482,7 +1482,10 @@ def from_named_objects(*args): raise ValueError(f"argument {n} has no name") from None if name is None: raise ValueError(f"the name for argument {n} is None") - objs[name] = a + if isinstance(a, pd.Series): + objs[name] = a.values + else: + objs[name] = a return xr.Dataset(objs) From 7dc3db4bc02f0cdc527ff46371405da259df545d Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 13 Sep 2024 13:00:59 -0500 Subject: [PATCH 4/6] test_from_named_objects --- sharrow/dataset.py | 5 +---- sharrow/tests/test_datasets.py | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/sharrow/dataset.py b/sharrow/dataset.py index 43e7de2..d4695df 100755 --- a/sharrow/dataset.py +++ b/sharrow/dataset.py @@ -1482,10 +1482,7 @@ def from_named_objects(*args): raise ValueError(f"argument {n} has no name") from None if name is None: raise ValueError(f"the name for argument {n} is None") - if isinstance(a, pd.Series): - objs[name] = a.values - else: - objs[name] = a + objs[name] = np.asarray(a) return xr.Dataset(objs) diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py index 72d2104..e271270 100644 --- a/sharrow/tests/test_datasets.py +++ b/sharrow/tests/test_datasets.py @@ -5,6 +5,7 @@ import numpy as np import openmatrix import pandas as pd +import pytest import xarray as xr from pytest import approx @@ -133,3 +134,21 @@ def test_deferred_load_to_shared_memory(): xr.testing.assert_equal(d0, d1) d2 = xr.Dataset.shm.from_shared_memory(token) xr.testing.assert_equal(d0, d2) + + +def test_from_named_objects(): + from sharrow.dataset import from_named_objects + + s1 = pd.Series([1, 4, 9, 16], name="Squares") + s2 = pd.Series([2, 3, 5, 7, 11], name="Primes") + i1 = pd.Index([1, 4, 9, 16], name="Squares") + a1 = xr.DataArray([1, 4, 9, 16], name="Squares") + + for obj in [s1, i1, a1]: + ds = from_named_objects(obj, s2) + assert "Squares" in ds.dims + assert "Primes" in ds.dims + assert ds.sizes == {'Squares': 4, 'Primes': 5} + + with pytest.raises(ValueError): + from_named_objects([1,4,9,16], s2) From 7b35c6310c5c24aaa18a82d183aa5d57f55a7aaf Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 13 Sep 2024 13:20:10 -0500 Subject: [PATCH 5/6] test_dataarray_iloc --- sharrow/tests/test_datasets.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py index e271270..7b8610f 100644 --- a/sharrow/tests/test_datasets.py +++ b/sharrow/tests/test_datasets.py @@ -152,3 +152,30 @@ def test_from_named_objects(): with pytest.raises(ValueError): from_named_objects([1,4,9,16], s2) + + +def test_dataarray_iloc(): + + arr = xr.DataArray([1, 4, 9, 16, 25, 36], name="Squares", dims="s") + + assert arr.iloc[1] == 4 + xr.testing.assert_equal(arr.iloc[1:], xr.DataArray([4, 9, 16, 25, 36], dims="s")) + xr.testing.assert_equal(arr.iloc[:2], xr.DataArray([1, 4], dims="s")) + xr.testing.assert_equal(arr.iloc[2:4], xr.DataArray([9, 16], dims="s")) + xr.testing.assert_equal(arr.iloc[:-2], xr.DataArray([1, 4, 9, 16], dims="s")) + xr.testing.assert_equal(arr.iloc[-2:], xr.DataArray([25, 36], dims="s")) + + with pytest.raises(TypeError): + arr.iloc[1] = 5 # assignment not allowed + + arr2 = xr.DataArray([2, 3, 5, 7, 11], name="Primes", dims="p") + arr2d = arr * arr2 + + with pytest.raises(TypeError): + _tmp = arr2d.iloc[1] # not allowed for 2D arrays + + assert arr2d.iloc[dict(s=1, p=2)] == 20 + + z = arr2d.iloc[dict(s=slice(1,2), p=slice(2,4))] + + xr.testing.assert_equal(z, xr.DataArray([[20, 28]], dims=["s", "p"])) \ No newline at end of file From 2fda1c105e5350bd38b6c8dc0e2d7bbd1604916f Mon Sep 17 00:00:00 2001 From: Jeff Newman Date: Fri, 13 Sep 2024 13:22:35 -0500 Subject: [PATCH 6/6] ruffen --- pyproject.toml | 1 - sharrow/tests/test_datasets.py | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 31d28cf..79043bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ select = [ "B", # flake8-bugbear ] ignore = ["B905", "D1"] -ignore-init-module-imports = true per-file-ignores = { "*.ipynb" = [ "E402", # allow imports to appear anywhere in Jupyter Notebooks "E501", # allow long lines in Jupyter Notebooks diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py index 7b8610f..18c721b 100644 --- a/sharrow/tests/test_datasets.py +++ b/sharrow/tests/test_datasets.py @@ -148,14 +148,13 @@ def test_from_named_objects(): ds = from_named_objects(obj, s2) assert "Squares" in ds.dims assert "Primes" in ds.dims - assert ds.sizes == {'Squares': 4, 'Primes': 5} + assert ds.sizes == {"Squares": 4, "Primes": 5} with pytest.raises(ValueError): - from_named_objects([1,4,9,16], s2) + from_named_objects([1, 4, 9, 16], s2) def test_dataarray_iloc(): - arr = xr.DataArray([1, 4, 9, 16, 25, 36], name="Squares", dims="s") assert arr.iloc[1] == 4 @@ -176,6 +175,6 @@ def test_dataarray_iloc(): assert arr2d.iloc[dict(s=1, p=2)] == 20 - z = arr2d.iloc[dict(s=slice(1,2), p=slice(2,4))] + z = arr2d.iloc[dict(s=slice(1, 2), p=slice(2, 4))] - xr.testing.assert_equal(z, xr.DataArray([[20, 28]], dims=["s", "p"])) \ No newline at end of file + xr.testing.assert_equal(z, xr.DataArray([[20, 28]], dims=["s", "p"]))