From 31bc3f6b1016bdbe6d408e6f0ae169ae9eedb570 Mon Sep 17 00:00:00 2001 From: Jeffrey Newman Date: Fri, 13 Sep 2024 15:45:18 -0500 Subject: [PATCH] Patch small items (#63) * permit numpy >= 2 * add iloc to DataArray * fix from_named_objects when using Series to init * test_from_named_objects * test_dataarray_iloc * ruffen --- pyproject.toml | 3 +-- sharrow/dataset.py | 38 ++++++++++++++++++++++++++-- sharrow/tests/test_datasets.py | 45 ++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 83f7f36..79043bb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ name = "sharrow" requires-python = ">=3.9" dynamic = ["version"] dependencies = [ - "numpy >= 1.19, <2", + "numpy >= 1.19", "pandas >= 1.2", "pyarrow", "xarray", @@ -59,7 +59,6 @@ select = [ "B", # flake8-bugbear ] ignore = ["B905", "D1"] -ignore-init-module-imports = true per-file-ignores = { "*.ipynb" = [ "E402", # allow imports to appear anywhere in Jupyter Notebooks "E501", # allow long lines in Jupyter Notebooks diff --git a/sharrow/dataset.py b/sharrow/dataset.py index c1907ea..d4695df 100755 --- a/sharrow/dataset.py +++ b/sharrow/dataset.py @@ -91,7 +91,7 @@ def construct(source): Parameters ---------- source : pandas.DataFrame, pyarrow.Table, xarray.Dataset, or Sequence[str] - The source from which to create a Dataset. DataFrames and Tables + The source from which to create a Dataset. DataFrame and Table objects are converted to Datasets that have one dimension (the rows) and separate variables for each of the columns. A list of strings creates a dataset with those named empty variables. @@ -1065,6 +1065,38 @@ def __getitem__(self, key: Mapping[Hashable, Any]) -> Dataset: return self.dataset.isel(key) +@xr.register_dataarray_accessor("iloc") +class _iLocArrayIndexer: + """ + Purely integer-location based indexing for selection by position on 1-d DataArrays. + + In many ways, a dataset with a single dimensions is like a pandas DataFrame, + with the one dimension giving the rows, and the variables as columns. This + analogy eventually breaks down (DataFrame columns are ordered, Dataset + variables are not) but the similarities are enough that it’s sometimes + convenient to have iloc functionality enabled. This only works for indexing + on the rows, but if there’s only the one dimension the complexity of isel + is not needed. + """ + + __slots__ = ("dataarray",) + + def __init__(self, dataarray: DataArray): + self.dataarray = dataarray + + def __getitem__(self, key: Mapping[Hashable, Any]) -> DataArray: + if not is_dict_like(key): + if len(self.dataarray.dims) == 1: + dim_name = self.dataarray.dims.__iter__().__next__() + key = {dim_name: key} + else: + raise TypeError( + "can only lookup dictionaries from DataArray.iloc, " + "unless there is only one dimension" + ) + return self.dataarray.isel(key) + + xr.Dataset.rename_dims_and_coords = xr.Dataset.rename @@ -1182,6 +1214,8 @@ def _to_ast_literal(x): return _to_ast_literal(x.to_list()) elif isinstance(x, np.ndarray): return _to_ast_literal(list(x)) + elif isinstance(x, np.str_): + return repr(str(x)) else: return repr(x) @@ -1448,7 +1482,7 @@ def from_named_objects(*args): raise ValueError(f"argument {n} has no name") from None if name is None: raise ValueError(f"the name for argument {n} is None") - objs[name] = a + objs[name] = np.asarray(a) return xr.Dataset(objs) diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py index 72d2104..18c721b 100644 --- a/sharrow/tests/test_datasets.py +++ b/sharrow/tests/test_datasets.py @@ -5,6 +5,7 @@ import numpy as np import openmatrix import pandas as pd +import pytest import xarray as xr from pytest import approx @@ -133,3 +134,47 @@ def test_deferred_load_to_shared_memory(): xr.testing.assert_equal(d0, d1) d2 = xr.Dataset.shm.from_shared_memory(token) xr.testing.assert_equal(d0, d2) + + +def test_from_named_objects(): + from sharrow.dataset import from_named_objects + + s1 = pd.Series([1, 4, 9, 16], name="Squares") + s2 = pd.Series([2, 3, 5, 7, 11], name="Primes") + i1 = pd.Index([1, 4, 9, 16], name="Squares") + a1 = xr.DataArray([1, 4, 9, 16], name="Squares") + + for obj in [s1, i1, a1]: + ds = from_named_objects(obj, s2) + assert "Squares" in ds.dims + assert "Primes" in ds.dims + assert ds.sizes == {"Squares": 4, "Primes": 5} + + with pytest.raises(ValueError): + from_named_objects([1, 4, 9, 16], s2) + + +def test_dataarray_iloc(): + arr = xr.DataArray([1, 4, 9, 16, 25, 36], name="Squares", dims="s") + + assert arr.iloc[1] == 4 + xr.testing.assert_equal(arr.iloc[1:], xr.DataArray([4, 9, 16, 25, 36], dims="s")) + xr.testing.assert_equal(arr.iloc[:2], xr.DataArray([1, 4], dims="s")) + xr.testing.assert_equal(arr.iloc[2:4], xr.DataArray([9, 16], dims="s")) + xr.testing.assert_equal(arr.iloc[:-2], xr.DataArray([1, 4, 9, 16], dims="s")) + xr.testing.assert_equal(arr.iloc[-2:], xr.DataArray([25, 36], dims="s")) + + with pytest.raises(TypeError): + arr.iloc[1] = 5 # assignment not allowed + + arr2 = xr.DataArray([2, 3, 5, 7, 11], name="Primes", dims="p") + arr2d = arr * arr2 + + with pytest.raises(TypeError): + _tmp = arr2d.iloc[1] # not allowed for 2D arrays + + assert arr2d.iloc[dict(s=1, p=2)] == 20 + + z = arr2d.iloc[dict(s=slice(1, 2), p=slice(2, 4))] + + xr.testing.assert_equal(z, xr.DataArray([[20, 28]], dims=["s", "p"]))