From 7c500f586f728e990b8f25c1b6e19f53264cc87b Mon Sep 17 00:00:00 2001 From: Jeffrey Newman Date: Thu, 25 Aug 2022 21:55:24 -0500 Subject: [PATCH] Fix OMX construction with arbitrary zone ids (#27) * fallback to dynamic version * version infer * fix dataset from_omx for arbitrary zone ids --- sharrow/__init__.py | 2 +- sharrow/_infer_version.py | 13 ++++++++++++ sharrow/dataset.py | 34 ++++++++++++++++++++++++++++--- sharrow/flows.py | 2 +- sharrow/tests/test_datasets.py | 37 ++++++++++++++++++++++++++++++++++ 5 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 sharrow/_infer_version.py create mode 100644 sharrow/tests/test_datasets.py diff --git a/sharrow/__init__.py b/sharrow/__init__.py index 040cd54..100580c 100644 --- a/sharrow/__init__.py +++ b/sharrow/__init__.py @@ -1,7 +1,7 @@ from xarray import DataArray from . import dataset, example_data, selectors, shared_memory, sparse -from ._version import version as __version__ +from ._infer_version import __version__, __version_tuple__ from .dataset import Dataset from .digital_encoding import array_decode, array_encode from .flows import Flow diff --git a/sharrow/_infer_version.py b/sharrow/_infer_version.py new file mode 100644 index 0000000..c5d4d40 --- /dev/null +++ b/sharrow/_infer_version.py @@ -0,0 +1,13 @@ +try: + from ._version import __version__, __version_tuple__ +except ImportError: + # Package is not "installed", parse git tag at runtime + from importlib.metadata import PackageNotFoundError, version + + try: + __version__ = version(__package__) + except PackageNotFoundError: + # package is not installed + __version__ = "999.999" + + __version_tuple__ = __version__.split(".") diff --git a/sharrow/dataset.py b/sharrow/dataset.py index 6c6c6fc..7918603 100755 --- a/sharrow/dataset.py +++ b/sharrow/dataset.py @@ -230,16 +230,18 @@ def from_omx( ---------- omx : openmatrix.File or larch.OMX An OMX-format file, opened for reading. - index_names : tuple, default ("otaz", "dtaz", "time_period") + index_names : tuple, default ("otaz", "dtaz") Should be a tuple of length 3, giving the names of the three dimensions. The first two names are the native dimensions from the open matrix file, the last is the name of the implicit dimension that is created by parsing array names. - indexes : str, optional + indexes : str or tuple[str], optional The name of a 'lookup' in the OMX file, which will be used to populate the coordinates for the two native dimensions. Or, specify "one-based" or "zero-based" to assume sequential and - consecutive numbering starting with 1 or 0 respectively. + consecutive numbering starting with 1 or 0 respectively. For + non-square OMX data, this must be given as a tuple, relating + indexes as above for each dimension of `index_names`. renames : Mapping or Collection, optional Limit the import only to these data elements. If given as a mapping, the keys will be the names of variables in the resulting @@ -256,9 +258,11 @@ def from_omx( # handle both larch.OMX and openmatrix.open_file versions if "lar" in type(omx).__module__: omx_data = omx.data + omx_lookup = omx.lookup omx_shape = omx.shape else: omx_data = omx.root["data"] + omx_lookup = omx.root["lookup"] omx_shape = omx.shape() arrays = {} @@ -285,6 +289,30 @@ def from_omx( index_names[0]: zero_based(omx_shape[0]), index_names[1]: zero_based(omx_shape[1]), } + elif isinstance(indexes, str): + if indexes in omx_lookup: + if omx_shape[0] != omx_shape[1]: + raise ValueError("singleton arbitrary coordinates on non-square arrays") + ixs = np.asarray(omx_lookup[indexes]) + indexes = { + index_names[0]: ixs, + index_names[1]: ixs, + } + else: + raise KeyError(f"{indexes} not found in OMX lookups") + elif isinstance(indexes, tuple): + indexes_ = {} + for n, (name, i) in enumerate(zip(index_names, indexes)): + if i == "one-based": + indexes_[name] = one_based(omx_shape[n]) + elif i == "zero-based": + indexes_[name] = zero_based(omx_shape[n]) + elif isinstance(i, str): + if i in omx_lookup: + indexes_[name] = np.asarray(omx_lookup[i]) + else: + raise KeyError(f"{i} not found in OMX lookups") + indexes = indexes_ if indexes is not None: d["coords"] = { index_name: {"dims": index_name, "data": index} diff --git a/sharrow/flows.py b/sharrow/flows.py index da5d184..68e3e8b 100644 --- a/sharrow/flows.py +++ b/sharrow/flows.py @@ -16,7 +16,7 @@ import pandas as pd import xarray as xr -from . import __version__ +from ._infer_version import __version__ from .aster import expression_for_numba, extract_all_name_tokens, extract_names_2 from .filewrite import blacken, rewrite from .relationships import DataTree diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py new file mode 100644 index 0000000..f2b8b09 --- /dev/null +++ b/sharrow/tests/test_datasets.py @@ -0,0 +1,37 @@ +import tempfile +from pathlib import Path + +import numpy as np +import openmatrix +from pytest import approx + +import sharrow as sh + + +def test_dataset_construct_with_zoneids(): + tempdir = tempfile.TemporaryDirectory() + t = Path(tempdir.name) + + with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="w") as out: + out.create_carray("/data", "Eye", obj=np.eye(5, dtype=np.float32)) + out.create_carray("/lookup", "Zone", obj=np.asarray([11, 22, 33, 44, 55])) + shp = np.empty(2, dtype=int) + shp[0] = 5 + shp[1] = 5 + out.root._v_attrs.SHAPE = shp + + with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back: + ds = sh.dataset.from_omx(back, indexes="Zone") + + assert sorted(ds.coords) == ["dtaz", "otaz"] + assert ds.coords["otaz"].values == approx(np.asarray([11, 22, 33, 44, 55])) + assert sorted(ds.variables) == ["Eye", "dtaz", "otaz"] + assert ds["Eye"].data == approx(np.eye(5, dtype=np.float32)) + + with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back: + ds0 = sh.dataset.from_omx(back, indexes="zero-based") + assert ds0.coords["otaz"].values == approx(np.asarray([0, 1, 2, 3, 4])) + + with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back: + ds1 = sh.dataset.from_omx(back, indexes="one-based") + assert ds1.coords["otaz"].values == approx(np.asarray([1, 2, 3, 4, 5]))