From 7c500f586f728e990b8f25c1b6e19f53264cc87b Mon Sep 17 00:00:00 2001
From: Jeffrey Newman <jeffnewman@camsys.com>
Date: Thu, 25 Aug 2022 21:55:24 -0500
Subject: [PATCH] Fix OMX construction with arbitrary zone ids (#27)

* fallback to dynamic version

* version infer

* fix dataset from_omx for arbitrary zone ids
---
 sharrow/__init__.py            |  2 +-
 sharrow/_infer_version.py      | 13 ++++++++++++
 sharrow/dataset.py             | 34 ++++++++++++++++++++++++++++---
 sharrow/flows.py               |  2 +-
 sharrow/tests/test_datasets.py | 37 ++++++++++++++++++++++++++++++++++
 5 files changed, 83 insertions(+), 5 deletions(-)
 create mode 100644 sharrow/_infer_version.py
 create mode 100644 sharrow/tests/test_datasets.py

diff --git a/sharrow/__init__.py b/sharrow/__init__.py
index 040cd54..100580c 100644
--- a/sharrow/__init__.py
+++ b/sharrow/__init__.py
@@ -1,7 +1,7 @@
 from xarray import DataArray
 
 from . import dataset, example_data, selectors, shared_memory, sparse
-from ._version import version as __version__
+from ._infer_version import __version__, __version_tuple__
 from .dataset import Dataset
 from .digital_encoding import array_decode, array_encode
 from .flows import Flow
diff --git a/sharrow/_infer_version.py b/sharrow/_infer_version.py
new file mode 100644
index 0000000..c5d4d40
--- /dev/null
+++ b/sharrow/_infer_version.py
@@ -0,0 +1,13 @@
+try:
+    from ._version import __version__, __version_tuple__
+except ImportError:
+    # Package is not "installed", parse git tag at runtime
+    from importlib.metadata import PackageNotFoundError, version
+
+    try:
+        __version__ = version(__package__)
+    except PackageNotFoundError:
+        # package is not installed
+        __version__ = "999.999"
+
+    __version_tuple__ = __version__.split(".")
diff --git a/sharrow/dataset.py b/sharrow/dataset.py
index 6c6c6fc..7918603 100755
--- a/sharrow/dataset.py
+++ b/sharrow/dataset.py
@@ -230,16 +230,18 @@ def from_omx(
     ----------
     omx : openmatrix.File or larch.OMX
         An OMX-format file, opened for reading.
-    index_names : tuple, default ("otaz", "dtaz", "time_period")
+    index_names : tuple, default ("otaz", "dtaz")
         Should be a tuple of length 3, giving the names of the three
         dimensions.  The first two names are the native dimensions from
         the open matrix file, the last is the name of the implicit
         dimension that is created by parsing array names.
-    indexes : str, optional
+    indexes : str or tuple[str], optional
         The name of a 'lookup' in the OMX file, which will be used to
         populate the coordinates for the two native dimensions.  Or,
         specify "one-based" or "zero-based" to assume sequential and
-        consecutive numbering starting with 1 or 0 respectively.
+        consecutive numbering starting with 1 or 0 respectively. For
+        non-square OMX data, this must be given as a tuple, relating
+        indexes as above for each dimension of `index_names`.
     renames : Mapping or Collection, optional
         Limit the import only to these data elements.  If given as a
         mapping, the keys will be the names of variables in the resulting
@@ -256,9 +258,11 @@ def from_omx(
     # handle both larch.OMX and openmatrix.open_file versions
     if "lar" in type(omx).__module__:
         omx_data = omx.data
+        omx_lookup = omx.lookup
         omx_shape = omx.shape
     else:
         omx_data = omx.root["data"]
+        omx_lookup = omx.root["lookup"]
         omx_shape = omx.shape()
 
     arrays = {}
@@ -285,6 +289,30 @@ def from_omx(
             index_names[0]: zero_based(omx_shape[0]),
             index_names[1]: zero_based(omx_shape[1]),
         }
+    elif isinstance(indexes, str):
+        if indexes in omx_lookup:
+            if omx_shape[0] != omx_shape[1]:
+                raise ValueError("singleton arbitrary coordinates on non-square arrays")
+            ixs = np.asarray(omx_lookup[indexes])
+            indexes = {
+                index_names[0]: ixs,
+                index_names[1]: ixs,
+            }
+        else:
+            raise KeyError(f"{indexes} not found in OMX lookups")
+    elif isinstance(indexes, tuple):
+        indexes_ = {}
+        for n, (name, i) in enumerate(zip(index_names, indexes)):
+            if i == "one-based":
+                indexes_[name] = one_based(omx_shape[n])
+            elif i == "zero-based":
+                indexes_[name] = zero_based(omx_shape[n])
+            elif isinstance(i, str):
+                if i in omx_lookup:
+                    indexes_[name] = np.asarray(omx_lookup[i])
+                else:
+                    raise KeyError(f"{i} not found in OMX lookups")
+        indexes = indexes_
     if indexes is not None:
         d["coords"] = {
             index_name: {"dims": index_name, "data": index}
diff --git a/sharrow/flows.py b/sharrow/flows.py
index da5d184..68e3e8b 100644
--- a/sharrow/flows.py
+++ b/sharrow/flows.py
@@ -16,7 +16,7 @@
 import pandas as pd
 import xarray as xr
 
-from . import __version__
+from ._infer_version import __version__
 from .aster import expression_for_numba, extract_all_name_tokens, extract_names_2
 from .filewrite import blacken, rewrite
 from .relationships import DataTree
diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py
new file mode 100644
index 0000000..f2b8b09
--- /dev/null
+++ b/sharrow/tests/test_datasets.py
@@ -0,0 +1,37 @@
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import openmatrix
+from pytest import approx
+
+import sharrow as sh
+
+
+def test_dataset_construct_with_zoneids():
+    tempdir = tempfile.TemporaryDirectory()
+    t = Path(tempdir.name)
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="w") as out:
+        out.create_carray("/data", "Eye", obj=np.eye(5, dtype=np.float32))
+        out.create_carray("/lookup", "Zone", obj=np.asarray([11, 22, 33, 44, 55]))
+        shp = np.empty(2, dtype=int)
+        shp[0] = 5
+        shp[1] = 5
+        out.root._v_attrs.SHAPE = shp
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back:
+        ds = sh.dataset.from_omx(back, indexes="Zone")
+
+    assert sorted(ds.coords) == ["dtaz", "otaz"]
+    assert ds.coords["otaz"].values == approx(np.asarray([11, 22, 33, 44, 55]))
+    assert sorted(ds.variables) == ["Eye", "dtaz", "otaz"]
+    assert ds["Eye"].data == approx(np.eye(5, dtype=np.float32))
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back:
+        ds0 = sh.dataset.from_omx(back, indexes="zero-based")
+    assert ds0.coords["otaz"].values == approx(np.asarray([0, 1, 2, 3, 4]))
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back:
+        ds1 = sh.dataset.from_omx(back, indexes="one-based")
+    assert ds1.coords["otaz"].values == approx(np.asarray([1, 2, 3, 4, 5]))