Fix OMX construction with arbitrary zone ids (#27)

* fallback to dynamic version * version infer * fix dataset from_omx for arbitrary zone ids
ActivitySim · Aug 26, 2022 · 7c500f5 · 7c500f5
1 parent 1fe091b
commit 7c500f5
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 5 deletions.
diff --git a/sharrow/__init__.py b/sharrow/__init__.py
@@ -1,7 +1,7 @@
 from xarray import DataArray
 
 from . import dataset, example_data, selectors, shared_memory, sparse
-from ._version import version as __version__
+from ._infer_version import __version__, __version_tuple__
 from .dataset import Dataset
 from .digital_encoding import array_decode, array_encode
 from .flows import Flow

diff --git a/sharrow/_infer_version.py b/sharrow/_infer_version.py
@@ -0,0 +1,13 @@
+try:
+    from ._version import __version__, __version_tuple__
+except ImportError:
+    # Package is not "installed", parse git tag at runtime
+    from importlib.metadata import PackageNotFoundError, version
+
+    try:
+        __version__ = version(__package__)
+    except PackageNotFoundError:
+        # package is not installed
+        __version__ = "999.999"
+
+    __version_tuple__ = __version__.split(".")
diff --git a/sharrow/dataset.py b/sharrow/dataset.py
@@ -230,16 +230,18 @@ def from_omx(
     ----------
     omx : openmatrix.File or larch.OMX
         An OMX-format file, opened for reading.
-    index_names : tuple, default ("otaz", "dtaz", "time_period")
+    index_names : tuple, default ("otaz", "dtaz")
         Should be a tuple of length 3, giving the names of the three
         dimensions.  The first two names are the native dimensions from
         the open matrix file, the last is the name of the implicit
         dimension that is created by parsing array names.
-    indexes : str, optional
+    indexes : str or tuple[str], optional
         The name of a 'lookup' in the OMX file, which will be used to
         populate the coordinates for the two native dimensions.  Or,
         specify "one-based" or "zero-based" to assume sequential and
-        consecutive numbering starting with 1 or 0 respectively.
+        consecutive numbering starting with 1 or 0 respectively. For
+        non-square OMX data, this must be given as a tuple, relating
+        indexes as above for each dimension of `index_names`.
     renames : Mapping or Collection, optional
         Limit the import only to these data elements.  If given as a
         mapping, the keys will be the names of variables in the resulting
@@ -256,9 +258,11 @@ def from_omx(
     # handle both larch.OMX and openmatrix.open_file versions
     if "lar" in type(omx).__module__:
         omx_data = omx.data
+        omx_lookup = omx.lookup
         omx_shape = omx.shape
     else:
         omx_data = omx.root["data"]
+        omx_lookup = omx.root["lookup"]
         omx_shape = omx.shape()
 
     arrays = {}
@@ -285,6 +289,30 @@ def from_omx(
             index_names[0]: zero_based(omx_shape[0]),
             index_names[1]: zero_based(omx_shape[1]),
         }
+    elif isinstance(indexes, str):
+        if indexes in omx_lookup:
+            if omx_shape[0] != omx_shape[1]:
+                raise ValueError("singleton arbitrary coordinates on non-square arrays")
+            ixs = np.asarray(omx_lookup[indexes])
+            indexes = {
+                index_names[0]: ixs,
+                index_names[1]: ixs,
+            }
+        else:
+            raise KeyError(f"{indexes} not found in OMX lookups")
+    elif isinstance(indexes, tuple):
+        indexes_ = {}
+        for n, (name, i) in enumerate(zip(index_names, indexes)):
+            if i == "one-based":
+                indexes_[name] = one_based(omx_shape[n])
+            elif i == "zero-based":
+                indexes_[name] = zero_based(omx_shape[n])
+            elif isinstance(i, str):
+                if i in omx_lookup:
+                    indexes_[name] = np.asarray(omx_lookup[i])
+                else:
+                    raise KeyError(f"{i} not found in OMX lookups")
+        indexes = indexes_
     if indexes is not None:
         d["coords"] = {
             index_name: {"dims": index_name, "data": index}

diff --git a/sharrow/flows.py b/sharrow/flows.py
@@ -16,7 +16,7 @@
 import pandas as pd
 import xarray as xr
 
-from . import __version__
+from ._infer_version import __version__
 from .aster import expression_for_numba, extract_all_name_tokens, extract_names_2
 from .filewrite import blacken, rewrite
 from .relationships import DataTree

diff --git a/sharrow/tests/test_datasets.py b/sharrow/tests/test_datasets.py
@@ -0,0 +1,37 @@
+import tempfile
+from pathlib import Path
+
+import numpy as np
+import openmatrix
+from pytest import approx
+
+import sharrow as sh
+
+
+def test_dataset_construct_with_zoneids():
+    tempdir = tempfile.TemporaryDirectory()
+    t = Path(tempdir.name)
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="w") as out:
+        out.create_carray("/data", "Eye", obj=np.eye(5, dtype=np.float32))
+        out.create_carray("/lookup", "Zone", obj=np.asarray([11, 22, 33, 44, 55]))
+        shp = np.empty(2, dtype=int)
+        shp[0] = 5
+        shp[1] = 5
+        out.root._v_attrs.SHAPE = shp
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back:
+        ds = sh.dataset.from_omx(back, indexes="Zone")
+
+    assert sorted(ds.coords) == ["dtaz", "otaz"]
+    assert ds.coords["otaz"].values == approx(np.asarray([11, 22, 33, 44, 55]))
+    assert sorted(ds.variables) == ["Eye", "dtaz", "otaz"]
+    assert ds["Eye"].data == approx(np.eye(5, dtype=np.float32))
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back:
+        ds0 = sh.dataset.from_omx(back, indexes="zero-based")
+    assert ds0.coords["otaz"].values == approx(np.asarray([0, 1, 2, 3, 4]))
+
+    with openmatrix.open_file(t.joinpath("dummy5.omx"), mode="r") as back:
+        ds1 = sh.dataset.from_omx(back, indexes="one-based")
+    assert ds1.coords["otaz"].values == approx(np.asarray([1, 2, 3, 4, 5]))