zarr-developers · TomNicholas · Oct 19, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024
diff --git a/virtualizarr/backend.py b/virtualizarr/backend.py
diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
@@ -3,10 +3,13 @@
 
 import numpy as np
 
-from ..types.kerchunk import KerchunkArrRefs
-from ..zarr import ZArray
-from .array_api import MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, _isnan
-from .manifest import ChunkManifest
+from virtualizarr.manifests.array_api import (
+    MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS,
+    _isnan,
+)
+from virtualizarr.manifests.manifest import ChunkManifest
+from virtualizarr.types.kerchunk import KerchunkArrRefs
+from virtualizarr.zarr import ZArray
 
 
 class ManifestArray:
@@ -61,7 +64,7 @@ def __init__(
 
     @classmethod
     def _from_kerchunk_refs(cls, arr_refs: KerchunkArrRefs) -> "ManifestArray":
-        from virtualizarr.readers.kerchunk import (
+        from virtualizarr.translators.kerchunk import (
             fully_decode_arr_refs,
             parse_array_refs,
         )

diff --git a/virtualizarr/readers/common.py b/virtualizarr/readers/common.py
@@ -0,0 +1,195 @@
+import os
+import warnings
+from abc import ABC
+from collections.abc import Iterable, Mapping, MutableMapping
+from io import BufferedIOBase
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Hashable,
+    Optional,
+    cast,
+)
+
+import xarray as xr
+from xarray import Dataset
+from xarray.backends import AbstractDataStore, BackendArray
+from xarray.core.indexes import Index, PandasIndex
+from xarray.core.variable import IndexVariable, Variable
+
+from virtualizarr.manifests import ManifestArray
+from virtualizarr.utils import _FsspecFSFromFilepath
+
+XArrayOpenT = str | os.PathLike[Any] | BufferedIOBase | AbstractDataStore
+
+if TYPE_CHECKING:
+    try:
+        from xarray import DataTree  # type: ignore[attr-defined]
+    except ImportError:
+        DataTree = Any
+
+
+class ManifestBackendArray(ManifestArray, BackendArray):
+    """Using this prevents xarray from wrapping the KerchunkArray in ExplicitIndexingAdapter etc."""
+
+    ...
+
+
+def open_loadable_vars_and_indexes(
+    filepath: str,
+    loadable_variables,
+    reader_options,
+    drop_variables,
+    indexes,
+    group,
+    decode_times,
+) -> tuple[Mapping[str, Variable], Mapping[str, Index]]:
+    """
+    Open selected variables and indexes using xarray.
+
+    Relies on xr.open_dataset and its auto-detection of filetypes to find the correct installed backend.
+    """
+
+    # TODO get rid of this if?
+    if indexes is None or len(loadable_variables) > 0:
+        # TODO we are reading a bunch of stuff we know we won't need here, e.g. all of the data variables...
+        # TODO it would also be nice if we could somehow consolidate this with the reading of the kerchunk references
+        # TODO really we probably want a dedicated xarray backend that iterates over all variables only once
+        fpath = _FsspecFSFromFilepath(
+            filepath=filepath, reader_options=reader_options
+        ).open_file()
+
+        # fpath can be `Any` thanks to fsspec.filesystem(...).open() returning Any.
+        # We'll (hopefully safely) cast it to what xarray is expecting, but this might let errors through.
+
+        ds = xr.open_dataset(
+            cast(XArrayOpenT, fpath),
+            drop_variables=drop_variables,
+            group=group,
+            decode_times=decode_times,
+        )
+
+        if indexes is None:
+            warnings.warn(
+                "Specifying `indexes=None` will create in-memory pandas indexes for each 1D coordinate, but concatenation of ManifestArrays backed by pandas indexes is not yet supported (see issue #18)."
+                "You almost certainly want to pass `indexes={}` to `open_virtual_dataset` instead."
+            )
+
+            # add default indexes by reading data from file
+            indexes = {name: index for name, index in ds.xindexes.items()}
+        elif indexes != {}:
+            # TODO allow manual specification of index objects
+            raise NotImplementedError()
+        else:
+            indexes = dict(**indexes)  # for type hinting: to allow mutation
+
+        # TODO we should drop these earlier by using drop_variables
+        loadable_vars = {
+            str(name): var
+            for name, var in ds.variables.items()
+            if name in loadable_variables
+        }
+
+        # if we only read the indexes we can just close the file right away as nothing is lazy
+        if loadable_vars == {}:
+            ds.close()
+    else:
+        loadable_vars = {}
+        indexes = {}
+
+    return loadable_vars, indexes
+
+
+def construct_virtual_dataset(
+    virtual_vars,
+    loadable_vars,
+    indexes,
+    coord_names,
+    attrs,
+) -> Dataset:
+    """Construct a virtual Datset from consistuent parts."""
+
+    vars = {**virtual_vars, **loadable_vars}
+
+    data_vars, coords = separate_coords(vars, indexes, coord_names)
+
+    vds = xr.Dataset(
+        data_vars,
+        coords=coords,
+        # indexes={},  # TODO should be added in a later version of xarray
+        attrs=attrs,
+    )
+
+    # TODO we should probably also use vds.set_close() to tell xarray how to close the file we opened
+
+    return vds
+
+
+def separate_coords(
+    vars: Mapping[str, xr.Variable],
+    indexes: MutableMapping[str, Index],
+    coord_names: Iterable[str] | None = None,
+) -> tuple[dict[str, xr.Variable], xr.Coordinates]:
+    """
+    Try to generate a set of coordinates that won't cause xarray to automatically build a pandas.Index for the 1D coordinates.
+
+    Currently requires this function as a workaround unless xarray PR #8124 is merged.
+
+    Will also preserve any loaded variables and indexes it is passed.
+    """
+
+    if coord_names is None:
+        coord_names = []
+
+    # split data and coordinate variables (promote dimension coordinates)
+    data_vars = {}
+    coord_vars: dict[
+        str, tuple[Hashable, Any, dict[Any, Any], dict[Any, Any]] | xr.Variable
+    ] = {}
+    for name, var in vars.items():
+        if name in coord_names or var.dims == (name,):
+            # use workaround to avoid creating IndexVariables described here https://github.com/pydata/xarray/pull/8107#discussion_r1311214263
+            if len(var.dims) == 1:
+                dim1d, *_ = var.dims
+                coord_vars[name] = (dim1d, var.data, var.attrs, var.encoding)
+
+                if isinstance(var, IndexVariable):
+                    # unless variable actually already is a loaded IndexVariable,
+                    # in which case we need to keep it and add the corresponding indexes explicitly
+                    coord_vars[str(name)] = var
+                    # TODO this seems suspect - will it handle datetimes?
+                    indexes[name] = PandasIndex(var, dim1d)
+            else:
+                coord_vars[name] = var
+        else:
+            data_vars[name] = var
+
+    coords = xr.Coordinates(coord_vars, indexes=indexes)
+
+    return data_vars, coords
+
+
+class VirtualBackend(ABC):
+    @staticmethod
+    def open_virtual_dataset(
+        filepath: str,
+        group: str | None = None,
+        drop_variables: Iterable[str] | None = None,
+        loadable_variables: Iterable[str] | None = None,
+        decode_times: bool | None = None,
+        indexes: Mapping[str, Index] | None = None,
+        reader_options: Optional[dict] = None,
+    ) -> Dataset:
+        raise NotImplementedError()
+
+    @staticmethod
+    def open_virtual_datatree(
+        path: str,
+        group: str | None = None,
+        drop_variables: Iterable[str] | None = None,
+        loadable_variables: Iterable[str] | None = None,
+        decode_times: bool | None = None,
+        indexes: Mapping[str, Index] | None = None,
+        reader_options: Optional[dict] = None,
+    ) -> "DataTree":
+        raise NotImplementedError()
diff --git a/virtualizarr/readers/dmrpp.py b/virtualizarr/readers/dmrpp.py
@@ -2,18 +2,55 @@
 import warnings
 from collections import defaultdict
 from collections.abc import Mapping
-from typing import Any, Optional
+from typing import Any, Iterable, Optional
 from xml.etree import ElementTree as ET
 
 import numpy as np
-import xarray as xr
+from xarray import Coordinates, Dataset
 from xarray.core.indexes import Index
+from xarray.core.variable import Variable
 
 from virtualizarr.manifests import ChunkManifest, ManifestArray
+from virtualizarr.readers.common import VirtualBackend
 from virtualizarr.types import ChunkKey
+from virtualizarr.utils import _FsspecFSFromFilepath, check_for_collisions
 from virtualizarr.zarr import ZArray
 
 
+class DMRPPVirtualBackend(VirtualBackend):
+    @staticmethod
+    def open_virtual_dataset(
+        filepath: str,
+        group: str | None = None,
+        drop_variables: Iterable[str] | None = None,
+        loadable_variables: Iterable[str] | None = None,
+        decode_times: bool | None = None,
+        indexes: Mapping[str, Index] | None = None,
+        reader_options: Optional[dict] = None,
+    ) -> Dataset:
+        loadable_variables, drop_variables = check_for_collisions(
+            drop_variables=drop_variables,
+            loadable_variables=loadable_variables,
+        )
+
+        if loadable_variables != [] or decode_times or indexes is None:
+            raise NotImplementedError(
+                "Specifying `loadable_variables` or auto-creating indexes with `indexes=None` is not supported for dmrpp files."
+            )
+
+        if group:
+            raise NotImplementedError()
+
+        fpath = _FsspecFSFromFilepath(
+            filepath=filepath, reader_options=reader_options
+        ).open_file()
+
+        parser = DMRParser(fpath.read(), data_filepath=filepath.strip(".dmrpp"))
+        vds = parser.parse_dataset()
+
+        return vds.drop_vars(drop_variables)
+
+
 class DMRParser:
     """
     Parser for the OPeNDAP DMR++ XML format.
@@ -69,9 +106,7 @@ def __init__(self, dmr: str, data_filepath: Optional[str] = None):
             data_filepath if data_filepath is not None else self.root.attrib["name"]
         )
 
-    def parse_dataset(
-        self, group=None, indexes: Mapping[str, Index] = {}
-    ) -> xr.Dataset:
+    def parse_dataset(self, group=None, indexes: Mapping[str, Index] = {}) -> Dataset:
         """
         Parses the given file and creates a virtual xr.Dataset with ManifestArrays.
 
@@ -128,7 +163,7 @@ def _parse_netcdf4_dataset(
         root: ET.Element,
         group: Optional[str] = None,
         indexes: Mapping[str, Index] = {},
-    ) -> xr.Dataset:
+    ) -> Dataset:
         """
         Parse the dataset from the netcdf4 based dmrpp with groups, starting at the given group.
         Set root to the given group.
@@ -201,7 +236,7 @@ def _parse_hdf5_dataset(
         root: ET.Element,
         group: Optional[str] = None,
         indexes: Mapping[str, Index] = {},
-    ) -> xr.Dataset:
+    ) -> Dataset:
         """
         Parse the dataset from the HDF5 based dmrpp with groups, starting at the given group.
         Set root to the given group.
@@ -331,7 +366,7 @@ def _split_hdf5(self, root: ET.Element) -> dict[str, ET.Element]:
 
     def _parse_dataset(
         self, root: ET.Element, indexes: Mapping[str, Index] = {}
-    ) -> xr.Dataset:
+    ) -> Dataset:
         """
         Parse the dataset using the root element of the DMR file.
 
@@ -353,8 +388,8 @@ def _parse_dataset(
         if len(coord_names) == 0 or len(coord_names) < len(dataset_dims):
             coord_names = set(dataset_dims.keys())
         # Seperate and parse coords + data variables
-        coord_vars: dict[str, xr.Variable] = {}
-        data_vars: dict[str, xr.Variable] = {}
+        coord_vars: dict[str, Variable] = {}
+        data_vars: dict[str, Variable] = {}
         for var_tag in self._find_var_tags(root):
             variable = self._parse_variable(var_tag, dataset_dims)
             if var_tag.attrib["name"] in coord_names:
@@ -365,9 +400,9 @@ def _parse_dataset(
         attrs: dict[str, str] = {}
         for attr_tag in self.root.iterfind("dap:Attribute", self._ns):
             attrs.update(self._parse_attribute(attr_tag))
-        return xr.Dataset(
+        return Dataset(
             data_vars=data_vars,
-            coords=xr.Coordinates(coords=coord_vars, indexes=indexes),
+            coords=Coordinates(coords=coord_vars, indexes=indexes),
             attrs=attrs,
         )
 
@@ -484,7 +519,7 @@ def _parse_multi_dims(
 
     def _parse_variable(
         self, var_tag: ET.Element, dataset_dims: dict[str, int]
-    ) -> xr.Variable:
+    ) -> Variable:
         """
         Parse a variable from a DMR tag.
 
@@ -542,7 +577,7 @@ def _parse_variable(
         )
         marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest)
         encoding = {k: attrs.get(k) for k in self._encoding_keys if k in attrs}
-        return xr.Variable(
+        return Variable(
             dims=dim_shapes.keys(), data=marr, attrs=attrs, encoding=encoding
         )