From c0316ace9b18455aece8d0910a33cd4791e083ce Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Wed, 23 Oct 2024 09:31:10 -0400 Subject: [PATCH] Working remote hdf tests --- kerchunk/hdf.py | 2 +- kerchunk/tests/test_hdf.py | 22 +++++++++++----------- kerchunk/utils.py | 37 ++++++++++++++++++++++++------------- kerchunk/xarray_backend.py | 4 +++- pyproject.toml | 2 +- 5 files changed, 40 insertions(+), 27 deletions(-) diff --git a/kerchunk/hdf.py b/kerchunk/hdf.py index 7cb4b5f6..1d4d0054 100644 --- a/kerchunk/hdf.py +++ b/kerchunk/hdf.py @@ -461,7 +461,7 @@ def _translator( if h5obj.attrs.get("_FillValue") is not None: fill = h5obj.attrs.get("_FillValue") fill = encode_fill_value( - h5obj.attrs.get("_FillValue"), dt or h5obj.dtype + fill, dt or h5obj.dtype ) adims = self._get_array_dims(h5obj) diff --git a/kerchunk/tests/test_hdf.py b/kerchunk/tests/test_hdf.py index 8e2117cc..f600a127 100644 --- a/kerchunk/tests/test_hdf.py +++ b/kerchunk/tests/test_hdf.py @@ -1,7 +1,12 @@ +import asyncio import fsspec import json import os.path as osp +import zarr.core +import zarr.core.buffer +import zarr.core.group + import kerchunk.hdf import numpy as np import pytest @@ -11,33 +16,28 @@ from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links from kerchunk.combine import MultiZarrToZarr, drop from kerchunk.utils import refs_as_fs, refs_as_store -from kerchunk.zarr import fs_as_store +from kerchunk.utils import fs_as_store here = osp.dirname(__file__) def test_single(): """Test creating references for a single HDF file""" - # url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" - url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc" + url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp" so = dict(anon=True, default_fill_cache=False, default_cache_type="none") with fsspec.open(url, **so) as f: - h5chunks = SingleHdf5ToZarr(f, url, storage_options=so) + h5chunks = SingleHdf5ToZarr(f, url, storage_options=so, inline_threshold=1) test_dict = h5chunks.translate() with open("test_dict.json", "w") as f: json.dump(test_dict, f) - store = refs_as_store(test_dict) - - ds = xr.open_dataset( - store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) - ) + store = refs_as_store(test_dict, remote_options=dict(asynchronous=True, anon=True)) + ds = xr.open_zarr(store, zarr_format=2, consolidated=False) with fsspec.open(url, **so) as f: expected = xr.open_dataset(f, engine="h5netcdf") - xr.testing.assert_equal(ds.drop_vars("crs"), expected.drop_vars("crs")) @@ -164,7 +164,7 @@ def test_times(times_data): h5chunks = SingleHdf5ToZarr(f, url) test_dict = h5chunks.translate() - store = refs_as_store(test_dict) + store = refs_as_store(test_dict, remote_protocol="file") result = xr.open_dataset( store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False) ) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 71cee56a..8cc2f765 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -1,6 +1,7 @@ import base64 import copy import itertools +import fsspec.asyn from packaging.version import Version from typing import Any, cast import warnings @@ -24,12 +25,23 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs): return fs -def refs_as_store(refs, remote_protocol=None, remote_options=None): +def refs_as_store(refs, mode="r", remote_protocol=None, remote_options=None): """Convert a reference set to a zarr store""" + asynchronous = False + if is_zarr3(): + asynchronous = True + if remote_options is None: + remote_options = {"asynchronous": True} + else: + remote_options["asynchronous"] = True + fs = refs_as_fs( - refs, remote_protocol=remote_protocol, remote_options=remote_options + refs, + remote_protocol=remote_protocol, + remote_options=remote_options, + asynchronous=asynchronous, ) - return fs_as_store(fs) + return fs_as_store(fs, mode=mode) def is_zarr3(): @@ -40,18 +52,17 @@ def is_zarr3(): def dict_to_store(store_dict: dict): """Create an in memory zarr store backed by the given dictionary""" if is_zarr3(): - return zarr.storage.MemoryStore(mode="a", store_dict=store_dict) + return zarr.storage.MemoryStore(mode="w", store_dict=store_dict) else: return zarr.storage.KVStore(store_dict) -def fs_as_store(fs, mode="r", remote_protocol=None, remote_options=None): +def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, mode="r"): """Open the refs as a zarr store Parameters ---------- - refs: dict-like - the references to open + fs: fsspec.async.AsyncFileSystem mode: str Returns @@ -541,18 +552,18 @@ def templateize(strings, min_length=10, template_name="u"): def translate_refs_serializable(refs: dict): - """Translate a reference set to a serializable form, given that zarr - v3 memory stores store data in buffers by default. This modifies the + """Translate a reference set to a serializable form, given that zarr + v3 memory stores store data in buffers by default. This modifies the input dictionary in place, and returns a reference to it. - It also fixes keys that have a leading slash, which is not appropriate for - zarr v3 keys + It also fixes keys that have a leading slash, which is not appropriate for + zarr v3 keys Parameters ---------- refs: dict The reference set - + Returns ------- dict @@ -568,4 +579,4 @@ def translate_refs_serializable(refs: dict): for k in keys_to_remove: del refs[k] refs.update(new_keys) - return refs \ No newline at end of file + return refs diff --git a/kerchunk/xarray_backend.py b/kerchunk/xarray_backend.py index ca377f6d..dfbbafba 100644 --- a/kerchunk/xarray_backend.py +++ b/kerchunk/xarray_backend.py @@ -43,4 +43,6 @@ def open_reference_dataset( m = fsspec.get_mapper("reference://", fo=filename_or_obj, **storage_options) - return xr.open_dataset(m, engine="zarr", consolidated=False, **open_dataset_options) + return xr.open_dataset( + m, engine="zarr", zarr_format=2, consolidated=False, **open_dataset_options + ) diff --git a/pyproject.toml b/pyproject.toml index 3c361a2d..5eb7c0c9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ dependencies = [ "numcodecs", "numpy", "ujson", - "zarr==3.0.0b0", + "zarr", ] [project.optional-dependencies]