Skip to content

Commit

Permalink
Working remote hdf tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mpiannucci committed Oct 23, 2024
1 parent 6e5741c commit c0316ac
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 27 deletions.
2 changes: 1 addition & 1 deletion kerchunk/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,7 +461,7 @@ def _translator(
if h5obj.attrs.get("_FillValue") is not None:
fill = h5obj.attrs.get("_FillValue")
fill = encode_fill_value(
h5obj.attrs.get("_FillValue"), dt or h5obj.dtype
fill, dt or h5obj.dtype
)

adims = self._get_array_dims(h5obj)
Expand Down
22 changes: 11 additions & 11 deletions kerchunk/tests/test_hdf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import asyncio
import fsspec
import json
import os.path as osp

import zarr.core
import zarr.core.buffer
import zarr.core.group

import kerchunk.hdf
import numpy as np
import pytest
Expand All @@ -11,33 +16,28 @@
from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links
from kerchunk.combine import MultiZarrToZarr, drop
from kerchunk.utils import refs_as_fs, refs_as_store
from kerchunk.zarr import fs_as_store
from kerchunk.utils import fs_as_store

here = osp.dirname(__file__)


def test_single():
"""Test creating references for a single HDF file"""
# url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp"
url = "s3://noaa-nos-ofs-pds/ngofs2/netcdf/202410/ngofs2.t03z.20241001.2ds.f020.nc"
url = "s3://noaa-nwm-retro-v2.0-pds/full_physics/2017/201704010000.CHRTOUT_DOMAIN1.comp"
so = dict(anon=True, default_fill_cache=False, default_cache_type="none")

with fsspec.open(url, **so) as f:
h5chunks = SingleHdf5ToZarr(f, url, storage_options=so)
h5chunks = SingleHdf5ToZarr(f, url, storage_options=so, inline_threshold=1)
test_dict = h5chunks.translate()

with open("test_dict.json", "w") as f:
json.dump(test_dict, f)

store = refs_as_store(test_dict)

ds = xr.open_dataset(
store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)
)
store = refs_as_store(test_dict, remote_options=dict(asynchronous=True, anon=True))
ds = xr.open_zarr(store, zarr_format=2, consolidated=False)

with fsspec.open(url, **so) as f:
expected = xr.open_dataset(f, engine="h5netcdf")

xr.testing.assert_equal(ds.drop_vars("crs"), expected.drop_vars("crs"))


Expand Down Expand Up @@ -164,7 +164,7 @@ def test_times(times_data):
h5chunks = SingleHdf5ToZarr(f, url)
test_dict = h5chunks.translate()

store = refs_as_store(test_dict)
store = refs_as_store(test_dict, remote_protocol="file")
result = xr.open_dataset(
store, engine="zarr", zarr_format=2, backend_kwargs=dict(consolidated=False)
)
Expand Down
37 changes: 24 additions & 13 deletions kerchunk/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import copy
import itertools
import fsspec.asyn
from packaging.version import Version
from typing import Any, cast
import warnings
Expand All @@ -24,12 +25,23 @@ def refs_as_fs(refs, remote_protocol=None, remote_options=None, **kwargs):
return fs


def refs_as_store(refs, remote_protocol=None, remote_options=None):
def refs_as_store(refs, mode="r", remote_protocol=None, remote_options=None):
"""Convert a reference set to a zarr store"""
asynchronous = False
if is_zarr3():
asynchronous = True
if remote_options is None:
remote_options = {"asynchronous": True}
else:
remote_options["asynchronous"] = True

fs = refs_as_fs(
refs, remote_protocol=remote_protocol, remote_options=remote_options
refs,
remote_protocol=remote_protocol,
remote_options=remote_options,
asynchronous=asynchronous,
)
return fs_as_store(fs)
return fs_as_store(fs, mode=mode)


def is_zarr3():
Expand All @@ -40,18 +52,17 @@ def is_zarr3():
def dict_to_store(store_dict: dict):
"""Create an in memory zarr store backed by the given dictionary"""
if is_zarr3():
return zarr.storage.MemoryStore(mode="a", store_dict=store_dict)
return zarr.storage.MemoryStore(mode="w", store_dict=store_dict)
else:
return zarr.storage.KVStore(store_dict)


def fs_as_store(fs, mode="r", remote_protocol=None, remote_options=None):
def fs_as_store(fs: fsspec.asyn.AsyncFileSystem, mode="r"):
"""Open the refs as a zarr store
Parameters
----------
refs: dict-like
the references to open
fs: fsspec.async.AsyncFileSystem
mode: str
Returns
Expand Down Expand Up @@ -541,18 +552,18 @@ def templateize(strings, min_length=10, template_name="u"):


def translate_refs_serializable(refs: dict):
"""Translate a reference set to a serializable form, given that zarr
v3 memory stores store data in buffers by default. This modifies the
"""Translate a reference set to a serializable form, given that zarr
v3 memory stores store data in buffers by default. This modifies the
input dictionary in place, and returns a reference to it.
It also fixes keys that have a leading slash, which is not appropriate for
zarr v3 keys
It also fixes keys that have a leading slash, which is not appropriate for
zarr v3 keys
Parameters
----------
refs: dict
The reference set
Returns
-------
dict
Expand All @@ -568,4 +579,4 @@ def translate_refs_serializable(refs: dict):
for k in keys_to_remove:
del refs[k]
refs.update(new_keys)
return refs
return refs
4 changes: 3 additions & 1 deletion kerchunk/xarray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@ def open_reference_dataset(

m = fsspec.get_mapper("reference://", fo=filename_or_obj, **storage_options)

return xr.open_dataset(m, engine="zarr", consolidated=False, **open_dataset_options)
return xr.open_dataset(
m, engine="zarr", zarr_format=2, consolidated=False, **open_dataset_options
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ dependencies = [
"numcodecs",
"numpy",
"ujson",
"zarr==3.0.0b0",
"zarr",
]

[project.optional-dependencies]
Expand Down

0 comments on commit c0316ac

Please sign in to comment.