Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
68d5c73
Support NETCDF4_CLASSIC in the h5engine backend
huard Sep 2, 2025
120af07
convert bytes attributes to numpy.bytes_ in NETCDF4_CLASSIC format wi…
huard Sep 2, 2025
f8f44f0
added test to confirm string attributes are stored as numpy char arra…
huard Sep 2, 2025
522d37d
Added change to whats-new
huard Sep 3, 2025
783c407
run pre-commit
huard Sep 3, 2025
2f1c781
Added test comparing CDL representation of test data written with net…
huard Sep 3, 2025
b142d38
Added global attribute to test data. Apply CLASSIC conversion to vari…
huard Sep 3, 2025
b14c373
Merge branch 'main' into fix_10676
huard Sep 3, 2025
909a96c
Use h5dump to compare file content instead of CDL. Add _nc3_strict at…
huard Sep 3, 2025
14d22a1
Merge branch 'fix_10676' of github.com:Ouranosinc/xarray into fix_10676
huard Sep 3, 2025
35b50ce
raise error if writing groups to CLASSIC file.
huard Sep 3, 2025
a187c8d
remove h5dump test. Remove _nc3_strict attribute (should go into h5ne…
huard Sep 5, 2025
0c1bc08
fix h5netcdf version check.
huard Sep 8, 2025
10707ee
Merge branch 'main' into fix_10676
huard Sep 8, 2025
03ea2de
try to fix tests
huard Sep 8, 2025
592b98b
Set default format to NETCDF4 instead of None, because passing None t…
huard Sep 8, 2025
13b60d0
Apply suggestions from code review
huard Sep 9, 2025
cf8b4be
Suggestions from review.
huard Sep 9, 2025
d0b0948
Merge branch 'fix_10676' of github.com:Ouranosinc/xarray into fix_10676
huard Sep 9, 2025
6351e66
Merge branch 'main' into fix_10676
huard Sep 22, 2025
b2a21d2
Raise an error within `get_child_store` rather that `__init__` with f…
huard Sep 22, 2025
50dbd36
Merge branch 'main' into fix_10676
huard Sep 23, 2025
6d686d4
Merge branch 'main' into fix_10676
huard Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ New Features
- ``compute=False`` is now supported by :py:meth:`DataTree.to_netcdf` and
:py:meth:`DataTree.to_zarr`.
By `Stephan Hoyer <https://github.com/shoyer>`_.
- The ``h5netcdf`` engine has support for pseudo ``NETCDF4_CLASSIC`` files, meaning variables and attributes are cast to supported types. Note that the saved files won't be recognized as genuine ``NETCDF4_CLASSIC`` files until ``h5netcdf`` adds support. (:issue:`10676`, :pull:`10686`).
By `David Huard <https://github.com/huard>`_.
- ``open_dataset`` will now correctly infer a path ending in ``.zarr/`` as zarr
By `Ian Hunt-Isaak <https://github.com/ianhi>`_.

Expand Down
56 changes: 45 additions & 11 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from typing import TYPE_CHECKING, Any, Self

import numpy as np
from packaging.version import Version

from xarray.backends.common import (
BACKEND_ENTRYPOINTS,
Expand All @@ -27,6 +28,7 @@
PickleableFileManager,
)
from xarray.backends.locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock
from xarray.backends.netcdf3 import encode_nc3_attr_value, encode_nc3_variable
from xarray.backends.netCDF4_ import (
BaseNetCDF4Array,
_build_and_get_enum,
Expand Down Expand Up @@ -124,6 +126,7 @@ def __init__(
manager: FileManager | h5netcdf.File | h5netcdf.Group,
group=None,
mode=None,
format="NETCDF4",
lock=HDF5_LOCK,
autoclose=False,
):
Expand All @@ -143,7 +146,7 @@ def __init__(
self._manager = manager
self._group = group
self._mode = mode
self.format = None
self.format = format or "NETCDF4"
# todo: utilizing find_root_and_group seems a bit clunky
# making filename available on h5netcdf.Group seems better
self._filename = find_root_and_group(self.ds)[0].filename
Expand All @@ -152,6 +155,9 @@ def __init__(
self.autoclose = autoclose

def get_child_store(self, group: str) -> Self:
if self.format == "NETCDF4_CLASSIC":
raise ValueError("Cannot create sub-groups in `NETCDF4_CLASSIC` format.")

if self._group is not None:
group = os.path.join(self._group, group)
return type(self)(
Expand All @@ -167,7 +173,7 @@ def open(
cls,
filename,
mode="r",
format=None,
format="NETCDF4",
group=None,
lock=None,
autoclose=False,
Expand Down Expand Up @@ -198,8 +204,8 @@ def open(
f"{magic_number!r} is not the signature of a valid netCDF4 file"
)

if format not in [None, "NETCDF4"]:
raise ValueError("invalid format for h5netcdf backend")
if format not in [None, "NETCDF4", "NETCDF4_CLASSIC"]:
raise ValueError(f"invalid format for h5netcdf backend: {format}")

kwargs = {
"invalid_netcdf": invalid_netcdf,
Expand All @@ -210,6 +216,8 @@ def open(
kwargs.update(driver_kwds)
if phony_dims is not None:
kwargs["phony_dims"] = phony_dims
if format is not None and Version(h5netcdf.__version__) > Version("1.6.4"):
kwargs["format"] = format
Comment on lines +219 to +220
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Has this been in fact added to h5netcdf yet?

We should figure out the API h5netcdf will accept first before using it in xarray.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change has been done in h5netcdf.

This PR assumes that h5netcdf knows nothing of the format argument for now, but that the next version will.

The plan is then to go inside h5netcdf and make a PR to support the CLASSIC format without breaking xarray tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But indeed, I'm assuming h5netcdf will accept a PR adding the format argument to their API, and that it will be merged before the next release. This might be over-optimistic and I'm happy to follow suggestions here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I really think we should finish the API on the h5netcdf side first. That eliminates the non-zero risk that h5netcdf releases a new version before your PR to h5netcdf lands, or that h5netcdf settles on a different API for this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huard I agree with @shoyer and I'm gladly supporting a PR over at h5netcdf.

We should move fast, as a major version change is lurking around the corner. The integration of pyfive is almost ready. We could add in the NETCDF4_CLASSIC changes before or on top of the pyfive changes and release this together as h5netcdf v2.0.0. That way we could also prevent possible deprecation cycle.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, see h5netcdf/h5netcdf#283

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The h5netcdf PR is now merged.


if lock is None:
if mode == "r":
Expand All @@ -223,7 +231,15 @@ def open(
else PickleableFileManager
)
manager = manager_cls(h5netcdf.File, filename, mode=mode, kwargs=kwargs)
return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose)

return cls(
manager,
group=group,
format=format,
mode=mode,
lock=lock,
autoclose=autoclose,
)

def _acquire(self, needs_lock=True):
with self._manager.acquire_context(needs_lock) as root:
Expand Down Expand Up @@ -319,11 +335,27 @@ def set_dimension(self, name, length, is_unlimited=False):
else:
self.ds.dimensions[name] = length

def convert_string(self, value):
"""If format is NETCDF4_CLASSIC, convert strings to fixed width char
arrays to ensure they can be read by legacy software.

CLASSIC attributes are read by third party software as fixed width char arrays
"""
if self.format == "NETCDF4_CLASSIC":
value = encode_nc3_attr_value(value)
if isinstance(value, bytes):
value = np.bytes_(value)
Comment on lines +346 to +347
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why this special logic only for converting bytes? This seems unrelated to what we need for NETCDF4_CLASSIC.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To make sure strings are written as NC_CHAR, and not NC_STRING. See https://engee.com/helpcenter/stable/en/julia/NetCDF/strings.html

This is in fact the detail that our third party software in C++ choked on. The netCDF C library has both nc_get_att_text and nc_get_att_string functions. Calling nc_get_att_text on an NC_STRING raises an error.

return value

def set_attribute(self, key, value):
value = self.convert_string(value)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure you need a helper function here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The convert_string function is called from two different places (variable attributes and global attributes), hence the helper function.

self.ds.attrs[key] = value

def encode_variable(self, variable, name=None):
return _encode_nc4_variable(variable, name=name)
if self.format == "NETCDF4_CLASSIC":
return encode_nc3_variable(variable, name=name)
else:
return _encode_nc4_variable(variable, name=name)

def prepare_variable(
self, name, variable, check_encoding=False, unlimited_dims=None
Expand All @@ -332,7 +364,9 @@ def prepare_variable(

_ensure_no_forward_slash_in_name(name)
attrs = variable.attrs.copy()
dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding)
dtype = _get_datatype(
variable, nc_format=self.format, raise_on_invalid_encoding=check_encoding
)

fillvalue = attrs.pop("_FillValue", None)

Expand Down Expand Up @@ -394,7 +428,7 @@ def prepare_variable(
nc4_var = self.ds[name]

for k, v in attrs.items():
nc4_var.attrs[k] = v
nc4_var.attrs[k] = self.convert_string(v)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is doing the variable attribute conversion twice?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This here is for variable attributes only. The set_attribute above only operates on global attributes.


target = H5NetCDFArrayWrapper(name, self)

Expand Down Expand Up @@ -484,7 +518,7 @@ def open_dataset(
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
format=None,
format="NETCDF4",
group=None,
lock=None,
invalid_netcdf=None,
Expand Down Expand Up @@ -544,7 +578,7 @@ def open_datatree(
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
format=None,
format="NETCDF4",
group: str | None = None,
lock=None,
invalid_netcdf=None,
Expand Down Expand Up @@ -587,7 +621,7 @@ def open_groups_as_dict(
drop_variables: str | Iterable[str] | None = None,
use_cftime=None,
decode_timedelta=None,
format=None,
format="NETCDF4",
group: str | None = None,
lock=None,
invalid_netcdf=None,
Expand Down
49 changes: 49 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ def roundtrip(
save_kwargs = {}
if open_kwargs is None:
open_kwargs = {}

with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path:
self.save(data, path, **save_kwargs)
with self.open(path, **open_kwargs) as ds:
Expand Down Expand Up @@ -4727,6 +4728,54 @@ def create_store(self):
) as store:
yield store

@requires_h5netcdf
def test_string_attributes_stored_as_char(self, tmp_path):
import h5netcdf

original = Dataset(attrs={"foo": "bar"})
store_path = tmp_path / "tmp.nc"
original.to_netcdf(store_path, engine=self.engine, format=self.file_format)
with h5netcdf.File(store_path, "r") as ds:
# Check that the attribute is stored as a char array
assert ds._h5file.attrs["foo"].dtype == np.dtype("S3")
Comment on lines +4732 to +4740
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NumPy's S dtype actually corresponds to bytes, not str. I don't think we want to use it for storing attributes in general.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using fixed width chars replicates the behavior of the netCDF4 backend for the CLASSIC format. Again, this has to do with the NC_CHAR vs NC_STRING formats.

Sticking as close as possible to netCDF4 output increases my confidence that the h5netcdf outputs will be compatible with 3rd party software expecting the CLASSIC format.



@requires_h5netcdf
class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data):
engine: T_NetcdfEngine = "h5netcdf"
file_format: T_NetcdfTypes = "NETCDF4_CLASSIC"

@contextlib.contextmanager
def create_store(self):
with create_tmp_file() as tmp_file:
with backends.H5NetCDFStore.open(
tmp_file, mode="w", format="NETCDF4_CLASSIC"
) as store:
yield store

@requires_netCDF4
def test_cross_engine_read_write_netcdf4(self) -> None:
# Drop dim3, because its labels include strings. These appear to be
# not properly read with python-netCDF4, which converts them into
# unicode instead of leaving them as bytes.
data = create_test_data().drop_vars("dim3")
data.attrs["foo"] = "bar"
valid_engines: list[T_NetcdfEngine] = ["netcdf4", "h5netcdf"]
for write_engine in valid_engines:
with create_tmp_file() as tmp_file:
data.to_netcdf(tmp_file, engine=write_engine, format=self.file_format)
for read_engine in valid_engines:
with open_dataset(tmp_file, engine=read_engine) as actual:
assert_identical(data, actual)

def test_group_fails(self):
# Check writing group data fails with CLASSIC format
original = create_test_data()
with pytest.raises(
ValueError, match=r"Cannot create sub-groups in `NETCDF4_CLASSIC` format."
):
original.to_netcdf(group="sub", format=self.file_format, engine=self.engine)


@requires_scipy_or_netCDF4
class TestGenericNetCDFData(NetCDF3Only, CFEncodedBase):
Expand Down
Loading