-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Add support for NETCDF4_CLASSIC to h5netcdf engine #10686
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
68d5c73
120af07
f8f44f0
522d37d
783c407
2f1c781
b142d38
b14c373
909a96c
14d22a1
35b50ce
a187c8d
0c1bc08
10707ee
03ea2de
592b98b
13b60d0
cf8b4be
d0b0948
6351e66
b2a21d2
50dbd36
6d686d4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from typing import TYPE_CHECKING, Any, Self | ||
|
||
import numpy as np | ||
from packaging.version import Version | ||
|
||
from xarray.backends.common import ( | ||
BACKEND_ENTRYPOINTS, | ||
|
@@ -27,6 +28,7 @@ | |
PickleableFileManager, | ||
) | ||
from xarray.backends.locks import HDF5_LOCK, combine_locks, ensure_lock, get_write_lock | ||
from xarray.backends.netcdf3 import encode_nc3_attr_value, encode_nc3_variable | ||
from xarray.backends.netCDF4_ import ( | ||
BaseNetCDF4Array, | ||
_build_and_get_enum, | ||
|
@@ -124,6 +126,7 @@ def __init__( | |
manager: FileManager | h5netcdf.File | h5netcdf.Group, | ||
group=None, | ||
mode=None, | ||
format="NETCDF4", | ||
lock=HDF5_LOCK, | ||
autoclose=False, | ||
): | ||
|
@@ -143,7 +146,7 @@ def __init__( | |
self._manager = manager | ||
self._group = group | ||
self._mode = mode | ||
self.format = None | ||
self.format = format or "NETCDF4" | ||
# todo: utilizing find_root_and_group seems a bit clunky | ||
# making filename available on h5netcdf.Group seems better | ||
self._filename = find_root_and_group(self.ds)[0].filename | ||
|
@@ -152,6 +155,9 @@ def __init__( | |
self.autoclose = autoclose | ||
|
||
def get_child_store(self, group: str) -> Self: | ||
if self.format == "NETCDF4_CLASSIC": | ||
raise ValueError("Cannot create sub-groups in `NETCDF4_CLASSIC` format.") | ||
|
||
if self._group is not None: | ||
group = os.path.join(self._group, group) | ||
return type(self)( | ||
|
@@ -167,7 +173,7 @@ def open( | |
cls, | ||
filename, | ||
mode="r", | ||
format=None, | ||
format="NETCDF4", | ||
group=None, | ||
lock=None, | ||
autoclose=False, | ||
|
@@ -198,8 +204,8 @@ def open( | |
f"{magic_number!r} is not the signature of a valid netCDF4 file" | ||
) | ||
|
||
if format not in [None, "NETCDF4"]: | ||
raise ValueError("invalid format for h5netcdf backend") | ||
if format not in [None, "NETCDF4", "NETCDF4_CLASSIC"]: | ||
raise ValueError(f"invalid format for h5netcdf backend: {format}") | ||
|
||
kwargs = { | ||
"invalid_netcdf": invalid_netcdf, | ||
|
@@ -210,6 +216,8 @@ def open( | |
kwargs.update(driver_kwds) | ||
if phony_dims is not None: | ||
kwargs["phony_dims"] = phony_dims | ||
if format is not None and Version(h5netcdf.__version__) > Version("1.6.4"): | ||
kwargs["format"] = format | ||
|
||
if lock is None: | ||
if mode == "r": | ||
|
@@ -223,7 +231,15 @@ def open( | |
else PickleableFileManager | ||
) | ||
manager = manager_cls(h5netcdf.File, filename, mode=mode, kwargs=kwargs) | ||
return cls(manager, group=group, mode=mode, lock=lock, autoclose=autoclose) | ||
|
||
return cls( | ||
manager, | ||
group=group, | ||
format=format, | ||
mode=mode, | ||
lock=lock, | ||
autoclose=autoclose, | ||
) | ||
|
||
def _acquire(self, needs_lock=True): | ||
with self._manager.acquire_context(needs_lock) as root: | ||
|
@@ -319,11 +335,27 @@ def set_dimension(self, name, length, is_unlimited=False): | |
else: | ||
self.ds.dimensions[name] = length | ||
|
||
def convert_string(self, value): | ||
"""If format is NETCDF4_CLASSIC, convert strings to fixed width char | ||
arrays to ensure they can be read by legacy software. | ||
|
||
CLASSIC attributes are read by third party software as fixed width char arrays | ||
""" | ||
if self.format == "NETCDF4_CLASSIC": | ||
value = encode_nc3_attr_value(value) | ||
if isinstance(value, bytes): | ||
value = np.bytes_(value) | ||
Comment on lines
+346
to
+347
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why this special logic only for converting bytes? This seems unrelated to what we need for NETCDF4_CLASSIC. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To make sure strings are written as This is in fact the detail that our third party software in C++ choked on. The netCDF C library has both |
||
return value | ||
|
||
def set_attribute(self, key, value): | ||
value = self.convert_string(value) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure you need a helper function here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
self.ds.attrs[key] = value | ||
|
||
def encode_variable(self, variable, name=None): | ||
return _encode_nc4_variable(variable, name=name) | ||
if self.format == "NETCDF4_CLASSIC": | ||
return encode_nc3_variable(variable, name=name) | ||
else: | ||
return _encode_nc4_variable(variable, name=name) | ||
|
||
def prepare_variable( | ||
self, name, variable, check_encoding=False, unlimited_dims=None | ||
|
@@ -332,7 +364,9 @@ def prepare_variable( | |
|
||
_ensure_no_forward_slash_in_name(name) | ||
attrs = variable.attrs.copy() | ||
dtype = _get_datatype(variable, raise_on_invalid_encoding=check_encoding) | ||
dtype = _get_datatype( | ||
variable, nc_format=self.format, raise_on_invalid_encoding=check_encoding | ||
) | ||
|
||
fillvalue = attrs.pop("_FillValue", None) | ||
|
||
|
@@ -394,7 +428,7 @@ def prepare_variable( | |
nc4_var = self.ds[name] | ||
|
||
for k, v in attrs.items(): | ||
nc4_var.attrs[k] = v | ||
nc4_var.attrs[k] = self.convert_string(v) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is doing the variable attribute conversion twice? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This here is for variable attributes only. The |
||
|
||
target = H5NetCDFArrayWrapper(name, self) | ||
|
||
|
@@ -484,7 +518,7 @@ def open_dataset( | |
drop_variables: str | Iterable[str] | None = None, | ||
use_cftime=None, | ||
decode_timedelta=None, | ||
format=None, | ||
format="NETCDF4", | ||
group=None, | ||
lock=None, | ||
invalid_netcdf=None, | ||
|
@@ -544,7 +578,7 @@ def open_datatree( | |
drop_variables: str | Iterable[str] | None = None, | ||
use_cftime=None, | ||
decode_timedelta=None, | ||
format=None, | ||
format="NETCDF4", | ||
group: str | None = None, | ||
lock=None, | ||
invalid_netcdf=None, | ||
|
@@ -587,7 +621,7 @@ def open_groups_as_dict( | |
drop_variables: str | Iterable[str] | None = None, | ||
use_cftime=None, | ||
decode_timedelta=None, | ||
format=None, | ||
format="NETCDF4", | ||
group: str | None = None, | ||
lock=None, | ||
invalid_netcdf=None, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -460,6 +460,7 @@ def roundtrip( | |
save_kwargs = {} | ||
if open_kwargs is None: | ||
open_kwargs = {} | ||
|
||
with create_tmp_file(allow_cleanup_failure=allow_cleanup_failure) as path: | ||
self.save(data, path, **save_kwargs) | ||
with self.open(path, **open_kwargs) as ds: | ||
|
@@ -4727,6 +4728,54 @@ def create_store(self): | |
) as store: | ||
yield store | ||
|
||
@requires_h5netcdf | ||
def test_string_attributes_stored_as_char(self, tmp_path): | ||
import h5netcdf | ||
|
||
original = Dataset(attrs={"foo": "bar"}) | ||
store_path = tmp_path / "tmp.nc" | ||
original.to_netcdf(store_path, engine=self.engine, format=self.file_format) | ||
with h5netcdf.File(store_path, "r") as ds: | ||
# Check that the attribute is stored as a char array | ||
assert ds._h5file.attrs["foo"].dtype == np.dtype("S3") | ||
Comment on lines
+4732
to
+4740
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NumPy's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using fixed width chars replicates the behavior of the netCDF4 backend for the CLASSIC format. Again, this has to do with the Sticking as close as possible to netCDF4 output increases my confidence that the h5netcdf outputs will be compatible with 3rd party software expecting the CLASSIC format. |
||
|
||
|
||
@requires_h5netcdf | ||
class TestNetCDF4ClassicViaH5NetCDFData(TestNetCDF4ClassicViaNetCDF4Data): | ||
engine: T_NetcdfEngine = "h5netcdf" | ||
file_format: T_NetcdfTypes = "NETCDF4_CLASSIC" | ||
|
||
@contextlib.contextmanager | ||
def create_store(self): | ||
with create_tmp_file() as tmp_file: | ||
with backends.H5NetCDFStore.open( | ||
tmp_file, mode="w", format="NETCDF4_CLASSIC" | ||
) as store: | ||
yield store | ||
|
||
@requires_netCDF4 | ||
def test_cross_engine_read_write_netcdf4(self) -> None: | ||
# Drop dim3, because its labels include strings. These appear to be | ||
# not properly read with python-netCDF4, which converts them into | ||
# unicode instead of leaving them as bytes. | ||
data = create_test_data().drop_vars("dim3") | ||
data.attrs["foo"] = "bar" | ||
valid_engines: list[T_NetcdfEngine] = ["netcdf4", "h5netcdf"] | ||
for write_engine in valid_engines: | ||
with create_tmp_file() as tmp_file: | ||
data.to_netcdf(tmp_file, engine=write_engine, format=self.file_format) | ||
for read_engine in valid_engines: | ||
with open_dataset(tmp_file, engine=read_engine) as actual: | ||
assert_identical(data, actual) | ||
|
||
def test_group_fails(self): | ||
# Check writing group data fails with CLASSIC format | ||
original = create_test_data() | ||
with pytest.raises( | ||
ValueError, match=r"Cannot create sub-groups in `NETCDF4_CLASSIC` format." | ||
): | ||
original.to_netcdf(group="sub", format=self.file_format, engine=self.engine) | ||
|
||
|
||
@requires_scipy_or_netCDF4 | ||
class TestGenericNetCDFData(NetCDF3Only, CFEncodedBase): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Has this been in fact added to h5netcdf yet?
We should figure out the API h5netcdf will accept first before using it in xarray.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No change has been done in
h5netcdf
.This PR assumes that
h5netcdf
knows nothing of the format argument for now, but that the next version will.The plan is then to go inside h5netcdf and make a PR to support the CLASSIC format without breaking xarray tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But indeed, I'm assuming
h5netcdf
will accept a PR adding theformat
argument to their API, and that it will be merged before the next release. This might be over-optimistic and I'm happy to follow suggestions here.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I really think we should finish the API on the h5netcdf side first. That eliminates the non-zero risk that h5netcdf releases a new version before your PR to h5netcdf lands, or that h5netcdf settles on a different API for this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@huard I agree with @shoyer and I'm gladly supporting a PR over at h5netcdf.
We should move fast, as a major version change is lurking around the corner. The integration of pyfive is almost ready. We could add in the NETCDF4_CLASSIC changes before or on top of the pyfive changes and release this together as h5netcdf v2.0.0. That way we could also prevent possible deprecation cycle.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds good, see h5netcdf/h5netcdf#283
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The h5netcdf PR is now merged.