From 634eeb4ff54e09dab3aa3585dfbf274cb999c8a2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 15 Aug 2023 10:32:51 -0400 Subject: [PATCH 1/4] working with zarr in zip --- kerchunk/netCDF3.py | 3 +- kerchunk/tests/test_zarr.py | 65 +++++++++++++++++++++++++++++++++++++ kerchunk/utils.py | 9 +++-- kerchunk/zarr.py | 3 +- setup.py | 2 +- 5 files changed, 77 insertions(+), 5 deletions(-) create mode 100644 kerchunk/tests/test_zarr.py diff --git a/kerchunk/netCDF3.py b/kerchunk/netCDF3.py index b0675858..a3b0c58f 100644 --- a/kerchunk/netCDF3.py +++ b/kerchunk/netCDF3.py @@ -59,6 +59,7 @@ def __init__( self.threshold = inline_threshold self.max_chunk_size = max_chunk_size self.out = {} + self.storage_options = storage_options with fsspec.open(filename, **(storage_options or {})) as fp: super().__init__( fp, *args, mmap=False, mode="r", maskandscale=False, **kwargs @@ -259,7 +260,7 @@ def translate(self): ) if self.threshold > 0: - out = do_inline(out, self.threshold) + out = do_inline(out, self.threshold, remote_options=self.storage_options) out = _encode_for_JSON(out) return {"version": 1, "refs": out} diff --git a/kerchunk/tests/test_zarr.py b/kerchunk/tests/test_zarr.py new file mode 100644 index 00000000..98913463 --- /dev/null +++ b/kerchunk/tests/test_zarr.py @@ -0,0 +1,65 @@ +import xarray as xr +import pandas as pd +import pytest +import numpy as np + +import kerchunk.zarr +import kerchunk.utils + + +@pytest.fixture(scope="module") +def ds(): + ds = xr.Dataset( + { + "x": xr.DataArray(np.linspace(-np.pi, np.pi, 10), dims=["x"]), + "y": xr.DataArray(np.linspace(-np.pi / 2, np.pi / 2, 10), dims=["y"]), + "time": xr.DataArray(pd.date_range("2020", "2021"), dims=["time"]), + }, + ) + ds["temp"] = ( + np.cos(ds.x) + * np.sin(ds.y) + * xr.ones_like(ds.time).astype("float") + * np.random.random(ds.time.shape) + ) + return ds + + +@pytest.fixture +def zarr_in_zip(tmpdir, ds): + def _zip(file): + import os + import zipfile + + filename = file + os.path.extsep + "zip" + with zipfile.ZipFile( + filename, "w", compression=zipfile.ZIP_STORED, allowZip64=True + ) as fh: + for root, _, filenames in os.walk(file): + for each_filename in filenames: + each_filename = os.path.join(root, each_filename) + fh.write(each_filename, os.path.relpath(each_filename, file)) + return filename + + fn = f"{tmpdir}/test.zarr" + ds.to_zarr(fn, mode="w") + return _zip(fn) + + +def test_zarr_in_zip(zarr_in_zip, ds): + out = kerchunk.zarr.ZarrToZarr( + url="zip://", storage_options={"fo": zarr_in_zip} + ).translate() + ds2 = xr.open_dataset( + "reference://", + engine="zarr", + backend_kwargs={ + "storage_options": { + "fo": out, + "remote_protocol": "zip", + "remote_options": {"fo": zarr_in_zip}, + }, + "consolidated": False, + }, + ) + assert ds.equals(ds2) diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 502666da..40163110 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -134,13 +134,18 @@ def _encode_for_JSON(store): return store -def do_inline(store, threshold, remote_options=None): +def do_inline(store, threshold, remote_options=None, remote_protocol=None): """Replace short chunks with the value of that chunk The chunk may need encoding with base64 if not ascii, so actual length may be larger than threshold. """ - fs = fsspec.filesystem("reference", fo=store, **(remote_options or {})) + fs = fsspec.filesystem( + "reference", + fo=store, + remote_options=remote_options, + remote_protocol=remote_protocol, + ) out = fs.references.copy() get_keys = [ k diff --git a/kerchunk/zarr.py b/kerchunk/zarr.py index 8f06e738..1398d414 100644 --- a/kerchunk/zarr.py +++ b/kerchunk/zarr.py @@ -24,7 +24,8 @@ def single_zarr(uri_or_store, storage_options=None, inline_threshold=100, inline refs[k] = mapper[k] else: refs[k] = [fsspec.utils._unstrip_protocol(mapper._key_to_str(k), mapper.fs)] - refs = do_inline(refs, inline_threshold) + if inline_threshold: + refs = do_inline(refs, inline_threshold, remote_options=storage_options) return refs diff --git a/setup.py b/setup.py index a29c3fa1..caf480c2 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "fill_hdf_strings = kerchunk.codecs:FillStringsCodec", "FITSAscii = kerchunk.codecs:AsciiTableCodec", "FITSVarBintable = kerchunk.codecs:VarArrCodec", - "record_member = kerchunk.codecs.RecordArrayMember", + "record_member = kerchunk.codecs:RecordArrayMember", ], }, zip_safe=False, From 5fc54707ec8e46682e8b23085b1e7784c4100964 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 25 Aug 2023 12:49:27 -0400 Subject: [PATCH 2/4] fix bad test --- kerchunk/tests/test_utils.py | 2 +- kerchunk/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kerchunk/tests/test_utils.py b/kerchunk/tests/test_utils.py index 4b34561f..afc35e6f 100644 --- a/kerchunk/tests/test_utils.py +++ b/kerchunk/tests/test_utils.py @@ -107,7 +107,7 @@ def test_subchunk_exact(m, chunks): out = kerchunk.utils.subchunk(ref, "data", 5) nchunk = 10 // chunks[0] * 5 - assert list(ref) == [".zgroup", "data/.zarray"] + [ + assert list(out) == [".zgroup", "data/.zarray"] + [ f"data/{_}.0" for _ in range(nchunk) ] diff --git a/kerchunk/utils.py b/kerchunk/utils.py index 40163110..c06f1ee9 100644 --- a/kerchunk/utils.py +++ b/kerchunk/utils.py @@ -237,6 +237,7 @@ def subchunk(store, variable, factor): modified store """ fs = fsspec.filesystem("reference", fo=store) + store = copy.deepcopy(store) meta_file = f"{variable}/.zarray" meta = ujson.loads(fs.cat(meta_file)) if meta["compressor"] is not None: From bc7ca94d09bd14cdd8af56bb5fc34d31f34b2d5b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 25 Aug 2023 12:53:50 -0400 Subject: [PATCH 3/4] try hpy pin --- ci/environment-py310.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci/environment-py310.yml b/ci/environment-py310.yml index 753b760d..514ad93f 100644 --- a/ci/environment-py310.yml +++ b/ci/environment-py310.yml @@ -8,6 +8,7 @@ dependencies: - zarr - xarray - h5netcdf + - h5py<3.9 - pandas - cfgrib - cftime @@ -24,7 +25,7 @@ dependencies: - black - fastparquet - pip - - pyopenssl=23.1.1 + - pyopenssl - tifffile - netCDF4 - pip: From fdd8dcca5c4f9d69718de652798353e77b5bbd12 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 25 Aug 2023 12:58:00 -0400 Subject: [PATCH 4/4] oter envs --- ci/environment-py38.yml | 3 ++- ci/environment-py39.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml index 050ad57d..e0334a88 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py38.yml @@ -8,6 +8,7 @@ dependencies: - zarr - xarray - h5netcdf + - h5py<3.9 - pandas - cfgrib - cftime @@ -22,7 +23,7 @@ dependencies: - python-blosc - flake8 - fastparquet - - pyopenssl=23.1.1 + - pyopenssl - black - pip - tifffile diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml index f4f4e4c4..ef0f41f1 100644 --- a/ci/environment-py39.yml +++ b/ci/environment-py39.yml @@ -8,6 +8,7 @@ dependencies: - zarr - xarray - h5netcdf + - h5py<3.9 - pandas - cfgrib - cftime @@ -22,7 +23,7 @@ dependencies: - python-blosc - flake8 - fastparquet - - pyopenssl=23.1.1 + - pyopenssl - black - pip - tifffile