From eab555a240e812dc067475364a37d28eca236614 Mon Sep 17 00:00:00 2001 From: Raphael Hagen Date: Wed, 9 Aug 2023 16:07:37 -0700 Subject: [PATCH] Updated parquet example to use LazyReferenceMapper --- docs/source/advanced.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/docs/source/advanced.rst b/docs/source/advanced.rst index b7b6f4a4..640954af 100644 --- a/docs/source/advanced.rst +++ b/docs/source/advanced.rst @@ -156,11 +156,28 @@ one go and may be faster, if you have a Dask cluster available. from kerchunk import hdf, combine, df import fsspec.implementations.reference + from fsspec.implementations.reference import LazyReferenceMapper + from tempfile import TemporaryDirectory + import xarray as xr files = fsspec.open(location_of_data) + + # Create LazyReferenceMapper to pass to MultiZarrToZarr + fs = fsspec.filesystem("file") + td = TemporaryDirectory() + tmpdir = str(td.name) + out = LazyReferenceMapper.create(10, tmpdir, fs) + + # Create references from input files single_ref_sets = [hdf.SingleHdf5ToZarr(_).translate() for _ in files] - out_dict = combine.MultiZarrToZarr(single_ref_sets, concat_dims=["time"]).translate() + + out_dict = MultiZarrToZarr( + single_ref_sets, + remote_protocol="memory", + concat_dims=["time"], + out=out).translate() + os.mkdir("combined.parq") df.refs_to_dataframe(out_dict, "combined.parq")