zarr-developers · kthyng · Jul 13, 2024 · Jul 13, 2024
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -9,6 +9,9 @@ v1.0.1 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Can open `kerchunk` reference files with ``open_virtual_dataset`` if they are json
+  (:pull:`119`, :pull:`186`) By `Raphael Hagen <https://github.com/norlandrhagen>`_, `Kristen Thyng <https://github.com/kthyng>`_.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -419,6 +419,14 @@ Currently there are not yet any zarr v3 readers which understand the chunk manif
 This store can however be read by {py:func}`~virtualizarr.xarray.open_virtual_dataset`, by passing `filetype="zarr_v3"`.
 ```
 
+## Opening kerchunk files from disk as virtual datasets
+
+You can open kerchunk files from disk as virtual datasets if they are json (not yet parquet). This is helpful, for example, to allow for the workflow necessary if you have existing model output you want to create a kerchunk file for, but also want to be able to add to later. One way to do this is to create derivative json kerchunk files to represent the available model output and combine those into a single kerchunk file for all available model output. Then as more model output is available, create more derivative kerchunk files to represent the new model output and as needed, recreate the single combined kerchunk file that represents the full model dataset.
+
+```python
+open_virtual_dataset('combined.json', format='json')
+```
+
 ## Rewriting existing manifests
 
 Sometimes it can be useful to rewrite the contents of an already-generated manifest or virtual dataset.

diff --git a/virtualizarr/kerchunk.py b/virtualizarr/kerchunk.py
@@ -139,6 +139,9 @@ def _automatically_determine_filetype(
     fpath = _fsspec_openfile_from_filepath(
         filepath=filepath, reader_options=reader_options
     )
+    magic_bytes = fpath.read(8)
+    fpath.close()
+
     if magic_bytes.startswith(b"CDF"):
         filetype = FileType.netcdf3
     elif magic_bytes.startswith(b"\x0e\x03\x13\x01"):

diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py
@@ -44,24 +44,8 @@ def _fsspec_openfile_from_filepath(
     universal_filepath = UPath(filepath)
     protocol = universal_filepath.protocol
 
-    if protocol == "":
-        fpath = fsspec.open(universal_filepath, "rb")
-        if universal_filepath.is_file():
-            fpath = fpath.open()
-
-    elif protocol in ["s3"]:
-        s3_anon_defaults = {"key": "", "secret": "", "anon": True}
-        if not bool(reader_options):
-            storage_options = s3_anon_defaults
-
-        else:
-            storage_options = reader_options.get("storage_options")  # type: ignore
-
-            # using dict merge operator to add in defaults if keys are not specified
-            storage_options = s3_anon_defaults | storage_options
-        fpath = fsspec.filesystem(protocol, **storage_options)
-        if universal_filepath.is_file():
-            fpath = fpath.open(filepath)
+    if protocol == "s3":
+        protocol_defaults = {"key": "", "secret": "", "anon": True}
     else:
         protocol_defaults = {}
 

diff --git a/virtualizarr/xarray.py b/virtualizarr/xarray.py
@@ -57,8 +57,10 @@ def open_virtual_dataset(
         File path to open as a set of virtualized zarr arrays.
     filetype : FileType, default None
         Type of file to be opened. Used to determine which kerchunk file format backend to use.
-        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3'}.
+        Can be one of {'netCDF3', 'netCDF4', 'HDF', 'TIFF', 'GRIB', 'FITS', 'zarr_v3', 'kerchunk'}.
         If not provided will attempt to automatically infer the correct filetype from header bytes.
+        For type of 'kerchunk' the file must be a JSON containing kerchunk references
+        and the filetype string must be input.
     drop_variables: list[str], default is None
         Variables in the file to drop before returning.
     loadable_variables: list[str], default is None
@@ -125,9 +127,16 @@ def open_virtual_dataset(
         )
 
     if filetype == "kerchunk":
-        fpath = _fsspec_openfile_from_filepath(
-            filepath=filepath, reader_options=reader_options
-        )
+
+        try:
+            fpath = _fsspec_openfile_from_filepath(
+                filepath=filepath, reader_options=reader_options
+            )
+        except IsADirectoryError:
+            # moved this here because fsspec wasn't finding the
+            # suffix of the parquet file due to thinking it was a directory
+            # but left the other code since it should ultimately be back down there
+            raise NotImplementedError()
 
         from upath import UPath
 
@@ -140,8 +149,8 @@ def open_virtual_dataset(
 
             vds = dataset_from_kerchunk_refs(refs_dict)
             return vds
-        elif kerchunk_storage_ftype == ".parquet":
-            raise NotImplementedError
+        elif kerchunk_storage_ftype in [".parquet", ".parq"]:
+            raise NotImplementedError()
 
             # Question: How should we read the parquet files
             # into a dict to pass into dataset_from_kerchunk_refs?