NatLabRockies · grantbuster · May 19, 2025 · May 18, 2025 · May 18, 2025 · May 18, 2025
diff --git a/examples/xarray/README.rst b/examples/xarray/README.rst
@@ -217,7 +217,16 @@ For more information on using dask with xarray, see `this <https://docs.xarray.d
 Opening Multiple Files
 ^^^^^^^^^^^^^^^^^^^^^^
 
-You can use ``xr.open_mfdataset`` to open multiple NREL data files at once:
+You can use ``xr.open_mfdataset`` to open multiple NREL data files at once.
+
+.. IMPORTANT::
+    By default, ``xarray`` does not assume that the coordinate data (i.e. meta variables)
+    match across files. As a result, it will try to load all coordinates and compare them
+    during the concatenation step. This process involves significant I/O and can drastically
+    increase the runtime of ``xr.open_mfdataset`` calls. Since the underlying assumption in
+    rex-style data is that the meta remains consistent across files, we can tell ``xarray``
+    to skip this validation by passing  ``compat="override"`` and ``coords="minimal"`` to the
+    ``xr.open_mfdataset`` call.
 
 
 .. code-block:: python
@@ -228,7 +237,7 @@ You can use ``xr.open_mfdataset`` to open multiple NREL data files at once:
 
     WTK_FPS = os.path.join(TESTDATADIR, 'wtk', 'ri_100_wtk_20*.h5')
 
-    ds = xr.open_mfdataset(WTK_FPS, engine="rex")
+    ds = xr.open_mfdataset(WTK_FPS, engine="rex", compat="override", coords="minimal")
     ds
 
 .. code-block:: python-console
@@ -405,7 +414,7 @@ so to open multiple files on S3, you have to list them out explicitly:
         "s3://nrel-pds-nsrdb/current/nsrdb_1998.h5",
         "s3://nrel-pds-nsrdb/current/nsrdb_1999.h5",
     ]
-    ds = xr.open_mfdataset(files, engine="rex")
+    ds = xr.open_mfdataset(files, engine="rex", compat="override", coords="minimal")
     ds
 
 .. code-block:: python-console
@@ -450,7 +459,7 @@ accept wildcard inputs:
     import xarray as xr
     from rex import open_mfdataset_hsds
 
-    ds = open_mfdataset_hsds("/nrel/nsrdb/v3/nsrdb_199*.h5")
+    ds = open_mfdataset_hsds("/nrel/nsrdb/v3/nsrdb_199*.h5", compat="override", coords="minimal")
     ds
 
 

diff --git a/examples/xarray/daily_agg.ipynb b/examples/xarray/daily_agg.ipynb
@@ -23,20 +23,19 @@
     "\n",
     "- The `memory_limit` argument is the limit *per worker*. If you are memory constrained, try using less workers and setting a lower memory limit and reducing the compute chunk size. Here, we're processing on a large NREL HPC node with 104 cores and 256 GB of memory.\n",
     "\n",
-    "- Setting up the full aggregate dataset lazily and then doing one `.compute()` call tended to break things. Smaller multiple compute calls seem to work better. "
+    "- Setting up the full aggregate dataset lazily and then doing one `.compute()` call tended to break things. Smaller multiple compute calls seem to work better. \n",
+    "\n",
+    "- If you are running on a network file system (NFS) such as on an HPC, try setting the ``local_directory`` parameter of the ``Client`` instance to point to a local scratch to avoid the overhead of network calls for each of the workers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "171a1a77-a6e2-4280-8a6b-fc8a286b0895",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import glob\n",
     "import xarray as xr\n",
-    "from rex import Resource\n",
-    "import numpy as np\n",
     "import pandas as pd\n",
     "from dask.distributed import Client"
    ]
@@ -637,7 +636,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "0c069232-6e46-4ef3-b34c-efeb623ebf49",
    "metadata": {
     "scrolled": true
@@ -662,14 +661,14 @@
     "fp_pr = fp_base.replace('v0.2.2_beta', 'v0.2.2_beta/daily')\n",
     "\n",
     "kwargs = dict(engine=\"rex\", chunks={'time': 8784, 'gid': 50000})\n",
-    "xds_trh = xr.open_mfdataset(fp_base.format(scenario=scenario, group='trh', year=year), **kwargs)\n",
-    "xds_wind = xr.open_mfdataset(fp_base.format(scenario=scenario, group='wind', year=year), **kwargs)\n",
-    "xds_pr = xr.open_mfdataset(fp_pr.format(scenario=scenario, group='pr', year=year), **kwargs)"
+    "xds_trh = xr.open_dataset(fp_base.format(scenario=scenario, group='trh', year=year), **kwargs)\n",
+    "xds_wind = xr.open_dataset(fp_base.format(scenario=scenario, group='wind', year=year), **kwargs)\n",
+    "xds_pr = xr.open_dataset(fp_pr.format(scenario=scenario, group='pr', year=year), **kwargs)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "d02dc16a-7727-47ee-b265-a6984181bf9e",
    "metadata": {},
    "outputs": [
@@ -684,13 +683,13 @@
    ],
    "source": [
     "%%time\n",
-    "da = xds_trh['temperature_2m'].groupby(\"time.date\").max(\"time\")\n",
+    "da = xds_trh['temperature_2m'].groupby(\"time.date\").max(dim=\"time\")\n",
     "ds_out = da.compute().to_dataset()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "3710d159-370f-48de-8c7b-0c3b30082504",
    "metadata": {},
    "outputs": [
@@ -705,13 +704,13 @@
    ],
    "source": [
     "%%time\n",
-    "da = xds_trh['relativehumidity_2m'].groupby(\"time.date\").min(\"time\")\n",
+    "da = xds_trh['relativehumidity_2m'].groupby(\"time.date\").min(dim=\"time\")\n",
     "ds_out['relativehumidity_2m'] = da.compute()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "0436ece4-c14a-4ca7-926e-6f5dc280ede8",
    "metadata": {},
    "outputs": [
@@ -726,7 +725,7 @@
    ],
    "source": [
     "%%time\n",
-    "da = xds_wind['windspeed_10m'].groupby(\"time.date\").mean(\"time\")\n",
+    "da = xds_wind['windspeed_10m'].groupby(\"time.date\").mean(dim=\"time\")\n",
     "ds_out['windspeed_10m'] = da.compute() * 3.6  # m/s to km/hr"
    ]
   },

diff --git a/rex/external/rexarray.py b/rex/external/rexarray.py
@@ -838,8 +838,14 @@ def open_mfdataset_hsds(paths, **kwargs):
         for more details on HSDS files.
     **kwargs
         Keyword-value argument pairs to pass to :func:`open_mfdataset`.
-        We strongly recommend specifying ``parallel=True`` and
-        ``chunks="auto"`` to help with data loading times.
+        We strongly recommend specifying the following parameters to
+        help with data loading times:
+
+            - parallel=True
+            - chunks="auto"
+            - compat="override"
+            - coords="minimal"
+
 
     Returns
     -------
@@ -852,6 +858,15 @@ def open_mfdataset_hsds(paths, **kwargs):
     kwargs["engine"] = "rex"
     kwargs["hsds"] = True
 
+    if kwargs.get("compat") != "override":
+        msg = ("Did not detect 'compat='override' parameter in arguments "
+               "passed to `rex.open_mfdataset_hsds`. You may see drastically "
+               "increased loading times since all of the coordinates are "
+               "loaded and validated by xarray. We strongly recommend passing "
+               "'compat='override' (and coords='minimal') for increased "
+               "read performance.")
+        warnings.warn(msg, UserWarning)
+
     if isinstance(paths, str):
         paths = _hsds_glob_to_list(paths)
     elif isinstance(paths, (list, tuple)):

diff --git a/tests/h5pyd_tests.py b/tests/h5pyd_tests.py
@@ -87,7 +87,9 @@ def test_sup3rcc():
 def test_mf_hsds_xr(fps):
     """Test opening multiple files via HSDS with xarray"""
 
-    with open_mfdataset_hsds(fps, parallel=True, chunks="auto") as ds:
+    kwargs = {"parallel": True, "chunks": "auto", "compat": "override",
+              "coords": "minimal"}
+    with open_mfdataset_hsds(fps, **kwargs) as ds:
         assert ds.sizes == {'time': 17544, 'gid': 2488136}
         assert str(ds.time_index.isel(time=0).values).startswith("2008")
         assert str(ds.time_index.isel(time=-1).values).startswith("2009")