Skip to content
Merged
17 changes: 13 additions & 4 deletions examples/xarray/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,16 @@ For more information on using dask with xarray, see `this <https://docs.xarray.d
Opening Multiple Files
^^^^^^^^^^^^^^^^^^^^^^

You can use ``xr.open_mfdataset`` to open multiple NREL data files at once:
You can use ``xr.open_mfdataset`` to open multiple NREL data files at once.

.. IMPORTANT::
By default, ``xarray`` does not assume that the coordinate data (i.e. meta variables)
match across files. As a result, it will try to load all coordinates and compare them
during the concatenation step. This process involves significant I/O and can drastically
increase the runtime of ``xr.open_mfdataset`` calls. Since the underlying assumption in
rex-style data is that the meta remains consistent across files, we can tell ``xarray``
to skip this validation by passing ``compat="override"`` and ``coords="minimal"`` to the
``xr.open_mfdataset`` call.


.. code-block:: python
Expand All @@ -228,7 +237,7 @@ You can use ``xr.open_mfdataset`` to open multiple NREL data files at once:

WTK_FPS = os.path.join(TESTDATADIR, 'wtk', 'ri_100_wtk_20*.h5')

ds = xr.open_mfdataset(WTK_FPS, engine="rex")
ds = xr.open_mfdataset(WTK_FPS, engine="rex", compat="override", coords="minimal")
ds

.. code-block:: python-console
Expand Down Expand Up @@ -405,7 +414,7 @@ so to open multiple files on S3, you have to list them out explicitly:
"s3://nrel-pds-nsrdb/current/nsrdb_1998.h5",
"s3://nrel-pds-nsrdb/current/nsrdb_1999.h5",
]
ds = xr.open_mfdataset(files, engine="rex")
ds = xr.open_mfdataset(files, engine="rex", compat="override", coords="minimal")
ds

.. code-block:: python-console
Expand Down Expand Up @@ -450,7 +459,7 @@ accept wildcard inputs:
import xarray as xr
from rex import open_mfdataset_hsds

ds = open_mfdataset_hsds("/nrel/nsrdb/v3/nsrdb_199*.h5")
ds = open_mfdataset_hsds("/nrel/nsrdb/v3/nsrdb_199*.h5", compat="override", coords="minimal")
ds


Expand Down
29 changes: 14 additions & 15 deletions examples/xarray/daily_agg.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,19 @@
"\n",
"- The `memory_limit` argument is the limit *per worker*. If you are memory constrained, try using less workers and setting a lower memory limit and reducing the compute chunk size. Here, we're processing on a large NREL HPC node with 104 cores and 256 GB of memory.\n",
"\n",
"- Setting up the full aggregate dataset lazily and then doing one `.compute()` call tended to break things. Smaller multiple compute calls seem to work better. "
"- Setting up the full aggregate dataset lazily and then doing one `.compute()` call tended to break things. Smaller multiple compute calls seem to work better. \n",
"\n",
"- If you are running on a network file system (NFS) such as on an HPC, try setting the ``local_directory`` parameter of the ``Client`` instance to point to a local scratch to avoid the overhead of network calls for each of the workers."
]
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"id": "171a1a77-a6e2-4280-8a6b-fc8a286b0895",
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"import xarray as xr\n",
"from rex import Resource\n",
"import numpy as np\n",
"import pandas as pd\n",
"from dask.distributed import Client"
]
Expand Down Expand Up @@ -637,7 +636,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "0c069232-6e46-4ef3-b34c-efeb623ebf49",
"metadata": {
"scrolled": true
Expand All @@ -662,14 +661,14 @@
"fp_pr = fp_base.replace('v0.2.2_beta', 'v0.2.2_beta/daily')\n",
"\n",
"kwargs = dict(engine=\"rex\", chunks={'time': 8784, 'gid': 50000})\n",
"xds_trh = xr.open_mfdataset(fp_base.format(scenario=scenario, group='trh', year=year), **kwargs)\n",
"xds_wind = xr.open_mfdataset(fp_base.format(scenario=scenario, group='wind', year=year), **kwargs)\n",
"xds_pr = xr.open_mfdataset(fp_pr.format(scenario=scenario, group='pr', year=year), **kwargs)"
"xds_trh = xr.open_dataset(fp_base.format(scenario=scenario, group='trh', year=year), **kwargs)\n",
"xds_wind = xr.open_dataset(fp_base.format(scenario=scenario, group='wind', year=year), **kwargs)\n",
"xds_pr = xr.open_dataset(fp_pr.format(scenario=scenario, group='pr', year=year), **kwargs)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "d02dc16a-7727-47ee-b265-a6984181bf9e",
"metadata": {},
"outputs": [
Expand All @@ -684,13 +683,13 @@
],
"source": [
"%%time\n",
"da = xds_trh['temperature_2m'].groupby(\"time.date\").max(\"time\")\n",
"da = xds_trh['temperature_2m'].groupby(\"time.date\").max(dim=\"time\")\n",
"ds_out = da.compute().to_dataset()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "3710d159-370f-48de-8c7b-0c3b30082504",
"metadata": {},
"outputs": [
Expand All @@ -705,13 +704,13 @@
],
"source": [
"%%time\n",
"da = xds_trh['relativehumidity_2m'].groupby(\"time.date\").min(\"time\")\n",
"da = xds_trh['relativehumidity_2m'].groupby(\"time.date\").min(dim=\"time\")\n",
"ds_out['relativehumidity_2m'] = da.compute()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "0436ece4-c14a-4ca7-926e-6f5dc280ede8",
"metadata": {},
"outputs": [
Expand All @@ -726,7 +725,7 @@
],
"source": [
"%%time\n",
"da = xds_wind['windspeed_10m'].groupby(\"time.date\").mean(\"time\")\n",
"da = xds_wind['windspeed_10m'].groupby(\"time.date\").mean(dim=\"time\")\n",
"ds_out['windspeed_10m'] = da.compute() * 3.6 # m/s to km/hr"
]
},
Expand Down
19 changes: 17 additions & 2 deletions rex/external/rexarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,8 +838,14 @@ def open_mfdataset_hsds(paths, **kwargs):
for more details on HSDS files.
**kwargs
Keyword-value argument pairs to pass to :func:`open_mfdataset`.
We strongly recommend specifying ``parallel=True`` and
``chunks="auto"`` to help with data loading times.
We strongly recommend specifying the following parameters to
help with data loading times:

- parallel=True
- chunks="auto"
- compat="override"
- coords="minimal"


Returns
-------
Expand All @@ -852,6 +858,15 @@ def open_mfdataset_hsds(paths, **kwargs):
kwargs["engine"] = "rex"
kwargs["hsds"] = True

if kwargs.get("compat") != "override":
msg = ("Did not detect 'compat='override' parameter in arguments "
"passed to `rex.open_mfdataset_hsds`. You may see drastically "
"increased loading times since all of the coordinates are "
"loaded and validated by xarray. We strongly recommend passing "
"'compat='override' (and coords='minimal') for increased "
"read performance.")
warnings.warn(msg, UserWarning)

if isinstance(paths, str):
paths = _hsds_glob_to_list(paths)
elif isinstance(paths, (list, tuple)):
Expand Down
4 changes: 3 additions & 1 deletion tests/h5pyd_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ def test_sup3rcc():
def test_mf_hsds_xr(fps):
"""Test opening multiple files via HSDS with xarray"""

with open_mfdataset_hsds(fps, parallel=True, chunks="auto") as ds:
kwargs = {"parallel": True, "chunks": "auto", "compat": "override",
"coords": "minimal"}
with open_mfdataset_hsds(fps, **kwargs) as ds:
assert ds.sizes == {'time': 17544, 'gid': 2488136}
assert str(ds.time_index.isel(time=0).values).startswith("2008")
assert str(ds.time_index.isel(time=-1).values).startswith("2009")
Expand Down
Loading