Skip to content

Commit da7738b

Browse files
1072 modelbuilder improve era5 experience (#1073)
* made arguments explicit in notebook and added epsg comment * improved docstring and comments in preprocess_ERA5 * update comments in test_download_era5 * prevented incorrect times including testcase for download_ERA5
1 parent d9773eb commit da7738b

File tree

4 files changed

+72
-28
lines changed

4 files changed

+72
-28
lines changed

dfm_tools/download.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ def download_ERA5(varkey,
7171
raise KeyError(f'"{varkey}" not available, choose from: {", ".join(variables_dict.keys())}')
7272

7373
period_range = pd.period_range(date_min,date_max,freq='M')
74+
if len(period_range) == 0:
75+
raise ValueError(f"requested time extents ({date_min} to {date_max}) "
76+
"resulted in empty period_range")
7477
print(f'retrieving data from {period_range[0]} to {period_range[-1]} (freq={period_range.freq})')
7578

7679
#make sure the data fully covers the desired spatial extent. Download 1 additional grid cell (resolution is 1/4 degrees) in both directions

dfm_tools/xarray_helpers.py

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,25 +96,49 @@ def preprocess_hisnc(ds):
9696

9797
def preprocess_ERA5(ds):
9898
"""
99-
Reduces the expver dimension in some of the ERA5 data (mtpr and other variables), which occurs in files with very recent data. The dimension contains the unvalidated data from the latest month in the second index in the expver dimension. The reduction is done with mean, but this is arbitrary, since there is only one valid value per timestep and the other one is nan.
99+
Aligning ERA5 datasets before merging them. These operations are currently
100+
(2025) only required when (also) using previously retrieved ERA5 data.
101+
102+
In recent datasets retrieved from ERA5 the time dimension and variable are
103+
now called valid_time. This is inconvenient since it causes issues when
104+
merging with previously retrieved datasets. However, it is not necessary
105+
for succesfully running a Delft3D FM simulation.
106+
107+
Reducing the expver dimension: In the past, the expver dimension was
108+
present if you downloaded ERA5 data that consisted of a mix of ERA5 and
109+
ERA5T data. This dimension was also present in the data variables, so it
110+
broke code. Therefore this dimension is reduced with a mean operation.
111+
Any reduction operation would do the trick since there is only one valid
112+
value per timestep and the other one is nan. In datasets downloaded
113+
currently (2025) the expver dimension is not present anymore,
114+
but anexpver variable is present defining whether the data comes
115+
from ERA5 (1) or ERA5T (5).
116+
117+
Removing scale_factor and add_offset: In the past, the ERA5 data was
118+
supplied as integers with a scaling and offset that was different for
119+
each downloaded file. This caused serious issues with merging files,
120+
since the scaling/offset from the first file was assumed to be valid
121+
for the others also, leading to invalid values. Only relevant for old
122+
files. More info at https://github.com/Deltares/dfm_tools/issues/239.
100123
"""
101-
if 'expver' in ds.dims:
102-
# TODO: this drops int encoding which leads to unzipped float32 netcdf files: https://github.com/Deltares/dfm_tools/issues/781
103-
ds = ds.mean(dim='expver')
104124

105-
# datasets retrieved with new cds-beta have valid_time instead of time dimn/varn
106-
# https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5?u=jelmer_veenstra
107-
# TODO: can be removed after https://github.com/Unidata/netcdf4-python/issues/1357 or https://forum.ecmwf.int/t/3796 is fixed
125+
# datasets retrieved with new CDS have valid_time instead of time dim/var
126+
# https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5
108127
if 'valid_time' in ds.coords:
109128
ds = ds.rename({'valid_time':'time'})
110129

111-
# Prevent writing to (incorrectly scaled) int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
112-
# By dropping scaling/offset encoding and converting to float32 (will result in a larger dataset)
113-
# ERA5 datasets retrieved with the new CDS-beta are zipped float32 instead of scaled int, so this is only needed for backwards compatibility with old files.
130+
# reduce the expver dimension (not present in newly retrieved files)
131+
if 'expver' in ds.dims:
132+
ds = ds.mean(dim='expver')
133+
134+
# drop scaling/offset encoding if present and converting to float32. Not
135+
# present in newly retrieved files, variables are zipped float32 instead
114136
for var in ds.data_vars.keys():
115-
if not set(['dtype','scale_factor','add_offset']).issubset(ds.variables[var].encoding.keys()):
137+
list_attrs = ['dtype','scale_factor','add_offset']
138+
if not set(list_attrs).issubset(ds.variables[var].encoding.keys()):
116139
continue
117-
# the _FillValue will still be -32767 (int default), but this is no issue for float32
140+
# the _FillValue will still be -32767 (int default)
141+
# this is no issue for float32
118142
ds[var].encoding.pop('scale_factor')
119143
ds[var].encoding.pop('add_offset')
120144
ds[var].encoding["dtype"] = "float32"

docs/notebooks/modelbuilder_example.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
"dir_output = os.path.abspath(f'./{model_name}_model')\n",
7171
"# path_style = 'windows' # windows / unix\n",
7272
"overwrite = False # overwrite the downloaded forcing data or not. Always set to True when changing the domain\n",
73-
"crs = 'EPSG:4326' # coordinate reference system\n",
73+
"crs = 'EPSG:4326' # coordinate reference system, only EPSG 4326 (WGS84) is currently supported by the ERA5 and CMEMS download scripts.\n",
7474
"\n",
7575
"# domain and resolution\n",
7676
"# the actual maximum extents can slightly vary: see dfmt.meshkernel_get_bbox() below\n",
@@ -144,7 +144,7 @@
144144
],
145145
"source": [
146146
"# generate spherical regular grid\n",
147-
"mk_object = dfmt.make_basegrid(lon_min, lon_max, lat_min, lat_max, dx=dxy, dy=dxy, crs=crs)\n",
147+
"mk_object = dfmt.make_basegrid(lon_min=lon_min, lon_max=lon_max, lat_min=lat_min, lat_max=lat_max, dx=dxy, dy=dxy, crs=crs)\n",
148148
"\n",
149149
"# retrieve actual lat/lon bounds from grid, the lon_max and lat_max are likely larger than requested\n",
150150
"lon_min, lat_min, lon_max, lat_max = dfmt.meshkernel_get_bbox(mk_object)\n",

tests/test_download.py

Lines changed: 31 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -351,16 +351,35 @@ def test_download_hycom(tmp_path):
351351
def test_download_era5_unsupported_varkey():
352352
date_min = '2010-01-31'
353353
date_max = '2010-02-01'
354-
longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52 #test domain
354+
longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52
355355
varkey = 'unexisting'
356356
with pytest.raises(KeyError) as e:
357-
dfmt.download_ERA5(varkey,
358-
longitude_min=longitude_min, longitude_max=longitude_max, latitude_min=latitude_min, latitude_max=latitude_max,
359-
date_min=date_min, date_max=date_max,
360-
dir_output='.', overwrite=True)
357+
dfmt.download_ERA5(
358+
varkey,
359+
longitude_min=longitude_min, longitude_max=longitude_max,
360+
latitude_min=latitude_min, latitude_max=latitude_max,
361+
date_min=date_min, date_max=date_max,
362+
dir_output='.', overwrite=True)
361363
assert '"unexisting" not available' in str(e.value)
362364

363365

366+
@pytest.mark.requiressecrets
367+
@pytest.mark.unittest
368+
def test_download_era5_incorrect_times():
369+
date_min = '2010-01-01'
370+
date_max = '2009-01-01'
371+
longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52
372+
varkey = 'msl'
373+
with pytest.raises(ValueError) as e:
374+
dfmt.download_ERA5(
375+
varkey,
376+
longitude_min=longitude_min, longitude_max=longitude_max,
377+
latitude_min=latitude_min, latitude_max=latitude_max,
378+
date_min=date_min, date_max=date_max,
379+
dir_output='.', overwrite=True)
380+
assert 'resulted in empty period_range' in str(e.value)
381+
382+
364383
@pytest.mark.requiressecrets
365384
@pytest.mark.unittest
366385
@pytest.mark.era5slow # temporarily skip these on github
@@ -372,21 +391,19 @@ def test_download_era5(file_nc_era5_pattern):
372391

373392
ds = xr.open_mfdataset(file_nc_era5_pattern)
374393

375-
assert 'valid_time' in ds.dims # TODO: if this fails, remove the exception below and in preprocess_ERA5
394+
# datasets retrieved with intermediate CDS had expver dimension causing issues
395+
assert 'expver' not in ds.dims # TODO: if this fails, update the docstring of preprocess_ERA5
376396

377-
timedim = 'time'
378-
# datasets retrieved with new cds-beta have valid_time instead of time dimn/varn
379-
# https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5?u=jelmer_veenstra
380-
# TODO: can be removed after https://github.com/Unidata/netcdf4-python/issues/1357 or https://forum.ecmwf.int/t/3796 is fixed
381-
if 'valid_time' in ds.dims:
382-
timedim = 'valid_time'
397+
# datasets retrieved with new CDS have valid_time instead of time dim/var
398+
assert 'valid_time' in ds.dims # TODO: if this fails, remove the exception below and in preprocess_ERA5
399+
timedim = 'valid_time'
383400

384401
assert ds.sizes[timedim] == 1416
385402
assert ds[timedim].to_pandas().iloc[0] == pd.Timestamp('2010-01-01')
386403
assert ds[timedim].to_pandas().iloc[-1] == pd.Timestamp('2010-02-28 23:00')
387404

388-
# check if there are no integers in the dataset anymore
389-
# this was the case before CDS-beta in https://github.com/Deltares/dfm_tools/issues/239
405+
# datasets retrieved with new CDS are float32 instead of scaled ints
406+
# ints raised problem in https://github.com/Deltares/dfm_tools/issues/239
390407
msl_encoding = ds['msl'].encoding
391408
assert str(msl_encoding['dtype']) == 'float32'
392409
assert 'scale_factor' not in msl_encoding.keys()

0 commit comments

Comments
 (0)