Skip to content

Commit

Permalink
912 make dfmtopen dataset extra more modular (#913)
Browse files Browse the repository at this point in the history
* deprecating `reverse_depth` keyword

* moved applying conventions and conversion_dict to separate functions

* also applying 360to180 correction to longitude variable in `open_dataset_curvilinear`

* added additional tests

* updated whatsnew
  • Loading branch information
veenstrajelmer authored Aug 1, 2024
1 parent b63d726 commit f952a81
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 76 deletions.
151 changes: 82 additions & 69 deletions dfm_tools/interpolate_grid2bnd.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ def extract_component(ds):


def check_time_extent(data_xr, tstart, tstop):
#convert tstart/tstop from str/dt.datetime/pd.Timestamp to pd.Timestamp. WARNING: when supplying '05-04-2016', monthfirst is assumed, so 2016-05-04 will be the result (may instead of april). So always supply yyyy-mm-dd datestrings
tstart = pd.Timestamp(tstart)
tstop = pd.Timestamp(tstop)

Expand All @@ -300,35 +301,94 @@ def check_time_extent(data_xr, tstart, tstop):
raise OutOfRangeError(f'requested tstop {tstop} outside of available range {nc_tstart} to {nc_tstop}.')


def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=None, refdate_str=None, reverse_depth=False, chunks=None):
"""
empty docstring
"""

def ds_apply_conventions(data_xr):
ncbnd_construct = get_ncbnd_construct()
dimn_depth = ncbnd_construct['dimn_depth']
varn_depth = ncbnd_construct['varn_depth']

if conversion_dict is None:
conversion_dict = get_conversion_dict()
#rename dims and vars time/depth/lat/lon/x/y #TODO: this has to be phased out some time, or made as an argument or merged with conversion_dict?
rename_dims_dict = {'depth':dimn_depth, # depth for CMEMS and many others
'lev':dimn_depth, # lev for GFDL
'lon':'longitude','lat':'latitude'}
for k,v in rename_dims_dict.items():
if k in data_xr.dims and v not in data_xr.dims:
data_xr = data_xr.rename_dims({k:v})
print(f'dimension {k} renamed to {v}')
if k in data_xr.variables and v not in data_xr.variables:
data_xr = data_xr.rename_vars({k:v})
print(f'varname {k} renamed to {v}')

# make depth positive up if not yet the case
if varn_depth in data_xr.coords:
if 'positive' in data_xr[varn_depth].attrs.keys():
if data_xr[varn_depth].attrs['positive'] == 'down': #attribute appears in CMEMS, GFDL and CMCC, save to assume presence?
data_xr[varn_depth] = -data_xr[varn_depth]
data_xr[varn_depth].attrs['positive'] = 'up'

# get calendar and maybe convert_calendar, makes sure that nc_tstart/nc_tstop are of type pd._libs.tslibs.timestamps.Timestamp
data_xr_calendar = data_xr['time'].dt.calendar
if data_xr_calendar != 'proleptic_gregorian': #this is for instance the case in case of noleap (or 365_days) calendars from GFDL and CMCC
units_copy = data_xr['time'].encoding['units']
print(f'WARNING: calendar different than proleptic_gregorian found ({data_xr_calendar}), convert_calendar is called so check output carefully. It should be no issue for datasets with a monthly interval.')
data_xr = data_xr.convert_calendar('standard') #TODO: does this not result in 29feb nan values in e.g. GFDL model? Check missing argument at https://docs.xarray.dev/en/stable/generated/xarray.Dataset.convert_calendar.html
data_xr['time'].encoding['units'] = units_copy #put back dropped units

# 360 to 180 conversion
convert_360to180 = (data_xr['longitude'].to_numpy()>180).any() #TODO: replace to_numpy() with load()
if convert_360to180: #TODO: make more flexible for models that eg pass -180/+180 crossing (add overlap at lon edges).
data_xr.coords['longitude'] = (data_xr.coords['longitude'] + 180) % 360 - 180
data_xr = data_xr.sortby(data_xr['longitude'])
return data_xr


def ds_apply_conversion_dict(data_xr, conversion_dict, quantity_list):
# rename variables from conversion_dict
for k,v in conversion_dict.items():
ncvarn = v['ncvarname']
if ncvarn in data_xr.variables.mapping.keys() and k in quantity_list: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict)
data_xr = data_xr.rename({ncvarn:k})
print(f'variable {ncvarn} renamed to {k}')

# optional conversion of units. Multiplications or other simple operatiors do not affect performance (dask.array(getitem) becomes dask.array(mul). With more complex operation it is better do do it on the interpolated array.
for quan in quantity_list: #TODO: maybe do unit conversion before interp or is that computationally heavy?
if 'conversion' in conversion_dict[quan].keys(): #if conversion is present, unit key must also be in conversion_dict
print(f'> converting units from [{data_xr[quan].attrs["units"]}] to [{conversion_dict[quan]["unit"]}]')
#print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}')
data_xr[quan] = data_xr[quan] * conversion_dict[quan]['conversion'] #conversion drops all attributes of which units (which are changed anyway)
data_xr[quan].attrs['units'] = conversion_dict[quan]['unit'] #add unit attribute with resulting unit
return data_xr


def get_quantity_list(quantity):
if quantity=='uxuyadvectionvelocitybnd': #T3Dvector
quantity_list = ['ux','uy']
elif isinstance(quantity, list):
quantity_list = quantity
else:
quantity_list = [quantity]

return quantity_list


def get_ncvarname_list(quantity_list, conversion_dict):
# check existence of requested keys in conversion_dict
for quan in quantity_list:
if quan not in conversion_dict.keys():
raise KeyError(f"quantity '{quan}' not in conversion_dict, (case sensitive) options are: {str(list(conversion_dict.keys()))}")

ncvarname_list = [conversion_dict[quan]['ncvarname'] for quan in quantity_list]
return ncvarname_list


#convert tstart/tstop from str/dt.datetime/pd.Timestamp to pd.Timestamp. WARNING: when supplying '05-04-2016', monthfirst is assumed, so 2016-05-04 will be the result (may instead of april).
tstart = pd.Timestamp(tstart)
tstop = pd.Timestamp(tstop)
def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=None, refdate_str=None, chunks=None):
"""
empty docstring
"""

if conversion_dict is None:
conversion_dict = get_conversion_dict()

quantity_list = get_quantity_list(quantity=quantity)
ncvarname_list = get_ncvarname_list(quantity_list=quantity_list, conversion_dict=conversion_dict)

dir_pattern = [Path(str(dir_pattern).format(ncvarname=ncvarname)) for ncvarname in ncvarname_list]
file_list_nc = []
Expand All @@ -339,76 +399,30 @@ def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=Non

data_xr = xr.open_mfdataset(file_list_nc, chunks=chunks, join="exact") #TODO: does chunks argument solve "PerformanceWarning: Slicing is producing a large chunk."? {'time':1} is not a convenient chunking to use for timeseries extraction

for k,v in conversion_dict.items():
ncvarn = v['ncvarname']
if ncvarn in data_xr.variables.mapping.keys() and k in quantity_list: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict)
data_xr = data_xr.rename({ncvarn:k})
print(f'variable {ncvarn} renamed to {k}')

#rename dims and vars time/depth/lat/lon/x/y #TODO: this has to be phased out some time, or made as an argument or merged with conversion_dict?
rename_dims_dict = {'depth':dimn_depth, # depth for CMEMS and many others
'lev':dimn_depth, # lev for GFDL
'lon':'longitude','lat':'latitude'}
for k,v in rename_dims_dict.items():
if k in data_xr.dims and v not in data_xr.dims:
data_xr = data_xr.rename_dims({k:v})
print(f'dimension {k} renamed to {v}')
if k in data_xr.variables and v not in data_xr.variables:
data_xr = data_xr.rename_vars({k:v})
print(f'varname {k} renamed to {v}')
data_xr = ds_apply_conventions(data_xr=data_xr)
data_xr = ds_apply_conversion_dict(data_xr=data_xr, conversion_dict=conversion_dict, quantity_list=quantity_list)

#get calendar and maybe convert_calendar, makes sure that nc_tstart/nc_tstop are of type pd._libs.tslibs.timestamps.Timestamp
data_xr_calendar = data_xr['time'].dt.calendar
if data_xr_calendar != 'proleptic_gregorian': #this is for instance the case in case of noleap (or 365_days) calendars from GFDL and CMCC
units_copy = data_xr['time'].encoding['units']
print(f'WARNING: calendar different than proleptic_gregorian found ({data_xr_calendar}), convert_calendar is called so check output carefully. It should be no issue for datasets with a monthly interval.')
data_xr = data_xr.convert_calendar('standard') #TODO: does this not result in 29feb nan values in e.g. GFDL model? Check missing argument at https://docs.xarray.dev/en/stable/generated/xarray.Dataset.convert_calendar.html
data_xr['time'].encoding['units'] = units_copy #put back dropped units

check_time_extent(data_xr, tstart, tstop)

#360 to 180 conversion
convert_360to180 = (data_xr['longitude'].to_numpy()>180).any() #TODO: replace to_numpy() with load()
if convert_360to180: #TODO: make more flexible for models that eg pass -180/+180 crossing (add overlap at lon edges).
data_xr.coords['longitude'] = (data_xr.coords['longitude'] + 180) % 360 - 180
data_xr = data_xr.sortby(data_xr['longitude'])

#retrieve var(s) (after potential longitude conversion) (also selecting relevant times)
#retrieve var(s) (after potential longitude conversion)
data_vars = list(data_xr.data_vars)
bool_quanavailable = pd.Series(quantity_list).isin(data_vars)
if not bool_quanavailable.all():
quantity_list_notavailable = pd.Series(quantity_list).loc[~bool_quanavailable].tolist()
raise KeyError(f'quantity {quantity_list_notavailable} not found in netcdf, available are: {data_vars}. Try updating conversion_dict to rename these variables.')
data_xr_vars = data_xr[quantity_list].sel(time=slice(tstart,tstop))
data_xr_vars = data_xr[quantity_list]

# slice time
check_time_extent(data_xr, tstart, tstop)
data_xr_vars = data_xr_vars.sel(time=slice(tstart,tstop))
# check time extent again to avoid issues with eg midday data being
# sliced to midnight times: https://github.com/Deltares/dfm_tools/issues/707
check_time_extent(data_xr_vars, tstart, tstop)

#optional conversion of units. Multiplications or other simple operatiors do not affect performance (dask.array(getitem) becomes dask.array(mul). With more complex operation it is better do do it on the interpolated array.
for quan in quantity_list: #TODO: maybe do unit conversion before interp or is that computationally heavy?
if 'conversion' in conversion_dict[quan].keys(): #if conversion is present, unit key must also be in conversion_dict
print(f'> converting units from [{data_xr_vars[quan].attrs["units"]}] to [{conversion_dict[quan]["unit"]}]')
#print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}')
data_xr_vars[quan] = data_xr_vars[quan] * conversion_dict[quan]['conversion'] #conversion drops all attributes of which units (which are changed anyway)
data_xr_vars[quan].attrs['units'] = conversion_dict[quan]['unit'] #add unit attribute with resulting unit

#optional refdate changing
if refdate_str is not None:
if 'long_name' in data_xr_vars.time.attrs: #for CMEMS it is 'hours since 1950-01-01', which would be wrong now #TODO: consider also removing attrs for depth/varname, since we would like to have salinitybnd/waterlevel instead of Salinity/sea_surface_height in xr plots?
del data_xr_vars.time.attrs['long_name']
data_xr_vars.time.encoding['units'] = refdate_str

if varn_depth in data_xr_vars.coords:
#make positive up
if 'positive' in data_xr_vars[varn_depth].attrs.keys():
if data_xr_vars[varn_depth].attrs['positive'] == 'down': #attribute appears in CMEMS, GFDL and CMCC, save to assume presence?
data_xr_vars[varn_depth] = -data_xr_vars[varn_depth]
data_xr_vars[varn_depth].attrs['positive'] = 'up'
#optional reversing depth dimension for comparison to coastserv
if reverse_depth:
print('> reversing depth dimension')
data_xr_vars = data_xr_vars.reindex({varn_depth:list(reversed(data_xr_vars[varn_depth]))})

return data_xr_vars


Expand Down Expand Up @@ -465,7 +479,7 @@ def interp_uds_to_plipoints(uds:xu.UgridDataset, gdf:geopandas.GeoDataFrame) ->
----------
uds : xu.UgridDataset
dfm model output read using dfm_tools. Dims: mesh2d_nLayers, mesh2d_nInterfaces, time, mesh2d_nNodes, mesh2d_nFaces, mesh2d_nMax_face_nodes, mesh2d_nEdges.
gdf : geopandas.GeoDataFrame (str/path is also supported)
gdf : geopandas.GeoDataFrame
gdf with location, geometry (Point) and crs corresponding to model crs.
Raises
Expand All @@ -479,14 +493,13 @@ def interp_uds_to_plipoints(uds:xu.UgridDataset, gdf:geopandas.GeoDataFrame) ->
xr.Dataset with dims: node, time, z.
"""
# TODO: this function requires gdf with points, but the interp_regularnc_to_plipoints requires paths to plifiles (and others also)

facedim = uds.grid.face_dimension
ncbnd_construct = get_ncbnd_construct()
dimn_point = ncbnd_construct['dimn_point']
varn_pointname = ncbnd_construct['varn_pointname']

if isinstance(gdf,(str,Path)): #TODO: align plipoints/gdf with other functions, now three input types are supported, but the interp_regularnc_to_plipoints requires paths to plifiles (and others also)
gdf = PolyFile_to_geodataframe_points(hcdfm.PolyFile(gdf))

ds = uds.ugrid.sel_points(x=gdf.geometry.x, y=gdf.geometry.y)
#TODO: drop mesh2d_face_x and mesh2d_face_y variables

Expand Down
6 changes: 4 additions & 2 deletions dfm_tools/xugrid_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ def open_partitioned_dataset(file_nc, decode_fillvals=False, remove_edges=True,


def open_dataset_curvilinear(file_nc,
varn_lon='longitude',
varn_vert_lon='vertices_longitude', #'grid_x'
varn_vert_lat='vertices_latitude', #'grid_y'
ij_dims=['i','j'], #['N','M']
Expand All @@ -282,9 +283,10 @@ def open_dataset_curvilinear(file_nc,
vertices_latitude = vertices_latitude.reshape(-1,vertices_latitude.shape[-1])
print(f'{(dt.datetime.now()-dtstart).total_seconds():.2f} sec')

#convert from 0to360 to -180 to 180
# convert from 0to360 to -180 to 180
if convert_360to180:
vertices_longitude = (vertices_longitude+180)%360 - 180 #TODO: check if uds.ugrid.to_nonperiodic() still works properly after doing this
vertices_longitude = (vertices_longitude+180)%360 - 180
ds[varn_lon] = (ds[varn_lon] + 180) % 360 - 180

# face_xy = np.stack([longitude,latitude],axis=-1)
# face_coords_x, face_coords_y = face_xy.T
Expand Down
9 changes: 9 additions & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
# What's new

## UNRELEASED

### Feat
- making `dfmt.open_dataset_extra()` more modular by partly moving it to separate private functions in [#913](https://github.com/Deltares/dfm_tools/pull/913)

### Fix
- also apply convert_360to180 to longitude variable in `dfmt.open_dataset_curvilinear()` in [#913](https://github.com/Deltares/dfm_tools/pull/913)


## 0.24.0 (2024-07-12)

### Feat
Expand Down
2 changes: 1 addition & 1 deletion tests/examples/preprocess_interpolate_nc_to_bc.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#quantities should be in conversion_dict.keys(). waterlevelbnd is steric/zos, tide is tidal components from FES/EOT
list_quantities = ['waterlevelbnd','salinitybnd','temperaturebnd','uxuyadvectionvelocitybnd','tracerbndNO3','tide']
#list_quantities = ['waterlevelbnd','salinitybnd','temperaturebnd','tracerbndNO3']
list_quantities = ['salinitybnd','tracerbndNO3','tide']
list_quantities = ['salinitybnd','tracerbndNO3']#,'tide'] # TODO: tide currently fails: https://github.com/Deltares/dfm_tools/issues/905
#list_quantities = ['waterlevelbnd','salinitybnd','temperaturebnd','uxuyadvectionvelocitybnd','tracerbndNO3','tracerbndOpal','tracerbndDON','tide'] #also waq vars with same ncvarname, opal not available for GFDL and CMCC

model = 'CMEMS' #CMEMS GFDL
Expand Down
Loading

0 comments on commit f952a81

Please sign in to comment.