diff --git a/dfm_tools/interpolate_grid2bnd.py b/dfm_tools/interpolate_grid2bnd.py index 6edbad1e0..2ff55853a 100644 --- a/dfm_tools/interpolate_grid2bnd.py +++ b/dfm_tools/interpolate_grid2bnd.py @@ -286,6 +286,7 @@ def extract_component(ds): def check_time_extent(data_xr, tstart, tstop): + #convert tstart/tstop from str/dt.datetime/pd.Timestamp to pd.Timestamp. WARNING: when supplying '05-04-2016', monthfirst is assumed, so 2016-05-04 will be the result (may instead of april). So always supply yyyy-mm-dd datestrings tstart = pd.Timestamp(tstart) tstop = pd.Timestamp(tstop) @@ -300,35 +301,94 @@ def check_time_extent(data_xr, tstart, tstop): raise OutOfRangeError(f'requested tstop {tstop} outside of available range {nc_tstart} to {nc_tstop}.') -def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=None, refdate_str=None, reverse_depth=False, chunks=None): - """ - empty docstring - """ - +def ds_apply_conventions(data_xr): ncbnd_construct = get_ncbnd_construct() dimn_depth = ncbnd_construct['dimn_depth'] varn_depth = ncbnd_construct['varn_depth'] - if conversion_dict is None: - conversion_dict = get_conversion_dict() + #rename dims and vars time/depth/lat/lon/x/y #TODO: this has to be phased out some time, or made as an argument or merged with conversion_dict? + rename_dims_dict = {'depth':dimn_depth, # depth for CMEMS and many others + 'lev':dimn_depth, # lev for GFDL + 'lon':'longitude','lat':'latitude'} + for k,v in rename_dims_dict.items(): + if k in data_xr.dims and v not in data_xr.dims: + data_xr = data_xr.rename_dims({k:v}) + print(f'dimension {k} renamed to {v}') + if k in data_xr.variables and v not in data_xr.variables: + data_xr = data_xr.rename_vars({k:v}) + print(f'varname {k} renamed to {v}') + + # make depth positive up if not yet the case + if varn_depth in data_xr.coords: + if 'positive' in data_xr[varn_depth].attrs.keys(): + if data_xr[varn_depth].attrs['positive'] == 'down': #attribute appears in CMEMS, GFDL and CMCC, save to assume presence? + data_xr[varn_depth] = -data_xr[varn_depth] + data_xr[varn_depth].attrs['positive'] = 'up' + + # get calendar and maybe convert_calendar, makes sure that nc_tstart/nc_tstop are of type pd._libs.tslibs.timestamps.Timestamp + data_xr_calendar = data_xr['time'].dt.calendar + if data_xr_calendar != 'proleptic_gregorian': #this is for instance the case in case of noleap (or 365_days) calendars from GFDL and CMCC + units_copy = data_xr['time'].encoding['units'] + print(f'WARNING: calendar different than proleptic_gregorian found ({data_xr_calendar}), convert_calendar is called so check output carefully. It should be no issue for datasets with a monthly interval.') + data_xr = data_xr.convert_calendar('standard') #TODO: does this not result in 29feb nan values in e.g. GFDL model? Check missing argument at https://docs.xarray.dev/en/stable/generated/xarray.Dataset.convert_calendar.html + data_xr['time'].encoding['units'] = units_copy #put back dropped units + # 360 to 180 conversion + convert_360to180 = (data_xr['longitude'].to_numpy()>180).any() #TODO: replace to_numpy() with load() + if convert_360to180: #TODO: make more flexible for models that eg pass -180/+180 crossing (add overlap at lon edges). + data_xr.coords['longitude'] = (data_xr.coords['longitude'] + 180) % 360 - 180 + data_xr = data_xr.sortby(data_xr['longitude']) + return data_xr + + +def ds_apply_conversion_dict(data_xr, conversion_dict, quantity_list): + # rename variables from conversion_dict + for k,v in conversion_dict.items(): + ncvarn = v['ncvarname'] + if ncvarn in data_xr.variables.mapping.keys() and k in quantity_list: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict) + data_xr = data_xr.rename({ncvarn:k}) + print(f'variable {ncvarn} renamed to {k}') + + # optional conversion of units. Multiplications or other simple operatiors do not affect performance (dask.array(getitem) becomes dask.array(mul). With more complex operation it is better do do it on the interpolated array. + for quan in quantity_list: #TODO: maybe do unit conversion before interp or is that computationally heavy? + if 'conversion' in conversion_dict[quan].keys(): #if conversion is present, unit key must also be in conversion_dict + print(f'> converting units from [{data_xr[quan].attrs["units"]}] to [{conversion_dict[quan]["unit"]}]') + #print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}') + data_xr[quan] = data_xr[quan] * conversion_dict[quan]['conversion'] #conversion drops all attributes of which units (which are changed anyway) + data_xr[quan].attrs['units'] = conversion_dict[quan]['unit'] #add unit attribute with resulting unit + return data_xr + + +def get_quantity_list(quantity): if quantity=='uxuyadvectionvelocitybnd': #T3Dvector quantity_list = ['ux','uy'] elif isinstance(quantity, list): quantity_list = quantity else: quantity_list = [quantity] - + return quantity_list + + +def get_ncvarname_list(quantity_list, conversion_dict): # check existence of requested keys in conversion_dict for quan in quantity_list: if quan not in conversion_dict.keys(): raise KeyError(f"quantity '{quan}' not in conversion_dict, (case sensitive) options are: {str(list(conversion_dict.keys()))}") ncvarname_list = [conversion_dict[quan]['ncvarname'] for quan in quantity_list] + return ncvarname_list + - #convert tstart/tstop from str/dt.datetime/pd.Timestamp to pd.Timestamp. WARNING: when supplying '05-04-2016', monthfirst is assumed, so 2016-05-04 will be the result (may instead of april). - tstart = pd.Timestamp(tstart) - tstop = pd.Timestamp(tstop) +def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=None, refdate_str=None, chunks=None): + """ + empty docstring + """ + + if conversion_dict is None: + conversion_dict = get_conversion_dict() + + quantity_list = get_quantity_list(quantity=quantity) + ncvarname_list = get_ncvarname_list(quantity_list=quantity_list, conversion_dict=conversion_dict) dir_pattern = [Path(str(dir_pattern).format(ncvarname=ncvarname)) for ncvarname in ncvarname_list] file_list_nc = [] @@ -339,76 +399,30 @@ def open_dataset_extra(dir_pattern, quantity, tstart, tstop, conversion_dict=Non data_xr = xr.open_mfdataset(file_list_nc, chunks=chunks, join="exact") #TODO: does chunks argument solve "PerformanceWarning: Slicing is producing a large chunk."? {'time':1} is not a convenient chunking to use for timeseries extraction - for k,v in conversion_dict.items(): - ncvarn = v['ncvarname'] - if ncvarn in data_xr.variables.mapping.keys() and k in quantity_list: #k in quantity_list so phyc is not always renamed to tracerbndPON1 (first in conversion_dict) - data_xr = data_xr.rename({ncvarn:k}) - print(f'variable {ncvarn} renamed to {k}') - - #rename dims and vars time/depth/lat/lon/x/y #TODO: this has to be phased out some time, or made as an argument or merged with conversion_dict? - rename_dims_dict = {'depth':dimn_depth, # depth for CMEMS and many others - 'lev':dimn_depth, # lev for GFDL - 'lon':'longitude','lat':'latitude'} - for k,v in rename_dims_dict.items(): - if k in data_xr.dims and v not in data_xr.dims: - data_xr = data_xr.rename_dims({k:v}) - print(f'dimension {k} renamed to {v}') - if k in data_xr.variables and v not in data_xr.variables: - data_xr = data_xr.rename_vars({k:v}) - print(f'varname {k} renamed to {v}') + data_xr = ds_apply_conventions(data_xr=data_xr) + data_xr = ds_apply_conversion_dict(data_xr=data_xr, conversion_dict=conversion_dict, quantity_list=quantity_list) - #get calendar and maybe convert_calendar, makes sure that nc_tstart/nc_tstop are of type pd._libs.tslibs.timestamps.Timestamp - data_xr_calendar = data_xr['time'].dt.calendar - if data_xr_calendar != 'proleptic_gregorian': #this is for instance the case in case of noleap (or 365_days) calendars from GFDL and CMCC - units_copy = data_xr['time'].encoding['units'] - print(f'WARNING: calendar different than proleptic_gregorian found ({data_xr_calendar}), convert_calendar is called so check output carefully. It should be no issue for datasets with a monthly interval.') - data_xr = data_xr.convert_calendar('standard') #TODO: does this not result in 29feb nan values in e.g. GFDL model? Check missing argument at https://docs.xarray.dev/en/stable/generated/xarray.Dataset.convert_calendar.html - data_xr['time'].encoding['units'] = units_copy #put back dropped units - - check_time_extent(data_xr, tstart, tstop) - - #360 to 180 conversion - convert_360to180 = (data_xr['longitude'].to_numpy()>180).any() #TODO: replace to_numpy() with load() - if convert_360to180: #TODO: make more flexible for models that eg pass -180/+180 crossing (add overlap at lon edges). - data_xr.coords['longitude'] = (data_xr.coords['longitude'] + 180) % 360 - 180 - data_xr = data_xr.sortby(data_xr['longitude']) - - #retrieve var(s) (after potential longitude conversion) (also selecting relevant times) + #retrieve var(s) (after potential longitude conversion) data_vars = list(data_xr.data_vars) bool_quanavailable = pd.Series(quantity_list).isin(data_vars) if not bool_quanavailable.all(): quantity_list_notavailable = pd.Series(quantity_list).loc[~bool_quanavailable].tolist() raise KeyError(f'quantity {quantity_list_notavailable} not found in netcdf, available are: {data_vars}. Try updating conversion_dict to rename these variables.') - data_xr_vars = data_xr[quantity_list].sel(time=slice(tstart,tstop)) + data_xr_vars = data_xr[quantity_list] + + # slice time + check_time_extent(data_xr, tstart, tstop) + data_xr_vars = data_xr_vars.sel(time=slice(tstart,tstop)) # check time extent again to avoid issues with eg midday data being # sliced to midnight times: https://github.com/Deltares/dfm_tools/issues/707 check_time_extent(data_xr_vars, tstart, tstop) - #optional conversion of units. Multiplications or other simple operatiors do not affect performance (dask.array(getitem) becomes dask.array(mul). With more complex operation it is better do do it on the interpolated array. - for quan in quantity_list: #TODO: maybe do unit conversion before interp or is that computationally heavy? - if 'conversion' in conversion_dict[quan].keys(): #if conversion is present, unit key must also be in conversion_dict - print(f'> converting units from [{data_xr_vars[quan].attrs["units"]}] to [{conversion_dict[quan]["unit"]}]') - #print(f'attrs are discarded:\n{data_xr_vars[quan].attrs}') - data_xr_vars[quan] = data_xr_vars[quan] * conversion_dict[quan]['conversion'] #conversion drops all attributes of which units (which are changed anyway) - data_xr_vars[quan].attrs['units'] = conversion_dict[quan]['unit'] #add unit attribute with resulting unit - #optional refdate changing if refdate_str is not None: if 'long_name' in data_xr_vars.time.attrs: #for CMEMS it is 'hours since 1950-01-01', which would be wrong now #TODO: consider also removing attrs for depth/varname, since we would like to have salinitybnd/waterlevel instead of Salinity/sea_surface_height in xr plots? del data_xr_vars.time.attrs['long_name'] data_xr_vars.time.encoding['units'] = refdate_str - if varn_depth in data_xr_vars.coords: - #make positive up - if 'positive' in data_xr_vars[varn_depth].attrs.keys(): - if data_xr_vars[varn_depth].attrs['positive'] == 'down': #attribute appears in CMEMS, GFDL and CMCC, save to assume presence? - data_xr_vars[varn_depth] = -data_xr_vars[varn_depth] - data_xr_vars[varn_depth].attrs['positive'] = 'up' - #optional reversing depth dimension for comparison to coastserv - if reverse_depth: - print('> reversing depth dimension') - data_xr_vars = data_xr_vars.reindex({varn_depth:list(reversed(data_xr_vars[varn_depth]))}) - return data_xr_vars @@ -465,7 +479,7 @@ def interp_uds_to_plipoints(uds:xu.UgridDataset, gdf:geopandas.GeoDataFrame) -> ---------- uds : xu.UgridDataset dfm model output read using dfm_tools. Dims: mesh2d_nLayers, mesh2d_nInterfaces, time, mesh2d_nNodes, mesh2d_nFaces, mesh2d_nMax_face_nodes, mesh2d_nEdges. - gdf : geopandas.GeoDataFrame (str/path is also supported) + gdf : geopandas.GeoDataFrame gdf with location, geometry (Point) and crs corresponding to model crs. Raises @@ -479,14 +493,13 @@ def interp_uds_to_plipoints(uds:xu.UgridDataset, gdf:geopandas.GeoDataFrame) -> xr.Dataset with dims: node, time, z. """ + # TODO: this function requires gdf with points, but the interp_regularnc_to_plipoints requires paths to plifiles (and others also) + facedim = uds.grid.face_dimension ncbnd_construct = get_ncbnd_construct() dimn_point = ncbnd_construct['dimn_point'] varn_pointname = ncbnd_construct['varn_pointname'] - if isinstance(gdf,(str,Path)): #TODO: align plipoints/gdf with other functions, now three input types are supported, but the interp_regularnc_to_plipoints requires paths to plifiles (and others also) - gdf = PolyFile_to_geodataframe_points(hcdfm.PolyFile(gdf)) - ds = uds.ugrid.sel_points(x=gdf.geometry.x, y=gdf.geometry.y) #TODO: drop mesh2d_face_x and mesh2d_face_y variables diff --git a/dfm_tools/xugrid_helpers.py b/dfm_tools/xugrid_helpers.py index 5e7eb3dca..538a2e112 100644 --- a/dfm_tools/xugrid_helpers.py +++ b/dfm_tools/xugrid_helpers.py @@ -259,6 +259,7 @@ def open_partitioned_dataset(file_nc, decode_fillvals=False, remove_edges=True, def open_dataset_curvilinear(file_nc, + varn_lon='longitude', varn_vert_lon='vertices_longitude', #'grid_x' varn_vert_lat='vertices_latitude', #'grid_y' ij_dims=['i','j'], #['N','M'] @@ -282,9 +283,10 @@ def open_dataset_curvilinear(file_nc, vertices_latitude = vertices_latitude.reshape(-1,vertices_latitude.shape[-1]) print(f'{(dt.datetime.now()-dtstart).total_seconds():.2f} sec') - #convert from 0to360 to -180 to 180 + # convert from 0to360 to -180 to 180 if convert_360to180: - vertices_longitude = (vertices_longitude+180)%360 - 180 #TODO: check if uds.ugrid.to_nonperiodic() still works properly after doing this + vertices_longitude = (vertices_longitude+180)%360 - 180 + ds[varn_lon] = (ds[varn_lon] + 180) % 360 - 180 # face_xy = np.stack([longitude,latitude],axis=-1) # face_coords_x, face_coords_y = face_xy.T diff --git a/docs/whats-new.md b/docs/whats-new.md index 6791fa63a..f938c192b 100644 --- a/docs/whats-new.md +++ b/docs/whats-new.md @@ -1,5 +1,14 @@ # What's new +## UNRELEASED + +### Feat +- making `dfmt.open_dataset_extra()` more modular by partly moving it to separate private functions in [#913](https://github.com/Deltares/dfm_tools/pull/913) + +### Fix +- also apply convert_360to180 to longitude variable in `dfmt.open_dataset_curvilinear()` in [#913](https://github.com/Deltares/dfm_tools/pull/913) + + ## 0.24.0 (2024-07-12) ### Feat diff --git a/tests/examples/preprocess_interpolate_nc_to_bc.py b/tests/examples/preprocess_interpolate_nc_to_bc.py index 9dc8a0ff4..a2a8fd9f6 100644 --- a/tests/examples/preprocess_interpolate_nc_to_bc.py +++ b/tests/examples/preprocess_interpolate_nc_to_bc.py @@ -26,7 +26,7 @@ #quantities should be in conversion_dict.keys(). waterlevelbnd is steric/zos, tide is tidal components from FES/EOT list_quantities = ['waterlevelbnd','salinitybnd','temperaturebnd','uxuyadvectionvelocitybnd','tracerbndNO3','tide'] #list_quantities = ['waterlevelbnd','salinitybnd','temperaturebnd','tracerbndNO3'] -list_quantities = ['salinitybnd','tracerbndNO3','tide'] +list_quantities = ['salinitybnd','tracerbndNO3']#,'tide'] # TODO: tide currently fails: https://github.com/Deltares/dfm_tools/issues/905 #list_quantities = ['waterlevelbnd','salinitybnd','temperaturebnd','uxuyadvectionvelocitybnd','tracerbndNO3','tracerbndOpal','tracerbndDON','tide'] #also waq vars with same ncvarname, opal not available for GFDL and CMCC model = 'CMEMS' #CMEMS GFDL diff --git a/tests/test_interpolate_grid2bnd.py b/tests/test_interpolate_grid2bnd.py index fba887026..ff5bda5c9 100644 --- a/tests/test_interpolate_grid2bnd.py +++ b/tests/test_interpolate_grid2bnd.py @@ -20,6 +20,10 @@ interp_regularnc_to_plipointsDataset, check_time_extent, ext_add_boundary_object_per_polyline, + get_quantity_list, + get_ncvarname_list, + ds_apply_conventions, + ds_apply_conversion_dict, ) from dfm_tools.hydrolib_helpers import PolyFile_to_geodataframe_points import hydrolib.core.dflowfm as hcdfm @@ -251,14 +255,48 @@ def test_open_dataset_extra_correctdepths(tmp_path): ds_moretime.to_netcdf(file_nc) ds_moretime_import = dfmt.open_dataset_extra(dir_pattern=file_nc, quantity='salinitybnd', tstart='2020-01-01 12:00:00', tstop='2020-01-02 12:00:00') + assert len(ds_moretime_import.time) == 2 + + +@pytest.mark.unittest +def test_ds_apply_conventions(): + # generate datset with depths defined positive down + ds_moretime = cmems_dataset_4times() + ds_moretime['depth'] = -1 * ds_moretime['depth'] + ds_moretime['depth'].attrs['positive'] = 'down' + ds_converted = ds_apply_conventions(data_xr=ds_moretime) ncbnd_construct = get_ncbnd_construct() varn_depth = ncbnd_construct['varn_depth'] - depth_actual = ds_moretime_import[varn_depth].to_numpy() - depth_expected = ds_moretime['depth'].to_numpy() + dimn_depth = ncbnd_construct['dimn_depth'] + assert varn_depth in ds_converted.variables + assert dimn_depth in ds_converted.dims - assert (np.abs(depth_actual - depth_expected) < 1e-9).all() - assert len(ds_moretime_import.time) == 2 + depth_expected = np.array([-0.494025, -1.541375, -2.645669, -3.819495, -5.078224]) + depth_actual = ds_converted[varn_depth].to_numpy() + assert np.allclose(depth_actual, depth_expected) + + +@pytest.mark.unittest +def test_ds_apply_conversion_dict_rename(): + conversion_dict = dfmt.get_conversion_dict() + ds_moretime = cmems_dataset_4times() + ds_converted = ds_apply_conversion_dict(data_xr=ds_moretime, conversion_dict=conversion_dict, quantity_list=['salinitybnd']) + assert 'so' in ds_moretime.data_vars + assert 'salinitybnd' in ds_converted.data_vars + assert np.allclose(ds_converted['salinitybnd'], ds_moretime['so'], equal_nan=True) + + +@pytest.mark.unittest +def test_ds_apply_conversion_dict_rename_and_factor(): + conversion_dict = dfmt.get_conversion_dict() + ds_moretime = cmems_dataset_4times() + ds_moretime = ds_moretime.rename_vars({'so':'o2'}) + ds_moretime['o2'] = ds_moretime['o2'].assign_attrs({'units':'dummy'}) + ds_converted = ds_apply_conversion_dict(data_xr=ds_moretime, conversion_dict=conversion_dict, quantity_list=['tracerbndOXY']) + assert 'o2' in ds_moretime.data_vars + assert 'tracerbndOXY' in ds_converted.data_vars + assert np.allclose(ds_converted['tracerbndOXY'], ds_moretime['o2']*0.032, equal_nan=True) @pytest.mark.unittest @@ -604,3 +642,24 @@ def test_ext_add_boundary_object_per_polyline_wrong_name(tmp_path): with pytest.raises(ValueError) as e: ext_add_boundary_object_per_polyline(ext_new=ext_new, boundary_object=boundary_object) assert "The names of one of the polylines in the polyfile is the same as the polyfilename" in str(e.value) + + +def test_get_quantity_list(): + quantity_list = get_quantity_list('uxuyadvectionvelocitybnd') + assert quantity_list == ['ux','uy'] + quantity_list = get_quantity_list('salinitybnd') + assert quantity_list == ['salinitybnd'] + quantity_list = get_quantity_list(['salinitybnd']) + assert quantity_list == ['salinitybnd'] + + +def test_get_ncvarname_list(): + conversion_dict = dfmt.get_conversion_dict() + + ncvarname_list = get_ncvarname_list(quantity_list=['salinitybnd'], conversion_dict=conversion_dict) + assert ncvarname_list == ["so"] + + with pytest.raises(KeyError) as e: + get_ncvarname_list(quantity_list=['nonexistingbnd'], conversion_dict=conversion_dict) + assert "quantity 'nonexistingbnd' not in conversion_dict" in str(e.value) +