From c834d16794a3ec6c2c6da5d456489f2212814880 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 11:17:49 +0100 Subject: [PATCH 01/13] Updated filter fn docstrings + added list_filters --- openghg_inversions/filters.py | 494 ++++++++++++++++++++-------------- 1 file changed, 286 insertions(+), 208 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 1caff42c..8e4cea01 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -1,10 +1,64 @@ +""" +Functions for filtering data. + +All filters are accessed via the `filtering` function. + +New filters are registered using `@register_filter`. + +To see the available filters call `list_filters`. +""" +from typing import Callable, Union + import numpy as np import pandas as pd import xarray as xr from openghg_inversions.utils import combine_datasets -def filtering(datasets_in, filters, keep_missing=False): + +# this dictionary will be populated by using the decorator `register_filter` +filtering_functions = {} + + +def register_filter(filt: Callable) -> Callable: + """Decorator function to register filters + + Args: + filt: filter function to register + + Returns: + filt, the input function (no modifications made) + + + For instance, the following use of `register_filter` as a decorator adds `my_new_filter` + to the `filtering_functions` dictionary, under the key "my_new_filter": + + >>> @register_filter + def my_new_filter(data): + return data + >>> "my_new_filter" in filtering_functions + True + """ + filtering_functions[filt.__name__] = filt + return filt + + +def list_filters() -> None: + """Print a list of the available filters with a short description.""" + spacing = max([len(k) for k in filtering_functions]) + 4 + + print("All available filters:") + for k, v in filtering_functions.items(): + # print function name and first line of docstring + try: + first_line_of_docstring = v.__doc__.strip().split('\n')[0] + except AttributeError: + first_line_of_docstring = "No docstring" + + print(f"\t{k:{spacing}}{first_line_of_docstring}") + + +def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]], keep_missing: bool = False) -> dict: """ Applies time filtering to entire dataset. Filters supplied in a list and then applied in order. @@ -19,26 +73,27 @@ def filtering(datasets_in, filters, keep_missing=False): ["daytime","daily_median"] ----------------------------------- Args: - datasets_in (dict): - Output from ModelScenario.footprints_merge(). Dictionary of datasets. - filters (list): - Filters to apply to the datasets. - All options are: - "daytime" : selects data between 1100 and 1500 local solar time - "daytime9to5" : selects data between 0900 and 1700 local solar time - "nighttime" : Only b/w 23:00 - 03:00 inclusive - "noon" : Only 12:00 fp and obs used - "daily_median" : calculates the daily median - "pblh_min" : Only keeps times when pblh is > threshold (default 200m) - "pblh_inlet_diff" : Only keeps times when inlet is at least a threshold (default 50m) below the pblh - "local_influence" : Only keep times when localness is low - "six_hr_mean" : - "local_lapse" : - keep_missing (bool) : Whether to reindex to retain missing data. + datasets_in: dictionary of datasets containing output from ModelScenario.footprints_merge(). + filters: filters to apply to the datasets. Either a list of filters, which will be applied to every site, + or a dictionary of lists of the form {: [filter1, filter2, ...]}, with specific filters to + be applied at each site. + + All options for filters are: + "daytime" : selects data between 1100 and 1500 local solar time + "daytime9to5" : selects data between 0900 and 1700 local solar time + "nighttime" : Only b/w 23:00 - 03:00 inclusive + "noon" : Only 12:00 fp and obs used + "daily_median" : calculates the daily median + "pblh_min" : Only keeps times when pblh is > threshold (default 200m) + "pblh_inlet_diff" : Only keeps times when inlet is at least a threshold (default 50m) below the pblh + "local_influence" : Only keep times when localness is low + "six_hr_mean" : + "local_lapse" : + keep_missing: if True, drop missing data ) Returns: - Same format as datasets_in : Datasets with filters applied. - ----------------------------------- + dict in same format as datasets_in, with filters applied + """ # Get list of sites sites = [key for key in list(datasets_in.keys()) if key[0] != "."] @@ -61,220 +116,243 @@ def filtering(datasets_in, filters, keep_missing=False): datasets = datasets_in.copy() - def local_solar_time(dataset): - """ - Returns hour of day as a function of local solar time - relative to the Greenwich Meridian. - """ - sitelon = dataset.release_lon.values[0] - # convert lon to [-180,180], so time offset is negative west of 0 degrees - if sitelon > 180: - sitelon = sitelon - 360.0 - dataset["time"] = dataset.time + pd.Timedelta(minutes=float(24 * 60 * sitelon / 360.0)) - hours = dataset.time.to_pandas().index.hour - return hours - - def local_ratio(dataset): - """ - Calculates the local ratio in the surrounding grid cells - """ - dlon = dataset.lon[1].values - dataset.lon[0].values - dlat = dataset.lat[1].values - dataset.lat[0].values - local_sum = np.zeros((len(dataset.mf))) - - for ti in range(len(dataset.mf)): - release_lon = dataset.release_lon[ti].values - release_lat = dataset.release_lat[ti].values - wh_rlon = np.where(abs(dataset.lon.values - release_lon) < dlon / 2.0) - wh_rlat = np.where(abs(dataset.lat.values - release_lat) < dlat / 2.0) - if np.any(wh_rlon[0]) and np.any(wh_rlat[0]): - local_sum[ti] = np.sum( - dataset.fp[ - wh_rlat[0][0] - 2 : wh_rlat[0][0] + 3, wh_rlon[0][0] - 2 : wh_rlon[0][0] + 3, ti - ].values - ) / np.sum(dataset.fp[:, :, ti].values) - else: - local_sum[ti] = 0.0 - - return local_sum - - # Filter functions - def daily_median(dataset, keep_missing=False): - """Calculate daily median""" - if keep_missing: - return dataset.resample(indexer={"time": "1D"}).median() - else: - return dataset.resample(indexer={"time": "1D"}).median().dropna(dim="time") - def six_hr_mean(dataset, keep_missing=False): - """Calculate six-hour median""" - if keep_missing: - return dataset.resample(indexer={"time": "6H"}).mean() - else: - return dataset.resample(indexer={"time": "6H"}).mean().dropna(dim="time") + # filtering_functions = { + # "daily_median": daily_median, + # "daytime": daytime, + # "daytime9to5": daytime9to5, + # "nighttime": nighttime, + # "noon": noon, + # "local_influence": local_influence, + # "six_hr_mean": six_hr_mean, + # "pblh_inlet_diff": pblh_inlet_diff, + # "pblh_min": pblh_min, + # "pblh": pblh, + # } - def daytime(dataset, site, keep_missing=False): - """Subset during daytime hours (11:00-15:00)""" - hours = local_solar_time(dataset) - ti = [i for i, h in enumerate(hours) if h >= 11 and h <= 15] + # Apply filtering + for site in sites: + if filters[site] is not None: + for filt in filters[site]: + n_nofilter = datasets[site].time.values.shape[0] + if filt in ["daily_median", "six_hr_mean", "pblh_inlet_diff", "pblh_min", "pblh"]: + datasets[site] = filtering_functions[filt](datasets[site], keep_missing=keep_missing) + else: + datasets[site] = filtering_functions[filt](datasets[site], site, keep_missing=keep_missing) + n_filter = datasets[site].time.values.shape[0] + n_dropped = n_nofilter - n_filter + perc_dropped = np.round(n_dropped / n_nofilter * 100, 2) + print(f"{filt} filter removed {n_dropped} ({perc_dropped} %) obs at site {site}") - if keep_missing: - dataset_temp = dataset[dict(time=ti)] - dataset_out = dataset_temp.reindex_like(dataset) - return dataset_out - else: - return dataset[dict(time=ti)] + return datasets - def daytime9to5(dataset, site, keep_missing=False): - """Subset during daytime hours (9:00-17:00)""" - hours = local_solar_time(dataset) - ti = [i for i, h in enumerate(hours) if h >= 9 and h <= 17] - if keep_missing: - dataset_temp = dataset[dict(time=ti)] - dataset_out = dataset_temp.reindex_like(dataset) - return dataset_out - else: - return dataset[dict(time=ti)] +@register_filter +def local_solar_time(dataset): + """ + Returns hour of day as a function of local solar time relative to the Greenwich Meridian. + """ + sitelon = dataset.release_lon.values[0] + # convert lon to [-180,180], so time offset is negative west of 0 degrees + if sitelon > 180: + sitelon = sitelon - 360.0 + dataset["time"] = dataset.time + pd.Timedelta(minutes=float(24 * 60 * sitelon / 360.0)) + hours = dataset.time.to_pandas().index.hour + return hours - def nighttime(dataset, site, keep_missing=False): - """Subset during nighttime hours (23:00 - 03:00)""" - hours = local_solar_time(dataset) - ti = [i for i, h in enumerate(hours) if h >= 23 or h <= 3] - if keep_missing: - dataset_temp = dataset[dict(time=ti)] - dataset_out = dataset_temp.reindex_like(dataset) - return dataset_out +@register_filter +def local_ratio(dataset): + """ + Calculates the local ratio in the surrounding grid cells + """ + dlon = dataset.lon[1].values - dataset.lon[0].values + dlat = dataset.lat[1].values - dataset.lat[0].values + local_sum = np.zeros((len(dataset.mf))) + + for ti in range(len(dataset.mf)): + release_lon = dataset.release_lon[ti].values + release_lat = dataset.release_lat[ti].values + wh_rlon = np.where(abs(dataset.lon.values - release_lon) < dlon / 2.0) + wh_rlat = np.where(abs(dataset.lat.values - release_lat) < dlat / 2.0) + if np.any(wh_rlon[0]) and np.any(wh_rlat[0]): + local_sum[ti] = np.sum( + dataset.fp[ + wh_rlat[0][0] - 2 : wh_rlat[0][0] + 3, wh_rlon[0][0] - 2 : wh_rlon[0][0] + 3, ti + ].values + ) / np.sum(dataset.fp[:, :, ti].values) else: - return dataset[dict(time=ti)] + local_sum[ti] = 0.0 - def noon(dataset, site, keep_missing=False): - """Select only 12pm data""" - hours = local_solar_time(dataset) - ti = [i for i, h in enumerate(hours) if h == 12] + return local_sum - if keep_missing: - dataset_temp = dataset[dict(time=ti)] - dataset_out = dataset_temp.reindex_like(dataset) - return dataset_out - else: - return dataset[dict(time=ti)] - - def local_influence(dataset, site, keep_missing=False): - """ - Subset for times when local influence is below threshold. - Local influence expressed as a fraction of the sum of entire footprint domain. - """ - if not dataset.filter_by_attrs(standard_name="local_ratio"): - lr = local_ratio(dataset) - else: - lr = dataset.local_ratio - pc = 0.1 - ti = [i for i, local_ratio in enumerate(lr) if local_ratio <= pc] - if keep_missing is True: - mf_data_array = dataset.mf - dataset_temp = dataset.drop("mf") +@register_filter +def daily_median(dataset, keep_missing=False): + """Calculate daily median""" + if keep_missing: + return dataset.resample(indexer={"time": "1D"}).median() + else: + return dataset.resample(indexer={"time": "1D"}).median().dropna(dim="time") + - dataarray_temp = mf_data_array[dict(time=ti)] +@register_filter +def six_hr_mean(dataset, keep_missing=False): + """Calculate six-hour median""" + if keep_missing: + return dataset.resample(indexer={"time": "6H"}).mean() + else: + return dataset.resample(indexer={"time": "6H"}).mean().dropna(dim="time") - mf_ds = xr.Dataset( - {"mf": (["time"], dataarray_temp)}, coords={"time": (dataarray_temp.coords["time"])} - ) - dataset_out = combine_datasets(dataset_temp, mf_ds, method=None) - return dataset_out - else: - return dataset[dict(time=ti)] +@register_filter +def daytime(dataset, site, keep_missing=False): + """Subset during daytime hours (11:00-15:00)""" + hours = local_solar_time(dataset) + ti = [i for i, h in enumerate(hours) if h >= 11 and h <= 15] - def pblh_min(dataset, pblh_threshold=200.0, keep_missing=False): - """ - Subset for times when the PBLH is greater than 200m. - """ - pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness + if keep_missing: + dataset_temp = dataset[dict(time=ti)] + dataset_out = dataset_temp.reindex_like(dataset) + return dataset_out + else: + return dataset[dict(time=ti)] - ti = [ - i for i, pblh in enumerate(pblh_da) if pblh > pblh_threshold - ] - if keep_missing is True: - mf_data_array = dataset.mf - dataset_temp = dataset.drop("mf") +@register_filter +def daytime9to5(dataset, site, keep_missing=False): + """Subset during daytime hours (9:00-17:00)""" + hours = local_solar_time(dataset) + ti = [i for i, h in enumerate(hours) if h >= 9 and h <= 17] - dataarray_temp = mf_data_array[dict(time=ti)] + if keep_missing: + dataset_temp = dataset[dict(time=ti)] + dataset_out = dataset_temp.reindex_like(dataset) + return dataset_out + else: + return dataset[dict(time=ti)] - mf_ds = xr.Dataset( - {"mf": (["time"], dataarray_temp)}, coords={"time": (dataarray_temp.coords["time"])} - ) - dataset_out = combine_datasets(dataset_temp, mf_ds, method=None) - return dataset_out - else: - return dataset[dict(time=ti)] - - def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing=False): - """ - Subset for times when observations are taken at a height of less than 50 m below the PBLH. - """ - if "inlet_height_magl" in dataset.attrs: - inlet_height = float(dataset.inlet_height_magl) - elif "inlet" in dataset.attrs: - m = re.search(r"\d+", dataset.attrs["inlet"]) - if m is not None: - inlet_height = float(m.group(0)) - else: - raise ValueError("Could not find inlet height from `inlet_height_magl` or `inlet` dataset attributes.") +@register_filter +def nighttime(dataset, site, keep_missing=False): + """Subset during nighttime hours (23:00 - 03:00)""" + hours = local_solar_time(dataset) + ti = [i for i, h in enumerate(hours) if h >= 23 or h <= 3] - pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness + if keep_missing: + dataset_temp = dataset[dict(time=ti)] + dataset_out = dataset_temp.reindex_like(dataset) + return dataset_out + else: + return dataset[dict(time=ti)] - ti = [ - i for i, pblh in enumerate(pblh_da) if inlet_height < pblh - diff_threshold - ] - if keep_missing is True: - mf_data_array = dataset.mf - dataset_temp = dataset.drop("mf") +@register_filter +def noon(dataset, site, keep_missing=False): + """Select only 12pm data""" + hours = local_solar_time(dataset) + ti = [i for i, h in enumerate(hours) if h == 12] - dataarray_temp = mf_data_array[dict(time=ti)] + if keep_missing: + dataset_temp = dataset[dict(time=ti)] + dataset_out = dataset_temp.reindex_like(dataset) + return dataset_out + else: + return dataset[dict(time=ti)] - mf_ds = xr.Dataset( - {"mf": (["time"], dataarray_temp)}, coords={"time": (dataarray_temp.coords["time"])} - ) - dataset_out = combine_datasets(dataset_temp, mf_ds, method=None) - return dataset_out - else: - return dataset[dict(time=ti)] - - def pblh(dataset, keep_missing=False): - raise NotImplementedError("pblh is now called pblh_inlet_diff") - - filtering_functions = { - "daily_median": daily_median, - "daytime": daytime, - "daytime9to5": daytime9to5, - "nighttime": nighttime, - "noon": noon, - "local_influence": local_influence, - "six_hr_mean": six_hr_mean, - "pblh_inlet_diff": pblh_inlet_diff, - "pblh_min": pblh_min, - "pblh": pblh, - } +@register_filter +def local_influence(dataset, site, keep_missing=False): + """ + Subset for times when local influence is below threshold. + Local influence expressed as a fraction of the sum of entire footprint domain. + """ + if not dataset.filter_by_attrs(standard_name="local_ratio"): + lr = local_ratio(dataset) + else: + lr = dataset.local_ratio - # Apply filtering - for site in sites: - if filters[site] is not None: - for filt in filters[site]: - n_nofilter = datasets[site].time.values.shape[0] - if filt in ["daily_median", "six_hr_mean", "pblh_inlet_diff", "pblh_min", "pblh"]: - datasets[site] = filtering_functions[filt](datasets[site], keep_missing=keep_missing) - else: - datasets[site] = filtering_functions[filt](datasets[site], site, keep_missing=keep_missing) - n_filter = datasets[site].time.values.shape[0] - n_dropped = n_nofilter - n_filter - perc_dropped = np.round(n_dropped / n_nofilter * 100, 2) - print(f"{filt} filter removed {n_dropped} ({perc_dropped} %) obs at site {site}") + pc = 0.1 + ti = [i for i, local_ratio in enumerate(lr) if local_ratio <= pc] + if keep_missing is True: + mf_data_array = dataset.mf + dataset_temp = dataset.drop("mf") - return datasets + dataarray_temp = mf_data_array[dict(time=ti)] + + mf_ds = xr.Dataset( + {"mf": (["time"], dataarray_temp)}, coords={"time": (dataarray_temp.coords["time"])} + ) + + dataset_out = combine_datasets(dataset_temp, mf_ds, method=None) + return dataset_out + else: + return dataset[dict(time=ti)] + + +@register_filter +def pblh_min(dataset, pblh_threshold=200.0, keep_missing=False): + """ + Subset for times when the PBLH is greater than 200m. + """ + pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness + + ti = [ + i for i, pblh in enumerate(pblh_da) if pblh > pblh_threshold + ] + + if keep_missing is True: + mf_data_array = dataset.mf + dataset_temp = dataset.drop("mf") + + dataarray_temp = mf_data_array[dict(time=ti)] + + mf_ds = xr.Dataset( + {"mf": (["time"], dataarray_temp)}, coords={"time": (dataarray_temp.coords["time"])} + ) + + dataset_out = combine_datasets(dataset_temp, mf_ds, method=None) + return dataset_out + else: + return dataset[dict(time=ti)] + + +@register_filter +def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing=False): + """ + Subset for times when observations are taken at a height of less than 50 m below the PBLH. + """ + if "inlet_height_magl" in dataset.attrs: + inlet_height = float(dataset.inlet_height_magl) + elif "inlet" in dataset.attrs: + m = re.search(r"\d+", dataset.attrs["inlet"]) + if m is not None: + inlet_height = float(m.group(0)) + else: + raise ValueError("Could not find inlet height from `inlet_height_magl` or `inlet` dataset attributes.") + + pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness + + ti = [ + i for i, pblh in enumerate(pblh_da) if inlet_height < pblh - diff_threshold + ] + + if keep_missing is True: + mf_data_array = dataset.mf + dataset_temp = dataset.drop("mf") + + dataarray_temp = mf_data_array[dict(time=ti)] + + mf_ds = xr.Dataset( + {"mf": (["time"], dataarray_temp)}, coords={"time": (dataarray_temp.coords["time"])} + ) + + dataset_out = combine_datasets(dataset_temp, mf_ds, method=None) + return dataset_out + else: + return dataset[dict(time=ti)] + + +@register_filter +def pblh(dataset, keep_missing=False): + raise NotImplementedError("pblh is now called pblh_inlet_diff") From e4b1a2d2c4f452aba003502d1f57f130f3b1ec88 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 11:30:50 +0100 Subject: [PATCH 02/13] Added missing import for filters.py --- openghg_inversions/filters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 8e4cea01..97fb174f 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -7,6 +7,7 @@ To see the available filters call `list_filters`. """ +import re from typing import Callable, Union import numpy as np From e76e42c0d88aeed9a1818cf0c07bbbb33c26e06f Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 11:31:09 +0100 Subject: [PATCH 03/13] Updated `filtering` function docstring --- openghg_inversions/filters.py | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 97fb174f..4577adce 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -72,25 +72,13 @@ def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]] instance when applying the "daily_median" filter if you only wanted to look at daytime values the filters list should be ["daytime","daily_median"] - ----------------------------------- + Args: datasets_in: dictionary of datasets containing output from ModelScenario.footprints_merge(). filters: filters to apply to the datasets. Either a list of filters, which will be applied to every site, or a dictionary of lists of the form {: [filter1, filter2, ...]}, with specific filters to - be applied at each site. - - All options for filters are: - "daytime" : selects data between 1100 and 1500 local solar time - "daytime9to5" : selects data between 0900 and 1700 local solar time - "nighttime" : Only b/w 23:00 - 03:00 inclusive - "noon" : Only 12:00 fp and obs used - "daily_median" : calculates the daily median - "pblh_min" : Only keeps times when pblh is > threshold (default 200m) - "pblh_inlet_diff" : Only keeps times when inlet is at least a threshold (default 50m) below the pblh - "local_influence" : Only keep times when localness is low - "six_hr_mean" : - "local_lapse" : - keep_missing: if True, drop missing data ) + be applied at each site. Use the `list_filters` function to list available filters. + keep_missing: if True, drop missing data Returns: dict in same format as datasets_in, with filters applied @@ -110,27 +98,13 @@ def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]] filters[site] = [filt] # Check that filters are defined for all sites + # TODO: just set filters for missing sites to None? tmp = [(site in filters) for site in sites] if not all(tmp): raise ValueError(f"Missing entry for sites {np.array(sites)[~np.array(tmp)]} in filters.") - datasets = datasets_in.copy() - - # filtering_functions = { - # "daily_median": daily_median, - # "daytime": daytime, - # "daytime9to5": daytime9to5, - # "nighttime": nighttime, - # "noon": noon, - # "local_influence": local_influence, - # "six_hr_mean": six_hr_mean, - # "pblh_inlet_diff": pblh_inlet_diff, - # "pblh_min": pblh_min, - # "pblh": pblh, - # } - # Apply filtering for site in sites: if filters[site] is not None: From 1a62c78f3fb548743da2e8469e435c4946ec8484 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 11:40:30 +0100 Subject: [PATCH 04/13] Update docstrings in filters.py --- openghg_inversions/filters.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 4577adce..b89dd5de 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -1,9 +1,10 @@ """ Functions for filtering data. -All filters are accessed via the `filtering` function. +All filters are accessed and applied to data via the `filtering` function. New filters are registered using `@register_filter`. +A filter function should accept as arguments: an xr.Dataset, a bool called "keep_missing" To see the available filters call `list_filters`. """ @@ -39,6 +40,7 @@ def my_new_filter(data): return data >>> "my_new_filter" in filtering_functions True + """ filtering_functions[filt.__name__] = filt return filt @@ -122,10 +124,11 @@ def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]] return datasets -@register_filter -def local_solar_time(dataset): +def _local_solar_time(dataset): """ Returns hour of day as a function of local solar time relative to the Greenwich Meridian. + + NOTE: This is not a filter; it is used by other filters. """ sitelon = dataset.release_lon.values[0] # convert lon to [-180,180], so time offset is negative west of 0 degrees From e51ccd59190fc27a4c5ec469ee0d9a0825989c5c Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 11:41:02 +0100 Subject: [PATCH 05/13] Moved local_ratio, renamed to _local_ratio This is a helper function used by the "local_influence" function, so we don't want it to appear to the user. --- openghg_inversions/filters.py | 51 ++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index b89dd5de..90a0ce57 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -140,30 +140,6 @@ def _local_solar_time(dataset): @register_filter -def local_ratio(dataset): - """ - Calculates the local ratio in the surrounding grid cells - """ - dlon = dataset.lon[1].values - dataset.lon[0].values - dlat = dataset.lat[1].values - dataset.lat[0].values - local_sum = np.zeros((len(dataset.mf))) - - for ti in range(len(dataset.mf)): - release_lon = dataset.release_lon[ti].values - release_lat = dataset.release_lat[ti].values - wh_rlon = np.where(abs(dataset.lon.values - release_lon) < dlon / 2.0) - wh_rlat = np.where(abs(dataset.lat.values - release_lat) < dlat / 2.0) - if np.any(wh_rlon[0]) and np.any(wh_rlat[0]): - local_sum[ti] = np.sum( - dataset.fp[ - wh_rlat[0][0] - 2 : wh_rlat[0][0] + 3, wh_rlon[0][0] - 2 : wh_rlon[0][0] + 3, ti - ].values - ) / np.sum(dataset.fp[:, :, ti].values) - else: - local_sum[ti] = 0.0 - - return local_sum - @register_filter def daily_median(dataset, keep_missing=False): @@ -239,6 +215,33 @@ def noon(dataset, site, keep_missing=False): return dataset[dict(time=ti)] +def _local_ratio(dataset): + """ + Calculates the local ratio in the surrounding grid cells. + + NOTE: This is not a filter; it is used by the `local_influence` filter. + """ + dlon = dataset.lon[1].values - dataset.lon[0].values + dlat = dataset.lat[1].values - dataset.lat[0].values + local_sum = np.zeros((len(dataset.mf))) + + for ti in range(len(dataset.mf)): + release_lon = dataset.release_lon[ti].values + release_lat = dataset.release_lat[ti].values + wh_rlon = np.where(abs(dataset.lon.values - release_lon) < dlon / 2.0) + wh_rlat = np.where(abs(dataset.lat.values - release_lat) < dlat / 2.0) + if np.any(wh_rlon[0]) and np.any(wh_rlat[0]): + local_sum[ti] = np.sum( + dataset.fp[ + wh_rlat[0][0] - 2 : wh_rlat[0][0] + 3, wh_rlon[0][0] - 2 : wh_rlon[0][0] + 3, ti + ].values + ) / np.sum(dataset.fp[:, :, ti].values) + else: + local_sum[ti] = 0.0 + + return local_sum + + @register_filter def local_influence(dataset, site, keep_missing=False): """ From a3e6bfdc85a781aee55da1255e344b4e98053c21 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 11:42:22 +0100 Subject: [PATCH 06/13] Updated filter function signature --- openghg_inversions/filters.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 90a0ce57..017a6265 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -140,9 +140,7 @@ def _local_solar_time(dataset): @register_filter - -@register_filter -def daily_median(dataset, keep_missing=False): +def daily_median(dataset, keep_missing: bool = False): """Calculate daily median""" if keep_missing: return dataset.resample(indexer={"time": "1D"}).median() @@ -151,7 +149,7 @@ def daily_median(dataset, keep_missing=False): @register_filter -def six_hr_mean(dataset, keep_missing=False): +def six_hr_mean(dataset, keep_missing: bool = False): """Calculate six-hour median""" if keep_missing: return dataset.resample(indexer={"time": "6H"}).mean() @@ -160,9 +158,9 @@ def six_hr_mean(dataset, keep_missing=False): @register_filter -def daytime(dataset, site, keep_missing=False): +def daytime(dataset, keep_missing: bool = False): """Subset during daytime hours (11:00-15:00)""" - hours = local_solar_time(dataset) + hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 11 and h <= 15] if keep_missing: @@ -174,9 +172,9 @@ def daytime(dataset, site, keep_missing=False): @register_filter -def daytime9to5(dataset, site, keep_missing=False): +def daytime9to5(dataset, keep_missing: bool = False): """Subset during daytime hours (9:00-17:00)""" - hours = local_solar_time(dataset) + hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 9 and h <= 17] if keep_missing: @@ -188,9 +186,9 @@ def daytime9to5(dataset, site, keep_missing=False): @register_filter -def nighttime(dataset, site, keep_missing=False): +def nighttime(dataset, keep_missing: bool = False): """Subset during nighttime hours (23:00 - 03:00)""" - hours = local_solar_time(dataset) + hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 23 or h <= 3] if keep_missing: @@ -202,9 +200,9 @@ def nighttime(dataset, site, keep_missing=False): @register_filter -def noon(dataset, site, keep_missing=False): +def noon(dataset, keep_missing: bool = False): """Select only 12pm data""" - hours = local_solar_time(dataset) + hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h == 12] if keep_missing: @@ -243,13 +241,13 @@ def _local_ratio(dataset): @register_filter -def local_influence(dataset, site, keep_missing=False): +def local_influence(dataset, keep_missing: bool = False): """ Subset for times when local influence is below threshold. Local influence expressed as a fraction of the sum of entire footprint domain. """ if not dataset.filter_by_attrs(standard_name="local_ratio"): - lr = local_ratio(dataset) + lr = _local_ratio(dataset) else: lr = dataset.local_ratio @@ -272,7 +270,7 @@ def local_influence(dataset, site, keep_missing=False): @register_filter -def pblh_min(dataset, pblh_threshold=200.0, keep_missing=False): +def pblh_min(dataset, pblh_threshold=200.0, keep_missing: bool = False): """ Subset for times when the PBLH is greater than 200m. """ @@ -299,7 +297,7 @@ def pblh_min(dataset, pblh_threshold=200.0, keep_missing=False): @register_filter -def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing=False): +def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing: bool = False): """ Subset for times when observations are taken at a height of less than 50 m below the PBLH. """ @@ -335,5 +333,6 @@ def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing=False): @register_filter -def pblh(dataset, keep_missing=False): +def pblh(dataset, keep_missing: bool = False): + """Deprecated: pblh is now called pblh_inlet_diff""" raise NotImplementedError("pblh is now called pblh_inlet_diff") From 8338553b9759749cf2965fcd1e363b326c54f49e Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 12:00:47 +0100 Subject: [PATCH 07/13] Updated _local_solar_time docs/output type --- openghg_inversions/filters.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 017a6265..2e922aaa 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -124,10 +124,12 @@ def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]] return datasets -def _local_solar_time(dataset): +def _local_solar_time(dataset: xr.Dataset) -> list[int]: """ Returns hour of day as a function of local solar time relative to the Greenwich Meridian. + This function also modifies `dataset` by changing the time coordinates. + NOTE: This is not a filter; it is used by other filters. """ sitelon = dataset.release_lon.values[0] @@ -136,11 +138,11 @@ def _local_solar_time(dataset): sitelon = sitelon - 360.0 dataset["time"] = dataset.time + pd.Timedelta(minutes=float(24 * 60 * sitelon / 360.0)) hours = dataset.time.to_pandas().index.hour - return hours + return list(hours) @register_filter -def daily_median(dataset, keep_missing: bool = False): +def daily_median(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Calculate daily median""" if keep_missing: return dataset.resample(indexer={"time": "1D"}).median() From d46ad6ebc4f557ce05ddc1d946fdfcf823efcde0 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 12:01:13 +0100 Subject: [PATCH 08/13] Updated filter function typing --- openghg_inversions/filters.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 2e922aaa..ece9d758 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -151,7 +151,7 @@ def daily_median(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: @register_filter -def six_hr_mean(dataset, keep_missing: bool = False): +def six_hr_mean(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Calculate six-hour median""" if keep_missing: return dataset.resample(indexer={"time": "6H"}).mean() @@ -160,7 +160,7 @@ def six_hr_mean(dataset, keep_missing: bool = False): @register_filter -def daytime(dataset, keep_missing: bool = False): +def daytime(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Subset during daytime hours (11:00-15:00)""" hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 11 and h <= 15] @@ -174,7 +174,7 @@ def daytime(dataset, keep_missing: bool = False): @register_filter -def daytime9to5(dataset, keep_missing: bool = False): +def daytime9to5(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Subset during daytime hours (9:00-17:00)""" hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 9 and h <= 17] @@ -188,7 +188,7 @@ def daytime9to5(dataset, keep_missing: bool = False): @register_filter -def nighttime(dataset, keep_missing: bool = False): +def nighttime(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Subset during nighttime hours (23:00 - 03:00)""" hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 23 or h <= 3] @@ -202,7 +202,7 @@ def nighttime(dataset, keep_missing: bool = False): @register_filter -def noon(dataset, keep_missing: bool = False): +def noon(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Select only 12pm data""" hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h == 12] @@ -215,7 +215,7 @@ def noon(dataset, keep_missing: bool = False): return dataset[dict(time=ti)] -def _local_ratio(dataset): +def _local_ratio(dataset: xr.Dataset) -> xr.Dataset: """ Calculates the local ratio in the surrounding grid cells. @@ -243,7 +243,7 @@ def _local_ratio(dataset): @register_filter -def local_influence(dataset, keep_missing: bool = False): +def local_influence(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """ Subset for times when local influence is below threshold. Local influence expressed as a fraction of the sum of entire footprint domain. @@ -272,7 +272,7 @@ def local_influence(dataset, keep_missing: bool = False): @register_filter -def pblh_min(dataset, pblh_threshold=200.0, keep_missing: bool = False): +def pblh_min(dataset: xr.Dataset, pblh_threshold=200.0, keep_missing: bool = False) -> xr.Dataset: """ Subset for times when the PBLH is greater than 200m. """ @@ -299,7 +299,7 @@ def pblh_min(dataset, pblh_threshold=200.0, keep_missing: bool = False): @register_filter -def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing: bool = False): +def pblh_inlet_diff(dataset: xr.Dataset, diff_threshold=50.0, keep_missing: bool = False) -> xr.Dataset: """ Subset for times when observations are taken at a height of less than 50 m below the PBLH. """ @@ -335,6 +335,6 @@ def pblh_inlet_diff(dataset, diff_threshold=50.0, keep_missing: bool = False): @register_filter -def pblh(dataset, keep_missing: bool = False): +def pblh(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """Deprecated: pblh is now called pblh_inlet_diff""" raise NotImplementedError("pblh is now called pblh_inlet_diff") From ed070a115d2c3f447976148aad640b3d4cd3b048 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 12:04:10 +0100 Subject: [PATCH 09/13] Fixed _local_ratio return type --- openghg_inversions/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index ece9d758..a1969631 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -215,7 +215,7 @@ def noon(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: return dataset[dict(time=ti)] -def _local_ratio(dataset: xr.Dataset) -> xr.Dataset: +def _local_ratio(dataset: xr.Dataset) -> np.ndarray: """ Calculates the local ratio in the surrounding grid cells. From 4520099520d6ed0636155d2a545e9bf66a75e80d Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 12:04:36 +0100 Subject: [PATCH 10/13] Ran black on filters.py --- openghg_inversions/filters.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index a1969631..d6d1be56 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -8,6 +8,7 @@ To see the available filters call `list_filters`. """ + import re from typing import Callable, Union @@ -54,14 +55,16 @@ def list_filters() -> None: for k, v in filtering_functions.items(): # print function name and first line of docstring try: - first_line_of_docstring = v.__doc__.strip().split('\n')[0] + first_line_of_docstring = v.__doc__.strip().split("\n")[0] except AttributeError: first_line_of_docstring = "No docstring" print(f"\t{k:{spacing}}{first_line_of_docstring}") -def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]], keep_missing: bool = False) -> dict: +def filtering( + datasets_in: dict, filters: Union[dict[str, list[str]], list[str]], keep_missing: bool = False +) -> dict: """ Applies time filtering to entire dataset. Filters supplied in a list and then applied in order. @@ -115,7 +118,9 @@ def filtering(datasets_in: dict, filters: Union[dict[str, list[str]], list[str]] if filt in ["daily_median", "six_hr_mean", "pblh_inlet_diff", "pblh_min", "pblh"]: datasets[site] = filtering_functions[filt](datasets[site], keep_missing=keep_missing) else: - datasets[site] = filtering_functions[filt](datasets[site], site, keep_missing=keep_missing) + datasets[site] = filtering_functions[filt]( + datasets[site], site, keep_missing=keep_missing + ) n_filter = datasets[site].time.values.shape[0] n_dropped = n_nofilter - n_filter perc_dropped = np.round(n_dropped / n_nofilter * 100, 2) @@ -278,9 +283,7 @@ def pblh_min(dataset: xr.Dataset, pblh_threshold=200.0, keep_missing: bool = Fal """ pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness - ti = [ - i for i, pblh in enumerate(pblh_da) if pblh > pblh_threshold - ] + ti = [i for i, pblh in enumerate(pblh_da) if pblh > pblh_threshold] if keep_missing is True: mf_data_array = dataset.mf @@ -310,13 +313,13 @@ def pblh_inlet_diff(dataset: xr.Dataset, diff_threshold=50.0, keep_missing: bool if m is not None: inlet_height = float(m.group(0)) else: - raise ValueError("Could not find inlet height from `inlet_height_magl` or `inlet` dataset attributes.") + raise ValueError( + "Could not find inlet height from `inlet_height_magl` or `inlet` dataset attributes." + ) pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness - ti = [ - i for i, pblh in enumerate(pblh_da) if inlet_height < pblh - diff_threshold - ] + ti = [i for i, pblh in enumerate(pblh_da) if inlet_height < pblh - diff_threshold] if keep_missing is True: mf_data_array = dataset.mf From 70c62cbfee7a20c5eeb4b166818b74c124adeff2 Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 12:36:37 +0100 Subject: [PATCH 11/13] Updated filter typing/docstrings --- openghg_inversions/filters.py | 112 ++++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 11 deletions(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index d6d1be56..912aca6b 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -66,9 +66,14 @@ def filtering( datasets_in: dict, filters: Union[dict[str, list[str]], list[str]], keep_missing: bool = False ) -> dict: """ - Applies time filtering to entire dataset. - Filters supplied in a list and then applied in order. - For example if you wanted a daily, daytime average, you could do this: + Applies time filtering to all datasets in `datasets_in`. + + If `filters` is a list, the same filters are applied to all sites. If `filters` is a dict + with site codes as keys, then the filters applied to each site depend on the list supplied + for that site. + + In any case, filters supplied in a list are applied in order. + For example, if you wanted a daily, daytime average, you could do this: datasets_dictionary = filtering(datasets_dictionary, ["daytime", "daily_median"]) @@ -136,6 +141,14 @@ def _local_solar_time(dataset: xr.Dataset) -> list[int]: This function also modifies `dataset` by changing the time coordinates. NOTE: This is not a filter; it is used by other filters. + TODO: do we want this to modify `dataset`? currently it changes the time coordinate + TODO: return np.ndarray and use vectorised filtering? + + Args: + dataset: dataset to extract hours of the day from; this dataset is modified in place + + Returns: + list of hours of the day for each time value in dataset.time """ sitelon = dataset.release_lon.values[0] # convert lon to [-180,180], so time offset is negative west of 0 degrees @@ -148,7 +161,15 @@ def _local_solar_time(dataset: xr.Dataset) -> list[int]: @register_filter def daily_median(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: - """Calculate daily median""" + """Resample data to daily frequency and use daily median values. + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + """ if keep_missing: return dataset.resample(indexer={"time": "1D"}).median() else: @@ -157,7 +178,16 @@ def daily_median(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: @register_filter def six_hr_mean(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: - """Calculate six-hour median""" + """Resample data to 6h frequency and use 6h mean values. + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + + """ if keep_missing: return dataset.resample(indexer={"time": "6H"}).mean() else: @@ -166,7 +196,15 @@ def six_hr_mean(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: @register_filter def daytime(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: - """Subset during daytime hours (11:00-15:00)""" + """Subset during daytime hours (11:00-15:00) + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + """ hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 11 and h <= 15] @@ -180,7 +218,15 @@ def daytime(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: @register_filter def daytime9to5(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: - """Subset during daytime hours (9:00-17:00)""" + """Subset during daytime hours (9:00-17:00) + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + """ hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 9 and h <= 17] @@ -194,7 +240,15 @@ def daytime9to5(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: @register_filter def nighttime(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: - """Subset during nighttime hours (23:00 - 03:00)""" + """Subset during nighttime hours (23:00 - 03:00) + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + """ hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h >= 23 or h <= 3] @@ -208,7 +262,15 @@ def nighttime(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: @register_filter def noon(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: - """Select only 12pm data""" + """Select only 12pm data + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + """ hours = _local_solar_time(dataset) ti = [i for i, h in enumerate(hours) if h == 12] @@ -250,8 +312,16 @@ def _local_ratio(dataset: xr.Dataset) -> np.ndarray: @register_filter def local_influence(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Dataset: """ - Subset for times when local influence is below threshold. + Subset for times when "local influence" is below threshold. + Local influence expressed as a fraction of the sum of entire footprint domain. + + Args: + dataset: dataset to filter + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset """ if not dataset.filter_by_attrs(standard_name="local_ratio"): lr = _local_ratio(dataset) @@ -277,9 +347,19 @@ def local_influence(dataset: xr.Dataset, keep_missing: bool = False) -> xr.Datas @register_filter -def pblh_min(dataset: xr.Dataset, pblh_threshold=200.0, keep_missing: bool = False) -> xr.Dataset: +def pblh_min(dataset: xr.Dataset, pblh_threshold: float = 200.0, keep_missing: bool = False) -> xr.Dataset: """ Subset for times when the PBLH is greater than 200m. + + Args: + dataset: dataset to filter + pblh_threshold: filter will discard times where PBLH/atmosphere boundary layer thickness is below pblh_threshold + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + + TODO: need way to pass pblh_threshold to filter """ pblh_da = dataset.PBLH if "PBLH" in dataset.data_vars else dataset.atmosphere_boundary_layer_thickness @@ -305,6 +385,16 @@ def pblh_min(dataset: xr.Dataset, pblh_threshold=200.0, keep_missing: bool = Fal def pblh_inlet_diff(dataset: xr.Dataset, diff_threshold=50.0, keep_missing: bool = False) -> xr.Dataset: """ Subset for times when observations are taken at a height of less than 50 m below the PBLH. + + Args: + dataset: dataset to filter + diff_threshold: filter will discard times where obs. are taken at a height of less than diff_threshold below PBLH + keep_missing: if True, drop time points removed by filter + + Returns: + filtered dataset + + TODO: need way to pass diff_threshold to filter """ if "inlet_height_magl" in dataset.attrs: inlet_height = float(dataset.inlet_height_magl) From 2a058d113896033fbe07c2ae2e3fe1ee1caacc7e Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 12:37:58 +0100 Subject: [PATCH 12/13] fixup! Updated filter typing/docstrings --- openghg_inversions/filters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openghg_inversions/filters.py b/openghg_inversions/filters.py index 912aca6b..ded84a5f 100644 --- a/openghg_inversions/filters.py +++ b/openghg_inversions/filters.py @@ -382,7 +382,7 @@ def pblh_min(dataset: xr.Dataset, pblh_threshold: float = 200.0, keep_missing: b @register_filter -def pblh_inlet_diff(dataset: xr.Dataset, diff_threshold=50.0, keep_missing: bool = False) -> xr.Dataset: +def pblh_inlet_diff(dataset: xr.Dataset, diff_threshold: float = 50.0, keep_missing: bool = False) -> xr.Dataset: """ Subset for times when observations are taken at a height of less than 50 m below the PBLH. From e9a98407e216d80fe7546299807f7354dcc2b73d Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Fri, 5 Jul 2024 13:58:05 +0100 Subject: [PATCH 13/13] Updated change log --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06437be0..71494eef 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ # Version 0.2.0 +- Refactored `filters.py` so filter functions aren't nested inside `filtering`. Added code to keep track of filter functions. Updated docstrings. [#PR 163](https://github.com/openghg/openghg_inversions/pull/163) + - Moved filters from `utils.py` to new submodule `filters.py` [#PR 159](https://github.com/openghg/openghg_inversions/pull/159) - Removed `site_info.json` and `species_info.json` and replaced with calls to functions in `openghg.util`, which pull the same info from `openghg_defs`. [#PR 152](https://github.com/openghg/openghg_inversions/pull/152)