diff --git a/ioos_qc/axds.py b/ioos_qc/axds.py index 79a8e8c..dc5045d 100644 --- a/ioos_qc/axds.py +++ b/ioos_qc/axds.py @@ -67,11 +67,7 @@ def valid_range_test( # This is required because we don't want to restrict a user from using a pd.Series # directly with this function. If the data was coming from a Store, it would # always be a numpy array. - elif ( - dtype is None - and hasattr(inp, "values") - and hasattr(inp.values, "dtype") - ): + elif dtype is None and hasattr(inp, "values") and hasattr(inp.values, "dtype"): dtype = inp.values.dtype # Save original shape diff --git a/ioos_qc/config.py b/ioos_qc/config.py index b4ce2b1..0584e22 100644 --- a/ioos_qc/config.py +++ b/ioos_qc/config.py @@ -158,13 +158,9 @@ def run(self, **passedkwargs): # Get the arguments that the test functions support sig = signature(self.func) valid_keywords = [ - p.name - for p in sig.parameters.values() - if p.kind == p.POSITIONAL_OR_KEYWORD + p.name for p in sig.parameters.values() if p.kind == p.POSITIONAL_OR_KEYWORD ] - testkwargs = { - k: v for k, v in testkwargs.items() if k in valid_keywords - } + testkwargs = {k: v for k, v in testkwargs.items() if k in valid_keywords} try: results.append( CallResult( @@ -320,11 +316,7 @@ def calls(self): @property def aggregate_calls(self): - return [ - c - for c in self._calls - if hasattr(c.func, "aggregate") and c.func.aggregate is True - ] + return [c for c in self._calls if hasattr(c.func, "aggregate") and c.func.aggregate is True] def has(self, stream_id: str, method: Union[callable, str]): if isinstance(method, str): @@ -422,10 +414,7 @@ def __init__(self, source: ConfigTypes) -> None: elif self.config["region"] and "features" in self.config["region"]: # Feature based GeoJSON self.region = GeometryCollection( - [ - shape(feature["geometry"]) - for feature in self.config["region"]["features"] - ], + [shape(feature["geometry"]) for feature in self.config["region"]["features"]], ) elif self.config["region"] and "geometry" in self.config["region"]: # Geometry based GeoJSON diff --git a/ioos_qc/config_creator/config_creator.py b/ioos_qc/config_creator/config_creator.py index 347c495..b328389 100644 --- a/ioos_qc/config_creator/config_creator.py +++ b/ioos_qc/config_creator/config_creator.py @@ -304,10 +304,7 @@ def create_config(self, variable_config): def _load_datasets(self): """Load datasets.""" L.debug(f"Loading {len(self.config)} datasets...") - return { - name: xr.load_dataset(self.config[name]["file_path"]) - for name in self.config - } + return {name: xr.load_dataset(self.config[name]["file_path"]) for name in self.config} def _determine_dataset_years(self): """Determine year used in datasets, return as dict {dataset_name, year}. diff --git a/ioos_qc/config_creator/get_assets.py b/ioos_qc/config_creator/get_assets.py index 42da782..a6ecccf 100644 --- a/ioos_qc/config_creator/get_assets.py +++ b/ioos_qc/config_creator/get_assets.py @@ -102,9 +102,7 @@ def ocean_atlas_variable_enhance(output_dir, month) -> None: def ocean_atlas_merge_time(output_dir) -> None: variable_merged_files = output_dir.glob("ocean_atlas_??.nc") - variable_merged_files = [ - str(merged_file) for merged_file in list(variable_merged_files) - ] + variable_merged_files = [str(merged_file) for merged_file in list(variable_merged_files)] variable_merged_files.sort() output_file = output_dir.parent / "ocean_atlas.nc" diff --git a/ioos_qc/qartod.py b/ioos_qc/qartod.py index c282c43..d69772c 100644 --- a/ioos_qc/qartod.py +++ b/ioos_qc/qartod.py @@ -174,12 +174,9 @@ def location_test( # Ignore warnings when comparing NaN values even though they are masked # https://github.com/numpy/numpy/blob/master/doc/release/1.8.0-notes.rst#runtime-warnings-when-comparing-nan-numbers with np.errstate(invalid="ignore"): - flag_arr[ - (lon < bbox.minx) - | (lat < bbox.miny) - | (lon > bbox.maxx) - | (lat > bbox.maxy) - ] = QartodFlags.FAIL + flag_arr[(lon < bbox.minx) | (lat < bbox.miny) | (lon > bbox.maxx) | (lat > bbox.maxy)] = ( + QartodFlags.FAIL + ) return flag_arr.reshape(original_shape) @@ -238,9 +235,7 @@ def gross_range_test( raise ValueError(msg) # Flag suspect outside of user span with np.errstate(invalid="ignore"): - flag_arr[(inp < uspan.minv) | (inp > uspan.maxv)] = ( - QartodFlags.SUSPECT - ) + flag_arr[(inp < uspan.minv) | (inp > uspan.maxv)] = QartodFlags.SUSPECT # Flag suspect outside of sensor span with np.errstate(invalid="ignore"): @@ -421,11 +416,7 @@ def check(self, tinp, inp, zinp): # Only test non-masked values between the min and max. # Ignore warnings about comparing masked values with np.errstate(invalid="ignore"): - z_idx = ( - (~zinp.mask) - & (zinp >= m.zspan.minv) - & (zinp <= m.zspan.maxv) - ) + z_idx = (~zinp.mask) & (zinp >= m.zspan.minv) & (zinp <= m.zspan.maxv) else: # If there is no z data in the config, don't try to filter by depth! # Set z_idx to all True to prevent filtering @@ -451,12 +442,8 @@ def check(self, tinp, inp, zinp): with np.errstate(invalid="ignore"): flag_arr[(values_idx & fail_idx)] = QartodFlags.FAIL - flag_arr[(values_idx & ~fail_idx & suspect_idx)] = ( - QartodFlags.SUSPECT - ) - flag_arr[(values_idx & ~fail_idx & ~suspect_idx)] = ( - QartodFlags.GOOD - ) + flag_arr[(values_idx & ~fail_idx & suspect_idx)] = QartodFlags.SUSPECT + flag_arr[(values_idx & ~fail_idx & ~suspect_idx)] = QartodFlags.GOOD return flag_arr @@ -752,9 +739,7 @@ def flat_line_test( tinp = mapdates(tinp).flatten() # The thresholds are in seconds so we round make sure the interval is also in seconds - time_interval = ( - np.median(np.diff(tinp)).astype("timedelta64[s]").astype(float) - ) + time_interval = np.median(np.diff(tinp)).astype("timedelta64[s]").astype(float) def rolling_window(a, window): """https://rigtorp.se/2011/01/01/rolling-statistics-numpy.html.""" @@ -880,9 +865,7 @@ def window_func(w): if min_obs is not None: min_periods = min_obs elif min_period is not None: - time_interval = ( - np.median(np.diff(tinp)).astype("timedelta64[s]").astype(float) - ) + time_interval = np.median(np.diff(tinp)).astype("timedelta64[s]").astype(float) min_periods = (min_period / time_interval).astype(int) else: min_periods = None diff --git a/ioos_qc/results.py b/ioos_qc/results.py index 2ca0a86..f944d42 100644 --- a/ioos_qc/results.py +++ b/ioos_qc/results.py @@ -50,7 +50,9 @@ class CollectedResult: lon: np.ndarray = None def __repr__(self) -> str: - return f"" + return ( + f"" + ) def function_name(self) -> str: return self.function.__name__ @@ -178,8 +180,6 @@ def collect_results_dict(results): collected[r.stream_id][testpackage][testname] = np.copy( flag_arr, ) - collected[r.stream_id][testpackage][testname][r.subset_indexes] = ( - testresults - ) + collected[r.stream_id][testpackage][testname][r.subset_indexes] = testresults return collected diff --git a/ioos_qc/stores.py b/ioos_qc/stores.py index 9574f69..4de80fe 100644 --- a/ioos_qc/stores.py +++ b/ioos_qc/stores.py @@ -132,9 +132,7 @@ def save( # Exclusion list, skip everything defined if exclude is not None and ( - cr.function in exclude - or cr.stream_id in exclude - or cr.test in cr.test in include + cr.function in exclude or cr.stream_id in exclude or cr.test in cr.test in include ): continue @@ -223,9 +221,7 @@ def save( # Get flags from module attribute called FLAGS flags = inspect.getmodule(cr.function).FLAGS - varflagnames = [ - d for d in flags.__dict__ if not d.startswith("__") - ] + varflagnames = [d for d in flags.__dict__ if not d.startswith("__")] varflagvalues = [getattr(flags, d) for d in varflagnames] # Set QC variable attributes @@ -247,11 +243,7 @@ def save( if len(config.contexts) == 1: calls = config.calls_by_stream_id(cr.stream_id) - calls = [ - c - for c in calls - if c.module == cr.package and c.method == cr.test - ] + calls = [c for c in calls if c.module == cr.package and c.method == cr.test] if not calls: # No stream_id found! continue @@ -372,12 +364,8 @@ def save(self, path_or_ncd, config, results): # Get flags from module attribute called FLAGS flags = testpackage.FLAGS - varflagnames = [ - d for d in flags.__dict__ if not d.startswith("__") - ] - varflagvalues = [ - getattr(flags, d) for d in varflagnames - ] + varflagnames = [d for d in flags.__dict__ if not d.startswith("__")] + varflagvalues = [getattr(flags, d) for d in varflagnames] if qcvarname not in ncd.variables: v = ncd.createVariable( @@ -412,11 +400,7 @@ def save(self, path_or_ncd, config, results): v.setncattr("ioos_qc_target", vname) # If there is only one context we can write variable specific configs if len(config.contexts) == 1: - varconfig = ( - config.contexts[0] - .streams[vname] - .config[modu][testname] - ) + varconfig = config.contexts[0].streams[vname].config[modu][testname] varconfig = json.dumps( varconfig, cls=GeoNumpyDateEncoder, diff --git a/ioos_qc/streams.py b/ioos_qc/streams.py index 6593897..7d04432 100644 --- a/ioos_qc/streams.py +++ b/ioos_qc/streams.py @@ -1,7 +1,5 @@ -#!/usr/bin/env python import logging -from collections import OrderedDict as odict -from collections import defaultdict +from collections import OrderedDict, defaultdict import numpy as np import pandas as pd @@ -20,16 +18,19 @@ class BaseStream: - """Each stream should define how to return a list of datastreams along with their time and depth association. - Each of these streams will passed through quality control configurations and returned back to it. Each stream - needs to also define what to do with the resulting results (how to store them.). + """Each stream should define how to return a list of datastreams along with + their time and depth association. Each of these streams will passed through + quality control configurations and returned back to it. Each stream needs + to also define what to do with the resulting results (how to store them.). """ def __init__(self, *args, **kwargs) -> None: """df: the dataframe.""" def time(self) -> None: - """Return the time array from the source dataset. This is useful when plotting QC results.""" + """Return the time array from the source dataset. + This is useful when plotting QC results. + """ def data(self, stream_id) -> None: """Return the data array from the source dataset based on stream_id. This is useful when @@ -43,7 +44,7 @@ def run(self, config: Config) -> None: class PandasStream: - def __init__( + def __init__( # noqa: PLR0913 self, df, time=None, @@ -57,7 +58,8 @@ def __init__( z: the column to use for depth lat: the column to use for latitude, this or geom is required if using regional subsets lon: the column to use for longitude, this or geom is required if using regional subsets - geom: the column containing the geometry, this or lat and lon are required if using regional subsets. + geom: the column containing the geometry, this or lat and lon are + required if using regional subsets. """ self.df = df self.time_column = time or "time" @@ -81,7 +83,7 @@ def time(self): def data(self, stream_id): return self.df[stream_id] - def run(self, config: Config): + def run(self, config: Config): # noqa: C901, PLR0912 for context, calls in config.contexts.items(): # Subset first by the stream id in each call stream_ids = [] @@ -95,24 +97,17 @@ def run(self, config: Config): subset = self.df.loc[:, list(set(stream_ids + self.axis_columns))] if context.region: - # TODO: yeah this does nothing right now - # Figure out if this is a geopandas DataFrame already. If not, create one using - # the specified lat_column and lon_column attributes in the constructor - # if self.geom_column not in subset: - # subset = gpd.DataFrame(subset) - # subset[self.geom_column] = 'wut' - # subset = subset[[ subset[self.geom_column].within(context.region) ]] + # TODO: does nothing right now + # Figure out if this is a geopandas DataFrame already. + # If not, create one using the specified lat_column and + # lon_column attributes in the constructor. pass - if ( - context.window.starting is not None - or context.window.ending is not None - ): + if context.window.starting is not None or context.window.ending is not None: if self.time_column in self.axis_columns: if context.window.starting: subset = subset.loc[ - subset[self.time_column] - >= context.window.starting, + subset[self.time_column] >= context.window.starting, :, ] if context.window.ending: @@ -125,8 +120,9 @@ def run(self, config: Config): f"Skipping window subset, {self.time_column} not in columns", ) - # This is a boolean array of what was subset and tested based on the initial data feed - # Take the index of the subset and set those to true + # This is a boolean array of what was subset and tested based on + # the initial data feed. Take the index of the subset and set + # those to true. subset_indexes = pd.Series(0, index=self.df.index, dtype="bool") subset_indexes.iloc[subset.index] = True @@ -144,10 +140,6 @@ def run(self, config: Config): # Perform the "run" function on each Call for call in calls: - # if call.is_aggregate: - # # We compute aggregates using the results - # continue - if call.stream_id not in subset: L.warning( f"{call.stream_id} not a column in the input dataframe, skipping", @@ -167,29 +159,29 @@ def run(self, config: Config): yield ContextResult( results=run_result, stream_id=call.stream_id, - subset_indexes=subset_indexes.values, - data=data_input.values, + subset_indexes=subset_indexes.to_numpy(), + data=data_input.to_numpy(), tinp=subset_kwargs.get( "tinp", pd.Series(dtype="datetime64[ns]"), - ).values, + ).to_numpy(), zinp=subset_kwargs.get( "zinp", pd.Series(dtype="float64"), - ).values, + ).to_numpy(), lat=subset_kwargs.get( "lat", pd.Series(dtype="float64"), - ).values, + ).to_numpy(), lon=subset_kwargs.get( "lon", pd.Series(dtype="float64"), - ).values, + ).to_numpy(), ) class NumpyStream: - def __init__( + def __init__( # noqa: PLR0913 self, inp=None, time=None, @@ -206,10 +198,9 @@ def __init__( geom: numpy array of geometry, this or lat and lon are required if using regional subsets. """ self.inp = inp - try: - assert time is not None + if time is not None: self.tinp = pd.DatetimeIndex(mapdates(time)) - except BaseException: + else: self.tinp = time self.zinp = z self.lat = lat @@ -219,10 +210,10 @@ def __init__( def time(self): return self.tinp - def data(self, stream_id=None): + def data(self): return self.inp - def run(self, config: Config): + def run(self, config: Config): # noqa: C901, PLR0912 for context, calls in config.contexts.items(): # This is a boolean array of what was subset and tested based on the initial data feed # Take the index of the subset and set those to true @@ -238,19 +229,12 @@ def run(self, config: Config): 'Skipping region subset, "lat" and "lon" must be passed into NumpySource', ) - if ( - context.window.starting is not None - or context.window.ending is not None - ): + if context.window.starting is not None or context.window.ending is not None: if self.tinp is not None: if context.window.starting: - subset_indexes = (subset_indexes) & ( - self.tinp >= context.window.starting - ) + subset_indexes = (subset_indexes) & (self.tinp >= context.window.starting) if context.window.ending: - subset_indexes = (subset_indexes) & ( - self.tinp < context.window.ending - ) + subset_indexes = (subset_indexes) & (self.tinp < context.window.ending) else: L.warning( 'Skipping window subset, "time" array must be passed into "run"', @@ -314,31 +298,30 @@ def run(self, config: Config): tinp=subset_kwargs.get( "tinp", pd.Series(dtype="datetime64[ns]"), - ).values, + ).to_numpy(), zinp=subset_kwargs.get( "zinp", - pd.Series(dtype="float64").values, + pd.Series(dtype="float64").to_numpy(), ), lat=subset_kwargs.get( "lat", - pd.Series(dtype="float64").values, + pd.Series(dtype="float64").to_numpy(), ), lon=subset_kwargs.get( "lon", - pd.Series(dtype="float64").values, + pd.Series(dtype="float64").to_numpy(), ), ) class NetcdfStream: - def __init__( + def __init__( # noqa: PLR0913 self, path_or_ncd, time=None, z=None, lat=None, lon=None, - geom=None, ) -> None: self.path_or_ncd = path_or_ncd @@ -371,7 +354,7 @@ def _open(self): return do_close, ds - def run(self, config: Config): + def run(self, config: Config): # noqa: C901 do_close, ds = self._open() stream_ids = [] @@ -388,19 +371,19 @@ def run(self, config: Config): varkwargs = {"inp": {}} if self.time_var in ds.variables: varkwargs["time"] = pd.DatetimeIndex( - mapdates(ds.variables[self.time_var].values), + mapdates(ds.variables[self.time_var].to_numpy()), ) if self.z_var in ds.variables: - varkwargs["z"] = ds.variables[self.z_var].values + varkwargs["z"] = ds.variables[self.z_var].to_numpy() if self.lat_var in ds.variables: - varkwargs["lat"] = ds.variables[self.lat_var].values + varkwargs["lat"] = ds.variables[self.lat_var].to_numpy() if self.lon_var in ds.variables: - varkwargs["lon"] = ds.variables[self.lon_var].values + varkwargs["lon"] = ds.variables[self.lon_var].to_numpy() # Now populate the `inp` dict for each valid data stream for s in stream_ids: if s in ds.variables: - varkwargs["inp"][s] = ds.variables[s].values + varkwargs["inp"][s] = ds.variables[s].to_numpy() if do_close is True: ds.close() @@ -410,7 +393,7 @@ def run(self, config: Config): class XarrayStream: - def __init__( + def __init__( # noqa: PLR0913 self, path_or_ncd, time=None, @@ -427,14 +410,14 @@ def __init__( def time(self): do_close, ds = self._open() - tdata = ds[self.time_var].values + tdata = ds[self.time_var].to_numpy() if do_close is True: ds.close() return tdata def data(self, stream_id): do_close, ds = self._open() - vdata = ds[stream_id].values + vdata = ds[stream_id].to_numpy() if do_close is True: ds.close() return vdata @@ -455,10 +438,10 @@ def _open(self): return do_close, ds - def run(self, config: Config): + def run(self, config: Config): # noqa: C901, PLR0912 # Magic for nested key generation # https://stackoverflow.com/a/27809959 - results = defaultdict(lambda: defaultdict(odict)) + results = defaultdict(lambda: defaultdict(OrderedDict)) do_close, ds = self._open() @@ -473,131 +456,111 @@ def run(self, config: Config): # Because the variables could have different dimensions # we calculate the coordinates and subset for each - # This is xarray style subsetting, so will look something like: - # { - # 'time': slice(datetime.datetime(2020, 1, 1, 0, 0), datetime.datetime(2020, 4, 1, 0, 0), None) - # } + # This is xarray style subsetting. label_indexes = {} subset_kwargs = {} # Region subset - # TODO: yeah this does nothing right now + # TODO: does nothing right now # Subset against the passed in lat/lons variable keys # and build up the subset dict to apply later # Time subset - if self.time_var in ds[call.stream_id].coords: - if context.window.starting and context.window.ending: - label_indexes[self.time_var] = slice( - context.window.starting, - context.window.ending, - ) + if ( + self.time_var in ds[call.stream_id].coords + and context.window.starting + and context.window.ending + ): + label_indexes[self.time_var] = slice( + context.window.starting, + context.window.ending, + ) subset_stream = ds[call.stream_id].sel(**label_indexes) if self.time_var in subset_stream.coords: - # Already subset with the stream, best case. Good netCDF file. - subset_kwargs["tinp"] = subset_stream.coords[ - self.time_var - ].values + # Already subset with the stream, best case. + # Good netCDF file. + subset_kwargs["tinp"] = subset_stream.coords[self.time_var].to_numpy() elif ( self.time_var in ds.variables and ds[self.time_var].dims == ds[call.stream_id].dims ): # Same dimensions as the stream, so use the same subset - subset_kwargs["tinp"] = ( - ds[self.time_var].sel(**label_indexes).values - ) + subset_kwargs["tinp"] = ds[self.time_var].sel(**label_indexes).to_numpy() elif ( self.time_var in ds.variables and ds[self.time_var].size == ds[call.stream_id].size ): - # Not specifically connected, but hey, the user asked for it - subset_kwargs["tinp"] = ( - ds[self.time_var].sel(**label_indexes).values - ) + # Not specifically connected, but hey, + # the user asked for it. + subset_kwargs["tinp"] = ds[self.time_var].sel(**label_indexes).to_numpy() if self.z_var in subset_stream.coords: - # Already subset with the stream, best case. Good netCDF file. - subset_kwargs["zinp"] = subset_stream.coords[ - self.z_var - ].values - elif ( - self.z_var in ds.variables - and ds[self.z_var].dims == ds[call.stream_id].dims - ): + # Already subset with the stream, best case. + # Good netCDF file. + subset_kwargs["zinp"] = subset_stream.coords[self.z_var].to_numpy() + elif self.z_var in ds.variables and ds[self.z_var].dims == ds[call.stream_id].dims: # Same dimensions as the stream, so use the same subset - subset_kwargs["zinp"] = ( - ds[self.z_var].sel(**label_indexes).values - ) - elif ( - self.z_var in ds.variables - and ds[self.z_var].size == ds[call.stream_id].size - ): - # Not specifically connected, but hey, the user asked for it - subset_kwargs["zinp"] = ( - ds[self.z_var].sel(**label_indexes).values - ) + subset_kwargs["zinp"] = ds[self.z_var].sel(**label_indexes).to_numpy() + elif self.z_var in ds.variables and ds[self.z_var].size == ds[call.stream_id].size: + # Not specifically connected, but hey, + # the user asked for it. + subset_kwargs["zinp"] = ds[self.z_var].sel(**label_indexes).to_numpy() if self.lat_var in subset_stream.coords: - # Already subset with the stream, best case. Good netCDF file. - subset_kwargs["lat"] = subset_stream.coords[ - self.lat_var - ].values + # Already subset with the stream, best case. + # Good netCDF file. + subset_kwargs["lat"] = subset_stream.coords[self.lat_var].to_numpy() elif ( self.lat_var in ds.variables and ds[self.lat_var].dims == ds[call.stream_id].dims ): # Same dimensions as the stream, so use the same subset - subset_kwargs["lat"] = ( - ds[self.lat_var].sel(**label_indexes).values - ) + subset_kwargs["lat"] = ds[self.lat_var].sel(**label_indexes).to_numpy() elif ( self.lat_var in ds.variables and ds[self.lat_var].size == ds[call.stream_id].size ): - # Not specifically connected, but hey, the user asked for it - subset_kwargs["lat"] = ( - ds[self.lat_var].sel(**label_indexes).values - ) + # Not specifically connected, but hey, + # the user asked for it. + subset_kwargs["lat"] = ds[self.lat_var].sel(**label_indexes).to_numpy() if self.lon_var in subset_stream.coords: - # Already subset with the stream, best case. Good netCDF file. - subset_kwargs["lon"] = subset_stream.coords[ - self.lon_var - ].values + # Already subset with the stream, best case. + # Good netCDF file. + subset_kwargs["lon"] = subset_stream.coords[self.lon_var].to_numpy() elif ( self.lon_var in ds.variables and ds[self.lon_var].dims == ds[call.stream_id].dims ): # Same dimensions as the stream, so use the same subset - subset_kwargs["lon"] = ( - ds[self.lon_var].sel(**label_indexes).values - ) + subset_kwargs["lon"] = ds[self.lon_var].sel(**label_indexes).to_numpy() elif ( self.lon_var in ds.variables and ds[self.lon_var].size == ds[call.stream_id].size ): - # Not specifically connected, but hey, the user asked for it - subset_kwargs["lon"] = ( - ds[self.lon_var].sel(**label_indexes).values - ) + # Not specifically connected, but hey, + # the user asked for it. + subset_kwargs["lon"] = ds[self.lon_var].sel(**label_indexes).to_numpy() - data_input = subset_stream.values + data_input = subset_stream.to_numpy() run_result = call.run( **subset_kwargs, inp=data_input, ) - # Here we turn the labeled xarray indexes into boolean index arrays that numpy - # can use to subset a basic array. This takes each labeled index, converts it to - # its integer index representation (label -> integers) and then matches the keys - # on each label with the dimension of the data variable. This result should be - # able to be used on the original data feed AS IS using a direct subset notation - # data[subset_indexes]. I'm pretty sure this works and if it doesn't blame my cat. - # We start by subsetting nothing + # Here we turn the labeled xarray indexes into boolean index + # arrays that numpy can use to subset a basic array. + # This takes each labeled index, converts it to its integer + # index representation (label -> integers) and then matches the + # keys on each label with the dimension of the data variable. + # This result should be able to be used on the original data + # feed AS IS using a direct subset notation + # data[subset_indexes]. I'm pretty sure this works and if it + # doesn't blame my cat. We start by subsetting nothing. subset_indexes = np.full_like( - ds[call.stream_id].values, + ds[call.stream_id].to_numpy(), 0, dtype=bool, ) @@ -611,22 +574,22 @@ def run(self, config: Config): int_indexes = int_indexes[0] else: int_indexes = int_indexes.dim_indexers - # Initial slicer will select everything. This selects all values in a dimension - # if there are no labeled indexes for it. + # Initial slicer will select everything. + # This selects all values in a dimension if there are no + # labeled indexes for it. slicers = [slice(None) for x in range(ds[call.stream_id].ndim)] for index_key, index_value in int_indexes.items(): if index_key in ds[call.stream_id].dims: - slicers[ds[call.stream_id].dims.index(index_key)] = ( - index_value - ) - # We started with an empty subset_indexes, not set to True what we actually subset - # using the labeled dimensions. + slicers[ds[call.stream_id].dims.index(index_key)] = index_value + # We started with an empty subset_indexes, not set to True what + # we actually subset using the labeled dimensions. # Casting to a tuple to handle a numpy deprecation: - # FutureWarning: Using a non-tuple sequence for multidimensional indexing is - # deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will - # be interpreted as an array index, `arr[np.array(seq)]`, which will result either - # in an error or a different result. + # FutureWarning: Using a non-tuple sequence for + # multidimensional indexing is deprecated; use + # `arr[tuple(seq)]` instead of `arr[seq]`. In the future this + # will be interpreted as an array index, `arr[np.array(seq)]`, + # which will result either in an error or a different result. subset_indexes[tuple(slicers)] = True yield ContextResult( @@ -636,19 +599,19 @@ def run(self, config: Config): data=data_input, tinp=subset_kwargs.get( "tinp", - pd.Series(dtype="datetime64[ns]").values, + pd.Series(dtype="datetime64[ns]").to_numpy(), ), zinp=subset_kwargs.get( "zinp", - pd.Series(dtype="float64").values, + pd.Series(dtype="float64").to_numpy(), ), lat=subset_kwargs.get( "lat", - pd.Series(dtype="float64").values, + pd.Series(dtype="float64").to_numpy(), ), lon=subset_kwargs.get( "lon", - pd.Series(dtype="float64").values, + pd.Series(dtype="float64").to_numpy(), ), ) diff --git a/ioos_qc/utils.py b/ioos_qc/utils.py index a601710..0df98f2 100644 --- a/ioos_qc/utils.py +++ b/ioos_qc/utils.py @@ -157,10 +157,7 @@ def isfixedlength( raise TypeError(msg) if len(lst) != length: - msg = ( - f"Incorrect list/tuple length for {lst}. Required: {length}, " - "Got: {len(lst)}" - ) + msg = f"Incorrect list/tuple length for {lst}. Required: {length}, " "Got: {len(lst)}" raise ValueError( msg, ) diff --git a/ruff.toml b/ruff.toml index 478bea4..052632b 100644 --- a/ruff.toml +++ b/ruff.toml @@ -1,12 +1,19 @@ -line-length = 79 +line-length = 100 lint.select = ["ALL"] lint.ignore = [ "ANN", # type annotation + "D107", # Missing docstring in `__init__` + "D100", # Missing docstring in public module + "D101", # Missing docstring in public class + "D102", # Missing docstring in public method "D203", # 1 blank line required before class docstring "D205", # 1 blank line required between summary line and description "D213", # incompatible. Ignoring `multi-line-summary-second-line` + "FIX002", # Line contains TODO, consider resolving the issue + "TD002", # Missing author in TODO + "TD003", # Missing issue link on the line following this TODO "TRY003", # Avoid specifying long messages outside the exception class ] diff --git a/tests/test_config_creator.py b/tests/test_config_creator.py index eadc87e..3cc7eb2 100644 --- a/tests/test_config_creator.py +++ b/tests/test_config_creator.py @@ -53,9 +53,7 @@ def test_creator_config(self): class TestQcVariableConfig(unittest.TestCase): def test_init(self): - qc_variable_config_file = ( - Path().parent / "tests/data/qc_variable_config.json" - ) + qc_variable_config_file = Path().parent / "tests/data/qc_variable_config.json" config = QcVariableConfig(qc_variable_config_file) assert config["variable"] == "air" @@ -149,9 +147,7 @@ def setUp(self): self.creator_config = CreatorConfig(creator_config_file) self.config_creator = QcConfigCreator(self.creator_config) - qc_variable_config_file = ( - Path().parent / "tests/data/qc_variable_config.json" - ) + qc_variable_config_file = Path().parent / "tests/data/qc_variable_config.json" self.variable_config = QcVariableConfig(qc_variable_config_file) def test_file_load(self): @@ -272,22 +268,10 @@ def test_data(self): }, } grt = config[var]["qartod"]["gross_range_test"] - assert ( - grt["suspect_span"][0] - == ref["qartod"]["gross_range_test"]["suspect_span"][0] - ) - assert ( - grt["suspect_span"][1] - == ref["qartod"]["gross_range_test"]["suspect_span"][1] - ) - assert ( - grt["fail_span"][0] - == ref["qartod"]["gross_range_test"]["fail_span"][0] - ) - assert ( - grt["fail_span"][1] - == ref["qartod"]["gross_range_test"]["fail_span"][1] - ) + assert grt["suspect_span"][0] == ref["qartod"]["gross_range_test"]["suspect_span"][0] + assert grt["suspect_span"][1] == ref["qartod"]["gross_range_test"]["suspect_span"][1] + assert grt["fail_span"][0] == ref["qartod"]["gross_range_test"]["fail_span"][0] + assert grt["fail_span"][1] == ref["qartod"]["gross_range_test"]["fail_span"][1] def test_no_data(self): # data not available for given box, so code expands box until it gets something diff --git a/tests/test_performance.py b/tests/test_performance.py index 1006f95..42ab2a9 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -256,10 +256,7 @@ def test_qartod_compare(self): tinp=self.times, zinp=self.zinp, ) - all_tests = [ - results["qartod"][test_name] - for test_name in list(results["qartod"]) - ] + all_tests = [results["qartod"][test_name] for test_name in list(results["qartod"])] def run_fn(): qartod.qartod_compare(all_tests) diff --git a/tests/test_qartod.py b/tests/test_qartod.py index d76630f..dda22ae 100644 --- a/tests/test_qartod.py +++ b/tests/test_qartod.py @@ -1290,9 +1290,7 @@ def setUp(self): dtype=np.datetime64, ) self.times_epoch_secs = [t.astype(int) for t in self.times] - self.threshold = ( - 5 / 15 / 60 - ) # 5 units per 15 minutes --> 5/15/60 units per second + self.threshold = 5 / 15 / 60 # 5 units per 15 minutes --> 5/15/60 units per second def test_rate_of_change(self): times = self.times @@ -1654,10 +1652,7 @@ def test_attenuated_signal(self): # good signal, all pass signal = np.array([1, 2, 3, 4]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([1, 1, 1, 1]) self._run_test( @@ -1672,10 +1667,7 @@ def test_attenuated_signal(self): # Only suspect signal = np.array([1, 2, 3, 4]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([3, 3, 3, 3]) self._run_test( @@ -1690,10 +1682,7 @@ def test_attenuated_signal(self): # Not changing should fail signal = np.array([1, 1, 1, 1]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([4, 4, 4, 4]) self._run_test( @@ -1708,10 +1697,7 @@ def test_attenuated_signal(self): # std deviation less than fail threshold signal = np.array([10, 20, 30, 40]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([4, 4, 4, 4]) self._run_test( @@ -1727,10 +1713,7 @@ def test_attenuated_signal_range(self): # range less than fail threshold signal = np.array([10, 20, 30, 40]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([4, 4, 4, 4]) self._run_test( @@ -1745,10 +1728,7 @@ def test_attenuated_signal_range(self): # range less than suspect threshold signal = np.array([10, 20, 30, 40]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([3, 3, 3, 3]) self._run_test( @@ -1762,10 +1742,7 @@ def test_attenuated_signal_range(self): signal = np.array([3, 4, 5, 8.1, 9, 8.5, 8.7, 8.4, 8.2, 8.35, 2, 1]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) self._run_test( @@ -1781,10 +1758,7 @@ def test_attenuated_signal_time_window(self): # test time windowed range signal = [1, 2, 3, 100, 1000] times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(len(signal)) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(len(signal))], ) time_window = 2 * 86400 # 2 days @@ -1828,10 +1802,7 @@ def _run_test_time_window(min_obs, min_period, expected): def test_attenuated_signal_missing(self): signal = np.array([None, 2, 3, 4]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([9, 1, 1, 1]) self._run_test( @@ -1845,10 +1816,7 @@ def test_attenuated_signal_missing(self): signal = np.array([None, None, None, None]) times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(signal.size) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(signal.size)], ) expected = np.array([9, 9, 9, 9]) self._run_test( @@ -1863,10 +1831,7 @@ def test_attenuated_signal_missing(self): # range less than 30 signal = [10, None, None, 40] times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(len(signal)) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(len(signal))], ) expected = np.array([4, 9, 9, 4]) self._run_test( @@ -1882,10 +1847,7 @@ def test_attenuated_signal_missing_time_window(self): # test time windowed range with missing values signal = [1, None, 10, 100, 1000] times = np.array( - [ - np.datetime64("2019-01-01") + np.timedelta64(i, "D") - for i in range(len(signal)) - ], + [np.datetime64("2019-01-01") + np.timedelta64(i, "D") for i in range(len(signal))], ) time_window = 2 * 86400 # 2 days min_obs = 2 # 2 days (since 1 obs per day)