From 401ebca93ddbe98c88583a1bcb1c10704f8394e6 Mon Sep 17 00:00:00 2001 From: William Jones Date: Wed, 25 Sep 2024 10:36:44 +0100 Subject: [PATCH 1/7] Fix bug that caused statistics to be recalculated for each threshold during feature detection --- tobac/feature_detection.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index fc491ff0..e5dedee4 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1109,29 +1109,30 @@ def feature_detection_multithreshold_timestep( elif i_threshold == 0: regions_old = regions_i - if statistic: - # reconstruct the labeled regions based on the regions dict - labels = np.zeros(track_data.shape) - labels = labels.astype(int) - for key in regions_old.keys(): - labels.ravel()[regions_old[key]] = key - # apply function to get statistics based on labeled regions and functions provided by the user - # the feature dataframe is updated by appending a column for each metric - features_thresholds = get_statistics( - features_thresholds, - labels, - track_data, - statistic=statistic, - index=np.unique(labels[labels > 0]), - id_column="idx", - ) - logging.debug( "Finished feature detection for threshold " + str(i_threshold) + " : " + str(threshold_i) ) + + if statistic: + # reconstruct the labeled regions based on the regions dict + labels = np.zeros(track_data.shape) + labels = labels.astype(int) + for key in regions_old.keys(): + labels.ravel()[regions_old[key]] = key + # apply function to get statistics based on labeled regions and functions provided by the user + # the feature dataframe is updated by appending a column for each metric + features_thresholds = get_statistics( + features_thresholds, + labels, + track_data, + statistic=statistic, + index=np.unique(labels[labels > 0]), + id_column="idx", + ) + return features_thresholds From fdd1ff86137d27eee7c4ea27e659cc44db7f2ff5 Mon Sep 17 00:00:00 2001 From: William Jones Date: Wed, 25 Sep 2024 11:22:45 +0100 Subject: [PATCH 2/7] Move filter_min_distance to after coordinate interpolation in feature detection --- tobac/feature_detection.py | 55 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index e5dedee4..4830175e 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1372,30 +1372,7 @@ def feature_detection_multithreshold( strict_thresholding=strict_thresholding, statistic=statistic, ) - # check if list of features is not empty, then merge features from different threshold - # values into one DataFrame and append to list for individual timesteps: - if not features_thresholds.empty: - hdim1_ax, hdim2_ax = internal_utils.find_hdim_axes_3D( - field_in, vertical_coord=vertical_coord - ) - hdim1_max = field_in.shape[hdim1_ax] - 1 - hdim2_max = field_in.shape[hdim2_ax] - 1 - # Loop over DataFrame to remove features that are closer than distance_min to each - # other: - if min_distance > 0: - features_thresholds = filter_min_distance( - features_thresholds, - dxy=dxy, - dz=dz, - min_distance=min_distance, - z_coordinate_name=vertical_coord, - target=target, - PBC_flag=PBC_flag, - min_h1=0, - max_h1=hdim1_max, - min_h2=0, - max_h2=hdim2_max, - ) + list_features_timesteps.append(features_thresholds) logging.debug( @@ -1417,9 +1394,39 @@ def feature_detection_multithreshold( ) else: features = add_coordinates(features, field_in) + + # Loop over DataFrame to remove features that are closer than distance_min to each + # other: + filtered_features = [] + if min_distance > 0: + hdim1_ax, hdim2_ax = internal_utils.find_hdim_axes_3D( + field_in, vertical_coord=vertical_coord + ) + hdim1_max = field_in.shape[hdim1_ax] - 1 + hdim2_max = field_in.shape[hdim2_ax] - 1 + + for _, features_frame in features.groupby("frame"): + filtered_features.append( + filter_min_distance( + features_frame, + dxy=dxy, + dz=dz, + min_distance=min_distance, + z_coordinate_name=vertical_coord, + target=target, + PBC_flag=PBC_flag, + min_h1=0, + max_h1=hdim1_max, + min_h2=0, + max_h2=hdim2_max, + ) + ) + features = pd.concat(filtered_features, ignore_index=True) + else: features = None logging.debug("No features detected") + logging.debug("feature detection completed") return features From b9c2cece159907ad6eb906cd443bfbad67b6277f Mon Sep 17 00:00:00 2001 From: William Jones Date: Wed, 25 Sep 2024 11:38:01 +0100 Subject: [PATCH 3/7] Enable use of vertical coordinate rather than fixed dz in filter_min_distance --- tobac/feature_detection.py | 71 ++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index 4830175e..e23407dc 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1274,6 +1274,9 @@ def feature_detection_multithreshold( if detect_subset is not None and ndim_time in detect_subset: raise NotImplementedError("Cannot subset on time") + # Remember if dz is set and not vertical coord for min distance filtering + use_dz_for_filtering = dz is not None + if is_3D: # We need to determine the time axis so that we can determine the # vertical axis in each timestep if vertical_axis is not none. @@ -1409,10 +1412,10 @@ def feature_detection_multithreshold( filtered_features.append( filter_min_distance( features_frame, - dxy=dxy, - dz=dz, - min_distance=min_distance, - z_coordinate_name=vertical_coord, + dxy = dxy, + dz = dz if use_dz_for_filtering else None, + min_distance = min_distance, + z_coordinate_name = None if use_dz_for_filtering else vertical_coord, target=target, PBC_flag=PBC_flag, min_h1=0, @@ -1422,7 +1425,7 @@ def feature_detection_multithreshold( ) ) features = pd.concat(filtered_features, ignore_index=True) - + else: features = None logging.debug("No features detected") @@ -1496,24 +1499,13 @@ def filter_min_distance( pandas DataFrame features after filtering """ + # Optional coordinate names are not yet implemented, set to defaults here: if dxy is None: raise NotImplementedError("dxy currently must be set.") - # if PBC_flag != "none": - # raise NotImplementedError("We haven't yet implemented PBCs into this.") - - # if we are 3D, the vertical dimension is in features. if we are 2D, there - # is no vertical dimension in features. - is_3D = "vdim" in features - - if is_3D and dz is None: - z_coordinate_name = internal_utils.find_dataframe_vertical_coord( - features, z_coordinate_name - ) - # Check if both dxy and their coordinate names are specified. # If they are, warn that we will use dxy. - if dxy is not None and ( + elif ( x_coordinate_name in features and y_coordinate_name in features ): warnings.warn( @@ -1522,22 +1514,35 @@ def filter_min_distance( "interpolated coordinates, or set `x_coordinate_name` and " "`y_coordinate_name` to None to use a constant dxy." ) + y_coordinate_name = "hdim_1" + x_coordinate_name = "hdim_2" + # If dxy only, use hdim_1, hdim_1 as default horizontal dimensions + else: + y_coordinate_name = "hdim_1" + x_coordinate_name = "hdim_2" - # Check and if both dz is specified and altitude is available, warn that we will use dz. - if is_3D and (dz is not None and z_coordinate_name in features): - warnings.warn( - "Both " - + z_coordinate_name - + " and dz available to filter_min_distance; using constant dz. " - "Set dz to none if you want to use altitude or set `z_coordinate_name` to None to use " - "constant dz." - ) - - # As optional coordinate names are not yet implemented, set to defaults here: - z_coordinate_name = "vdim" - y_coordinate_name = "hdim_1" - x_coordinate_name = "hdim_2" - + # if we are 3D, the vertical dimension is in features + is_3D = "vdim" in features + if is_3D: + if dz is None: + # Find vertical coord name and set dz to 1 + z_coordinate_name = internal_utils.find_dataframe_vertical_coord( + variable_dataframe=tracks, vertical_coord=z_coordinate_name + ) + dz = 1 + else: + # Use dz, warn if both are set + if z_coordinate_name is not None: + warnings.warn( + "Both " + + z_coordinate_name + + " and dz available to filter_min_distance; using constant dz. " + "Set dz to none if you want to use altitude or set `z_coordinate_name` to None to use " + "constant dz.", + UserWarning, + ) + z_coordinate_name = "vdim" + if target not in ["minimum", "maximum"]: raise ValueError( "target parameter must be set to either 'minimum' or 'maximum'" From c9f3cb973b92146903a5b3af56107df711ac137f Mon Sep 17 00:00:00 2001 From: William Jones Date: Wed, 25 Sep 2024 11:44:55 +0100 Subject: [PATCH 4/7] Add tests for minimum distance filtering using vertical coord --- tobac/feature_detection.py | 2 +- tobac/tests/test_feature_detection.py | 50 ++++++++++++++++++++------- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index e23407dc..eb3ea88b 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1527,7 +1527,7 @@ def filter_min_distance( if dz is None: # Find vertical coord name and set dz to 1 z_coordinate_name = internal_utils.find_dataframe_vertical_coord( - variable_dataframe=tracks, vertical_coord=z_coordinate_name + variable_dataframe=features, vertical_coord=z_coordinate_name ) dz = 1 else: diff --git a/tobac/tests/test_feature_detection.py b/tobac/tests/test_feature_detection.py index 37b6b586..d872e2d8 100644 --- a/tobac/tests/test_feature_detection.py +++ b/tobac/tests/test_feature_detection.py @@ -319,6 +319,34 @@ def test_feature_detection_position(position_threshold): True, False, ), + ( # Test using z coord name + (0, 0, 0, 4, 1), + (1, 1, 1, 4, 1), + 1000, + None, + 1, + "maximum", + False, + False, + True, + "none", + True, + True, + ), + ( # Test using z coord name + (0, 0, 0, 5, 1), + (1, 1, 1, 4, 1), + 1, + None, + 101, + "maximum", + False, + False, + True, + "none", + True, + False, + ), ], ) def test_filter_min_distance( @@ -415,18 +443,6 @@ def test_filter_min_distance( feat_combined = pd.concat([feat_1_interp, feat_2_interp], ignore_index=True) - filter_dist_opts = dict() - - if add_x_coords: - feat_combined[x_coord_name] = feat_combined["hdim_2"] * assumed_dxy - filter_dist_opts["x_coordinate_name"] = x_coord_name - if add_y_coords: - feat_combined[y_coord_name] = feat_combined["hdim_1"] * assumed_dxy - filter_dist_opts["y_coordinate_name"] = y_coord_name - if add_z_coords and is_3D: - feat_combined[z_coord_name] = feat_combined["vdim"] * assumed_dz - filter_dist_opts["z_coordinate_name"] = z_coord_name - filter_dist_opts = { "features": feat_combined, "dxy": dxy, @@ -439,6 +455,16 @@ def test_filter_min_distance( "min_h2": 0, "max_h2": 100, } + if add_x_coords: + feat_combined[x_coord_name] = feat_combined["hdim_2"] * assumed_dxy + filter_dist_opts["x_coordinate_name"] = x_coord_name + if add_y_coords: + feat_combined[y_coord_name] = feat_combined["hdim_1"] * assumed_dxy + filter_dist_opts["y_coordinate_name"] = y_coord_name + if add_z_coords and is_3D: + feat_combined[z_coord_name] = feat_combined["vdim"] * assumed_dz + filter_dist_opts["z_coordinate_name"] = z_coord_name + if target not in ["maximum", "minimum"]: with pytest.raises(ValueError): out_feats = feat_detect.filter_min_distance(**filter_dist_opts) From 9c1058f29e9f99fdd8cdb41a3897fb4177866336 Mon Sep 17 00:00:00 2001 From: William Jones Date: Wed, 25 Sep 2024 11:45:21 +0100 Subject: [PATCH 5/7] Formatting --- tobac/feature_detection.py | 48 +++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index eb3ea88b..a480ff94 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -627,9 +627,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, y_val_alt, x_val_alt] - labels_2[ - label_locs_v, label_locs_h1, label_locs_h2 - ] = labels_2_alt + labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( + labels_2_alt + ) skip_list = np.append(skip_list, label_ind) break @@ -673,9 +673,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, y_val_alt, label_x] - labels_2[ - label_locs_v, label_locs_h1, label_locs_h2 - ] = labels_2_alt + labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( + labels_2_alt + ) new_label_ind = labels_2_alt skip_list = np.append(skip_list, label_ind) @@ -717,9 +717,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, label_y, x_val_alt] - labels_2[ - label_locs_v, label_locs_h1, label_locs_h2 - ] = labels_2_alt + labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( + labels_2_alt + ) new_label_ind = labels_2_alt skip_list = np.append(skip_list, label_ind) @@ -1115,7 +1115,7 @@ def feature_detection_multithreshold_timestep( + " : " + str(threshold_i) ) - + if statistic: # reconstruct the labeled regions based on the regions dict labels = np.zeros(track_data.shape) @@ -1132,7 +1132,7 @@ def feature_detection_multithreshold_timestep( index=np.unique(labels[labels > 0]), id_column="idx", ) - + return features_thresholds @@ -1276,7 +1276,7 @@ def feature_detection_multithreshold( # Remember if dz is set and not vertical coord for min distance filtering use_dz_for_filtering = dz is not None - + if is_3D: # We need to determine the time axis so that we can determine the # vertical axis in each timestep if vertical_axis is not none. @@ -1375,7 +1375,7 @@ def feature_detection_multithreshold( strict_thresholding=strict_thresholding, statistic=statistic, ) - + list_features_timesteps.append(features_thresholds) logging.debug( @@ -1397,7 +1397,7 @@ def feature_detection_multithreshold( ) else: features = add_coordinates(features, field_in) - + # Loop over DataFrame to remove features that are closer than distance_min to each # other: filtered_features = [] @@ -1412,10 +1412,12 @@ def feature_detection_multithreshold( filtered_features.append( filter_min_distance( features_frame, - dxy = dxy, - dz = dz if use_dz_for_filtering else None, - min_distance = min_distance, - z_coordinate_name = None if use_dz_for_filtering else vertical_coord, + dxy=dxy, + dz=dz if use_dz_for_filtering else None, + min_distance=min_distance, + z_coordinate_name=( + None if use_dz_for_filtering else vertical_coord + ), target=target, PBC_flag=PBC_flag, min_h1=0, @@ -1505,9 +1507,7 @@ def filter_min_distance( # Check if both dxy and their coordinate names are specified. # If they are, warn that we will use dxy. - elif ( - x_coordinate_name in features and y_coordinate_name in features - ): + elif x_coordinate_name in features and y_coordinate_name in features: warnings.warn( "Both " + x_coordinate_name + "/" + y_coordinate_name + " and dxy " "set. Using constant dxy. Set dxy to None if you want to use the " @@ -1538,11 +1538,11 @@ def filter_min_distance( + z_coordinate_name + " and dz available to filter_min_distance; using constant dz. " "Set dz to none if you want to use altitude or set `z_coordinate_name` to None to use " - "constant dz.", - UserWarning, + "constant dz.", + UserWarning, ) z_coordinate_name = "vdim" - + if target not in ["minimum", "maximum"]: raise ValueError( "target parameter must be set to either 'minimum' or 'maximum'" From f5e5e25caa65bff3585b87795e4a3f77e77d08e5 Mon Sep 17 00:00:00 2001 From: William Jones Date: Wed, 25 Sep 2024 11:53:04 +0100 Subject: [PATCH 6/7] Manually reformat --- tobac/feature_detection.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index a480ff94..6db4ec57 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -627,9 +627,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, y_val_alt, x_val_alt] - labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( - labels_2_alt - ) + labels_2[ + label_locs_v, label_locs_h1, label_locs_h2 + ] = labels_2_alt skip_list = np.append(skip_list, label_ind) break @@ -673,9 +673,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, y_val_alt, label_x] - labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( - labels_2_alt - ) + labels_2[ + label_locs_v, label_locs_h1, label_locs_h2 + ] = labels_2_alt new_label_ind = labels_2_alt skip_list = np.append(skip_list, label_ind) @@ -717,9 +717,9 @@ def feature_detection_threshold( # find the updated label, and overwrite all of label_ind indices with # updated label labels_2_alt = labels_2[label_z, label_y, x_val_alt] - labels_2[label_locs_v, label_locs_h1, label_locs_h2] = ( - labels_2_alt - ) + labels_2[ + label_locs_v, label_locs_h1, label_locs_h2 + ] = labels_2_alt new_label_ind = labels_2_alt skip_list = np.append(skip_list, label_ind) From bea540c26e68066b09897df73cb99c3de3455bb9 Mon Sep 17 00:00:00 2001 From: William Jones Date: Thu, 17 Oct 2024 11:16:25 +0100 Subject: [PATCH 7/7] Re-add statistics on unsmoothed data --- tobac/feature_detection.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tobac/feature_detection.py b/tobac/feature_detection.py index 454c8387..a866e897 100644 --- a/tobac/feature_detection.py +++ b/tobac/feature_detection.py @@ -1136,10 +1136,14 @@ def feature_detection_multithreshold_timestep( labels.ravel()[regions_old[key]] = key # apply function to get statistics based on labeled regions and functions provided by the user # the feature dataframe is updated by appending a column for each metric + + # select which data to use according to statistics_unsmoothed option + stats_data = data_i.core_data() if statistics_unsmoothed else track_data + features_thresholds = get_statistics( features_thresholds, labels, - track_data, + stats_data, statistic=statistic, index=np.unique(labels[labels > 0]), id_column="idx",