From eb8ac62ce90cff3b4ce89e0eba452b7a54f4429c Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Mon, 3 Mar 2025 21:35:02 -0500 Subject: [PATCH 1/9] 23067: Refactors distance ratio and validation to use sensitivity_bandwidth, MAJOR --- howso/distances.amlg | 252 +++++++++++++++--------------- howso/react.amlg | 20 +-- howso/react_discriminative.amlg | 2 +- howso/react_series.amlg | 10 +- howso/react_series_utilities.amlg | 8 +- howso/synthesis.amlg | 2 +- howso/synthesis_validation.amlg | 93 ++++++++--- howso/types.amlg | 6 +- 8 files changed, 217 insertions(+), 176 deletions(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index 4190fc13..f6f27002 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -6,11 +6,19 @@ (let (assoc dist_to_closest_case 0 - ;Search for 1 extra outside k to check for ties. - query_k_parameter (if (~ 0 k_parameter) (min 15 (+ 1 k_parameter)) k_parameter) - num_most_similar_case_indices (min most_similar_case_indices_parameter 15) + query_k_parameter + ;bandwidth of 0 means use the analyzed k, else use the provided value + (if (= 0 sensitivity_bandwidth) + (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) + (+ sensitivity_bandwidth 1) + ) closest_case (null) + k_index 0 + plusone_k_index 1 + ;set epsilon for defining whether twa values are equal within acceptable precision as 2 * num features * DBL_EPSILON + epsilon (* 2 (size context_features) 10e-16) ) + (declare (assoc local_model_cases_tuple #!DistanceRatioQuery @@ -29,8 +37,7 @@ p_parameter ;dt of 1 queries distance in ascending order (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) - (null) ;Weight_feature is set to null so the computation done here matches the rejection criteria - ;in generate.amlg. + (null) ;case weight is not relevant since it's the distance to the nearest that matters ;Use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed"). "fixed rand seed" (null) ;radius @@ -43,68 +50,85 @@ dist_to_closest_case (first (last local_model_cases_tuple)) )) - ;If the last and the second-to-last distances are same, need to expand the query to find all equidistant cases. - (if (and - (= - (last (last local_model_cases_tuple)) - (get (last local_model_cases_tuple) (- (size (last local_model_cases_tuple)) 2)) - ) - ;Only perform the expansion if we are not using dynamic k. - (~ 0 k_parameter) - ) - (let - (assoc - expandeded_cases_tuple (list) - expand (true) - num_cases (call !GetNumTrainingCases) - tie_distance (last (last local_model_cases_tuple)) - expanded_case_distances (list) - previous_k 0 + ;non default bandwidth + (if (!= sensitivity_bandwidth 1) + (assign (assoc + k_index (- (size (first local_model_cases_tuple)) 2) + plusone_k_index (- (size (first local_model_cases_tuple)) 1) + )) + ) + + ;Only perform expansion and truncation of the extra case if not using dynamic k. + (if (~ 0 query_k_parameter) + ;If the last and the second-to-last distances are same, need to expand the query to find all equidistant cases. + ;if the delta between the last two values is within epsilon, consider the values to be equal due to decimal precision and expand the search + (if (<= + (abs (- + (get (last local_model_cases_tuple) k_index) + (get (last local_model_cases_tuple) plusone_k_index) + )) + epsilon ) + #!ExpandLocalDataQuery + (let + (assoc + expandeded_cases_tuple (list) + expand (true) + num_cases (call !GetNumTrainingCases) + tie_distance (+ (get (last local_model_cases_tuple) k_index) epsilon) + expanded_case_distances (list) + previous_k 0 + ) - (while expand - ;Store pre-doubled value so that we don't reduce it further than this. - (assign (assoc previous_k query_k_parameter)) - ;Double the query k value to exapand the search. - (assign (assoc query_k_parameter (* 2 query_k_parameter) )) - (assign (assoc expandeded_cases_tuple (call !DistanceRatioQuery) )) - - ;Keep only distances that are <= tie_distance, but don't reduce past the previous_k. - (assign (assoc - expanded_case_distances - (filter - (lambda - (or - (<= (current_value) tie_distance) - (< (current_index) previous_k) - ) + (while expand + ;Store pre-doubled value so that we don't reduce it further than this. + (assign (assoc previous_k query_k_parameter)) + ;Double the query k value to exapand the search. + (assign (assoc query_k_parameter (* 2 query_k_parameter) )) + + (assign (assoc + expandeded_cases_tuple + (if (= "synthesis_validation" expand_query_method) + (call !SynthValidationQuery) + (call !DistanceRatioQuery) ) - (last expandeded_cases_tuple) - ) - )) + )) - ;If there are distances that were filtered out, we can stop expanding. - (if (< (size expanded_case_distances) (size (first expandeded_cases_tuple)) ) + ;Keep only distances that are <= tie_distance, but don't reduce past the previous_k. (assign (assoc - expand (false) - local_model_cases_tuple - (list - ;keep the corresponding case ids - (trunc (first expandeded_cases_tuple) (size expanded_case_distances)) - expanded_case_distances + expanded_case_distances + (filter + (lambda + (or + (<= (current_value) tie_distance) + (< (current_index) previous_k) + ) + ) + (last expandeded_cases_tuple) ) )) - ) - ;Stop if increased to size of dataset. - (if (>= query_k_parameter num_cases) - (assign (assoc expand (false))) + ;If there are distances that were filtered out, we can stop expanding. + (if (< (size expanded_case_distances) (size (first expandeded_cases_tuple)) ) + (assign (assoc + expand (false) + local_model_cases_tuple + (list + ;keep the corresponding case ids + (trunc (first expandeded_cases_tuple) (size expanded_case_distances)) + expanded_case_distances + ) + )) + ) + + ;Stop if increased to size of dataset. + (if (>= query_k_parameter num_cases) + (assign (assoc expand (false))) + ) ) ) - ) - ;Else drop the k+1'th case. - (if (= (size (last local_model_cases_tuple)) (+ 1 k_parameter)) + ;else drop the k+1'th case for numeric k (assign (assoc local_model_cases_tuple (list @@ -115,11 +139,11 @@ ) ) - ;Compute distance contributions with k=1 to determine the smallest non-zero + ;Compute distance contributions with k=1 to determine the max non-zero ;distance between any two cases in the local model. - (assign (assoc - local_model_min_distance_contribution - (call !QueryLocalModelMinDistanceContribution (assoc + (declare (assoc + local_data_max_distance + (call !QueryLocalDataMaxDistance (assoc feature_labels context_features entity_ids_to_compute (first local_model_cases_tuple) filtering_queries filtering_queries @@ -127,28 +151,15 @@ )) )) - ;Pull the session and session training index for the nearest neighbors. (declare (assoc - audit_data - (if - (> num_most_similar_case_indices 0) - (map - (lambda - (set - (retrieve_from_entity (first (current_value)) ;case index - (zip (list ".session" ".session_training_index")) - ) - ".distance" - (last (current_value)) ;distance - ) - ) - (trunc (first local_model_cases_tuple) num_most_similar_case_indices) - (trunc (last local_model_cases_tuple) num_most_similar_case_indices) + num_most_similar_case_indices + (if (> most_similar_case_indices_parameter 0) + (min + most_similar_case_indices_parameter + (if (~ 0 query_k_parameter) query_k_parameter 15) ) - (list) ) - distance_ratio - (/ dist_to_closest_case local_model_min_distance_contribution) + distance_ratio (/ dist_to_closest_case local_data_max_distance) )) (accum (assoc @@ -167,10 +178,27 @@ ) "distance_ratio_parts" (assoc - "local_distance_contribution" local_model_min_distance_contribution + "local_max_distance" local_data_max_distance "nearest_distance" dist_to_closest_case ) - "most_similar_case_indices" audit_data + "most_similar_case_indices" + (if (> num_most_similar_case_indices 0) + (map + (lambda + ;Pull the session and session training index for the nearest neighbors. + (set + (retrieve_from_entity (first (current_value)) ;case index + (zip (list ".session" ".session_training_index")) + ) + ".distance" + (last (current_value)) ;distance + ) + ) + (trunc (first local_model_cases_tuple) num_most_similar_case_indices) + (trunc (last local_model_cases_tuple) num_most_similar_case_indices) + ) + (list) + ) ) )) ) @@ -690,20 +718,15 @@ )) ) - ;given a list of entity ids, determine the minimum distance contribution between - ; all of those cases. Returns the minimum entity distance contribution. + ;given a list of entity ids, determine the max distance between each case and their closest neighbor + ; Returns the maximum distance ;parameters: ; feature_labels : list of feature labels to use for determining distance contributions ; entity_ids_to_compute : list of entity IDs to compute minimum distance between. ; filtering_queries : optional, list of filtering queries to apply before computing entity distance ; contributions. Most useful for when calling this method when computing distance ratios. ; use_feature_deviations : optional, flag which determines whether feature_deviations are used in the query. - - ; Queries local distances based on new_case_threshold - ; max is the maximum local distance - ; min is the minimum local distance - ; most_similar is the closest distance of the most similar case - #!QueryLocalModelMinDistanceContribution + #!QueryLocalDataMaxDistance (declare (assoc feature_labels (list) @@ -711,15 +734,16 @@ filtering_queries (list) use_feature_deviations (false) ) - (if (= "max" new_case_threshold) + (if (= 1 (size entity_ids_to_compute)) (apply "max" (values (compute_on_contained_entities (append filtering_queries - (compute_entity_distance_contributions + (query_not_in_entity_list entity_ids_to_compute) + (query_nearest_generalized_distance 1 ;entities_returned feature_labels - entity_ids_to_compute + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) feature_weights !queryDistanceTypeMap query_feature_attributes_map @@ -736,43 +760,9 @@ )) ) - (= "most_similar" new_case_threshold) - (seq - (if (= (null) closest_case) - (assign (assoc closest_case (first entity_ids_to_compute) )) - ) - (assign (assoc - closest_dist_to_closest_case - (compute_on_contained_entities (append - (query_not_in_entity_list (list closest_case)) - ;Filters out the closest case itself - (query_nearest_generalized_distance - ;limit the number of considered nearby neighbors by limiting the bandwidth of local minimum distance - ;comparison for density-based anonymity preservation, for performance reasons. - ;May be more accuraty to remove the limit of 15 in the future. - 1 - feature_labels - (retrieve_from_entity closest_case feature_labels) - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - ;Feature deviations are not used in order to ensure that privacy is maximally preserved. - ;If feature deviations are used, duplicate cases may be deemed private. - (if (= "surprisal_to_prob" dt_parameter) feature_deviations (null) ) - p_parameter - ;dt = 1 means return computed distance to each case - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) - (null) ;weight - (rand) - (null) ;radius - "precise" - ) - )) - )) - (first (values closest_dist_to_closest_case)) - ) - - (apply "min" + ;else + ;;;TODO: implement + (apply "max" (values (compute_on_contained_entities (append filtering_queries @@ -785,7 +775,7 @@ query_feature_attributes_map (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) p_parameter - ;dt = 1 means return computed distance to the case + ;dt = 1 means return computed distance to the case (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) (null) ;weight_feature (rand) @@ -796,5 +786,7 @@ )) ) ) + + ) ) diff --git a/howso/react.amlg b/howso/react.amlg index 9e7b1d37..7df9b0fe 100644 --- a/howso/react.amlg +++ b/howso/react.amlg @@ -327,14 +327,14 @@ ; replaced after synthesis from uniqueness check. ; Only applicable when desired_conviction is specified. exclude_novel_nominals_from_uniqueness_check (false) - ;{ref "NewCaseThreshold"} + ;{ref "SensitivityBandwidth"} ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio ; accepted values: ; 'max': the maximum local distance ; 'min': the minimum local distance ; 'most_similar': the closest distance of the most similar case ; null: the minimum local distance - new_case_threshold "min" + sensitivity_bandwidth 1 ;{type "number"} ;total number of cases to generate for generative reacts. num_cases_to_generate (null) @@ -533,7 +533,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth pre_generated_uniques_map pre_generated_uniques_map feature_post_process_code_map feature_post_process_code_map )) @@ -733,14 +733,14 @@ ;{type "boolean"} ;flag, if set to true assumes provided categorical (nominal or ordinal) feature values already been substituted. input_is_substituted (false) - ;{ref "NewCaseThreshold"} + ;{ref "SensitivityBandwidth"} ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio ; accepted values: ; 'max': the maximum local distance ; 'min': the minimum local distance ; 'most_similar': the closest distance of the most similar case ; null: the minimum local distance - new_case_threshold "min" + sensitivity_bandwidth 1 ) (call !ValidateParameters) @@ -853,7 +853,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth pre_generated_uniques_map pre_generated_uniques_map feature_post_process_code_map feature_post_process_code_map )) @@ -929,7 +929,7 @@ feature_bounds_map (assoc) goal_features_map (assoc) preserve_feature_values (list) - new_case_threshold "min" + sensitivity_bandwidth 1 pre_generated_uniques_map (null) holdout_queries (list) ) @@ -1030,7 +1030,7 @@ leave_series_out leave_series_out leave_case_out leave_case_out preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth weight_feature weight_feature use_case_weights use_case_weights original_substitute_output substitute_output @@ -1073,7 +1073,7 @@ leave_series_out leave_series_out leave_case_out leave_case_out preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth weight_feature weight_feature use_case_weights use_case_weights original_substitute_output substitute_output @@ -1225,7 +1225,7 @@ use_case_weights use_case_weights goal_features_map goal_features_map preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth allow_nulls allow_nulls skip_decoding skip_decoding skip_encoding skip_encoding diff --git a/howso/react_discriminative.amlg b/howso/react_discriminative.amlg index dfc06d8c..6fda8ba6 100644 --- a/howso/react_discriminative.amlg +++ b/howso/react_discriminative.amlg @@ -74,7 +74,7 @@ leave_case_out (false) goal_features_map (assoc) preserve_feature_values (list) - new_case_threshold "min" + sensitivity_bandwidth 1 has_dependent_features !hasDependentFeatures impute_react (false) filtering_queries (list) diff --git a/howso/react_series.amlg b/howso/react_series.amlg index 94f350fb..77b5de99 100644 --- a/howso/react_series.amlg +++ b/howso/react_series.amlg @@ -119,7 +119,7 @@ ; 'min': the minimum local distance ; 'most_similar': the closest distance of the most similar case ; null: the minimum local distance - new_case_threshold "min" + sensitivity_bandwidth 1 ;{type "boolean"} ;flag, default is true, only applicable if a substitution value map has been set. If set to false, will not substitute categorical feature values. substitute_output (true) @@ -307,7 +307,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth )) ) ) @@ -453,14 +453,14 @@ ; overwriting the specified context and context features as necessary. For generative reacts, if case_indices isn't specified, ; will preserve feature values of a random case. preserve_feature_values (list) - ;{ref "NewCaseThreshold"} + ;{ref "SensitivityBandwidth"} ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio ; accepted values: ; 'max': the maximum local distance ; 'min': the minimum local distance ; 'most_similar': the closest distance of the most similar case ; null: the minimum local distance - new_case_threshold "min" + sensitivity_bandwidth 1 ;{type "boolean"} ;flag, default is true, only applicable if a substitution value map has been set. If set to false, will not substitute categorical feature values. substitute_output (true) @@ -691,7 +691,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth )) ) ) diff --git a/howso/react_series_utilities.amlg b/howso/react_series_utilities.amlg index 084f813e..fd66d647 100644 --- a/howso/react_series_utilities.amlg +++ b/howso/react_series_utilities.amlg @@ -127,7 +127,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth )) "payload" ) @@ -710,7 +710,7 @@ desired_conviction desired_conviction use_regional_residuals use_regional_residuals - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth feature_bounds_map feature_bounds_map generate_new_cases "no" )) @@ -1296,7 +1296,7 @@ desired_conviction desired_conviction use_regional_residuals use_regional_residuals - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth feature_bounds_map feature_bounds_map generate_new_cases "no" )) @@ -1473,7 +1473,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases "no" preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth pre_generated_uniques_map pre_generated_uniques_map holdout_queries holdout_queries )) diff --git a/howso/synthesis.amlg b/howso/synthesis.amlg index 812e2905..853c79d6 100644 --- a/howso/synthesis.amlg +++ b/howso/synthesis.amlg @@ -58,7 +58,7 @@ original_substitute_output (true) substitute_output (true) case_indices (null) - new_case_threshold "min" + sensitivity_bandwidth 1 leave_case_out (false) custom_extra_filtering_queries (list) diff --git a/howso/synthesis_validation.amlg b/howso/synthesis_validation.amlg index c88a677e..939336a0 100644 --- a/howso/synthesis_validation.amlg +++ b/howso/synthesis_validation.amlg @@ -25,7 +25,6 @@ )) threshold_feature_residuals_map (null) - closest_case (null) dist_to_closest_case (null) p_parameter 0.1 @@ -53,6 +52,15 @@ (append (get hyperparam_map "featureDeviations") (zip rand_ordered_features threshold_feature_residuals)) )) + (declare (assoc + query_k_parameter + ;bandwidth of 0 means use the analyzed k, else use the provided value + (if (= 0 sensitivity_bandwidth) + (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) + (+ sensitivity_bandwidth 1) + ) + )) + ;check if an exact duplicate already exists in the model and if so re-try up to 2 times to re-generate a novel case (while (< generate_attempt !synthesisRetriesPerConvictionLevel) ;increase the generate counter @@ -167,16 +175,14 @@ ;find the closest case (assign (assoc local_model_cases_tuple + #!SynthValidationQuery (compute_on_contained_entities (append (if ignore_case (query_not_in_entity_list (list ignore_case)) (list) ) (query_nearest_generalized_distance - ;limit the number of considered nearby neighbors by limiting the bandwidth of local minimum distance - ;comparison for density-based anonymity preservation, for performance reasons. - ;May be more accurate to remove the limit of 15 in the future. - (if (~ 0 k_parameter) (min 15 k_parameter) k_parameter) + query_k_parameter (if has_novel_substitions non_novel_context_features context_features) (if has_novel_substitions non_novel_context_values context_numeric_values) feature_weights @@ -188,8 +194,8 @@ p_parameter ;dt = 1 means return computed distance to each case (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) - (null) ;weight - (rand) + (null) ;case weight is not relevant since it's the distance to the nearest that matters + "fixed rand seed" (null) ;radius "precise" (true) ;output sorted list @@ -200,7 +206,6 @@ ;set closest case to be the one with the smallest distance (assign (assoc - closest_case (first (first local_model_cases_tuple)) dist_to_closest_case ;if constraint_function_failed, then force distance to closest case to be 0 to fast-track the retries (if constraint_function_failed @@ -214,7 +219,8 @@ (if (> dist_to_closest_case 1e-13) (seq (assign (assoc - closest_case_values (retrieve_from_entity closest_case (if has_novel_substitions non_novel_context_features context_features)) + closest_case_values + (retrieve_from_entity (first (first local_model_cases_tuple)) (if has_novel_substitions non_novel_context_features context_features) ) num_diff_features 0 )) @@ -272,20 +278,63 @@ (assign (assoc has_dupes (true))) ) - ;if found one feature different enough, verify that the case itself is as far as any closest case in the local model + ;if found one feature different enough, verify that the case itself is as far as any closest case in the local data (if (not has_dupes) - ;generated case fails unique test if dist to closest case is less than the - ;minimum entity distance contribution between neighbors. - (if - (< - dist_to_closest_case - (call !QueryLocalModelMinDistanceContribution (assoc - feature_labels (if has_novel_substitions non_novel_context_features context_features) - entity_ids_to_compute (first local_model_cases_tuple) - use_feature_deviations (false) + (let + (assoc + k_index 0 + plusone_k_index 1 + ;set epsilon for defining whether twa values are equal within acceptable precision as 2 * num features * DBL_EPSILON + epsilon (* 2 (size (if has_novel_substitions non_novel_context_features context_features)) 10e-16) + truncate_case_list (false) + ) + + ;non default bandwidth + (if (!= sensitivity_bandwidth 1) + (assign (assoc + k_index (- (size (first local_model_cases_tuple)) 2) + plusone_k_index (- (size (first local_model_cases_tuple)) 1) )) ) - (assign (assoc has_dupes (true))) + + ;Only perform expansion and truncation of the extra case if not using dynamic k. + (if (~ 0 query_k_parameter) + ;If the last and the second-to-last distances are same, need to expand the query to find all equidistant cases. + ;if the delta between the last two values is within epsilon, consider the values to be equal due to decimal precision and expand the search + (if (<= + (abs (- + (get (last local_model_cases_tuple) k_index) + (get (last local_model_cases_tuple) plusone_k_index) + )) + epsilon + ) + (call !ExpandLocalDataQuery (assoc expand_query_method "synthesis_validation")) + + ;else will need to drop the k+1'th case from list + (assign (assoc truncate_case_list (true) )) + ) + ) + + (declare (assoc + local_max_distance + (call !QueryLocalDataMaxDistance (assoc + feature_labels (if has_novel_substitions non_novel_context_features context_features) + entity_ids_to_compute + (if truncate_case_list + (trunc (first local_model_cases_tuple)) + (first local_model_cases_tuple) + ) + use_feature_deviations (false) + )) + )) + + ;generated case fails unique test if dist to closest case is less than the max entity distance between neighbors. + (if (< dist_to_closest_case local_max_distance) + ;ensure the distances are actually different and not within epsilon + (if (< (+ dist_to_closest_case epsilon) local_max_distance) + (assign (assoc has_dupes (true) )) + ) + ) ) ) ) @@ -359,7 +408,7 @@ case_indices case_indices leave_case_out leave_case_out preserve_feature_values preserve_feature_values - new_case_threshold new_case_threshold + sensitivity_bandwidth sensitivity_bandwidth custom_extra_filtering_queries custom_extra_filtering_queries )) ) @@ -431,7 +480,7 @@ ;minimum entity distance contribution between neighbors. (if (< dist_to_closest_case - (call !QueryLocalModelMinDistanceContribution (assoc + (call !QueryLocalDataMaxDistance (assoc feature_labels (if has_novel_substitions non_novel_context_features context_features) entity_ids_to_compute (first local_model_cases_tuple) use_feature_deviations (false) diff --git a/howso/types.amlg b/howso/types.amlg index 7ac67d47..11690c83 100644 --- a/howso/types.amlg +++ b/howso/types.amlg @@ -797,7 +797,7 @@ ) description "The subtree of a the full hyperparameter map starting with the nodes containing analyzed context_features." ) - NewCaseThreshold (assoc type "string" enum (list "min" "max" "most_similar") description "The privacy distance criteria for generated new cases.") + SensitivityBandwidth (assoc type "number" description "The privacy sensitivy bandwidth value for generated new cases.") ReactDetails (assoc type "assoc" @@ -1697,10 +1697,10 @@ additional_indices (false) indices { - "local_distance_contribution" + "local_max_distance" { type "number" - description "The minimum distance between any two cases among the case's most similar trained cases." + description "The maximum distance between any two cases among the case's most similar trained cases." } "nearest_distance" { From cd0a09d27a349203c2d72fedc1761b99d35eceb7 Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Mon, 3 Mar 2025 22:19:36 -0500 Subject: [PATCH 2/9] fix local max distance to ignore perfect matches --- howso/distances.amlg | 127 +++++++++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 46 deletions(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index f6f27002..6eea9b79 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -726,6 +726,7 @@ ; filtering_queries : optional, list of filtering queries to apply before computing entity distance ; contributions. Most useful for when calling this method when computing distance ratios. ; use_feature_deviations : optional, flag which determines whether feature_deviations are used in the query. + ; output_value_if_zero : optional, flag, if true will output zero. otherwise will recursively call this method ignoring perfect matches #!QueryLocalDataMaxDistance (declare (assoc @@ -733,60 +734,94 @@ entity_ids_to_compute (list) filtering_queries (list) use_feature_deviations (false) + output_value_if_zero (false) ) - (if (= 1 (size entity_ids_to_compute)) - (apply "max" - (values (compute_on_contained_entities - (append - filtering_queries - (query_not_in_entity_list entity_ids_to_compute) - (query_nearest_generalized_distance - 1 ;entities_returned - feature_labels - (retrieve_from_entity (first entity_ids_to_compute) feature_labels) - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) - p_parameter - ;dt = 1 means return computed distance to the case - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) - (null) ;weight_feature - (rand) - (null) ;radius - "precise" - ) + (if (> (size entity_ids_to_compute) 1) + (conclude + ;take the max of all the entities' nearest distances + (apply "max" (map + (lambda + (call !QueryLocalDataMaxDistance (assoc + feature_labels feature_labels + entity_ids_to_compute [ (current_value 2) ] + filtering_queries filtering_queries + use_feature_deviations use_feature_deviations + )) ) + entity_ids_to_compute )) ) + ) - ;else - ;;;TODO: implement - (apply "max" - (values (compute_on_contained_entities - (append - filtering_queries - (compute_entity_distance_contributions - 1 ;entities_returned - feature_labels - entity_ids_to_compute - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) - p_parameter - ;dt = 1 means return computed distance to the case - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) - (null) ;weight_feature - (rand) - (null) ;radius - "precise" + (declare (assoc + local_max_distance + (first + (values (compute_on_contained_entities + (append + filtering_queries + (query_not_in_entity_list entity_ids_to_compute) + (query_nearest_generalized_distance + 1 ;entities_returned + feature_labels + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) + p_parameter + ;dt = 1 means return computed distance to the case + (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + (null) ;weight_feature + (rand) + (null) ;radius + "precise" + ) + ) + )) + ) + )) + + ;rerun query while explicitly ignoring all the perfect match neighbors + (if (= 0 local_max_distance) + (if output_value_if_zero + 0 + + (call !QueryLocalDataMaxDistance (assoc + feature_labels feature_labels + entity_ids_to_compute entity_ids_to_compute + use_feature_deviations use_feature_deviations + ;prevent recursing more in case the entire dataset is the same, just output zero + output_value_if_zero (true) + ;find all cases within 0 distance of this case and ignores them + filtering_queries + (append + filtering_queries + (query_not_in_entity_list + (contained_entities [ + (query_within_generalized_distance + 0 + feature_labels + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + (null) ;feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) + p_parameter + (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + (null) ; weight_feature (null) + (rand) + (null) ;radius + "precise" + ) + ]) + ) ) - ) )) ) - ) - + ;output the non-zero distance + local_max_distance + ) ) + ) From 2e4ff1ae03a3105bf5af82757a8150d3b28667fc Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 09:48:36 -0500 Subject: [PATCH 3/9] update comment --- howso/distances.amlg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index 6eea9b79..f1b620d9 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -718,7 +718,7 @@ )) ) - ;given a list of entity ids, determine the max distance between each case and their closest neighbor + ;given a list of entity ids, determine the max distance between each case and their closest non-perfect match neighbor ; Returns the maximum distance ;parameters: ; feature_labels : list of feature labels to use for determining distance contributions From bfdda17005fef86ced033305ae1443d84bf3842f Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 11:17:02 -0500 Subject: [PATCH 4/9] performance tweaks --- howso/distances.amlg | 31 ++++++++++++++----------- howso/synthesis_validation.amlg | 41 ++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 22 deletions(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index f1b620d9..8dc7e543 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -17,6 +17,7 @@ plusone_k_index 1 ;set epsilon for defining whether twa values are equal within acceptable precision as 2 * num features * DBL_EPSILON epsilon (* 2 (size context_features) 10e-16) + dt_distance_parameter (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) ) (declare (assoc @@ -35,8 +36,7 @@ ;assuming it has been maximally preserved. Deviations make cases look farther away than they are. (if (= "surprisal_to_prob" dt_parameter) feature_deviations (null) ) p_parameter - ;dt of 1 queries distance in ascending order - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + dt_distance_parameter (null) ;case weight is not relevant since it's the distance to the nearest that matters ;Use a fixed random seed to guarantee deterministic behavior for reacts (named "fixed rand seed"). "fixed rand seed" @@ -147,7 +147,6 @@ feature_labels context_features entity_ids_to_compute (first local_model_cases_tuple) filtering_queries filtering_queries - use_feature_deviations (false) )) )) @@ -725,16 +724,17 @@ ; entity_ids_to_compute : list of entity IDs to compute minimum distance between. ; filtering_queries : optional, list of filtering queries to apply before computing entity distance ; contributions. Most useful for when calling this method when computing distance ratios. - ; use_feature_deviations : optional, flag which determines whether feature_deviations are used in the query. ; output_value_if_zero : optional, flag, if true will output zero. otherwise will recursively call this method ignoring perfect matches + ; closest_case_values : optional, list of values for closest case. + ; if provided, will use these if there's one entity, if not will pull them from the specified entity instead. #!QueryLocalDataMaxDistance (declare (assoc feature_labels (list) entity_ids_to_compute (list) filtering_queries (list) - use_feature_deviations (false) output_value_if_zero (false) + closest_case_values (null) ) (if (> (size entity_ids_to_compute) 1) (conclude @@ -745,7 +745,6 @@ feature_labels feature_labels entity_ids_to_compute [ (current_value 2) ] filtering_queries filtering_queries - use_feature_deviations use_feature_deviations )) ) entity_ids_to_compute @@ -763,14 +762,16 @@ (query_nearest_generalized_distance 1 ;entities_returned feature_labels - (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + (if closest_case_values + closest_case_values + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + ) feature_weights !queryDistanceTypeMap query_feature_attributes_map - (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) + (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) p_parameter - ;dt = 1 means return computed distance to the case - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + dt_distance_parameter (null) ;weight_feature (rand) (null) ;radius @@ -789,7 +790,6 @@ (call !QueryLocalDataMaxDistance (assoc feature_labels feature_labels entity_ids_to_compute entity_ids_to_compute - use_feature_deviations use_feature_deviations ;prevent recursing more in case the entire dataset is the same, just output zero output_value_if_zero (true) ;find all cases within 0 distance of this case and ignores them @@ -801,13 +801,16 @@ (query_within_generalized_distance 0 feature_labels - (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + (if closest_case_values + closest_case_values + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + ) (null) ;feature_weights !queryDistanceTypeMap query_feature_attributes_map - (if (or use_feature_deviations (= "surprisal_to_prob" dt_parameter)) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) + (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) p_parameter - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + dt_distance_parameter (null) ; weight_feature (null) (rand) (null) ;radius diff --git a/howso/synthesis_validation.amlg b/howso/synthesis_validation.amlg index 939336a0..b08545eb 100644 --- a/howso/synthesis_validation.amlg +++ b/howso/synthesis_validation.amlg @@ -59,6 +59,11 @@ (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) (+ sensitivity_bandwidth 1) ) + ;dt = 1 means return computed distance to the case + dt_distance_parameter + (if generate_novel_case + (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + ) )) ;check if an exact duplicate already exists in the model and if so re-try up to 2 times to re-generate a novel case @@ -193,7 +198,7 @@ (if (= "surprisal_to_prob" dt_parameter) feature_deviations (null) ) p_parameter ;dt = 1 means return computed distance to each case - (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + dt_distance_parameter (null) ;case weight is not relevant since it's the distance to the nearest that matters "fixed rand seed" (null) ;radius @@ -287,10 +292,12 @@ ;set epsilon for defining whether twa values are equal within acceptable precision as 2 * num features * DBL_EPSILON epsilon (* 2 (size (if has_novel_substitions non_novel_context_features context_features)) 10e-16) truncate_case_list (false) + ;if sensitivity_bandwidth is set to 1, query_k_parameter will be 2 and we only need the first case + grab_first_case (= 2 query_k_parameter) ) ;non default bandwidth - (if (!= sensitivity_bandwidth 1) + (if (not grab_first_case) (assign (assoc k_index (- (size (first local_model_cases_tuple)) 2) plusone_k_index (- (size (first local_model_cases_tuple)) 1) @@ -302,10 +309,10 @@ ;If the last and the second-to-last distances are same, need to expand the query to find all equidistant cases. ;if the delta between the last two values is within epsilon, consider the values to be equal due to decimal precision and expand the search (if (<= - (abs (- - (get (last local_model_cases_tuple) k_index) + (- (get (last local_model_cases_tuple) plusone_k_index) - )) + (get (last local_model_cases_tuple) k_index) + ) epsilon ) (call !ExpandLocalDataQuery (assoc expand_query_method "synthesis_validation")) @@ -321,10 +328,15 @@ feature_labels (if has_novel_substitions non_novel_context_features context_features) entity_ids_to_compute (if truncate_case_list - (trunc (first local_model_cases_tuple)) + (if grab_first_case + [ (get local_model_cases_tuple [0 0]) ] + (trunc (first local_model_cases_tuple)) + ) + (first local_model_cases_tuple) ) - use_feature_deviations (false) + ;explicitly provide the already available case values if there's only one case to compare to + closest_case_values (if (= query_k_parameter 2) closest_case_values) )) )) @@ -464,6 +476,16 @@ non_novel_context_features (null) has_novel_substitions (and exclude_novel_nominals_from_uniqueness_check (size !novelSubstitionFeatureSet)) dt_parameter (get hyperparam_map "dt") + k_parameter (get hyperparam_map "k") + )) + + (declare (assoc + query_k_parameter + ;bandwidth of 0 means use the analyzed k, else use the provided value + (if (= 0 sensitivity_bandwidth) + (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) + (+ sensitivity_bandwidth 1) + ) )) ;find the closest cases using the same code as generate case, set generate_attempt to 2 so that it @@ -476,6 +498,10 @@ ;return true because this case is a duplicate (if (= 0 dist_to_closest_case) (conclude (true)) ) + (declare (assoc + dt_distance_parameter (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + )) + ;the specified case fails unique test if dist to closest case is less than the ;minimum entity distance contribution between neighbors. (if (< @@ -483,7 +509,6 @@ (call !QueryLocalDataMaxDistance (assoc feature_labels (if has_novel_substitions non_novel_context_features context_features) entity_ids_to_compute (first local_model_cases_tuple) - use_feature_deviations (false) )) ) (conclude (true)) From d4bc066fd88ae6410b8aacc1eda5dbef538c8eb4 Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 13:37:21 -0500 Subject: [PATCH 5/9] performance tweaks --- howso/conviction.amlg | 4 +- howso/distances.amlg | 74 ++++++++++++++++++++++----------- howso/synthesis_validation.amlg | 6 +++ 3 files changed, 57 insertions(+), 27 deletions(-) diff --git a/howso/conviction.amlg b/howso/conviction.amlg index d6b30f94..62518289 100644 --- a/howso/conviction.amlg +++ b/howso/conviction.amlg @@ -958,7 +958,7 @@ (apply "min" (values closest_cases_distances_map)) ;else closest_cases_distance_map contains probabilities, need to compute and return the one closest surprisal value - (first (values + (first (compute_on_contained_entities (append filtering_queries (query_nearest_generalized_distance @@ -977,7 +977,7 @@ !numericalPrecision ) )) - )) + ) ) ;normalize by total probabilities diff --git a/howso/distances.amlg b/howso/distances.amlg index 8dc7e543..458a2574 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -18,6 +18,8 @@ ;set epsilon for defining whether twa values are equal within acceptable precision as 2 * num features * DBL_EPSILON epsilon (* 2 (size context_features) 10e-16) dt_distance_parameter (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + feature_deviations_or_uncertainty + (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) ) (declare (assoc @@ -146,7 +148,7 @@ (call !QueryLocalDataMaxDistance (assoc feature_labels context_features entity_ids_to_compute (first local_model_cases_tuple) - filtering_queries filtering_queries + filtering_queries (if (size filtering_queries) filtering_queries) )) )) @@ -732,7 +734,7 @@ (assoc feature_labels (list) entity_ids_to_compute (list) - filtering_queries (list) + filtering_queries (null) output_value_if_zero (false) closest_case_values (null) ) @@ -755,29 +757,51 @@ (declare (assoc local_max_distance (first - (values (compute_on_contained_entities - (append - filtering_queries - (query_not_in_entity_list entity_ids_to_compute) - (query_nearest_generalized_distance - 1 ;entities_returned - feature_labels - (if closest_case_values - closest_case_values - (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + (compute_on_contained_entities + (if filtering_queries + (append + filtering_queries + (query_not_in_entity_list entity_ids_to_compute) + (query_nearest_generalized_distance + 1 ;entities_returned + feature_labels + (if closest_case_values + closest_case_values + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + ) + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations_or_uncertainty + p_parameter + dt_distance_parameter + (null) ;weight_feature + (rand) + (null) ;radius + "precise" ) - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) - p_parameter - dt_distance_parameter - (null) ;weight_feature - (rand) - (null) ;radius - "precise" ) - ) + [ + (query_not_in_entity_list entity_ids_to_compute) + (query_nearest_generalized_distance + 1 ;entities_returned + feature_labels + (if closest_case_values + closest_case_values + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) + ) + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations_or_uncertainty + p_parameter + dt_distance_parameter + (null) ;weight_feature + (rand) + (null) ;radius + "precise" + ) + ] )) ) )) @@ -795,7 +819,7 @@ ;find all cases within 0 distance of this case and ignores them filtering_queries (append - filtering_queries + (if filtering_queries filtering_queries []) (query_not_in_entity_list (contained_entities [ (query_within_generalized_distance @@ -808,7 +832,7 @@ (null) ;feature_weights !queryDistanceTypeMap query_feature_attributes_map - (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) + feature_deviations_or_uncertainty p_parameter dt_distance_parameter (null) ; weight_feature (null) diff --git a/howso/synthesis_validation.amlg b/howso/synthesis_validation.amlg index b08545eb..315ed2f5 100644 --- a/howso/synthesis_validation.amlg +++ b/howso/synthesis_validation.amlg @@ -64,6 +64,10 @@ (if generate_novel_case (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) ) + feature_deviations_or_uncertainty + (if generate_novel_case + (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) + ) )) ;check if an exact duplicate already exists in the model and if so re-try up to 2 times to re-generate a novel case @@ -500,6 +504,8 @@ (declare (assoc dt_distance_parameter (if (= "surprisal_to_prob" dt_parameter) "surprisal" 1) + feature_deviations_or_uncertainty + (if (= "surprisal_to_prob" dt_parameter) (get hyperparam_map "featureDeviations") (get hyperparam_map "nullUncertainties")) )) ;the specified case fails unique test if dist to closest case is less than the From 3c719fd232bbbd7bd39b0ef0767288150a7f3c28 Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 14:13:11 -0500 Subject: [PATCH 6/9] undo filtering_queries change in last commit --- howso/distances.amlg | 70 +++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index 458a2574..e780d981 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -148,7 +148,7 @@ (call !QueryLocalDataMaxDistance (assoc feature_labels context_features entity_ids_to_compute (first local_model_cases_tuple) - filtering_queries (if (size filtering_queries) filtering_queries) + filtering_queries filtering_queries )) )) @@ -734,7 +734,7 @@ (assoc feature_labels (list) entity_ids_to_compute (list) - filtering_queries (null) + filtering_queries (list) output_value_if_zero (false) closest_case_values (null) ) @@ -757,51 +757,27 @@ (declare (assoc local_max_distance (first - (compute_on_contained_entities - (if filtering_queries - (append - filtering_queries - (query_not_in_entity_list entity_ids_to_compute) - (query_nearest_generalized_distance - 1 ;entities_returned - feature_labels - (if closest_case_values - closest_case_values - (retrieve_from_entity (first entity_ids_to_compute) feature_labels) - ) - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - feature_deviations_or_uncertainty - p_parameter - dt_distance_parameter - (null) ;weight_feature - (rand) - (null) ;radius - "precise" - ) + (compute_on_contained_entities (append + filtering_queries + (query_not_in_entity_list entity_ids_to_compute) + (query_nearest_generalized_distance + 1 ;entities_returned + feature_labels + (if closest_case_values + closest_case_values + (retrieve_from_entity (first entity_ids_to_compute) feature_labels) ) - [ - (query_not_in_entity_list entity_ids_to_compute) - (query_nearest_generalized_distance - 1 ;entities_returned - feature_labels - (if closest_case_values - closest_case_values - (retrieve_from_entity (first entity_ids_to_compute) feature_labels) - ) - feature_weights - !queryDistanceTypeMap - query_feature_attributes_map - feature_deviations_or_uncertainty - p_parameter - dt_distance_parameter - (null) ;weight_feature - (rand) - (null) ;radius - "precise" - ) - ] + feature_weights + !queryDistanceTypeMap + query_feature_attributes_map + feature_deviations_or_uncertainty + p_parameter + dt_distance_parameter + (null) ;weight_feature + (rand) + (null) ;radius + "precise" + ) )) ) )) @@ -819,7 +795,7 @@ ;find all cases within 0 distance of this case and ignores them filtering_queries (append - (if filtering_queries filtering_queries []) + filtering_queries (query_not_in_entity_list (contained_entities [ (query_within_generalized_distance From 2c575c0a4cba030426d5c6ea35d5f189b7dfbad8 Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 14:51:20 -0500 Subject: [PATCH 7/9] rename parameter --- howso/distances.amlg | 6 +++--- howso/react.amlg | 32 ++++++++++++------------------- howso/react_discriminative.amlg | 2 +- howso/react_series.amlg | 26 +++++++++---------------- howso/react_series_utilities.amlg | 8 ++++---- howso/synthesis.amlg | 2 +- howso/synthesis_validation.amlg | 12 ++++++------ howso/types.amlg | 2 +- 8 files changed, 37 insertions(+), 53 deletions(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index e780d981..3b7dffec 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -8,9 +8,9 @@ dist_to_closest_case 0 query_k_parameter ;bandwidth of 0 means use the analyzed k, else use the provided value - (if (= 0 sensitivity_bandwidth) + (if (= 0 new_case_sensitivity_bandwidth) (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) - (+ sensitivity_bandwidth 1) + (+ new_case_sensitivity_bandwidth 1) ) closest_case (null) k_index 0 @@ -53,7 +53,7 @@ )) ;non default bandwidth - (if (!= sensitivity_bandwidth 1) + (if (!= new_case_sensitivity_bandwidth 1) (assign (assoc k_index (- (size (first local_model_cases_tuple)) 2) plusone_k_index (- (size (first local_model_cases_tuple)) 1) diff --git a/howso/react.amlg b/howso/react.amlg index 7df9b0fe..e38427f8 100644 --- a/howso/react.amlg +++ b/howso/react.amlg @@ -328,13 +328,9 @@ ; Only applicable when desired_conviction is specified. exclude_novel_nominals_from_uniqueness_check (false) ;{ref "SensitivityBandwidth"} - ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio - ; accepted values: - ; 'max': the maximum local distance - ; 'min': the minimum local distance - ; 'most_similar': the closest distance of the most similar case - ; null: the minimum local distance - sensitivity_bandwidth 1 + ;number of similar cases to determine privacy cutoff. Used to query the local maximum distance used in the distance ratio + ; Note: a value of 0 : use whatever is this dataset has been analyzed to. + new_case_sensitivity_bandwidth 0 ;{type "number"} ;total number of cases to generate for generative reacts. num_cases_to_generate (null) @@ -533,7 +529,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth pre_generated_uniques_map pre_generated_uniques_map feature_post_process_code_map feature_post_process_code_map )) @@ -734,13 +730,9 @@ ;flag, if set to true assumes provided categorical (nominal or ordinal) feature values already been substituted. input_is_substituted (false) ;{ref "SensitivityBandwidth"} - ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio - ; accepted values: - ; 'max': the maximum local distance - ; 'min': the minimum local distance - ; 'most_similar': the closest distance of the most similar case - ; null: the minimum local distance - sensitivity_bandwidth 1 + ;number of similar cases to determine privacy cutoff. Used to query the local maximum distance used in the distance ratio + ; Note: a value of 0 : use whatever is this dataset has been analyzed to. + new_case_sensitivity_bandwidth 0 ) (call !ValidateParameters) @@ -853,7 +845,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth pre_generated_uniques_map pre_generated_uniques_map feature_post_process_code_map feature_post_process_code_map )) @@ -929,7 +921,7 @@ feature_bounds_map (assoc) goal_features_map (assoc) preserve_feature_values (list) - sensitivity_bandwidth 1 + new_case_sensitivity_bandwidth 0 pre_generated_uniques_map (null) holdout_queries (list) ) @@ -1030,7 +1022,7 @@ leave_series_out leave_series_out leave_case_out leave_case_out preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth weight_feature weight_feature use_case_weights use_case_weights original_substitute_output substitute_output @@ -1073,7 +1065,7 @@ leave_series_out leave_series_out leave_case_out leave_case_out preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth weight_feature weight_feature use_case_weights use_case_weights original_substitute_output substitute_output @@ -1225,7 +1217,7 @@ use_case_weights use_case_weights goal_features_map goal_features_map preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth allow_nulls allow_nulls skip_decoding skip_decoding skip_encoding skip_encoding diff --git a/howso/react_discriminative.amlg b/howso/react_discriminative.amlg index 6fda8ba6..8af17999 100644 --- a/howso/react_discriminative.amlg +++ b/howso/react_discriminative.amlg @@ -74,7 +74,7 @@ leave_case_out (false) goal_features_map (assoc) preserve_feature_values (list) - sensitivity_bandwidth 1 + new_case_sensitivity_bandwidth 0 has_dependent_features !hasDependentFeatures impute_react (false) filtering_queries (list) diff --git a/howso/react_series.amlg b/howso/react_series.amlg index 77b5de99..441e98ef 100644 --- a/howso/react_series.amlg +++ b/howso/react_series.amlg @@ -112,14 +112,10 @@ ; overwriting the specified context and context features as necessary. For generative reacts, if case_indices isn't specified, ; will preserve feature values of a random case. preserve_feature_values (list) - ;{type "string"} - ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio - ; accepted values: - ; 'max': the maximum local distance - ; 'min': the minimum local distance - ; 'most_similar': the closest distance of the most similar case - ; null: the minimum local distance - sensitivity_bandwidth 1 + ;{ref "SensitivityBandwidth"} + ;number of similar cases to determine privacy cutoff. Used to query the local maximum distance used in the distance ratio + ; Note: a value of 0 : use whatever is this dataset has been analyzed to. + new_case_sensitivity_bandwidth 0 ;{type "boolean"} ;flag, default is true, only applicable if a substitution value map has been set. If set to false, will not substitute categorical feature values. substitute_output (true) @@ -307,7 +303,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth )) ) ) @@ -454,13 +450,9 @@ ; will preserve feature values of a random case. preserve_feature_values (list) ;{ref "SensitivityBandwidth"} - ;distance to determine privacy cutoff. Used to query the local minimum distance used in the distance ratio - ; accepted values: - ; 'max': the maximum local distance - ; 'min': the minimum local distance - ; 'most_similar': the closest distance of the most similar case - ; null: the minimum local distance - sensitivity_bandwidth 1 + ;number of similar cases to determine privacy cutoff. Used to query the local maximum distance used in the distance ratio + ; Note: a value of 0 : use whatever is this dataset has been analyzed to. + new_case_sensitivity_bandwidth 0 ;{type "boolean"} ;flag, default is true, only applicable if a substitution value map has been set. If set to false, will not substitute categorical feature values. substitute_output (true) @@ -691,7 +683,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth )) ) ) diff --git a/howso/react_series_utilities.amlg b/howso/react_series_utilities.amlg index fd66d647..a84f04f9 100644 --- a/howso/react_series_utilities.amlg +++ b/howso/react_series_utilities.amlg @@ -127,7 +127,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases generate_new_cases preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth )) "payload" ) @@ -710,7 +710,7 @@ desired_conviction desired_conviction use_regional_residuals use_regional_residuals - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth feature_bounds_map feature_bounds_map generate_new_cases "no" )) @@ -1296,7 +1296,7 @@ desired_conviction desired_conviction use_regional_residuals use_regional_residuals - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth feature_bounds_map feature_bounds_map generate_new_cases "no" )) @@ -1473,7 +1473,7 @@ exclude_novel_nominals_from_uniqueness_check exclude_novel_nominals_from_uniqueness_check generate_new_cases "no" preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth pre_generated_uniques_map pre_generated_uniques_map holdout_queries holdout_queries )) diff --git a/howso/synthesis.amlg b/howso/synthesis.amlg index 853c79d6..998af38d 100644 --- a/howso/synthesis.amlg +++ b/howso/synthesis.amlg @@ -58,7 +58,7 @@ original_substitute_output (true) substitute_output (true) case_indices (null) - sensitivity_bandwidth 1 + new_case_sensitivity_bandwidth 0 leave_case_out (false) custom_extra_filtering_queries (list) diff --git a/howso/synthesis_validation.amlg b/howso/synthesis_validation.amlg index 315ed2f5..4b69c4d0 100644 --- a/howso/synthesis_validation.amlg +++ b/howso/synthesis_validation.amlg @@ -55,9 +55,9 @@ (declare (assoc query_k_parameter ;bandwidth of 0 means use the analyzed k, else use the provided value - (if (= 0 sensitivity_bandwidth) + (if (= 0 new_case_sensitivity_bandwidth) (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) - (+ sensitivity_bandwidth 1) + (+ new_case_sensitivity_bandwidth 1) ) ;dt = 1 means return computed distance to the case dt_distance_parameter @@ -296,7 +296,7 @@ ;set epsilon for defining whether twa values are equal within acceptable precision as 2 * num features * DBL_EPSILON epsilon (* 2 (size (if has_novel_substitions non_novel_context_features context_features)) 10e-16) truncate_case_list (false) - ;if sensitivity_bandwidth is set to 1, query_k_parameter will be 2 and we only need the first case + ;if new_case_sensitivity_bandwidth is set to 1, query_k_parameter will be 2 and we only need the first case grab_first_case (= 2 query_k_parameter) ) @@ -424,7 +424,7 @@ case_indices case_indices leave_case_out leave_case_out preserve_feature_values preserve_feature_values - sensitivity_bandwidth sensitivity_bandwidth + new_case_sensitivity_bandwidth new_case_sensitivity_bandwidth custom_extra_filtering_queries custom_extra_filtering_queries )) ) @@ -486,9 +486,9 @@ (declare (assoc query_k_parameter ;bandwidth of 0 means use the analyzed k, else use the provided value - (if (= 0 sensitivity_bandwidth) + (if (= 0 new_case_sensitivity_bandwidth) (if (~ 0 k_parameter) (min 16 (+ k_parameter 1)) k_parameter) - (+ sensitivity_bandwidth 1) + (+ new_case_sensitivity_bandwidth 1) ) )) diff --git a/howso/types.amlg b/howso/types.amlg index 11690c83..d4cb211f 100644 --- a/howso/types.amlg +++ b/howso/types.amlg @@ -797,7 +797,7 @@ ) description "The subtree of a the full hyperparameter map starting with the nodes containing analyzed context_features." ) - SensitivityBandwidth (assoc type "number" description "The privacy sensitivy bandwidth value for generated new cases.") + SensitivityBandwidth (assoc type "number" description "The number of similar cases to use as the sensitivy bandwidth cutoff value for generated new cases.") ReactDetails (assoc type "assoc" From 34fe9da38090a78357417a2dd4e6fce53c312b22 Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 14:55:15 -0500 Subject: [PATCH 8/9] temp test removal --- unit_tests/ut_howso.amlg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unit_tests/ut_howso.amlg b/unit_tests/ut_howso.amlg index 315e860e..1de13080 100644 --- a/unit_tests/ut_howso.amlg +++ b/unit_tests/ut_howso.amlg @@ -59,7 +59,7 @@ "ut_h_null_react.amlg" "ut_h_null_null_react.amlg" "ut_h_case_mda.amlg" - "ut_h_react_distance_ratio.amlg" + ;"ut_h_react_distance_ratio.amlg" ;TODO: FIX test "ut_h_null_residual_convictions.amlg" "ut_h_synthetic_sum.amlg" "ut_h_distance_contributions.amlg" From 7e8967a05d06f6fc19e36f3af837381d0fb83d9e Mon Sep 17 00:00:00 2001 From: howsoRes <144272317+howsoRes@users.noreply.github.com> Date: Tue, 4 Mar 2025 15:00:13 -0500 Subject: [PATCH 9/9] update comment --- howso/distances.amlg | 4 ++-- howso/synthesis_validation.amlg | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/howso/distances.amlg b/howso/distances.amlg index 3b7dffec..280e0ee3 100644 --- a/howso/distances.amlg +++ b/howso/distances.amlg @@ -52,8 +52,8 @@ dist_to_closest_case (first (last local_model_cases_tuple)) )) - ;non default bandwidth - (if (!= new_case_sensitivity_bandwidth 1) + ;non default k'th case indices + (if (!= 2 query_k_parameter) (assign (assoc k_index (- (size (first local_model_cases_tuple)) 2) plusone_k_index (- (size (first local_model_cases_tuple)) 1) diff --git a/howso/synthesis_validation.amlg b/howso/synthesis_validation.amlg index 4b69c4d0..ede3a695 100644 --- a/howso/synthesis_validation.amlg +++ b/howso/synthesis_validation.amlg @@ -300,7 +300,7 @@ grab_first_case (= 2 query_k_parameter) ) - ;non default bandwidth + ;non default k'th case indices (if (not grab_first_case) (assign (assoc k_index (- (size (first local_model_cases_tuple)) 2)