calogica · lookslikeitsnot · May 17, 2023 · May 18, 2023 · May 18, 2023 · May 19, 2023
diff --git a/README.md b/README.md
@@ -1009,8 +1009,12 @@ tests:
       value_set: [0.5]
       top_n: 1
       quote_values: true # (Optional. Default is 'true'.)
-      data_type: "decimal" # (Optional. Default is 'decimal')
+      data_type: "decimal"  # (Optional. Default is adapter-specific equivalent of 'decimal' with a scale provided by dbt. 
+                            # Using decimal/numeric without scale might result in unexpected behaviour with Snowflake where scale
+                            # defaults to 0 resulting in values being rounded)
       strictly: false # (Optional. Default is 'false'. Adds an 'or equal to' to the comparison operator for min/max)
+      ties_okay: true # (Optional. Default is 'false'. If true, the expectation will succeed if values outside 
+                       # the designated set are as common (but not more common) than designated values)
 ```
 
 ### [expect_column_max_to_be_between](macros/schema_tests/aggregate_functions/expect_column_max_to_be_between.sql)

diff --git a/integration_tests/models/schema_tests/data_test.sql b/integration_tests/models/schema_tests/data_test.sql
@@ -1,8 +1,8 @@
 select
     1 as idx,
     '2020-10-21' as date_col,
-    cast(0 as {{ dbt.type_float() }}) as col_numeric_a,
-    cast(1 as {{ dbt.type_float() }}) as col_numeric_b,
+    cast(0 as {{ dbt.type_numeric() }}) as col_numeric_a,
+    cast(1 as {{ dbt.type_numeric() }}) as col_numeric_b,
     'a' as col_string_a,
     'b' as col_string_b,
     cast(null as {{ dbt.type_string() }}) as col_null,
@@ -13,8 +13,8 @@ union all
 select
     2 as idx,
     '2020-10-22' as date_col,
-    1 as col_numeric_a,
-    0 as col_numeric_b,
+    cast(1 as {{ dbt.type_numeric() }}) as col_numeric_a,
+    cast(0 as {{ dbt.type_numeric() }}) as col_numeric_b,
     'b' as col_string_a,
     'ab' as col_string_b,
     null as col_null,
@@ -25,8 +25,8 @@ union all
 select
     3 as idx,
     '2020-10-23' as date_col,
-    0.5 as col_numeric_a,
-    0.5 as col_numeric_b,
+    cast(0.5 as {{ dbt.type_numeric() }}) as col_numeric_a,
+    cast(0.5 as {{ dbt.type_numeric() }}) as col_numeric_b,
     'c' as col_string_a,
     'abc' as col_string_b,
     null as col_null,
@@ -37,8 +37,8 @@ union all
 select
     4 as idx,
     '2020-10-23' as date_col,
-    0.5 as col_numeric_a,
-    0.5 as col_numeric_b,
+    cast(0.5 as {{ dbt.type_numeric() }}) as col_numeric_a,
+    cast(0.5 as {{ dbt.type_numeric() }}) as col_numeric_b,
     'c' as col_string_a,
     'abcd' as col_string_b,
     null as col_null,

diff --git a/integration_tests/models/schema_tests/schema.yml b/integration_tests/models/schema_tests/schema.yml
@@ -505,6 +505,63 @@ models:
               value_set: [0.5]
               top_n: 1
               quote_values: false
+              # Expect success if all most common values at all n levels are in set 
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [0.5, 0, 1]
+              top_n: 2
+              quote_values: false
+              # Expect failure if not all most common values at all n levels are in set 
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [0.5, 0]
+              top_n: 2
+              quote_values: false
+              config:
+                error_if: "=0"
+                warn_if: "<>1"
+              # Expect success if some of the most common values at all n levels are in set and ties_okay is true
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [0.5, 0]
+              top_n: 2
+              ties_okay: true
+              quote_values: false
+              # Expect success if any of the top 2 most common levels value are in set and ties_okay is true
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [0]
+              top_n: 2
+              ties_okay: true
+              quote_values: false
+              # Expect success if any of the top most common level value is in set and ties_okay is true
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [0.5]
+              top_n: 2
+              ties_okay: true
+              quote_values: false
+            # Expect error if value is in column but not most common
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [1]
+              top_n: 1
+              quote_values: false
+              config:
+                error_if: "=0"
+                warn_if: "<>1"
+            # Expect error if value is in column but not most common and ties_okay is true
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [1]
+              top_n: 1
+              ties_okay: true
+              quote_values: false
+              config:
+                error_if: "=0"
+                warn_if: "<>1"
+             # Expect error if value not in column at any level
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: [123456789]
+              top_n: >
+                (select count(*) from {{ref('data_test')}})
+              quote_values: false
+              config:
+                error_if: "=0"
+                warn_if: "<>3"
           - dbt_expectations.expect_column_values_to_be_increasing:
               sort_column: col_numeric_a
               strictly: false
@@ -538,6 +595,65 @@ models:
           - dbt_expectations.expect_column_values_to_not_be_in_set:
               value_set: ['a','c']
               quote_values: true
+            # Expect error if not all most common values are in the set
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['b']
+              top_n: 1
+              config:
+                error_if: "=0"
+                warn_if: "<3"
+            # Expect success if not all most common values are in the set but ties_okay is set
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['b']
+              top_n: 1
+              ties_okay: true
+            # Expect error if none of the most common values are in the set and ties_okay is set
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['invalid_value']
+              top_n: 1
+              ties_okay: true
+              config:
+                error_if: "=0"
+                warn_if: "<4"
+            # Expect success if not all most common values are in the set but ties_okay is set
+            # and the set contains extra values 
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['b', 'invalid_value']
+              top_n: 1
+              ties_okay: true
+            # Expect success if not all most common values are in the set but ties_okay is set
+            # and value is not first one of the column naturally ordered
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['ab']
+              top_n: 1
+              ties_okay: true
+            # Expect success if all most common values are in the set
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['b', 'ab', 'abc', 'abcd']
+              top_n: 1
+            # Expect success if all most common values are in the set 
+            # and the set contains extra values 
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['b', 'ab', 'abc', 'abcd', 'invalid_value']
+              top_n: 1
+            # Expect error if none of the most common values are in the set 
+            # and the set contains extra values 
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['invalid_value1', 'invalid_value2', 'invalid_value3', 'invalid_value4', 'invalid_value5']
+              top_n: 1
+              config:
+                error_if: "=0"
+                warn_if: "<4"
+            # Expect error if none of the most common values are in the set 
+            # and the set contains extra values 
+          - dbt_expectations.expect_column_most_common_value_to_be_in_set:
+              value_set: ['invalid_value1', 'invalid_value2', 'invalid_value3', 'invalid_value4', 'invalid_value5']
+              top_n: >
+                (select count(*) from {{ref('data_test')}})
+              ties_okay: true
+              config:
+                error_if: "=0"
+                warn_if: "<4"
           - dbt_expectations.expect_column_value_lengths_to_be_between:
               min_value: 1
               max_value: 4

diff --git a/macros/schema_tests/aggregate_functions/expect_column_most_common_value_to_be_in_set.sql b/macros/schema_tests/aggregate_functions/expect_column_most_common_value_to_be_in_set.sql
@@ -3,12 +3,15 @@
                                                        value_set,
                                                        top_n,
                                                        quote_values=True,
-                                                       data_type="decimal",
-                                                       row_condition=None
+                                                       data_type=None,
+                                                       row_condition=None,
+                                                       ties_okay=False
                                                        ) -%}
-
+    {# For Snowflake, using a default 'decimal' instead of dbt.type_numeric() 
+        rounds up the value when casting #}
+    {% set data_type = dbt.type_numeric() if not data_type else data_type %}
     {{ adapter.dispatch('test_expect_column_most_common_value_to_be_in_set', 'dbt_expectations') (
-            model, column_name, value_set, top_n, quote_values, data_type, row_condition
+            model, column_name, value_set, top_n, quote_values, data_type, row_condition, ties_okay
         ) }}
 
 {%- endtest %}
@@ -19,9 +22,10 @@
                                                                       top_n,
                                                                       quote_values,
                                                                       data_type,
-                                                                      row_condition
+                                                                      row_condition,
+                                                                      ties_okay
                                                                       ) %}
-
+{% set data_type = data_type %}
 with value_counts as (
 
     select
@@ -48,7 +52,7 @@ value_counts_ranked as (
 
     select
         *,
-        row_number() over(order by value_count desc) as value_count_rank
+        rank() over(order by value_count desc) as value_count_rank
     from
         value_counts
 
@@ -60,7 +64,7 @@ value_count_top_n as (
     from
         value_counts_ranked
     where
-        value_count_rank = {{ top_n }}
+        value_count_rank <= {{ top_n }}
 
 ),
 set_values as (
@@ -83,15 +87,44 @@ unique_set_values as (
         set_values
 
 ),
-validation_errors as (
-    -- values from the model that are not in the set
+most_common_values_not_in_set as (
     select
         value_field
     from
         value_count_top_n
     where
         value_field not in (select value_field from unique_set_values)
-
+),
+most_common_values_in_set as (
+    select 
+        value_field 
+    from 
+        value_count_top_n 
+    {{ dbt.except() }}
+    select 
+        value_field 
+    from 
+        most_common_values_not_in_set
+),
+validation_errors as (
+    {% if ties_okay -%}
+    select 
+        * 
+    from 
+        most_common_values_not_in_set
+    where
+        {# 
+            If the intersection between the most common values and the values in the set is not empty, 
+            succeed. Otherwise fail the test and select all the most common values from the column.
+        #}
+        (
+            select count(*) 
+            from most_common_values_in_set
+        ) = 0
+    {%- else -%}
+    select * 
+    from most_common_values_not_in_set
+    {%- endif -%}
 )
 
 select *