From 988642bb7c458123d8ea4565c929d4f09efec2a3 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 13 Jan 2026 17:38:03 +0000 Subject: [PATCH 01/34] First pass --- dbt/models/default/default.vw_pin_sale.sql | 43 +++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 14430d981..5f6e7a567 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -367,7 +367,48 @@ SELECT -- If there is no override, default to sv_is_outlier WHEN sales_val.sv_is_outlier IS NOT NULL THEN sales_val.sv_is_outlier - END AS is_outlier + END AS is_outlier, + -- Combined outlier reasons: model SV reasons + override “trigger” fieldnames +ARRAY_DISTINCT( + CONCAT( + -- Model reasons (sv_outlier_reason1-3) + FILTER( + ARRAY[ + sales_val.sv_outlier_reason1, + sales_val.sv_outlier_reason2, + sales_val.sv_outlier_reason3 + ], + r -> r IS NOT NULL AND TRIM(r) <> '' + ), + + -- Override triggers (only those that *would* make the sale an outlier) + FILTER( + ARRAY[ + CASE + WHEN COALESCE(flag_override.is_arms_length = FALSE, FALSE) + THEN 'is_arms_length' + END, + CASE + WHEN COALESCE(flag_override.is_flip = TRUE, FALSE) + THEN 'is_flip' + END, + CASE + WHEN COALESCE(flag_override.has_class_change = TRUE, FALSE) + THEN 'has_class_change' + END, + CASE + WHEN COALESCE(flag_override.has_characteristic_change = 'yes_major', FALSE) + THEN 'has_characteristic_change' + END, + CASE + WHEN COALESCE(flag_override.requires_field_check = TRUE, FALSE) + THEN 'requires_field_check' + END + ], + r -> r IS NOT NULL + ) + ) +) AS outlier_reason FROM unique_sales LEFT JOIN mydec_sales ON unique_sales.doc_no = mydec_sales.doc_no From 13f850e2d0c2caa779d2e9dac816e8ff0d324360 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Tue, 13 Jan 2026 22:17:11 +0000 Subject: [PATCH 02/34] Edit comments --- dbt/models/default/default.vw_pin_sale.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 5f6e7a567..313ee6383 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -368,10 +368,10 @@ SELECT WHEN sales_val.sv_is_outlier IS NOT NULL THEN sales_val.sv_is_outlier END AS is_outlier, - -- Combined outlier reasons: model SV reasons + override “trigger” fieldnames + -- Combined outlier reasons: model SV reasons + manual override reasons ARRAY_DISTINCT( CONCAT( - -- Model reasons (sv_outlier_reason1-3) + -- Sales val statistical model reasons (sv_outlier_reason1-3) FILTER( ARRAY[ sales_val.sv_outlier_reason1, @@ -381,7 +381,7 @@ ARRAY_DISTINCT( r -> r IS NOT NULL AND TRIM(r) <> '' ), - -- Override triggers (only those that *would* make the sale an outlier) + -- Manual override triggers FILTER( ARRAY[ CASE From d66b74e1858be1d4c1d467711b9ca493e6e93bf7 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 14 Jan 2026 20:41:57 +0000 Subject: [PATCH 03/34] Add prefix and remove snake case --- dbt/models/default/default.vw_pin_sale.sql | 79 +++++++++++----------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 313ee6383..788799467 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -369,46 +369,49 @@ SELECT THEN sales_val.sv_is_outlier END AS is_outlier, -- Combined outlier reasons: model SV reasons + manual override reasons -ARRAY_DISTINCT( - CONCAT( - -- Sales val statistical model reasons (sv_outlier_reason1-3) - FILTER( - ARRAY[ - sales_val.sv_outlier_reason1, - sales_val.sv_outlier_reason2, - sales_val.sv_outlier_reason3 - ], - r -> r IS NOT NULL AND TRIM(r) <> '' - ), + ARRAY_DISTINCT( + CONCAT( + -- Sales val statistical model reasons (sv_outlier_reason1-3) + FILTER( + ARRAY[ + CONCAT('SV pipeline: ', sales_val.sv_outlier_reason1), + CONCAT('SV pipeline: ', sales_val.sv_outlier_reason2), + CONCAT('SV pipeline: ', sales_val.sv_outlier_reason3) + ], + r -> r IS NOT NULL AND TRIM(r) <> 'SV pipeline:' + ), - -- Manual override triggers - FILTER( - ARRAY[ - CASE - WHEN COALESCE(flag_override.is_arms_length = FALSE, FALSE) - THEN 'is_arms_length' - END, - CASE - WHEN COALESCE(flag_override.is_flip = TRUE, FALSE) - THEN 'is_flip' - END, - CASE - WHEN COALESCE(flag_override.has_class_change = TRUE, FALSE) - THEN 'has_class_change' - END, - CASE - WHEN COALESCE(flag_override.has_characteristic_change = 'yes_major', FALSE) - THEN 'has_characteristic_change' - END, - CASE - WHEN COALESCE(flag_override.requires_field_check = TRUE, FALSE) - THEN 'requires_field_check' - END - ], - r -> r IS NOT NULL + -- Manual override triggers (human-readable) + FILTER( + ARRAY[ + IF( + COALESCE(flag_override.is_arms_length = FALSE, FALSE), + 'Analyst: Arms length' + ), + IF( + COALESCE(flag_override.is_flip = TRUE, FALSE), + 'Analyst: Flip' + ), + IF( + COALESCE(flag_override.has_class_change = TRUE, FALSE), + 'Analyst: Class change' + ), + IF( + COALESCE( + flag_override.has_characteristic_change = 'yes_major', + FALSE + ), + 'Analyst: Characteristic change' + ), + IF( + COALESCE(flag_override.requires_field_check = TRUE, FALSE), + 'Analyst: Requires field check' + ) + ], + r -> r IS NOT NULL + ) ) - ) -) AS outlier_reason + ) AS outlier_reason FROM unique_sales LEFT JOIN mydec_sales ON unique_sales.doc_no = mydec_sales.doc_no From 2b1362b764963c6272a3fd57d8c5d0da9dcb9013 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 14 Jan 2026 20:54:16 +0000 Subject: [PATCH 04/34] Make arms length negative --- dbt/models/default/default.vw_pin_sale.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 788799467..0a0490cd9 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -386,7 +386,7 @@ SELECT ARRAY[ IF( COALESCE(flag_override.is_arms_length = FALSE, FALSE), - 'Analyst: Arms length' + 'Analyst: Non-arms length' ), IF( COALESCE(flag_override.is_flip = TRUE, FALSE), From aae4e6df732c6111eec6247a43199a812d504fc2 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 14 Jan 2026 21:16:20 +0000 Subject: [PATCH 05/34] Place analyst reasons before sv reasons --- dbt/models/default/default.vw_pin_sale.sql | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 0a0490cd9..7183c5d8b 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -368,20 +368,10 @@ SELECT WHEN sales_val.sv_is_outlier IS NOT NULL THEN sales_val.sv_is_outlier END AS is_outlier, - -- Combined outlier reasons: model SV reasons + manual override reasons + -- Combined outlier reasons: manual override reasons + model SV reasons ARRAY_DISTINCT( CONCAT( - -- Sales val statistical model reasons (sv_outlier_reason1-3) - FILTER( - ARRAY[ - CONCAT('SV pipeline: ', sales_val.sv_outlier_reason1), - CONCAT('SV pipeline: ', sales_val.sv_outlier_reason2), - CONCAT('SV pipeline: ', sales_val.sv_outlier_reason3) - ], - r -> r IS NOT NULL AND TRIM(r) <> 'SV pipeline:' - ), - - -- Manual override triggers (human-readable) + -- Manual override triggers (human-readable) FIRST FILTER( ARRAY[ IF( @@ -409,6 +399,16 @@ SELECT ) ], r -> r IS NOT NULL + ), + + -- Sales val statistical model reasons (sv_outlier_reason1-3) SECOND + FILTER( + ARRAY[ + CONCAT('SV pipeline: ', sales_val.sv_outlier_reason1), + CONCAT('SV pipeline: ', sales_val.sv_outlier_reason2), + CONCAT('SV pipeline: ', sales_val.sv_outlier_reason3) + ], + r -> r IS NOT NULL AND TRIM(r) <> 'SV pipeline:' ) ) ) AS outlier_reason From db05fa877e70eacf99ac02c0c09871e8663042e1 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Wed, 14 Jan 2026 21:17:51 +0000 Subject: [PATCH 06/34] Comments and linting --- dbt/models/default/default.vw_pin_sale.sql | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 7183c5d8b..2dd169165 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -371,7 +371,7 @@ SELECT -- Combined outlier reasons: manual override reasons + model SV reasons ARRAY_DISTINCT( CONCAT( - -- Manual override triggers (human-readable) FIRST + -- Manual analyst override triggers FILTER( ARRAY[ IF( @@ -388,27 +388,30 @@ SELECT ), IF( COALESCE( - flag_override.has_characteristic_change = 'yes_major', + flag_override.has_characteristic_change + = 'yes_major', FALSE ), 'Analyst: Characteristic change' ), IF( - COALESCE(flag_override.requires_field_check = TRUE, FALSE), + COALESCE( + flag_override.requires_field_check = TRUE, FALSE + ), 'Analyst: Requires field check' ) ], r -> r IS NOT NULL ), - -- Sales val statistical model reasons (sv_outlier_reason1-3) SECOND + -- Sales val statistical model reasons (sv_outlier_reason1-3) FILTER( ARRAY[ CONCAT('SV pipeline: ', sales_val.sv_outlier_reason1), CONCAT('SV pipeline: ', sales_val.sv_outlier_reason2), CONCAT('SV pipeline: ', sales_val.sv_outlier_reason3) ], - r -> r IS NOT NULL AND TRIM(r) <> 'SV pipeline:' + r -> r IS NOT NULL AND TRIM(r) != 'SV pipeline:' ) ) ) AS outlier_reason From 9d16c8067e46248adcb1c07c51476011a3be5e07 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 15 Jan 2026 20:49:22 +0000 Subject: [PATCH 07/34] Add source_is_outlier --- dbt/models/default/default.vw_pin_sale.sql | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 2dd169165..01b457dc1 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -414,7 +414,22 @@ SELECT r -> r IS NOT NULL AND TRIM(r) != 'SV pipeline:' ) ) - ) AS outlier_reason + ) AS outlier_reason, + -- Logic that mirrots the is_outlier_field but lets us know explicity + -- if the is_outlier column determination is an analyst override or + -- an algorithmic fallback. + CASE + WHEN + flag_override.is_arms_length IS NOT NULL + OR flag_override.is_flip IS NOT NULL + OR flag_override.has_class_change IS NOT NULL + OR flag_override.has_characteristic_change IS NOT NULL + OR flag_override.requires_field_check IS NOT NULL + THEN 'analyst' + + WHEN sales_val.sv_is_outlier IS NOT NULL + THEN 'algorithm' + END AS source_is_outlier FROM unique_sales LEFT JOIN mydec_sales ON unique_sales.doc_no = mydec_sales.doc_no From 408d6068557efa3cf32ca8a882c505ff7caf1f90 Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 15 Jan 2026 21:13:13 +0000 Subject: [PATCH 08/34] Add docs --- dbt/models/default/schema/default.vw_pin_sale.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 9ce4e3ffd..1c7fa6208 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -25,11 +25,11 @@ models: description: Indicator for whether or not the observation uses the MyDec sale date - name: is_outlier description: | - The final determination indicating whether a sale is a statistical + The final determination indicating whether a sale is an outlier that should be excluded from model training. - Combines information from our sales validation model with information - from analysts who review our outliers for correctness. + Combines information from our statistical sales validation model with + information from analysts who review our outliers for correctness. - name: mydec_deed_type description: Deed type from MyDec, more granular than CCAO deed type - name: mydec_line_8_current_use @@ -72,6 +72,10 @@ models: description: '{{ doc("shared_column_nbhd_code") }}' - name: num_parcels_sale description: '{{ doc("shared_column_num_parcels_sale") }}' + - name: outlier_reason + description: This is an array colum that contains all of the distinct outlier reasons between analyst review and our + algorithmic sales validation model. A nuance is that there are some outlier reasons that don't necessarily + cause the `is_outlier` status to be true. - name: pin description: '{{ doc("shared_column_pin") }}' - name: requires_field_check @@ -92,6 +96,9 @@ models: description: '{{ doc("shared_column_sale_price") }}' - name: seller_name description: '{{ doc("shared_column_seller_name") }}' + - name: source_is_outlier + description: This field tells us if the determination in `is_outlier` was sourced from analyst review or from our algorithmic + sales validation pipeline. - name: sv_is_heuristic_outlier description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' - name: sv_is_outlier From eaf6287c50062ae7159610731b66ae07c5d7f87d Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Thu, 15 Jan 2026 21:29:20 +0000 Subject: [PATCH 09/34] Adjust comment --- dbt/models/default/default.vw_pin_sale.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 01b457dc1..ce5884392 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -415,7 +415,7 @@ SELECT ) ) ) AS outlier_reason, - -- Logic that mirrots the is_outlier_field but lets us know explicity + -- Logic similar to the is_outlier_field but lets us know explicity -- if the is_outlier column determination is an analyst override or -- an algorithmic fallback. CASE From 530e5c04943fda270639212a833ca049374c583b Mon Sep 17 00:00:00 2001 From: Michael Wagner Date: Fri, 16 Jan 2026 15:48:19 +0000 Subject: [PATCH 10/34] Tweak comment --- dbt/models/default/default.vw_pin_sale.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index ce5884392..e84d10dd7 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -415,9 +415,9 @@ SELECT ) ) ) AS outlier_reason, - -- Logic similar to the is_outlier_field but lets us know explicity - -- if the is_outlier column determination is an analyst override or - -- an algorithmic fallback. + -- Logic similar to the is_outlier field but lets us know explicity + -- if the is_outlier column determination is sourced from an analyst + -- override or an algorithmic fallback. CASE WHEN flag_override.is_arms_length IS NOT NULL From 634097cf5119cb6b65dae96ce6ecfd5cfff86e57 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 12:56:44 -0600 Subject: [PATCH 11/34] Refactor `default.vw_pin_sale.is_outlier` and `outlier_reason` logic --- dbt/models/default/default.vw_pin_sale.sql | 270 +++++++++++---------- dbt/models/sale/sale.vw_outlier.sql | 39 +++ 2 files changed, 176 insertions(+), 133 deletions(-) create mode 100644 dbt/models/sale/sale.vw_outlier.sql diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index e84d10dd7..42af67eb5 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -222,31 +222,114 @@ mydec_sales AS ( OR (year_of_sale > '2020') ), -max_version_flag AS ( +-- Pull outlier sale determinations from our sales validation pipeline, and +-- combine those fields to derive a final determination +outlier AS ( SELECT - meta_sale_document_num, - MAX(version) AS max_version - FROM {{ source('sale', 'flag') }} - GROUP BY meta_sale_document_num -), - -sales_val AS ( - SELECT - sf.meta_sale_document_num, - sf.sv_is_outlier, - sf.sv_is_ptax_outlier, - sf.sv_is_heuristic_outlier, - sf.sv_outlier_reason1, - sf.sv_outlier_reason2, - sf.sv_outlier_reason3, - sf.run_id AS sv_run_id, - sf.version AS sv_version - FROM - {{ source('sale', 'flag') }} - AS sf - INNER JOIN max_version_flag AS mv - ON sf.meta_sale_document_num = mv.meta_sale_document_num - AND sf.version = mv.max_version + *, + CASE + -- Our sales validation pipeline includes algorithmic sale flagging + -- alongside human review. If a human reviewed the sale, we want to + -- consider those results first, because we weight those + -- determinations more strongly than the algorithmic flags + WHEN has_review + THEN + CASE + -- If a reviewer found a major characteristic change, it + -- should always indicate an outlier, regardless of what + -- the algorithm found. This is because incorrect major + -- characteristics can bias our valuation models + WHEN review_has_major_characteristic_change + THEN + 'Review: Major Characteristic Change' + -- Flips and non-arm's-length sales only indicate outliers + -- if the algorithm found the sale price to be unusual. + -- This is because flips and non-arm's-length sales are + -- only problematic if the sale is unrepresentative of the + -- market; if the flip brings the property up to standard + -- for the market, or if a non-arm's-length sale is close + -- to market price, then the information from that sale is + -- still useful for our valuation models + WHEN has_flag AND flag_is_outlier + THEN + CASE + WHEN review_is_flip + THEN + 'Review: Flip' + WHEN NOT review_is_arms_length + THEN + 'Review: Non-Arms-Length' + ELSE + -- If we reach this branch, then the reviewer + -- did not find anything unusual about the + -- sale, which means the reviewer and the + -- algorithm disagree about whether the sale + -- is an outlier. In these cases, we choose to + -- trust the reviewer's determination + 'Review: Valid Sale' + END + ELSE + -- If we reach this branch, we can be confident that + -- the sale does not have major characteristic errors, + -- and that the sales algorithm did not flag the sale + -- as having an unusual sale price. In this case, it + -- doesn't matter if the reviewer found the sale to be + -- a flip or a non-arm's-length transaction, since the + -- sale is in a sense "typical" enough to contain + -- relevant information for our valuation models. + -- + -- Note that a sale can also reach this branch if its + -- algorithmic flag group did not have enough sales in + -- it to reach statistical significance, or if the + -- algorithm has not evaluated this sale yet. Though + -- these two conditions are semantically different from + -- the case where the algorithm evaluated the sale and + -- found it to have a typical sale price (i.e. it is + -- not an algorithmic outlier), we lump all of these + -- conditions together for the purpose of this + -- conditional branch, since we want to default to + -- including a sale if we don't have enough information + -- to decide if its sale price was atypical + 'Review: Valid Sale' + END + -- If a reviewer has not looked at the sale, but the algorithm + -- has evaluated it, then we will use the algorithm's decision + -- as the basis for our outlier determination. If neither a + -- reviewer nor the algorithm has evaluated the sale, then the + -- outer CASE statement will fall through and return null + WHEN + has_flag + THEN + CASE + WHEN flag_is_outlier + THEN + CASE + -- The validation algorithm produces its own + -- set of reasons, so if these exist, we want + -- to use them for the final outlier reason. + -- + -- It shouldn't be possible for the algorithm + -- to flag the sale as an outlier without + -- providing reasons for that decision, but + -- if this ever happens against our + -- expectation, we want to avoid a trailing + -- comma in the output + WHEN CARDINALITY(flag_outlier_reasons) > 0 + THEN + CONCAT( + 'Algorithm: Outlier Sale, ', + ARRAY_JOIN( + flag_outlier_reasons, ', ' + ) + ) + ELSE + 'Algorithm: Outlier Sale' + END + ELSE + 'Algorithm: Valid Sale' + END + END AS outlier_reason + FROM {{ ref('sale.vw_outlier') }} ) SELECT @@ -290,11 +373,6 @@ SELECT unique_sales.sale_filter_same_sale_within_365, unique_sales.sale_filter_less_than_10k, unique_sales.sale_filter_deed_type, - -- Our sales validation pipeline only validates sales past 2014 due to MyDec - -- limitations. Previous to that values for sv_is_outlier will be NULL, so - -- if we want to both exclude detected outliers and include sales prior to - -- 2014, we need to code everything NULL as FALSE. - COALESCE(sales_val.sv_is_outlier, FALSE) AS sale_filter_is_outlier, mydec_sales.mydec_deed_type, mydec_sales.sale_filter_ptax_flag, mydec_sales.mydec_property_advertised, @@ -328,112 +406,38 @@ SELECT mydec_sales.mydec_homestead_exemption_general_alternative, mydec_sales.mydec_homestead_exemption_senior_citizens, mydec_sales.mydec_homestead_exemption_senior_citizens_assessment_freeze, - sales_val.sv_is_outlier, - sales_val.sv_is_ptax_outlier, - sales_val.sv_is_heuristic_outlier, - sales_val.sv_outlier_reason1, - sales_val.sv_outlier_reason2, - sales_val.sv_outlier_reason3, - sales_val.sv_run_id, - sales_val.sv_version, - flag_override.is_arms_length, - flag_override.is_flip, - flag_override.has_class_change, - flag_override.has_characteristic_change, - flag_override.requires_field_check, + outlier.has_flag, + outlier.flag_is_outlier, + outlier.flag_is_ptax_outlier, + outlier.flag_is_heuristic_outlier, + outlier.flag_outlier_reasons, + outlier.flag_run_id, + outlier.flag_version, + outlier.has_review, + outlier.review_is_arms_length, + outlier.review_is_flip, + outlier.review_has_class_change, + outlier.review_has_characteristic_change, + outlier.outlier_reason, CASE - -- If there is an override, use override logic - -- If neither override nor sv_is_outlier is populated, leave null - WHEN - flag_override.is_arms_length IS NOT NULL - OR flag_override.is_flip IS NOT NULL - OR flag_override.has_class_change IS NOT NULL - OR flag_override.has_characteristic_change IS NOT NULL - OR flag_override.requires_field_check IS NOT NULL - THEN ( - -- COALESCE is required here because the boolean logic is - -- three-valued (TRUE / FALSE / NULL). When overrides exist - -- but some override columns are NULL, expressions like FALSE - -- OR NULL evaluate to NULL, which would incorrectly return - -- is_outlier = NULL instead of FALSE. - COALESCE(flag_override.is_arms_length = FALSE, FALSE) - OR COALESCE(flag_override.is_flip = TRUE, FALSE) - OR COALESCE(flag_override.has_class_change = TRUE, FALSE) - OR COALESCE( - flag_override.has_characteristic_change = 'yes_major', FALSE - ) - OR COALESCE(flag_override.requires_field_check = TRUE, FALSE) + WHEN outlier.outlier_reason IN ( + 'Review: Major Characteristic Change', + 'Review: Non-Arms-Length', + 'Review: Flip' + ) OR outlier.outlier_reason LIKE 'Algorithm: Outlier Sale%' + THEN TRUE + WHEN outlier.outlier_reason IN ( + 'Review: Valid Sale', + 'Algorithm: Valid Sale' ) - -- If there is no override, default to sv_is_outlier - WHEN sales_val.sv_is_outlier IS NOT NULL - THEN sales_val.sv_is_outlier - END AS is_outlier, - -- Combined outlier reasons: manual override reasons + model SV reasons - ARRAY_DISTINCT( - CONCAT( - -- Manual analyst override triggers - FILTER( - ARRAY[ - IF( - COALESCE(flag_override.is_arms_length = FALSE, FALSE), - 'Analyst: Non-arms length' - ), - IF( - COALESCE(flag_override.is_flip = TRUE, FALSE), - 'Analyst: Flip' - ), - IF( - COALESCE(flag_override.has_class_change = TRUE, FALSE), - 'Analyst: Class change' - ), - IF( - COALESCE( - flag_override.has_characteristic_change - = 'yes_major', - FALSE - ), - 'Analyst: Characteristic change' - ), - IF( - COALESCE( - flag_override.requires_field_check = TRUE, FALSE - ), - 'Analyst: Requires field check' - ) - ], - r -> r IS NOT NULL - ), - - -- Sales val statistical model reasons (sv_outlier_reason1-3) - FILTER( - ARRAY[ - CONCAT('SV pipeline: ', sales_val.sv_outlier_reason1), - CONCAT('SV pipeline: ', sales_val.sv_outlier_reason2), - CONCAT('SV pipeline: ', sales_val.sv_outlier_reason3) - ], - r -> r IS NOT NULL AND TRIM(r) != 'SV pipeline:' - ) - ) - ) AS outlier_reason, - -- Logic similar to the is_outlier field but lets us know explicity - -- if the is_outlier column determination is sourced from an analyst - -- override or an algorithmic fallback. - CASE - WHEN - flag_override.is_arms_length IS NOT NULL - OR flag_override.is_flip IS NOT NULL - OR flag_override.has_class_change IS NOT NULL - OR flag_override.has_characteristic_change IS NOT NULL - OR flag_override.requires_field_check IS NOT NULL - THEN 'analyst' - - WHEN sales_val.sv_is_outlier IS NOT NULL - THEN 'algorithm' - END AS source_is_outlier + THEN FALSE + -- Default to including the sale in our valuation models if neither a + -- human reviewer nor our algorithm has evaluated the sale + WHEN outlier.outlier_reason IS NULL + THEN FALSE + END AS sale_filter_is_outlier FROM unique_sales LEFT JOIN mydec_sales ON unique_sales.doc_no = mydec_sales.doc_no -LEFT JOIN sales_val - ON unique_sales.doc_no = sales_val.meta_sale_document_num -LEFT JOIN {{ source('sale', 'flag_override') }} AS flag_override - ON unique_sales.doc_no = flag_override.doc_no +LEFT JOIN outlier + ON unique_sales.doc_no = outlier.doc_no diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql new file mode 100644 index 000000000..5ba3d98cb --- /dev/null +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -0,0 +1,39 @@ +-- View that combines `sale.flag` and `sale.flag_override` to produce one +-- unified view of all sales validation information for a sale based on its +-- doc number +SELECT + COALESCE(flag.doc_no, review.doc_no) AS doc_no, + flag.sv_is_outlier IS NOT NULL AS has_flag, + COALESCE(flag.sv_is_outlier, FALSE) AS flag_is_outlier, + COALESCE(flag.sv_is_ptax_outlier, FALSE) AS flag_is_ptax_outlier, + COALESCE(flag.sv_is_heuristic_outlier, FALSE) AS flag_is_heuristic_outlier, + flag.sv_run_id AS flag_run_id, + flag.sv_version AS flag_version, + FILTER( + ARRAY[ + flag.sv_outlier_reason1, + flag.sv_outlier_reason2, + flag.sv_outlier_reason3 + ], + r -> r IS NOT NULL + ) AS flag_outlier_reasons, + NOT COALESCE( + review.has_class_change IS NULL + AND review.has_characteristic_change IS NULL + AND review.is_arms_length IS NULL + AND review.is_flip IS NULL, + FALSE + ) AS has_review, + COALESCE(review.is_arms_length, FALSE) AS review_is_arms_length, + COALESCE(review.is_flip, FALSE) AS review_is_flip, + COALESCE(review.has_class_change, FALSE) AS review_has_class_change, + COALESCE(review.has_characteristic_change, FALSE) + AS review_has_characteristic_change, + COALESCE( + review.has_class_change + OR review.has_characteristic_change = 'yes_major', + FALSE + ) AS review_has_major_characteristic_change +FROM {{ source('sale', 'flag') }} AS flag +FULL OUTER JOIN {{ source('sale', 'flag_override') }} AS review + ON flag.doc_no = review.doc_no From 36e49d1575582f24971f47c64952d38acdae56c2 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 13:17:46 -0600 Subject: [PATCH 12/34] Move `is_outlier` and `outlier_reason` logic from `default.vw_pin_sale` to `sale.vw_outlier` --- dbt/models/default/default.vw_pin_sale.sql | 132 +------------ dbt/models/sale/sale.vw_outlier.sql | 218 +++++++++++++++++---- 2 files changed, 186 insertions(+), 164 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 42af67eb5..a767e3d03 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -220,116 +220,6 @@ mydec_sales AS ( joined back onto unique_sales will create duplicates by pin/sale date. */ WHERE num_single_day_sales = 1 OR (year_of_sale > '2020') -), - --- Pull outlier sale determinations from our sales validation pipeline, and --- combine those fields to derive a final determination -outlier AS ( - SELECT - *, - CASE - -- Our sales validation pipeline includes algorithmic sale flagging - -- alongside human review. If a human reviewed the sale, we want to - -- consider those results first, because we weight those - -- determinations more strongly than the algorithmic flags - WHEN has_review - THEN - CASE - -- If a reviewer found a major characteristic change, it - -- should always indicate an outlier, regardless of what - -- the algorithm found. This is because incorrect major - -- characteristics can bias our valuation models - WHEN review_has_major_characteristic_change - THEN - 'Review: Major Characteristic Change' - -- Flips and non-arm's-length sales only indicate outliers - -- if the algorithm found the sale price to be unusual. - -- This is because flips and non-arm's-length sales are - -- only problematic if the sale is unrepresentative of the - -- market; if the flip brings the property up to standard - -- for the market, or if a non-arm's-length sale is close - -- to market price, then the information from that sale is - -- still useful for our valuation models - WHEN has_flag AND flag_is_outlier - THEN - CASE - WHEN review_is_flip - THEN - 'Review: Flip' - WHEN NOT review_is_arms_length - THEN - 'Review: Non-Arms-Length' - ELSE - -- If we reach this branch, then the reviewer - -- did not find anything unusual about the - -- sale, which means the reviewer and the - -- algorithm disagree about whether the sale - -- is an outlier. In these cases, we choose to - -- trust the reviewer's determination - 'Review: Valid Sale' - END - ELSE - -- If we reach this branch, we can be confident that - -- the sale does not have major characteristic errors, - -- and that the sales algorithm did not flag the sale - -- as having an unusual sale price. In this case, it - -- doesn't matter if the reviewer found the sale to be - -- a flip or a non-arm's-length transaction, since the - -- sale is in a sense "typical" enough to contain - -- relevant information for our valuation models. - -- - -- Note that a sale can also reach this branch if its - -- algorithmic flag group did not have enough sales in - -- it to reach statistical significance, or if the - -- algorithm has not evaluated this sale yet. Though - -- these two conditions are semantically different from - -- the case where the algorithm evaluated the sale and - -- found it to have a typical sale price (i.e. it is - -- not an algorithmic outlier), we lump all of these - -- conditions together for the purpose of this - -- conditional branch, since we want to default to - -- including a sale if we don't have enough information - -- to decide if its sale price was atypical - 'Review: Valid Sale' - END - -- If a reviewer has not looked at the sale, but the algorithm - -- has evaluated it, then we will use the algorithm's decision - -- as the basis for our outlier determination. If neither a - -- reviewer nor the algorithm has evaluated the sale, then the - -- outer CASE statement will fall through and return null - WHEN - has_flag - THEN - CASE - WHEN flag_is_outlier - THEN - CASE - -- The validation algorithm produces its own - -- set of reasons, so if these exist, we want - -- to use them for the final outlier reason. - -- - -- It shouldn't be possible for the algorithm - -- to flag the sale as an outlier without - -- providing reasons for that decision, but - -- if this ever happens against our - -- expectation, we want to avoid a trailing - -- comma in the output - WHEN CARDINALITY(flag_outlier_reasons) > 0 - THEN - CONCAT( - 'Algorithm: Outlier Sale, ', - ARRAY_JOIN( - flag_outlier_reasons, ', ' - ) - ) - ELSE - 'Algorithm: Outlier Sale' - END - ELSE - 'Algorithm: Valid Sale' - END - END AS outlier_reason - FROM {{ ref('sale.vw_outlier') }} ) SELECT @@ -373,6 +263,7 @@ SELECT unique_sales.sale_filter_same_sale_within_365, unique_sales.sale_filter_less_than_10k, unique_sales.sale_filter_deed_type, + outlier.is_outlier AS sale_filter_is_outlier, mydec_sales.mydec_deed_type, mydec_sales.sale_filter_ptax_flag, mydec_sales.mydec_property_advertised, @@ -418,26 +309,9 @@ SELECT outlier.review_is_flip, outlier.review_has_class_change, outlier.review_has_characteristic_change, - outlier.outlier_reason, - CASE - WHEN outlier.outlier_reason IN ( - 'Review: Major Characteristic Change', - 'Review: Non-Arms-Length', - 'Review: Flip' - ) OR outlier.outlier_reason LIKE 'Algorithm: Outlier Sale%' - THEN TRUE - WHEN outlier.outlier_reason IN ( - 'Review: Valid Sale', - 'Algorithm: Valid Sale' - ) - THEN FALSE - -- Default to including the sale in our valuation models if neither a - -- human reviewer nor our algorithm has evaluated the sale - WHEN outlier.outlier_reason IS NULL - THEN FALSE - END AS sale_filter_is_outlier + outlier.outlier_reason FROM unique_sales LEFT JOIN mydec_sales ON unique_sales.doc_no = mydec_sales.doc_no -LEFT JOIN outlier +LEFT JOIN {{ ref('sale.vw_outlier') }} AS outlier ON unique_sales.doc_no = outlier.doc_no diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index 5ba3d98cb..27dfc02ae 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -1,39 +1,187 @@ -- View that combines `sale.flag` and `sale.flag_override` to produce one -- unified view of all sales validation information for a sale based on its -- doc number + +-- Start by combining the algorithmic flags with the human reviewer +-- determinations. To make boolean logic simpler (avoiding conditions +-- unexpectedly evaluating to NULL), we coalesce all of the determination fields +-- to FALSE in cases where the sale has not been reviewed or flagged, and +-- instead provide the boolean columns `has_review` and `has_flag` so that +-- consumers can disambiguate FALSE determinations from missing review/flags +WITH flag_and_review AS ( + SELECT + COALESCE(flag.doc_no, review.doc_no) AS doc_no, + flag.sv_is_outlier IS NOT NULL AS has_flag, + COALESCE(flag.sv_is_outlier, FALSE) AS flag_is_outlier, + COALESCE(flag.sv_is_ptax_outlier, FALSE) AS flag_is_ptax_outlier, + COALESCE(flag.sv_is_heuristic_outlier, FALSE) + AS flag_is_heuristic_outlier, + flag.sv_run_id AS flag_run_id, + flag.sv_version AS flag_version, + FILTER( + ARRAY[ + flag.sv_outlier_reason1, + flag.sv_outlier_reason2, + flag.sv_outlier_reason3 + ], + r -> r IS NOT NULL + ) AS flag_outlier_reasons, + NOT COALESCE( + review.has_class_change IS NULL + AND review.has_characteristic_change IS NULL + AND review.is_arms_length IS NULL + AND review.is_flip IS NULL, + FALSE + ) AS has_review, + COALESCE(review.is_arms_length, FALSE) AS review_is_arms_length, + COALESCE(review.is_flip, FALSE) AS review_is_flip, + COALESCE(review.has_class_change, FALSE) AS review_has_class_change, + COALESCE(review.has_characteristic_change, FALSE) + AS review_has_characteristic_change, + COALESCE( + review.has_class_change + OR review.has_characteristic_change = 'yes_major', + FALSE + ) AS review_has_major_characteristic_change + FROM {{ source('sale', 'flag') }} AS flag + FULL OUTER JOIN {{ source('sale', 'flag_override') }} AS review + ON flag.doc_no = review.doc_no +), + +-- Compare algorithmic flags to human review in order to make a final +-- determination about whether the sale is an outlier. Since we need to explain +-- our outlier determinations to data consumers, we start by producing verbose +-- reasons for our outlier determinations, and then when we consume this CTE +-- we will cast these reasons to a boolean flag that is easier to filter +outlier_reason AS ( + SELECT + *, + CASE + -- Our sales validation pipeline includes algorithmic sale flagging + -- alongside human review. If a human reviewed the sale, we want to + -- consider those results first, because we weight those + -- determinations more strongly than the algorithmic flags + WHEN has_review + THEN + CASE + -- If a reviewer found a major characteristic change, it + -- should always indicate an outlier, regardless of what + -- the algorithm found. This is because incorrect major + -- characteristics can bias our valuation models + WHEN review_has_major_characteristic_change + THEN + 'Review: Major Characteristic Change' + -- Flips and non-arm's-length sales only indicate outliers + -- if the algorithm found the sale price to be unusual. + -- This is because flips and non-arm's-length sales are + -- only problematic if the sale is unrepresentative of the + -- market; if the flip brings the property up to standard + -- for the market, or if a non-arm's-length sale is close + -- to market price, then the information from that sale is + -- still useful for our valuation models + WHEN has_flag AND flag_is_outlier + THEN + CASE + WHEN review_is_flip + THEN + 'Review: Flip' + WHEN NOT review_is_arms_length + THEN + 'Review: Non-Arms-Length' + ELSE + -- If we reach this branch, then the reviewer + -- did not find anything unusual about the + -- sale, which means the reviewer and the + -- algorithm disagree about whether the sale + -- is an outlier. In these cases, we choose to + -- trust the reviewer's determination + 'Review: Valid Sale' + END + ELSE + -- If we reach this branch, we can be confident that + -- the sale does not have major characteristic errors, + -- and that the sales algorithm did not flag the sale + -- as having an unusual sale price. In this case, it + -- doesn't matter if the reviewer found the sale to be + -- a flip or a non-arm's-length transaction, since the + -- sale is in a sense "typical" enough to contain + -- relevant information for our valuation models. + -- + -- Note that a sale can also reach this branch if its + -- algorithmic flag group did not have enough sales in + -- it to reach statistical significance, or if the + -- algorithm has not evaluated this sale yet. Though + -- these two conditions are semantically different from + -- the case where the algorithm evaluated the sale and + -- found it to have a typical sale price (i.e. it is + -- not an algorithmic outlier), we lump all of these + -- conditions together for the purpose of this + -- conditional branch, since we want to default to + -- including a sale if we don't have enough information + -- to decide if its sale price was atypical + 'Review: Valid Sale' + END + -- If a reviewer has not looked at the sale, but the algorithm + -- has evaluated it, then we will use the algorithm's decision + -- as the basis for our outlier determination. If neither a + -- reviewer nor the algorithm has evaluated the sale, then the + -- outer CASE statement will fall through and return null + WHEN + has_flag + THEN + CASE + WHEN flag_is_outlier + THEN + CASE + -- The validation algorithm produces its own + -- set of reasons, so if these exist, we want + -- to use them for the final outlier reason. + -- + -- It shouldn't be possible for the algorithm + -- to flag the sale as an outlier without + -- providing reasons for that decision, but + -- if this ever happens against our + -- expectation, we want to avoid a trailing + -- comma in the output + WHEN CARDINALITY(flag_outlier_reasons) > 0 + THEN + CONCAT( + 'Algorithm: Outlier Sale, ', + ARRAY_JOIN( + flag_outlier_reasons, ', ' + ) + ) + ELSE + 'Algorithm: Outlier Sale' + END + ELSE + 'Algorithm: Valid Sale' + END + END AS outlier_reason + FROM flag_and_review +) + SELECT - COALESCE(flag.doc_no, review.doc_no) AS doc_no, - flag.sv_is_outlier IS NOT NULL AS has_flag, - COALESCE(flag.sv_is_outlier, FALSE) AS flag_is_outlier, - COALESCE(flag.sv_is_ptax_outlier, FALSE) AS flag_is_ptax_outlier, - COALESCE(flag.sv_is_heuristic_outlier, FALSE) AS flag_is_heuristic_outlier, - flag.sv_run_id AS flag_run_id, - flag.sv_version AS flag_version, - FILTER( - ARRAY[ - flag.sv_outlier_reason1, - flag.sv_outlier_reason2, - flag.sv_outlier_reason3 - ], - r -> r IS NOT NULL - ) AS flag_outlier_reasons, - NOT COALESCE( - review.has_class_change IS NULL - AND review.has_characteristic_change IS NULL - AND review.is_arms_length IS NULL - AND review.is_flip IS NULL, - FALSE - ) AS has_review, - COALESCE(review.is_arms_length, FALSE) AS review_is_arms_length, - COALESCE(review.is_flip, FALSE) AS review_is_flip, - COALESCE(review.has_class_change, FALSE) AS review_has_class_change, - COALESCE(review.has_characteristic_change, FALSE) - AS review_has_characteristic_change, - COALESCE( - review.has_class_change - OR review.has_characteristic_change = 'yes_major', - FALSE - ) AS review_has_major_characteristic_change -FROM {{ source('sale', 'flag') }} AS flag -FULL OUTER JOIN {{ source('sale', 'flag_override') }} AS review - ON flag.doc_no = review.doc_no + *, + -- Cast the verbose outlier reasons to a boolean flag for easier filtering + CASE + WHEN outlier_reason IN ( + 'Review: Major Characteristic Change', + 'Review: Non-Arms-Length', + 'Review: Flip' + ) OR outlier_reason LIKE 'Algorithm: Outlier Sale%' + THEN TRUE + WHEN outlier_reason IN ( + 'Review: Valid Sale', + 'Algorithm: Valid Sale' + ) + THEN FALSE + -- Default to considering the sale to be valid if neither a human + -- reviewer nor our algorithm has evaluated the sale. This should + -- only apply to sales that are so old they are not relevant for + -- modeling, or sales that are later than our modeling lien date + -- (in which case we do not need to flag or review them yet) + WHEN outlier_reason IS NULL + THEN FALSE + END AS is_outlier +FROM outlier_reason From 7d4ade578ab978cc7092c27dabf0bd35454f1171 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 14:53:20 -0600 Subject: [PATCH 13/34] Define `sale.vw_flag` view to record most recent flag for each reviewed sale --- dbt/models/sale/sale.vw_flag.sql | 23 +++++++++++++++++++++++ dbt/models/sale/sale.vw_outlier.sql | 8 ++++---- 2 files changed, 27 insertions(+), 4 deletions(-) create mode 100644 dbt/models/sale/sale.vw_flag.sql diff --git a/dbt/models/sale/sale.vw_flag.sql b/dbt/models/sale/sale.vw_flag.sql new file mode 100644 index 000000000..53536aa85 --- /dev/null +++ b/dbt/models/sale/sale.vw_flag.sql @@ -0,0 +1,23 @@ +-- View that derives the most recent version of flags for each sale in the +-- `sale.flag` table, which uses its `version` column as a type 2 +-- slowly changing dimension +SELECT + sf.meta_sale_document_num AS doc_no, + sf.sv_is_outlier, + sf.sv_is_ptax_outlier, + sf.sv_is_heuristic_outlier, + sf.sv_outlier_reason1, + sf.sv_outlier_reason2, + sf.sv_outlier_reason3, + sf.run_id, + sf.version +FROM {{ source('sale', 'flag') }} AS sf +INNER JOIN ( + SELECT + meta_sale_document_num, + MAX(version) AS max_version + FROM {{ source('sale', 'flag') }} + GROUP BY meta_sale_document_num +) AS mv + ON sf.meta_sale_document_num = mv.meta_sale_document_num + AND sf.version = mv.max_version diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index 27dfc02ae..eca724fe6 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -16,8 +16,8 @@ WITH flag_and_review AS ( COALESCE(flag.sv_is_ptax_outlier, FALSE) AS flag_is_ptax_outlier, COALESCE(flag.sv_is_heuristic_outlier, FALSE) AS flag_is_heuristic_outlier, - flag.sv_run_id AS flag_run_id, - flag.sv_version AS flag_version, + flag.run_id AS flag_run_id, + flag.version AS flag_version, FILTER( ARRAY[ flag.sv_outlier_reason1, @@ -36,14 +36,14 @@ WITH flag_and_review AS ( COALESCE(review.is_arms_length, FALSE) AS review_is_arms_length, COALESCE(review.is_flip, FALSE) AS review_is_flip, COALESCE(review.has_class_change, FALSE) AS review_has_class_change, - COALESCE(review.has_characteristic_change, FALSE) + COALESCE(review.has_characteristic_change, 'no') AS review_has_characteristic_change, COALESCE( review.has_class_change OR review.has_characteristic_change = 'yes_major', FALSE ) AS review_has_major_characteristic_change - FROM {{ source('sale', 'flag') }} AS flag + FROM {{ ref('sale.vw_flag') }} AS flag FULL OUTER JOIN {{ source('sale', 'flag_override') }} AS review ON flag.doc_no = review.doc_no ), From 7b456841ec0713b2ab9f84e900fb0cd58735623a Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 14:55:13 -0600 Subject: [PATCH 14/34] Temporarily disable main branch restriction for `build_and_test_dbt` PR workflow --- .github/workflows/build_and_test_dbt.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build_and_test_dbt.yaml b/.github/workflows/build_and_test_dbt.yaml index 9840f3f52..31f6bca1b 100644 --- a/.github/workflows/build_and_test_dbt.yaml +++ b/.github/workflows/build_and_test_dbt.yaml @@ -2,7 +2,6 @@ name: build-and-test-dbt on: pull_request: - branches: [master] push: branches: [master] From 49578122c91480c5930d355032bd2174e88faace Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 15:41:38 -0600 Subject: [PATCH 15/34] Rename `sale.flag_override` table to `sale.flag_review` --- dbt/models/sale/docs.md | 4 ++-- dbt/models/sale/sale.vw_outlier.sql | 4 ++-- dbt/models/sale/schema.yml | 6 +++--- .../sale/{sale-flag_override.R => sale-flag_review.R} | 2 +- .../sale/{sale-flag_override.R => sale-flag_review.R} | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) rename etl/scripts-ccao-data-raw-us-east-1/sale/{sale-flag_override.R => sale-flag_review.R} (90%) rename etl/scripts-ccao-data-warehouse-us-east-1/sale/{sale-flag_override.R => sale-flag_review.R} (98%) diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index 012e936ad..d734b8352 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -11,9 +11,9 @@ should be possible to reconstruct using the other sales validation tables: **Primary Key**: `meta_sale_document_number`, `run_id`, `version` {% enddocs %} -# flag_override +# flag_review -{% docs flag_override %} +{% docs flag_review %} Data built by manual review by analysts that determine whether or not we should include sales in the model. diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index eca724fe6..4f453c26d 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -1,4 +1,4 @@ --- View that combines `sale.flag` and `sale.flag_override` to produce one +-- View that combines `sale.flag` and `sale.flag_review` to produce one -- unified view of all sales validation information for a sale based on its -- doc number @@ -44,7 +44,7 @@ WITH flag_and_review AS ( FALSE ) AS review_has_major_characteristic_change FROM {{ ref('sale.vw_flag') }} AS flag - FULL OUTER JOIN {{ source('sale', 'flag_override') }} AS review + FULL OUTER JOIN {{ source('sale', 'flag_review') }} AS review ON flag.doc_no = review.doc_no ), diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 5f9f06544..447bf5888 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -50,8 +50,8 @@ sources: meta: description: flag is unique by meta_sale_document_num and version - - name: flag_override - description: '{{ doc("flag_override") }}' + - name: flag_review + description: '{{ doc("flag_review") }}' columns: - name: doc_no description: '{{ doc("shared_column_document_number") }}' @@ -77,7 +77,7 @@ sources: a planned change to the home sometime soon in the future. data_tests: - unique_combination_of_columns: - name: sale_flag_override_unique_by_doc_no + name: sale_flag_review_unique_by_doc_no combination_of_columns: - doc_no diff --git a/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_override.R b/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_review.R similarity index 90% rename from etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_override.R rename to etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_review.R index a7f90082b..36591327f 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_override.R +++ b/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_review.R @@ -8,7 +8,7 @@ library(writexl) src_dir <- "O:/CCAODATA/data/sale" # Output dir -s3_dir <- "s3://ccao-data-raw-us-east-1/sale/flag_override/" +s3_dir <- "s3://ccao-data-raw-us-east-1/sale/flag_review/" # List Excel files excel_files <- list.files( diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_override.R b/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_review.R similarity index 98% rename from etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_override.R rename to etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_review.R index 8595bf932..1b4b344b8 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_override.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_review.R @@ -17,7 +17,7 @@ source("utils.R") objs <- get_bucket( bucket = "ccao-data-raw-us-east-1", - prefix = "sale/flag_override/" + prefix = "sale/flag_review/" ) tmp_dir <- tempdir() @@ -171,7 +171,7 @@ dfs_ready_to_write <- purrr::imap( ) # Output dir -out_dir <- "s3://ccao-data-warehouse-us-east-1/sale/flag_override/" +out_dir <- "s3://ccao-data-warehouse-us-east-1/sale/flag_review/" purrr::iwalk( dfs_ready_to_write, From 2a7f67ec1b5dba7967f076b0e1b4bfb0e47fd6af Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 15:43:08 -0600 Subject: [PATCH 16/34] WIP document new sale views --- dbt/models/sale/docs.md | 24 ++++++++++++++++ dbt/models/sale/schema.yml | 57 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index d734b8352..945619ff2 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -66,6 +66,19 @@ including the statistical bounds, groupings, window sizes, etc. **Primary Key**: `run_id` {% enddocs %} +# vw_flag + +{% docs vw_flag %} +PIN-level sales validation flags created by +[model-sales-val](https://github.com/ccao-data/model-sales-val). + +This view derives the most recent version of flags for each sale in the +`sale.flag` table, which uses its `version` column as a type 2 +slowly changing dimension. As such, this view is unique by `doc_no`. + +**Primary Key**: `doc_no` +{% enddocs %} + # vw_flag_group {% docs vw_flag_group %} @@ -92,3 +105,14 @@ with iasWorld. **Primary Key**: `salekey`, `run_id` {% enddocs %} + +# vw_outlier + +{% docs vw_outlier %} + +View that combines `sale.flag` and `sale.flag_review` to produce one +unified view of all sales validation information for a sale based on its +doc number. + +**Primary Key**: `doc_no` +{% enddocs %} diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 447bf5888..bf942dfee 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -245,6 +245,42 @@ sources: mydec is unique by document_number for non multi-sales models: + - name: vw_flag + description: '{{ doc("vw_flag") }}' + + columns: + - name: doc_no + description: '{{ doc("shared_column_document_number") }}' + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: sv_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' + - name: sv_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: sv_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' + - name: sv_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: version + description: '{{ doc("shared_column_sv_version") }}' + + data_tests: + - row_count: + name: sale_vw_flag_row_count_matches_sale_flag + equals: > + ( + SELECT COUNT(DISTINCT(meta_sale_document_num)) + FROM {{ source('sale.flag') }} + ) + - unique_combination_of_columns: + name: sale_vw_flag_unique_by_doc_no + combination_of_columns: + - doc_no + - name: sale.vw_ias_salesval_upload description: '{{ doc("vw_ias_salesval_upload") }}' data_tests: @@ -269,6 +305,7 @@ models: description: '{{ doc("shared_column_sv_outlier_reason") }}' - name: sv_outlier_reason3 description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sale.vw_flag_group description: '{{ doc("vw_flag_group") }}' @@ -325,3 +362,23 @@ models: # more details: https://github.com/ccao-data/data-architecture/pull/913#discussion_r2475427865 # If there are any more duplicates, we will be pushed over 4 and this test will flag it error_if: ">4" + + - name: sale.vw_outlier + description: '{{ doc("vw_outlier") }}' + + columns: + - name: doc_no + description: '{{ doc("shared_column_document_number") }}' + - name: flag_is_heuristic_outlier + - name: flag_is_outlier + - name: flag_is_ptax_outlier + - name: flag_outlier_reasons + - name: flag_run_id + - name: flag_version + - name: has_flag + - name: has_review + - name: review_has_characteristic_change + - name: review_has_class_change + - name: review_has_major_characteristic_change + - name: review_is_arms_length + - name: review_is_flip From 8f3615b2c630e69ceb2cd3144f1f251ef050a861 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 16:39:17 -0600 Subject: [PATCH 17/34] Document new outlier columns and views --- .../default/schema/default.vw_pin_sale.yml | 41 +++++++----- dbt/models/sale/docs.md | 12 ++++ dbt/models/sale/schema.yml | 23 +++++++ dbt/models/shared_columns.md | 63 +++++++++++++++++++ 4 files changed, 124 insertions(+), 15 deletions(-) diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 1c7fa6208..01d6a6df7 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -11,14 +11,22 @@ models: description: '{{ doc("shared_column_deed_type") }}' - name: doc_no description: '{{ doc("shared_column_document_number") }}' - - name: has_characteristic_change - description: '{{ doc("shared_column_has_characteristic_change") }}' - - name: has_class_change - description: '{{ doc("shared_column_has_class_change") }}' - - name: is_arms_length - description: '{{ doc("shared_column_is_arms_length") }}' - - name: is_flip - description: '{{ doc("shared_column_is_flip") }}' + - name: flag_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' + - name: flag_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: flag_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' + - name: flag_outlier_reasons + description: Array of combined algorithm flag reasons + - name: flag_run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: flag_version + description: '{{ doc("shared_column_sv_version") }}' + - name: has_flag + description: '{{ doc("shared_column_has_flag") }}' + - name: has_review + description: '{{ doc("shared_column_has_review") }}' - name: is_multisale description: '{{ doc("shared_column_sale_is_multisale") }}' - name: is_mydec_date @@ -73,19 +81,25 @@ models: - name: num_parcels_sale description: '{{ doc("shared_column_num_parcels_sale") }}' - name: outlier_reason - description: This is an array colum that contains all of the distinct outlier reasons between analyst review and our - algorithmic sales validation model. A nuance is that there are some outlier reasons that don't necessarily - cause the `is_outlier` status to be true. + description: '{{ doc("shared_column_outlier_reason") }}' - name: pin description: '{{ doc("shared_column_pin") }}' - name: requires_field_check description: '{{ doc("shared_column_requires_field_check") }}' + - name: review_has_characteristic_change + description: '{{ doc("shared_column_has_characteristic_change") }}' + - name: review_has_class_change + description: '{{ doc("shared_column_has_class_change") }}' + - name: review_is_arms_length + description: '{{ doc("shared_column_is_arms_length") }}' + - name: review_is_flip + description: '{{ doc("shared_column_is_flip") }}' - name: sale_date description: '{{ doc("shared_column_sale_date") }}' - name: sale_filter_deed_type description: Remove quit claim, executor, beneficiary and missing deed types - name: sale_filter_is_outlier - description: Same as `sv_is_outlier`, but `NULL` values are replaced with `FALSE` + description: '{{ doc("shared_column_is_outlier") }}' - name: sale_filter_less_than_10k description: Indicator for whether sale is less than $10K FMW - name: sale_filter_same_sale_within_365 @@ -96,9 +110,6 @@ models: description: '{{ doc("shared_column_sale_price") }}' - name: seller_name description: '{{ doc("shared_column_seller_name") }}' - - name: source_is_outlier - description: This field tells us if the determination in `is_outlier` was sourced from analyst review or from our algorithmic - sales validation pipeline. - name: sv_is_heuristic_outlier description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' - name: sv_is_outlier diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index 945619ff2..2211f3edf 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -114,5 +114,17 @@ View that combines `sale.flag` and `sale.flag_review` to produce one unified view of all sales validation information for a sale based on its doc number. +**Nuance**: Unlike the constituent tables `sale.flag` and `sale.flag_review`, +the determination columns in this view (like `flag_is_outlier` or +`review_has_class_change`) will never be null, even if the sale was not flagged +or was not reviewed by an analyst. This is intended to ease the process of using +these columns for boolean logic, so that we never have to handle the case where +a boolean comparison could return null unexpectedly. However, it introduces the +potential for confusion, in that a sale could have a not-null value for a +determination column that does not actually correspond to a decision made by +our algorithm or a reviewer. To determine whether a value in a determination +column corresponds to a real decision made by our algorithm or a reviewer, use the +`has_flag` and `has_review` columns. + **Primary Key**: `doc_no` {% enddocs %} diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index bf942dfee..3206ed2d3 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -370,15 +370,38 @@ models: - name: doc_no description: '{{ doc("shared_column_document_number") }}' - name: flag_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' - name: flag_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' - name: flag_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' - name: flag_outlier_reasons + description: Array of combined algorithm flag reasons - name: flag_run_id + description: '{{ doc("shared_column_sv_run_id") }}' - name: flag_version + description: '{{ doc("shared_column_sv_version") }}' - name: has_flag + description: '{{ doc("shared_column_has_flag") }}' - name: has_review + description: '{{ doc("shared_column_has_review") }}' + - name: is_outlier + description: '{{ doc("shared_column_is_outlier") }}' + - name: outlier_reason + description: '{{ doc("shared_column_outlier_reason") }}' - name: review_has_characteristic_change + description: '{{ doc("shared_column_has_characteristic_change") }}' - name: review_has_class_change + description: '{{ doc("shared_column_has_class_change") }}' - name: review_has_major_characteristic_change + description: | + True if `has_class_change` is true or if `has_characteristic_change` is + `"yes_major"`, false otherwise. + + This is a convenience column intended to make the outlier determination + logic cleaner in downstream views, where "major class change" can + reflect either one of the conditions described above. - name: review_is_arms_length + description: '{{ doc("shared_column_is_arms_length") }}' - name: review_is_flip + description: '{{ doc("shared_column_is_flip") }}' diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index 98077d7fe..4d7980ae5 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -1692,6 +1692,29 @@ Analyst determination of whether or not the property had an incorrect class classification at time of sale. {% enddocs %} +## has_flag + +{% docs shared_column_has_flag %} +Whether the sale has algorithm flags in the `sale.flag` table. + +If this column is `TRUE`, then all of the `flag_*` columns will +contain useful, reliable information from the `sale.flag` table. +If this column is `FALSE`, however, then our sales validation +algorithm has not evaluated this sale, and you should not rely +on those columns. +{% enddocs %} + +## has_review + +{% docs shared_column_has_review %} +Whether the sale has an analyst review in the `sale.flag_review` table. + +If this column is `TRUE`, then all of the `review_*` columns will +contain useful, reliable information from the `sale.flag_review` table. +If this column is `FALSE`, however, then an analyst has not reviewed +this sale, and you should not rely on those columns. +{% enddocs %} + ## is_arms_length {% docs shared_column_is_arms_length %} @@ -1706,6 +1729,46 @@ Analyst determination of whether or not the sale was a "flip". {% enddocs %} +## is_outlier + +{% docs shared_column_is_outlier %} +Whether this sale is an outlier, based on the combined information from our +sales validation algorithm and human reviewers. + +This column will default to `FALSE` if the sale has not been reviewed by our +sales validation algorithm or any human reviewers. +{% enddocs %} + +## outlier_reason + +{% docs shared_column_outlier_reason %} +Human-readable description of the reason behind this sale's outlier status. + +This column will be null if the sale has not been evaluated for outlier status +by our sales validation algorithm or our analyst review process. + +Possible values for this variable are: + +- `Review: Major Characteristic Change`: Outlier due to analyst reviewer finding + a major characteristic problem +- `Review: Non-Arms-Length`: Outlier due to analyst reviewer finding the + transaction was not arm's-length, and the algorithm finding the sale price to + be unusual +- `Review: Flip`: Outlier due to analyst reviewer finding the sale to be a flip, + and the algorithm finding the sale price to be unusual +- `Review: Valid Sale`: Non-outlier due to analyst reviewer finding the sale to + be arm's-length and open market with correct characteristics +- `Algorithm: Outlier Sale`: Outlier due to our sales validation algorithm + finding one or more issues with the sale. The exact reasons for the + algorithm's decision will be appended to the end of this value as a + comma-separated string, like + `Algorithm: Outlier Sale, High Price, Statistical Anomaly` +- `Algorithm: Valid Sale`: Non-outlier due to our sales validation algorithm + finding no problems with the sale +- Null: The sale has not been evaluated by our sales validation algorithm or by + an analyst reviewer +{% enddocs %} + ## requires_field_check {% docs shared_column_requires_field_check %} From 9fe890517ce39881f222f0070b1d52e093073010 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 16:44:04 -0600 Subject: [PATCH 18/34] Fix syntax errors in sale.vw_flag definition --- dbt/models/sale/schema.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 3206ed2d3..3c6ee4bb4 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -245,7 +245,7 @@ sources: mydec is unique by document_number for non multi-sales models: - - name: vw_flag + - name: sale.vw_flag description: '{{ doc("vw_flag") }}' columns: @@ -274,7 +274,7 @@ models: equals: > ( SELECT COUNT(DISTINCT(meta_sale_document_num)) - FROM {{ source('sale.flag') }} + FROM {{ source('sale', 'flag') }} ) - unique_combination_of_columns: name: sale_vw_flag_unique_by_doc_no From e14b4515a7144995b7fb683a7a03b9eb577f2d08 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 17:01:04 -0600 Subject: [PATCH 19/34] Add data tests for `sale.vw_outlier` --- dbt/models/sale/schema.yml | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 3c6ee4bb4..a7b0b25d8 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -387,8 +387,25 @@ models: description: '{{ doc("shared_column_has_review") }}' - name: is_outlier description: '{{ doc("shared_column_is_outlier") }}' + data_tests: + - expression_is_true: + name: sale_vw_outlier_is_outlier_is_false_when_outlier_reason_is_null + expression: = FALSE + config: + where: outlier_reason IS NULL + - not_null: + name: sale_vw_outlier_is_outlier_not_null - name: outlier_reason description: '{{ doc("shared_column_outlier_reason") }}' + data_tests: + - is_null: + name: sale_vw_outlier_outlier_reason_is_null_when_not_has_flag_and_not_has_review + config: + where: NOT (has_flag OR has_review) + - not_null: + name: sale_vw_outlier_outlier_reason_not_null_when_is_outlier_or_has_flag_or_has_review + config: + where: is_outlier OR has_flag OR has_review - name: review_has_characteristic_change description: '{{ doc("shared_column_has_characteristic_change") }}' - name: review_has_class_change @@ -405,3 +422,22 @@ models: description: '{{ doc("shared_column_is_arms_length") }}' - name: review_is_flip description: '{{ doc("shared_column_is_flip") }}' + + data_tests: + - row_count: + name: sale_vw_outlier_row_count_matches_sale_flag_and_sale_review + equals: > + ( + SELECT COUNT(*) + FROM ( + SELECT DISTINCT(meta_sale_document_num) AS doc_no + FROM {{ source('sale', 'flag') }} + UNION + SELECT DISTINCT(doc_no) AS doc_no + FROM {{ source('sale', 'flag_review') }} + ) + ) + - unique_combination_of_columns: + name: sale_vw_outlier_unique_by_doc_no + combination_of_columns: + - doc_no From cf1b1a3dbd5ac7ecada7ccf7f96a8f7fd2e1a8f1 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 18:10:43 -0600 Subject: [PATCH 20/34] Fix a few more spots with `is_outlier` --- dbt/models/default/default.vw_pin_sale.sql | 1 + .../default/schema/default.vw_pin_sale.yml | 19 ++----------------- .../reporting/reporting.vw_market_tracker.sql | 2 +- .../sale/sale.vw_ias_salesval_upload.sql | 13 +------------ 4 files changed, 5 insertions(+), 30 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index a767e3d03..2f3fe2086 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -309,6 +309,7 @@ SELECT outlier.review_is_flip, outlier.review_has_class_change, outlier.review_has_characteristic_change, + outlier.is_outlier, outlier.outlier_reason FROM unique_sales LEFT JOIN mydec_sales diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 01d6a6df7..2c4b6b071 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -32,12 +32,7 @@ models: - name: is_mydec_date description: Indicator for whether or not the observation uses the MyDec sale date - name: is_outlier - description: | - The final determination indicating whether a sale is an - outlier that should be excluded from model training. - - Combines information from our statistical sales validation model with - information from analysts who review our outliers for correctness. + description: '{{ doc("shared_column_is_outlier") }}' - name: mydec_deed_type description: Deed type from MyDec, more granular than CCAO deed type - name: mydec_line_8_current_use @@ -99,7 +94,7 @@ models: - name: sale_filter_deed_type description: Remove quit claim, executor, beneficiary and missing deed types - name: sale_filter_is_outlier - description: '{{ doc("shared_column_is_outlier") }}' + description: Deprecated, prefer `is_outlier`. - name: sale_filter_less_than_10k description: Indicator for whether sale is less than $10K FMW - name: sale_filter_same_sale_within_365 @@ -110,16 +105,6 @@ models: description: '{{ doc("shared_column_sale_price") }}' - name: seller_name description: '{{ doc("shared_column_seller_name") }}' - - name: sv_is_heuristic_outlier - description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' - - name: sv_is_outlier - description: '{{ doc("shared_column_sv_is_outlier") }}' - - name: sv_is_ptax_outlier - description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' - - name: sv_run_id - description: '{{ doc("shared_column_sv_run_id") }}' - - name: sv_version - description: '{{ doc("shared_column_sv_version") }}' - name: township_code description: '{{ doc("shared_column_township_code") }}' - name: year diff --git a/dbt/models/reporting/reporting.vw_market_tracker.sql b/dbt/models/reporting/reporting.vw_market_tracker.sql index d5a853ae7..f780b8f9f 100644 --- a/dbt/models/reporting/reporting.vw_market_tracker.sql +++ b/dbt/models/reporting/reporting.vw_market_tracker.sql @@ -66,7 +66,7 @@ SELECT vps.sale_price, vps.sale_date, vps.sale_filter_is_outlier, - vps.sv_is_outlier, + vps.flag_is_outlier AS sv_is_outlier, vps.is_multisale, vps.sale_filter_same_sale_within_365, vps.sale_filter_less_than_10k, diff --git a/dbt/models/sale/sale.vw_ias_salesval_upload.sql b/dbt/models/sale/sale.vw_ias_salesval_upload.sql index aa5d95a10..9b7176940 100644 --- a/dbt/models/sale/sale.vw_ias_salesval_upload.sql +++ b/dbt/models/sale/sale.vw_ias_salesval_upload.sql @@ -5,14 +5,6 @@ WITH ias_sales AS ( FROM {{ source('iasworld', 'sales') }} WHERE cur = 'Y' AND deactivat IS NULL -), - -max_version AS ( - SELECT - meta_sale_document_num, - MAX(version) AS max_version - FROM {{ source('sale', 'flag') }} - GROUP BY meta_sale_document_num ) SELECT @@ -23,8 +15,5 @@ SELECT sf.sv_outlier_reason3, sf.run_id FROM ias_sales -INNER JOIN {{ source('sale', 'flag') }} AS sf +INNER JOIN {{ ref('sale.vw_flag') }} AS sf ON ias_sales.instruno_clean = sf.meta_sale_document_num -INNER JOIN max_version AS mv - ON sf.meta_sale_document_num = mv.meta_sale_document_num - AND sf.version = mv.max_version; From 27c75d6ebc9065e995eed236190f08ceb345749e Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 18:32:42 -0600 Subject: [PATCH 21/34] Fixup market tracker and vw_ias_salesval_upload with vw_pin_sale changes --- dbt/models/reporting/reporting.vw_market_tracker.sql | 6 +++--- dbt/models/sale/sale.vw_ias_salesval_upload.sql | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dbt/models/reporting/reporting.vw_market_tracker.sql b/dbt/models/reporting/reporting.vw_market_tracker.sql index f780b8f9f..426c82b3e 100644 --- a/dbt/models/reporting/reporting.vw_market_tracker.sql +++ b/dbt/models/reporting/reporting.vw_market_tracker.sql @@ -71,9 +71,9 @@ SELECT vps.sale_filter_same_sale_within_365, vps.sale_filter_less_than_10k, vps.sale_filter_deed_type, - vps.sv_outlier_reason1, - vps.sv_outlier_reason2, - vps.sv_outlier_reason3, + vps.flag_outlier_reasons[1] AS sv_outlier_reason1, + vps.flag_outlier_reasons[2] AS sv_outlier_reason2, + vps.flag_outlier_reasons[3] AS sv_outlier_reason3, -- Cards 1 and 2 are card numbers for the first two residential cards for a -- parcel. They will not always be values of 1 and 2. vrc1.card AS card1, diff --git a/dbt/models/sale/sale.vw_ias_salesval_upload.sql b/dbt/models/sale/sale.vw_ias_salesval_upload.sql index 9b7176940..9f83f8d16 100644 --- a/dbt/models/sale/sale.vw_ias_salesval_upload.sql +++ b/dbt/models/sale/sale.vw_ias_salesval_upload.sql @@ -1,7 +1,7 @@ WITH ias_sales AS ( SELECT salekey, - NULLIF(REPLACE(instruno, 'D', ''), '') AS instruno_clean + NULLIF(REPLACE(instruno, 'D', ''), '') AS doc_no FROM {{ source('iasworld', 'sales') }} WHERE cur = 'Y' AND deactivat IS NULL @@ -16,4 +16,4 @@ SELECT sf.run_id FROM ias_sales INNER JOIN {{ ref('sale.vw_flag') }} AS sf - ON ias_sales.instruno_clean = sf.meta_sale_document_num + ON ias_sales.doc_no = sf.doc_no From abb4c083f4cd8037a7f59a8f2f4d671416d314a3 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 18:38:36 -0600 Subject: [PATCH 22/34] Revert "Temporarily disable main branch restriction for `build_and_test_dbt` PR workflow" This reverts commit 7b456841ec0713b2ab9f84e900fb0cd58735623a. --- .github/workflows/build_and_test_dbt.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_and_test_dbt.yaml b/.github/workflows/build_and_test_dbt.yaml index 31f6bca1b..9840f3f52 100644 --- a/.github/workflows/build_and_test_dbt.yaml +++ b/.github/workflows/build_and_test_dbt.yaml @@ -2,6 +2,7 @@ name: build-and-test-dbt on: pull_request: + branches: [master] push: branches: [master] From 786e5b9cad78fa7a64f429208624183d9b78c4e6 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 19:20:19 -0600 Subject: [PATCH 23/34] Avoid errors in reporting.vw_market_tracker --- dbt/models/reporting/reporting.vw_market_tracker.sql | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dbt/models/reporting/reporting.vw_market_tracker.sql b/dbt/models/reporting/reporting.vw_market_tracker.sql index 426c82b3e..d4dc33011 100644 --- a/dbt/models/reporting/reporting.vw_market_tracker.sql +++ b/dbt/models/reporting/reporting.vw_market_tracker.sql @@ -71,9 +71,12 @@ SELECT vps.sale_filter_same_sale_within_365, vps.sale_filter_less_than_10k, vps.sale_filter_deed_type, - vps.flag_outlier_reasons[1] AS sv_outlier_reason1, - vps.flag_outlier_reasons[2] AS sv_outlier_reason2, - vps.flag_outlier_reasons[3] AS sv_outlier_reason3, + {%- for idx in [1, 2, 3] %} + CASE + WHEN CARDINALITY(vps.flag_outlier_reasons >= {{ idx }}) + THEN vps.flag_outlier_reasons[{{ idx }}] + END AS sv_outlier_reason{{ idx }}, + {% endfor %} -- Cards 1 and 2 are card numbers for the first two residential cards for a -- parcel. They will not always be values of 1 and 2. vrc1.card AS card1, From d1d8371909eeb8a57ee3aba2860d12aef6ad41c7 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 19:23:37 -0600 Subject: [PATCH 24/34] Fix a couple docs --- dbt/models/default/schema/default.vw_pin_sale.yml | 2 +- dbt/models/sale/docs.md | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 2c4b6b071..07c9c8d03 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -94,7 +94,7 @@ models: - name: sale_filter_deed_type description: Remove quit claim, executor, beneficiary and missing deed types - name: sale_filter_is_outlier - description: Deprecated, prefer `is_outlier`. + description: Deprecated, prefer `is_outlier` - name: sale_filter_less_than_10k description: Indicator for whether sale is less than $10K FMW - name: sale_filter_same_sale_within_365 diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index 2211f3edf..068a36164 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -73,8 +73,9 @@ PIN-level sales validation flags created by [model-sales-val](https://github.com/ccao-data/model-sales-val). This view derives the most recent version of flags for each sale in the -`sale.flag` table, which uses its `version` column as a type 2 -slowly changing dimension. As such, this view is unique by `doc_no`. +`sale.flag` table, which uses its `version` column as a [type 2 slowly changing +dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row). +As such, this view is unique by `doc_no`. **Primary Key**: `doc_no` {% enddocs %} From 642737b7c2b23f6b886b345b897a56b54a44e0f3 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 23 Jan 2026 19:24:10 -0600 Subject: [PATCH 25/34] Fix error with `CARDINALITY` call in vw_market_tracker --- dbt/models/reporting/reporting.vw_market_tracker.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/models/reporting/reporting.vw_market_tracker.sql b/dbt/models/reporting/reporting.vw_market_tracker.sql index d4dc33011..40a2ab1b3 100644 --- a/dbt/models/reporting/reporting.vw_market_tracker.sql +++ b/dbt/models/reporting/reporting.vw_market_tracker.sql @@ -73,7 +73,7 @@ SELECT vps.sale_filter_deed_type, {%- for idx in [1, 2, 3] %} CASE - WHEN CARDINALITY(vps.flag_outlier_reasons >= {{ idx }}) + WHEN CARDINALITY(vps.flag_outlier_reasons) >= {{ idx }} THEN vps.flag_outlier_reasons[{{ idx }}] END AS sv_outlier_reason{{ idx }}, {% endfor %} From 34e3b3731d6f66cc9fa230f0db97b5f7e827d6ef Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 26 Jan 2026 12:11:35 -0600 Subject: [PATCH 26/34] Fix small logic bug and add unit tests to `sale.vw_outlier` --- dbt/models/sale/sale.vw_outlier.sql | 2 +- dbt/models/sale/schema.yml | 135 ++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index 4f453c26d..860acc8ff 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -33,7 +33,7 @@ WITH flag_and_review AS ( AND review.is_flip IS NULL, FALSE ) AS has_review, - COALESCE(review.is_arms_length, FALSE) AS review_is_arms_length, + COALESCE(review.is_arms_length, TRUE) AS review_is_arms_length, COALESCE(review.is_flip, FALSE) AS review_is_flip, COALESCE(review.has_class_change, FALSE) AS review_has_class_change, COALESCE(review.has_characteristic_change, 'no') diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index a7b0b25d8..27f49b0d4 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -441,3 +441,138 @@ models: name: sale_vw_outlier_unique_by_doc_no combination_of_columns: - doc_no + +unit_tests: + - name: sale_vw_outlier_outlier_reason_produces_correct_values + description: View should produce the correct outlier reasons + model: sale.vw_outlier + given: + - input: ref("sale.vw_flag") + rows: + - doc_no: flag_outlier_null_no_review + - doc_no: flag_outlier_null_has_class_change + - doc_no: flag_outlier_null_has_major_characteristic_change + - doc_no: flag_outlier_null_has_minor_characteristic_change + - doc_no: flag_outlier_null_no_characteristic_change + - doc_no: flag_outlier_null_is_not_arms_length + - doc_no: flag_outlier_null_is_flip + - doc_no: flag_outlier_false_no_review + sv_is_outlier: false + - doc_no: flag_outlier_false_has_class_change + sv_is_outlier: false + - doc_no: flag_outlier_false_has_major_characteristic_change + sv_is_outlier: false + - doc_no: flag_outlier_false_has_minor_characteristic_change + sv_is_outlier: false + - doc_no: flag_outlier_false_no_characteristic_change + sv_is_outlier: false + - doc_no: flag_outlier_false_is_not_arms_length + sv_is_outlier: false + - doc_no: flag_outlier_false_is_flip + sv_is_outlier: false + - doc_no: flag_outlier_true_no_review_no_reasons + sv_is_outlier: true + - doc_no: flag_outlier_true_no_review_with_reasons + sv_is_outlier: true + sv_outlier_reason1: 'High Price' + sv_outlier_reason2: 'Statistical Anomaly' + - doc_no: flag_outlier_true_has_class_change + sv_is_outlier: true + - doc_no: flag_outlier_true_has_major_characteristic_change + sv_is_outlier: true + - doc_no: flag_outlier_true_has_minor_characteristic_change + sv_is_outlier: true + - doc_no: flag_outlier_true_no_characteristic_change + sv_is_outlier: true + - doc_no: flag_outlier_true_is_not_arms_length + sv_is_outlier: true + - doc_no: flag_outlier_true_is_flip + sv_is_outlier: true + - input: source("sale", "flag_review") + rows: + - doc_no: flag_outlier_null_no_review + - doc_no: flag_outlier_null_has_class_change + has_class_change: true + - doc_no: flag_outlier_null_has_major_characteristic_change + has_characteristic_change: yes_major + - doc_no: flag_outlier_null_has_minor_characteristic_change + has_characteristic_change: yes_minor + - doc_no: flag_outlier_null_no_characteristic_change + has_characteristic_change: no + - doc_no: flag_outlier_null_is_not_arms_length + is_arms_length: false + - doc_no: flag_outlier_null_is_flip + is_flip: true + - doc_no: flag_outlier_false_no_review + - doc_no: flag_outlier_false_has_class_change + has_class_change: true + - doc_no: flag_outlier_false_has_major_characteristic_change + has_characteristic_change: yes_major + - doc_no: flag_outlier_false_has_minor_characteristic_change + has_characteristic_change: yes_minor + - doc_no: flag_outlier_false_no_characteristic_change + has_characteristic_change: no + - doc_no: flag_outlier_false_is_not_arms_length + is_arms_length: false + - doc_no: flag_outlier_false_is_flip + is_flip: true + - doc_no: flag_outlier_true_no_review_no_reasons + - doc_no: flag_outlier_true_no_review_with_reasons + - doc_no: flag_outlier_true_has_class_change + has_class_change: true + - doc_no: flag_outlier_true_has_major_characteristic_change + has_characteristic_change: yes_major + - doc_no: flag_outlier_true_has_minor_characteristic_change + has_characteristic_change: yes_minor + - doc_no: flag_outlier_true_no_characteristic_change + has_characteristic_change: no + - doc_no: flag_outlier_true_is_not_arms_length + is_arms_length: false + - doc_no: flag_outlier_true_is_flip + is_flip: true + expect: + rows: + - doc_no: flag_outlier_null_no_review + outlier_reason: null + - doc_no: flag_outlier_null_has_class_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_null_has_major_characteristic_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_null_has_minor_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_no_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_is_not_arms_length + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_is_flip + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_no_review + outlier_reason: 'Algorithm: Valid Sale' + - doc_no: flag_outlier_false_has_class_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_false_has_major_characteristic_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_false_has_minor_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_no_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_is_not_arms_length + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_is_flip + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_no_review_no_reasons + outlier_reason: 'Algorithm: Outlier Sale' + - doc_no: flag_outlier_true_no_review_with_reasons + outlier_reason: 'Algorithm: Outlier Sale, High Price, Statistical Anomaly' + - doc_no: flag_outlier_true_has_class_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_true_has_major_characteristic_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_true_has_minor_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_no_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_is_not_arms_length + outlier_reason: 'Review: Non-Arms-Length' + - doc_no: flag_outlier_true_is_flip + outlier_reason: 'Review: Flip' From e512f5a9178f1aa0c3e64a95255267ba188ccfd1 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Mon, 26 Jan 2026 12:16:24 -0600 Subject: [PATCH 27/34] Fix unit test indentation for yamllint --- dbt/models/sale/schema.yml | 88 +++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 27f49b0d4..bfecb316f 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -532,47 +532,47 @@ unit_tests: is_flip: true expect: rows: - - doc_no: flag_outlier_null_no_review - outlier_reason: null - - doc_no: flag_outlier_null_has_class_change - outlier_reason: 'Review: Major Characteristic Change' - - doc_no: flag_outlier_null_has_major_characteristic_change - outlier_reason: 'Review: Major Characteristic Change' - - doc_no: flag_outlier_null_has_minor_characteristic_change - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_null_no_characteristic_change - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_null_is_not_arms_length - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_null_is_flip - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_false_no_review - outlier_reason: 'Algorithm: Valid Sale' - - doc_no: flag_outlier_false_has_class_change - outlier_reason: 'Review: Major Characteristic Change' - - doc_no: flag_outlier_false_has_major_characteristic_change - outlier_reason: 'Review: Major Characteristic Change' - - doc_no: flag_outlier_false_has_minor_characteristic_change - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_false_no_characteristic_change - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_false_is_not_arms_length - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_false_is_flip - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_true_no_review_no_reasons - outlier_reason: 'Algorithm: Outlier Sale' - - doc_no: flag_outlier_true_no_review_with_reasons - outlier_reason: 'Algorithm: Outlier Sale, High Price, Statistical Anomaly' - - doc_no: flag_outlier_true_has_class_change - outlier_reason: 'Review: Major Characteristic Change' - - doc_no: flag_outlier_true_has_major_characteristic_change - outlier_reason: 'Review: Major Characteristic Change' - - doc_no: flag_outlier_true_has_minor_characteristic_change - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_true_no_characteristic_change - outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_true_is_not_arms_length - outlier_reason: 'Review: Non-Arms-Length' - - doc_no: flag_outlier_true_is_flip - outlier_reason: 'Review: Flip' + - doc_no: flag_outlier_null_no_review + outlier_reason: null + - doc_no: flag_outlier_null_has_class_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_null_has_major_characteristic_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_null_has_minor_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_no_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_is_not_arms_length + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_is_flip + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_no_review + outlier_reason: 'Algorithm: Valid Sale' + - doc_no: flag_outlier_false_has_class_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_false_has_major_characteristic_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_false_has_minor_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_no_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_is_not_arms_length + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_is_flip + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_no_review_no_reasons + outlier_reason: 'Algorithm: Outlier Sale' + - doc_no: flag_outlier_true_no_review_with_reasons + outlier_reason: 'Algorithm: Outlier Sale, High Price, Statistical Anomaly' + - doc_no: flag_outlier_true_has_class_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_true_has_major_characteristic_change + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_true_has_minor_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_no_characteristic_change + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_is_not_arms_length + outlier_reason: 'Review: Non-Arms-Length' + - doc_no: flag_outlier_true_is_flip + outlier_reason: 'Review: Flip' From b6562bc414618c2f1b35a057884c68f8e1a09cbd Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Tue, 27 Jan 2026 15:28:53 -0600 Subject: [PATCH 28/34] Add algorithm flags to flip/non-arms-length reviewed sale outlier reasons --- dbt/models/sale/sale.vw_outlier.sql | 38 ++++++++-- dbt/models/sale/schema.yml | 104 +++++++++++++++++++++++++--- 2 files changed, 127 insertions(+), 15 deletions(-) diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index 860acc8ff..4efd41f48 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -48,6 +48,13 @@ WITH flag_and_review AS ( ON flag.doc_no = review.doc_no ), +{% set price_outlier_reasons = [ + "High price", + "High price per square foot", + "Low price", + "Low price per square foot", +] -%} + -- Compare algorithmic flags to human review in order to make a final -- determination about whether the sale is an outlier. Since we need to explain -- our outlier determinations to data consumers, we start by producing verbose @@ -84,10 +91,28 @@ outlier_reason AS ( CASE WHEN review_is_flip THEN - 'Review: Flip' + CASE + {%- for price_outlier_reason in price_outlier_reasons %} -- noqa: LT05 + WHEN CONTAINS( + flag_outlier_reasons, + '{{ price_outlier_reason }}' + ) + THEN 'Review: Flip, Algorithm: {{ price_outlier_reason }}' -- noqa: LT05 + {%- endfor %} + ELSE 'Review: Flip' + END WHEN NOT review_is_arms_length THEN - 'Review: Non-Arms-Length' + CASE + {%- for price_outlier_reason in price_outlier_reasons %} -- noqa: LT05 + WHEN CONTAINS( + flag_outlier_reasons, + '{{ price_outlier_reason }}' + ) + THEN 'Review: Non-Arms-Length, Algorithm: {{ price_outlier_reason }}' -- noqa: LT05 + {%- endfor %} + ELSE 'Review: Non-Arms-Length' + END ELSE -- If we reach this branch, then the reviewer -- did not find anything unusual about the @@ -165,11 +190,10 @@ SELECT *, -- Cast the verbose outlier reasons to a boolean flag for easier filtering CASE - WHEN outlier_reason IN ( - 'Review: Major Characteristic Change', - 'Review: Non-Arms-Length', - 'Review: Flip' - ) OR outlier_reason LIKE 'Algorithm: Outlier Sale%' + WHEN outlier_reason = 'Review: Major Characteristic Change' + OR outlier_reason LIKE 'Review: Non-Arms-Length%' + OR outlier_reason LIKE 'Review: Flip%' + OR outlier_reason LIKE 'Algorithm: Outlier Sale%' THEN TRUE WHEN outlier_reason IN ( 'Review: Valid Sale', diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index bfecb316f..b7be7ff73 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -443,8 +443,8 @@ models: - doc_no unit_tests: - - name: sale_vw_outlier_outlier_reason_produces_correct_values - description: View should produce the correct outlier reasons + - name: sale_vw_outlier_produces_correct_outliers_and_reasons + description: View should produce correct outliers with associated reasons model: sale.vw_outlier given: - input: ref("sale.vw_flag") @@ -484,10 +484,36 @@ unit_tests: sv_is_outlier: true - doc_no: flag_outlier_true_no_characteristic_change sv_is_outlier: true - - doc_no: flag_outlier_true_is_not_arms_length + - doc_no: flag_outlier_true_is_not_arms_length_no_price_reasons sv_is_outlier: true - - doc_no: flag_outlier_true_is_flip + - doc_no: flag_outlier_true_is_not_arms_length_high_price sv_is_outlier: true + sv_outlier_reason1: High price per square foot # View should ignore + sv_outlier_reason2: High price + - doc_no: flag_outlier_true_is_not_arms_length_high_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason2: High price per square foot + - doc_no: flag_outlier_true_is_not_arms_length_low_price + sv_is_outlier: true + sv_outlier_reason3: Low price + - doc_no: flag_outlier_true_is_not_arms_length_low_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason1: Statistical Anomaly # View should ignore + sv_outlier_reason2: Low price per square foot + - doc_no: flag_outlier_true_is_flip_no_price_reasons + sv_is_outlier: true + - doc_no: flag_outlier_true_is_flip_high_price + sv_is_outlier: true + sv_outlier_reason1: High price + - doc_no: flag_outlier_true_is_flip_high_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason1: High price per square foot + - doc_no: flag_outlier_true_is_flip_low_price + sv_is_outlier: true + sv_outlier_reason1: Low price + - doc_no: flag_outlier_true_is_flip_low_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason1: Low price per square foot - input: source("sale", "flag_review") rows: - doc_no: flag_outlier_null_no_review @@ -526,53 +552,115 @@ unit_tests: has_characteristic_change: yes_minor - doc_no: flag_outlier_true_no_characteristic_change has_characteristic_change: no - - doc_no: flag_outlier_true_is_not_arms_length + - doc_no: flag_outlier_true_is_not_arms_length_no_price_reasons is_arms_length: false - - doc_no: flag_outlier_true_is_flip + - doc_no: flag_outlier_true_is_not_arms_length_high_price + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_high_price_per_sq_ft + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_low_price + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_low_price_per_sq_ft + is_arms_length: false + - doc_no: flag_outlier_true_is_flip_no_price_reasons + is_flip: true + - doc_no: flag_outlier_true_is_flip_high_price + is_flip: true + - doc_no: flag_outlier_true_is_flip_high_price_per_sq_ft + is_flip: true + - doc_no: flag_outlier_true_is_flip_low_price + is_flip: true + - doc_no: flag_outlier_true_is_flip_low_price_per_sq_ft is_flip: true expect: rows: - doc_no: flag_outlier_null_no_review + is_outlier: false outlier_reason: null - doc_no: flag_outlier_null_has_class_change + is_outlier: true outlier_reason: 'Review: Major Characteristic Change' - doc_no: flag_outlier_null_has_major_characteristic_change + is_outlier: true outlier_reason: 'Review: Major Characteristic Change' - doc_no: flag_outlier_null_has_minor_characteristic_change + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_null_no_characteristic_change + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_null_is_not_arms_length + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_null_is_flip + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_false_no_review + is_outlier: false outlier_reason: 'Algorithm: Valid Sale' - doc_no: flag_outlier_false_has_class_change + is_outlier: true outlier_reason: 'Review: Major Characteristic Change' - doc_no: flag_outlier_false_has_major_characteristic_change + is_outlier: true outlier_reason: 'Review: Major Characteristic Change' - doc_no: flag_outlier_false_has_minor_characteristic_change + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_false_no_characteristic_change + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_false_is_not_arms_length + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_false_is_flip + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_true_no_review_no_reasons + is_outlier: true outlier_reason: 'Algorithm: Outlier Sale' - doc_no: flag_outlier_true_no_review_with_reasons + is_outlier: true outlier_reason: 'Algorithm: Outlier Sale, High Price, Statistical Anomaly' - doc_no: flag_outlier_true_has_class_change + is_outlier: true outlier_reason: 'Review: Major Characteristic Change' - doc_no: flag_outlier_true_has_major_characteristic_change + is_outlier: true outlier_reason: 'Review: Major Characteristic Change' - doc_no: flag_outlier_true_has_minor_characteristic_change + is_outlier: false outlier_reason: 'Review: Valid Sale' - doc_no: flag_outlier_true_no_characteristic_change + is_outlier: false outlier_reason: 'Review: Valid Sale' - - doc_no: flag_outlier_true_is_not_arms_length + - doc_no: flag_outlier_true_is_not_arms_length_no_price_reasons + is_outlier: true outlier_reason: 'Review: Non-Arms-Length' - - doc_no: flag_outlier_true_is_flip + - doc_no: flag_outlier_true_is_not_arms_length_high_price + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: High price' + - doc_no: flag_outlier_true_is_not_arms_length_high_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: High price per square foot' + - doc_no: flag_outlier_true_is_not_arms_length_low_price + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: Low price' + - doc_no: flag_outlier_true_is_not_arms_length_low_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: Low price per square foot' + - doc_no: flag_outlier_true_is_flip_no_price_reasons + is_outlier: true outlier_reason: 'Review: Flip' + - doc_no: flag_outlier_true_is_flip_high_price + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: High price' + - doc_no: flag_outlier_true_is_flip_high_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: High price per square foot' + - doc_no: flag_outlier_true_is_flip_low_price + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: Low price' + - doc_no: flag_outlier_true_is_flip_low_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: Low price per square foot' From 4b0f4fe2b645f7f2a796ea85cc85a065ba699e5d Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Tue, 27 Jan 2026 17:25:42 -0600 Subject: [PATCH 29/34] Refactor outlier reasons for more intuitive use in downstream consumers --- dbt/models/default/default.vw_pin_sale.sql | 5 ++- .../default/schema/default.vw_pin_sale.yml | 8 +++-- .../reporting/reporting.vw_market_tracker.sql | 9 ++--- dbt/models/sale/sale.vw_outlier.sql | 36 ++++++++++++++++--- dbt/models/sale/schema.yml | 22 ++++++------ 5 files changed, 57 insertions(+), 23 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 2f3fe2086..3db981b70 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -301,7 +301,9 @@ SELECT outlier.flag_is_outlier, outlier.flag_is_ptax_outlier, outlier.flag_is_heuristic_outlier, - outlier.flag_outlier_reasons, + outlier.flag_outlier_reason1, + outlier.flag_outlier_reason2, + outlier.flag_outlier_reason3, outlier.flag_run_id, outlier.flag_version, outlier.has_review, @@ -309,6 +311,7 @@ SELECT outlier.review_is_flip, outlier.review_has_class_change, outlier.review_has_characteristic_change, + outlier.review_json, outlier.is_outlier, outlier.outlier_reason FROM unique_sales diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 07c9c8d03..012be8387 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -17,8 +17,12 @@ models: description: '{{ doc("shared_column_sv_is_outlier") }}' - name: flag_is_ptax_outlier description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' - - name: flag_outlier_reasons - description: Array of combined algorithm flag reasons + - name: flag_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' - name: flag_run_id description: '{{ doc("shared_column_sv_run_id") }}' - name: flag_version diff --git a/dbt/models/reporting/reporting.vw_market_tracker.sql b/dbt/models/reporting/reporting.vw_market_tracker.sql index 40a2ab1b3..51a6cabdf 100644 --- a/dbt/models/reporting/reporting.vw_market_tracker.sql +++ b/dbt/models/reporting/reporting.vw_market_tracker.sql @@ -71,12 +71,9 @@ SELECT vps.sale_filter_same_sale_within_365, vps.sale_filter_less_than_10k, vps.sale_filter_deed_type, - {%- for idx in [1, 2, 3] %} - CASE - WHEN CARDINALITY(vps.flag_outlier_reasons) >= {{ idx }} - THEN vps.flag_outlier_reasons[{{ idx }}] - END AS sv_outlier_reason{{ idx }}, - {% endfor %} + vps.flag_outlier_reason1 AS sv_outlier_reason1, + vps.flag_outlier_reason2 AS sv_outlier_reason2, + vps.flag_outlier_reason3 AS sv_outlier_reason3, -- Cards 1 and 2 are card numbers for the first two residential cards for a -- parcel. They will not always be values of 1 and 2. vrc1.card AS card1, diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index 4efd41f48..20c7a6954 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -16,8 +16,12 @@ WITH flag_and_review AS ( COALESCE(flag.sv_is_ptax_outlier, FALSE) AS flag_is_ptax_outlier, COALESCE(flag.sv_is_heuristic_outlier, FALSE) AS flag_is_heuristic_outlier, - flag.run_id AS flag_run_id, - flag.version AS flag_version, + flag.sv_outlier_reason1 AS flag_outlier_reason1, + flag.sv_outlier_reason2 AS flag_outlier_reason2, + flag.sv_outlier_reason3 AS flag_outlier_reason3, + -- Convenience column that combines all flag reasons into one array + -- for easier logic in queries that consume this CTE. We don't select + -- this column in the output of this view because it's redundant FILTER( ARRAY[ flag.sv_outlier_reason1, @@ -26,6 +30,8 @@ WITH flag_and_review AS ( ], r -> r IS NOT NULL ) AS flag_outlier_reasons, + flag.run_id AS flag_run_id, + flag.version AS flag_version, NOT COALESCE( review.has_class_change IS NULL AND review.has_characteristic_change IS NULL @@ -42,7 +48,13 @@ WITH flag_and_review AS ( review.has_class_change OR review.has_characteristic_change = 'yes_major', FALSE - ) AS review_has_major_characteristic_change + ) AS review_has_major_characteristic_change, + JSON_OBJECT( -- noqa: disable=CP02,RF02 + 'is_arms_length' VALUE is_arms_length, + 'is_flip' VALUE is_flip, + 'has_class_change' VALUE has_class_change, + 'has_characteristic_change' VALUE has_characteristic_change + ) AS review_json -- noqa: enable=CP02,RF02 FROM {{ ref('sale.vw_flag') }} AS flag FULL OUTER JOIN {{ source('sale', 'flag_review') }} AS review ON flag.doc_no = review.doc_no @@ -187,7 +199,23 @@ outlier_reason AS ( ) SELECT - *, + doc_no, + has_flag, + flag_is_outlier, + flag_is_ptax_outlier, + flag_is_heuristic_outlier, + flag_outlier_reason1, + flag_outlier_reason2, + flag_outlier_reason3, + flag_run_id, + flag_version, + has_review, + review_is_arms_length, + review_is_flip, + review_has_class_change, + review_has_characteristic_change, + review_json, + outlier_reason, -- Cast the verbose outlier reasons to a boolean flag for easier filtering CASE WHEN outlier_reason = 'Review: Major Characteristic Change' diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index b7be7ff73..b76d45c35 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -375,8 +375,12 @@ models: description: '{{ doc("shared_column_sv_is_outlier") }}' - name: flag_is_ptax_outlier description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' - - name: flag_outlier_reasons - description: Array of combined algorithm flag reasons + - name: flag_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' - name: flag_run_id description: '{{ doc("shared_column_sv_run_id") }}' - name: flag_version @@ -410,18 +414,16 @@ models: description: '{{ doc("shared_column_has_characteristic_change") }}' - name: review_has_class_change description: '{{ doc("shared_column_has_class_change") }}' - - name: review_has_major_characteristic_change - description: | - True if `has_class_change` is true or if `has_characteristic_change` is - `"yes_major"`, false otherwise. - - This is a convenience column intended to make the outlier determination - logic cleaner in downstream views, where "major class change" can - reflect either one of the conditions described above. - name: review_is_arms_length description: '{{ doc("shared_column_is_arms_length") }}' - name: review_is_flip description: '{{ doc("shared_column_is_flip") }}' + - name: review_json + description: | + Archival object documenting the combined state of all review fields. + + The schema for this object is not guaranteed to be consistent over + time, so do not use it for serious analysis or programming tasks. data_tests: - row_count: From e0da4d9210f5c5db753a5fe1c4af46d7518dec82 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 28 Jan 2026 10:36:03 -0600 Subject: [PATCH 30/34] Factor out `review_json` docs to `shared_columns` and add it to `default.vw_pin_sale` --- dbt/models/default/schema/default.vw_pin_sale.yml | 2 ++ dbt/models/sale/schema.yml | 6 +----- dbt/models/shared_columns.md | 9 +++++++++ 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 012be8387..91624a8a4 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -93,6 +93,8 @@ models: description: '{{ doc("shared_column_is_arms_length") }}' - name: review_is_flip description: '{{ doc("shared_column_is_flip") }}' + - name: review_json + description: '{{ doc("shared_column_review_json") }}' - name: sale_date description: '{{ doc("shared_column_sale_date") }}' - name: sale_filter_deed_type diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index b76d45c35..a5f74a11e 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -419,11 +419,7 @@ models: - name: review_is_flip description: '{{ doc("shared_column_is_flip") }}' - name: review_json - description: | - Archival object documenting the combined state of all review fields. - - The schema for this object is not guaranteed to be consistent over - time, so do not use it for serious analysis or programming tasks. + description: '{{ doc("shared_column_review_json") }}' data_tests: - row_count: diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index 38d8b9231..4b20e1e99 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -1785,6 +1785,15 @@ in this column suggests that the analyst believes there has been completed work on the home. {% enddocs %} +## review_json + +{% docs shared_column_review_json %} +Archival object documenting the combined state of all sale review fields. + +The schema for this object is not guaranteed to be consistent over +time, so do not use it for serious analysis or programming tasks. +{% enddocs %} + ## sv_is_heuristic_outlier {% docs shared_column_sv_is_heuristic_outlier %} From af0dcfce5234dd9ce9989cbb0141ab5c4fb286ae Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 28 Jan 2026 10:42:55 -0600 Subject: [PATCH 31/34] Clarify comment on `is_outlier` logic in `sale.vw_outlier` --- dbt/models/sale/sale.vw_outlier.sql | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql index 20c7a6954..6ad9edbb3 100644 --- a/dbt/models/sale/sale.vw_outlier.sql +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -216,7 +216,10 @@ SELECT review_has_characteristic_change, review_json, outlier_reason, - -- Cast the verbose outlier reasons to a boolean flag for easier filtering + -- Cast the verbose outlier reasons to a boolean flag for easier filtering. + -- See the comments on the logic that produces the outlier reason column + -- above to clarify why these particular values indicate outliers or + -- valid sales CASE WHEN outlier_reason = 'Review: Major Characteristic Change' OR outlier_reason LIKE 'Review: Non-Arms-Length%' From 79dd34db4aacf28ad0f514b1bc98608bdb508fa3 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 28 Jan 2026 11:40:30 -0600 Subject: [PATCH 32/34] Document price outlier component of flip/non-arms-length `outlier_reason` values --- dbt/models/shared_columns.md | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index 4b20e1e99..31155d5d7 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -1758,18 +1758,23 @@ Possible values for this variable are: - `Review: Major Characteristic Change`: Outlier due to analyst reviewer finding a major characteristic problem -- `Review: Non-Arms-Length`: Outlier due to analyst reviewer finding the - transaction was not arm's-length, and the algorithm finding the sale price to - be unusual -- `Review: Flip`: Outlier due to analyst reviewer finding the sale to be a flip, - and the algorithm finding the sale price to be unusual +- `Review: Non-Arms-Length, Algorithm: {High price|Low price|High price per square foot|Low price per square foot}`: + Outlier due to analyst reviewer finding the transaction was not arm's-length, + and the algorithm finding the sale price to be unusual. The string following + `Algorithm: ` indicates the specific reason that caused the algorithm to flag + the sale as having an unusual sale price. +- `Review: Flip, Algorithm: {High price|Low price|High price per square foot|Low price per square foot}`: + Outlier due to analyst reviewer finding the sale to be a flip, + and the algorithm finding the sale price to be unusual. The string following + `Algorithm: ` indicates the specific reason that caused the algorithm to flag + the sale as having an unusual sale price. - `Review: Valid Sale`: Non-outlier due to analyst reviewer finding the sale to - be arm's-length and open market with correct characteristics + be arm's-length and open-market with correct characteristics - `Algorithm: Outlier Sale`: Outlier due to our sales validation algorithm finding one or more issues with the sale. The exact reasons for the algorithm's decision will be appended to the end of this value as a comma-separated string, like - `Algorithm: Outlier Sale, High Price, Statistical Anomaly` + `Algorithm: Outlier Sale, High Price, Statistical Anomaly`. - `Algorithm: Valid Sale`: Non-outlier due to our sales validation algorithm finding no problems with the sale - Null: The sale has not been evaluated by our sales validation algorithm or by From 4608468f27ed1d05b697c550ac1e0858b5dc18b8 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Wed, 28 Jan 2026 11:42:34 -0600 Subject: [PATCH 33/34] Document `sv_*` columns on `model.training_data`, for extra clarity --- dbt/models/model/schema.yml | 15 +++++++++++++++ dbt/models/shared_columns.md | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/dbt/models/model/schema.yml b/dbt/models/model/schema.yml index 298e7134f..80493c29c 100644 --- a/dbt/models/model/schema.yml +++ b/dbt/models/model/schema.yml @@ -240,6 +240,21 @@ models: config: tags: - load_manual + columns: + - name: sv_is_outlier + description: '{{ doc("shared_column_is_outlier") }}' + - name: sv_outlier_reason + description: '{{ doc("shared_column_outlier_reason") }}' + - name: sv_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_review_json + description: '{{ doc("shared_column_review_json") }}' + - name: sv_run_id + description: '{{ doc("shared_column_sv_run_id") }}' tests: - unique_combination_of_columns: name: model_training_data_unique_card_doc_number_run_id diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index 31155d5d7..6b5a65053 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -1841,8 +1841,8 @@ See [model-sales-val](https://github.com/ccao-data/model-sales-val) for more det {% docs shared_column_sv_outlier_reason %} One of three possible reasons that a sale is -flagged as on outlier. The priority for -sv_outlier_reason$n column filling is +flagged as an outlier by our algorithmic sales validation pipeline. +The priority for `sv_outlier_reason$n` column filling is ptax outlier > price outlier > characteristic outlier. See the [model-sales-val](https://github.com/ccao-data/model-sales-val) From 220bcbd97640d8235a6e19efa0613f55c93686b0 Mon Sep 17 00:00:00 2001 From: Jean Cochrane Date: Fri, 30 Jan 2026 15:37:32 -0600 Subject: [PATCH 34/34] Ensure `has_flag`, `has_review`, and `is_outlier` are never null in `default.vw_pin_sale` --- dbt/models/default/default.vw_pin_sale.sql | 8 ++++---- dbt/models/default/schema/default.vw_pin_sale.yml | 9 +++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 3db981b70..bcc8bc9e5 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -263,7 +263,7 @@ SELECT unique_sales.sale_filter_same_sale_within_365, unique_sales.sale_filter_less_than_10k, unique_sales.sale_filter_deed_type, - outlier.is_outlier AS sale_filter_is_outlier, + COALESCE(outlier.is_outlier, FALSE) AS sale_filter_is_outlier, mydec_sales.mydec_deed_type, mydec_sales.sale_filter_ptax_flag, mydec_sales.mydec_property_advertised, @@ -297,7 +297,7 @@ SELECT mydec_sales.mydec_homestead_exemption_general_alternative, mydec_sales.mydec_homestead_exemption_senior_citizens, mydec_sales.mydec_homestead_exemption_senior_citizens_assessment_freeze, - outlier.has_flag, + COALESCE(outlier.has_flag, FALSE) AS has_flag, outlier.flag_is_outlier, outlier.flag_is_ptax_outlier, outlier.flag_is_heuristic_outlier, @@ -306,13 +306,13 @@ SELECT outlier.flag_outlier_reason3, outlier.flag_run_id, outlier.flag_version, - outlier.has_review, + COALESCE(outlier.has_review, FALSE) AS has_review, outlier.review_is_arms_length, outlier.review_is_flip, outlier.review_has_class_change, outlier.review_has_characteristic_change, outlier.review_json, - outlier.is_outlier, + COALESCE(outlier.is_outlier, FALSE) AS is_outlier, outlier.outlier_reason FROM unique_sales LEFT JOIN mydec_sales diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 91624a8a4..c6f7bfcdd 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -29,14 +29,23 @@ models: description: '{{ doc("shared_column_sv_version") }}' - name: has_flag description: '{{ doc("shared_column_has_flag") }}' + data_tests: + - not_null: + name: default_vw_pin_sale_has_flag_not_null - name: has_review description: '{{ doc("shared_column_has_review") }}' + data_tests: + - not_null: + name: default_vw_pin_sale_has_review_not_null - name: is_multisale description: '{{ doc("shared_column_sale_is_multisale") }}' - name: is_mydec_date description: Indicator for whether or not the observation uses the MyDec sale date - name: is_outlier description: '{{ doc("shared_column_is_outlier") }}' + data_tests: + - not_null: + name: default_vw_pin_sale_is_outlier_not_null - name: mydec_deed_type description: Deed type from MyDec, more granular than CCAO deed type - name: mydec_line_8_current_use