diff --git a/dbt/models/default/default.vw_pin_sale.sql b/dbt/models/default/default.vw_pin_sale.sql index 14430d981..bcc8bc9e5 100644 --- a/dbt/models/default/default.vw_pin_sale.sql +++ b/dbt/models/default/default.vw_pin_sale.sql @@ -220,33 +220,6 @@ mydec_sales AS ( joined back onto unique_sales will create duplicates by pin/sale date. */ WHERE num_single_day_sales = 1 OR (year_of_sale > '2020') -), - -max_version_flag AS ( - SELECT - meta_sale_document_num, - MAX(version) AS max_version - FROM {{ source('sale', 'flag') }} - GROUP BY meta_sale_document_num -), - -sales_val AS ( - SELECT - sf.meta_sale_document_num, - sf.sv_is_outlier, - sf.sv_is_ptax_outlier, - sf.sv_is_heuristic_outlier, - sf.sv_outlier_reason1, - sf.sv_outlier_reason2, - sf.sv_outlier_reason3, - sf.run_id AS sv_run_id, - sf.version AS sv_version - FROM - {{ source('sale', 'flag') }} - AS sf - INNER JOIN max_version_flag AS mv - ON sf.meta_sale_document_num = mv.meta_sale_document_num - AND sf.version = mv.max_version ) SELECT @@ -290,11 +263,7 @@ SELECT unique_sales.sale_filter_same_sale_within_365, unique_sales.sale_filter_less_than_10k, unique_sales.sale_filter_deed_type, - -- Our sales validation pipeline only validates sales past 2014 due to MyDec - -- limitations. Previous to that values for sv_is_outlier will be NULL, so - -- if we want to both exclude detected outliers and include sales prior to - -- 2014, we need to code everything NULL as FALSE. - COALESCE(sales_val.sv_is_outlier, FALSE) AS sale_filter_is_outlier, + COALESCE(outlier.is_outlier, FALSE) AS sale_filter_is_outlier, mydec_sales.mydec_deed_type, mydec_sales.sale_filter_ptax_flag, mydec_sales.mydec_property_advertised, @@ -328,50 +297,25 @@ SELECT mydec_sales.mydec_homestead_exemption_general_alternative, mydec_sales.mydec_homestead_exemption_senior_citizens, mydec_sales.mydec_homestead_exemption_senior_citizens_assessment_freeze, - sales_val.sv_is_outlier, - sales_val.sv_is_ptax_outlier, - sales_val.sv_is_heuristic_outlier, - sales_val.sv_outlier_reason1, - sales_val.sv_outlier_reason2, - sales_val.sv_outlier_reason3, - sales_val.sv_run_id, - sales_val.sv_version, - flag_override.is_arms_length, - flag_override.is_flip, - flag_override.has_class_change, - flag_override.has_characteristic_change, - flag_override.requires_field_check, - CASE - -- If there is an override, use override logic - -- If neither override nor sv_is_outlier is populated, leave null - WHEN - flag_override.is_arms_length IS NOT NULL - OR flag_override.is_flip IS NOT NULL - OR flag_override.has_class_change IS NOT NULL - OR flag_override.has_characteristic_change IS NOT NULL - OR flag_override.requires_field_check IS NOT NULL - THEN ( - -- COALESCE is required here because the boolean logic is - -- three-valued (TRUE / FALSE / NULL). When overrides exist - -- but some override columns are NULL, expressions like FALSE - -- OR NULL evaluate to NULL, which would incorrectly return - -- is_outlier = NULL instead of FALSE. - COALESCE(flag_override.is_arms_length = FALSE, FALSE) - OR COALESCE(flag_override.is_flip = TRUE, FALSE) - OR COALESCE(flag_override.has_class_change = TRUE, FALSE) - OR COALESCE( - flag_override.has_characteristic_change = 'yes_major', FALSE - ) - OR COALESCE(flag_override.requires_field_check = TRUE, FALSE) - ) - -- If there is no override, default to sv_is_outlier - WHEN sales_val.sv_is_outlier IS NOT NULL - THEN sales_val.sv_is_outlier - END AS is_outlier + COALESCE(outlier.has_flag, FALSE) AS has_flag, + outlier.flag_is_outlier, + outlier.flag_is_ptax_outlier, + outlier.flag_is_heuristic_outlier, + outlier.flag_outlier_reason1, + outlier.flag_outlier_reason2, + outlier.flag_outlier_reason3, + outlier.flag_run_id, + outlier.flag_version, + COALESCE(outlier.has_review, FALSE) AS has_review, + outlier.review_is_arms_length, + outlier.review_is_flip, + outlier.review_has_class_change, + outlier.review_has_characteristic_change, + outlier.review_json, + COALESCE(outlier.is_outlier, FALSE) AS is_outlier, + outlier.outlier_reason FROM unique_sales LEFT JOIN mydec_sales ON unique_sales.doc_no = mydec_sales.doc_no -LEFT JOIN sales_val - ON unique_sales.doc_no = sales_val.meta_sale_document_num -LEFT JOIN {{ source('sale', 'flag_override') }} AS flag_override - ON unique_sales.doc_no = flag_override.doc_no +LEFT JOIN {{ ref('sale.vw_outlier') }} AS outlier + ON unique_sales.doc_no = outlier.doc_no diff --git a/dbt/models/default/schema/default.vw_pin_sale.yml b/dbt/models/default/schema/default.vw_pin_sale.yml index 9ce4e3ffd..c6f7bfcdd 100644 --- a/dbt/models/default/schema/default.vw_pin_sale.yml +++ b/dbt/models/default/schema/default.vw_pin_sale.yml @@ -11,25 +11,41 @@ models: description: '{{ doc("shared_column_deed_type") }}' - name: doc_no description: '{{ doc("shared_column_document_number") }}' - - name: has_characteristic_change - description: '{{ doc("shared_column_has_characteristic_change") }}' - - name: has_class_change - description: '{{ doc("shared_column_has_class_change") }}' - - name: is_arms_length - description: '{{ doc("shared_column_is_arms_length") }}' - - name: is_flip - description: '{{ doc("shared_column_is_flip") }}' + - name: flag_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' + - name: flag_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: flag_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' + - name: flag_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: flag_version + description: '{{ doc("shared_column_sv_version") }}' + - name: has_flag + description: '{{ doc("shared_column_has_flag") }}' + data_tests: + - not_null: + name: default_vw_pin_sale_has_flag_not_null + - name: has_review + description: '{{ doc("shared_column_has_review") }}' + data_tests: + - not_null: + name: default_vw_pin_sale_has_review_not_null - name: is_multisale description: '{{ doc("shared_column_sale_is_multisale") }}' - name: is_mydec_date description: Indicator for whether or not the observation uses the MyDec sale date - name: is_outlier - description: | - The final determination indicating whether a sale is a statistical - outlier that should be excluded from model training. - - Combines information from our sales validation model with information - from analysts who review our outliers for correctness. + description: '{{ doc("shared_column_is_outlier") }}' + data_tests: + - not_null: + name: default_vw_pin_sale_is_outlier_not_null - name: mydec_deed_type description: Deed type from MyDec, more granular than CCAO deed type - name: mydec_line_8_current_use @@ -72,16 +88,28 @@ models: description: '{{ doc("shared_column_nbhd_code") }}' - name: num_parcels_sale description: '{{ doc("shared_column_num_parcels_sale") }}' + - name: outlier_reason + description: '{{ doc("shared_column_outlier_reason") }}' - name: pin description: '{{ doc("shared_column_pin") }}' - name: requires_field_check description: '{{ doc("shared_column_requires_field_check") }}' + - name: review_has_characteristic_change + description: '{{ doc("shared_column_has_characteristic_change") }}' + - name: review_has_class_change + description: '{{ doc("shared_column_has_class_change") }}' + - name: review_is_arms_length + description: '{{ doc("shared_column_is_arms_length") }}' + - name: review_is_flip + description: '{{ doc("shared_column_is_flip") }}' + - name: review_json + description: '{{ doc("shared_column_review_json") }}' - name: sale_date description: '{{ doc("shared_column_sale_date") }}' - name: sale_filter_deed_type description: Remove quit claim, executor, beneficiary and missing deed types - name: sale_filter_is_outlier - description: Same as `sv_is_outlier`, but `NULL` values are replaced with `FALSE` + description: Deprecated, prefer `is_outlier` - name: sale_filter_less_than_10k description: Indicator for whether sale is less than $10K FMW - name: sale_filter_same_sale_within_365 @@ -92,16 +120,6 @@ models: description: '{{ doc("shared_column_sale_price") }}' - name: seller_name description: '{{ doc("shared_column_seller_name") }}' - - name: sv_is_heuristic_outlier - description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' - - name: sv_is_outlier - description: '{{ doc("shared_column_sv_is_outlier") }}' - - name: sv_is_ptax_outlier - description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' - - name: sv_run_id - description: '{{ doc("shared_column_sv_run_id") }}' - - name: sv_version - description: '{{ doc("shared_column_sv_version") }}' - name: township_code description: '{{ doc("shared_column_township_code") }}' - name: year diff --git a/dbt/models/model/schema.yml b/dbt/models/model/schema.yml index 298e7134f..80493c29c 100644 --- a/dbt/models/model/schema.yml +++ b/dbt/models/model/schema.yml @@ -240,6 +240,21 @@ models: config: tags: - load_manual + columns: + - name: sv_is_outlier + description: '{{ doc("shared_column_is_outlier") }}' + - name: sv_outlier_reason + description: '{{ doc("shared_column_outlier_reason") }}' + - name: sv_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_review_json + description: '{{ doc("shared_column_review_json") }}' + - name: sv_run_id + description: '{{ doc("shared_column_sv_run_id") }}' tests: - unique_combination_of_columns: name: model_training_data_unique_card_doc_number_run_id diff --git a/dbt/models/reporting/reporting.vw_market_tracker.sql b/dbt/models/reporting/reporting.vw_market_tracker.sql index d5a853ae7..51a6cabdf 100644 --- a/dbt/models/reporting/reporting.vw_market_tracker.sql +++ b/dbt/models/reporting/reporting.vw_market_tracker.sql @@ -66,14 +66,14 @@ SELECT vps.sale_price, vps.sale_date, vps.sale_filter_is_outlier, - vps.sv_is_outlier, + vps.flag_is_outlier AS sv_is_outlier, vps.is_multisale, vps.sale_filter_same_sale_within_365, vps.sale_filter_less_than_10k, vps.sale_filter_deed_type, - vps.sv_outlier_reason1, - vps.sv_outlier_reason2, - vps.sv_outlier_reason3, + vps.flag_outlier_reason1 AS sv_outlier_reason1, + vps.flag_outlier_reason2 AS sv_outlier_reason2, + vps.flag_outlier_reason3 AS sv_outlier_reason3, -- Cards 1 and 2 are card numbers for the first two residential cards for a -- parcel. They will not always be values of 1 and 2. vrc1.card AS card1, diff --git a/dbt/models/sale/docs.md b/dbt/models/sale/docs.md index 012e936ad..068a36164 100644 --- a/dbt/models/sale/docs.md +++ b/dbt/models/sale/docs.md @@ -11,9 +11,9 @@ should be possible to reconstruct using the other sales validation tables: **Primary Key**: `meta_sale_document_number`, `run_id`, `version` {% enddocs %} -# flag_override +# flag_review -{% docs flag_override %} +{% docs flag_review %} Data built by manual review by analysts that determine whether or not we should include sales in the model. @@ -66,6 +66,20 @@ including the statistical bounds, groupings, window sizes, etc. **Primary Key**: `run_id` {% enddocs %} +# vw_flag + +{% docs vw_flag %} +PIN-level sales validation flags created by +[model-sales-val](https://github.com/ccao-data/model-sales-val). + +This view derives the most recent version of flags for each sale in the +`sale.flag` table, which uses its `version` column as a [type 2 slowly changing +dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row). +As such, this view is unique by `doc_no`. + +**Primary Key**: `doc_no` +{% enddocs %} + # vw_flag_group {% docs vw_flag_group %} @@ -92,3 +106,26 @@ with iasWorld. **Primary Key**: `salekey`, `run_id` {% enddocs %} + +# vw_outlier + +{% docs vw_outlier %} + +View that combines `sale.flag` and `sale.flag_review` to produce one +unified view of all sales validation information for a sale based on its +doc number. + +**Nuance**: Unlike the constituent tables `sale.flag` and `sale.flag_review`, +the determination columns in this view (like `flag_is_outlier` or +`review_has_class_change`) will never be null, even if the sale was not flagged +or was not reviewed by an analyst. This is intended to ease the process of using +these columns for boolean logic, so that we never have to handle the case where +a boolean comparison could return null unexpectedly. However, it introduces the +potential for confusion, in that a sale could have a not-null value for a +determination column that does not actually correspond to a decision made by +our algorithm or a reviewer. To determine whether a value in a determination +column corresponds to a real decision made by our algorithm or a reviewer, use the +`has_flag` and `has_review` columns. + +**Primary Key**: `doc_no` +{% enddocs %} diff --git a/dbt/models/sale/sale.vw_flag.sql b/dbt/models/sale/sale.vw_flag.sql new file mode 100644 index 000000000..53536aa85 --- /dev/null +++ b/dbt/models/sale/sale.vw_flag.sql @@ -0,0 +1,23 @@ +-- View that derives the most recent version of flags for each sale in the +-- `sale.flag` table, which uses its `version` column as a type 2 +-- slowly changing dimension +SELECT + sf.meta_sale_document_num AS doc_no, + sf.sv_is_outlier, + sf.sv_is_ptax_outlier, + sf.sv_is_heuristic_outlier, + sf.sv_outlier_reason1, + sf.sv_outlier_reason2, + sf.sv_outlier_reason3, + sf.run_id, + sf.version +FROM {{ source('sale', 'flag') }} AS sf +INNER JOIN ( + SELECT + meta_sale_document_num, + MAX(version) AS max_version + FROM {{ source('sale', 'flag') }} + GROUP BY meta_sale_document_num +) AS mv + ON sf.meta_sale_document_num = mv.meta_sale_document_num + AND sf.version = mv.max_version diff --git a/dbt/models/sale/sale.vw_ias_salesval_upload.sql b/dbt/models/sale/sale.vw_ias_salesval_upload.sql index aa5d95a10..9f83f8d16 100644 --- a/dbt/models/sale/sale.vw_ias_salesval_upload.sql +++ b/dbt/models/sale/sale.vw_ias_salesval_upload.sql @@ -1,18 +1,10 @@ WITH ias_sales AS ( SELECT salekey, - NULLIF(REPLACE(instruno, 'D', ''), '') AS instruno_clean + NULLIF(REPLACE(instruno, 'D', ''), '') AS doc_no FROM {{ source('iasworld', 'sales') }} WHERE cur = 'Y' AND deactivat IS NULL -), - -max_version AS ( - SELECT - meta_sale_document_num, - MAX(version) AS max_version - FROM {{ source('sale', 'flag') }} - GROUP BY meta_sale_document_num ) SELECT @@ -23,8 +15,5 @@ SELECT sf.sv_outlier_reason3, sf.run_id FROM ias_sales -INNER JOIN {{ source('sale', 'flag') }} AS sf - ON ias_sales.instruno_clean = sf.meta_sale_document_num -INNER JOIN max_version AS mv - ON sf.meta_sale_document_num = mv.meta_sale_document_num - AND sf.version = mv.max_version; +INNER JOIN {{ ref('sale.vw_flag') }} AS sf + ON ias_sales.doc_no = sf.doc_no diff --git a/dbt/models/sale/sale.vw_outlier.sql b/dbt/models/sale/sale.vw_outlier.sql new file mode 100644 index 000000000..6ad9edbb3 --- /dev/null +++ b/dbt/models/sale/sale.vw_outlier.sql @@ -0,0 +1,242 @@ +-- View that combines `sale.flag` and `sale.flag_review` to produce one +-- unified view of all sales validation information for a sale based on its +-- doc number + +-- Start by combining the algorithmic flags with the human reviewer +-- determinations. To make boolean logic simpler (avoiding conditions +-- unexpectedly evaluating to NULL), we coalesce all of the determination fields +-- to FALSE in cases where the sale has not been reviewed or flagged, and +-- instead provide the boolean columns `has_review` and `has_flag` so that +-- consumers can disambiguate FALSE determinations from missing review/flags +WITH flag_and_review AS ( + SELECT + COALESCE(flag.doc_no, review.doc_no) AS doc_no, + flag.sv_is_outlier IS NOT NULL AS has_flag, + COALESCE(flag.sv_is_outlier, FALSE) AS flag_is_outlier, + COALESCE(flag.sv_is_ptax_outlier, FALSE) AS flag_is_ptax_outlier, + COALESCE(flag.sv_is_heuristic_outlier, FALSE) + AS flag_is_heuristic_outlier, + flag.sv_outlier_reason1 AS flag_outlier_reason1, + flag.sv_outlier_reason2 AS flag_outlier_reason2, + flag.sv_outlier_reason3 AS flag_outlier_reason3, + -- Convenience column that combines all flag reasons into one array + -- for easier logic in queries that consume this CTE. We don't select + -- this column in the output of this view because it's redundant + FILTER( + ARRAY[ + flag.sv_outlier_reason1, + flag.sv_outlier_reason2, + flag.sv_outlier_reason3 + ], + r -> r IS NOT NULL + ) AS flag_outlier_reasons, + flag.run_id AS flag_run_id, + flag.version AS flag_version, + NOT COALESCE( + review.has_class_change IS NULL + AND review.has_characteristic_change IS NULL + AND review.is_arms_length IS NULL + AND review.is_flip IS NULL, + FALSE + ) AS has_review, + COALESCE(review.is_arms_length, TRUE) AS review_is_arms_length, + COALESCE(review.is_flip, FALSE) AS review_is_flip, + COALESCE(review.has_class_change, FALSE) AS review_has_class_change, + COALESCE(review.has_characteristic_change, 'no') + AS review_has_characteristic_change, + COALESCE( + review.has_class_change + OR review.has_characteristic_change = 'yes_major', + FALSE + ) AS review_has_major_characteristic_change, + JSON_OBJECT( -- noqa: disable=CP02,RF02 + 'is_arms_length' VALUE is_arms_length, + 'is_flip' VALUE is_flip, + 'has_class_change' VALUE has_class_change, + 'has_characteristic_change' VALUE has_characteristic_change + ) AS review_json -- noqa: enable=CP02,RF02 + FROM {{ ref('sale.vw_flag') }} AS flag + FULL OUTER JOIN {{ source('sale', 'flag_review') }} AS review + ON flag.doc_no = review.doc_no +), + +{% set price_outlier_reasons = [ + "High price", + "High price per square foot", + "Low price", + "Low price per square foot", +] -%} + +-- Compare algorithmic flags to human review in order to make a final +-- determination about whether the sale is an outlier. Since we need to explain +-- our outlier determinations to data consumers, we start by producing verbose +-- reasons for our outlier determinations, and then when we consume this CTE +-- we will cast these reasons to a boolean flag that is easier to filter +outlier_reason AS ( + SELECT + *, + CASE + -- Our sales validation pipeline includes algorithmic sale flagging + -- alongside human review. If a human reviewed the sale, we want to + -- consider those results first, because we weight those + -- determinations more strongly than the algorithmic flags + WHEN has_review + THEN + CASE + -- If a reviewer found a major characteristic change, it + -- should always indicate an outlier, regardless of what + -- the algorithm found. This is because incorrect major + -- characteristics can bias our valuation models + WHEN review_has_major_characteristic_change + THEN + 'Review: Major Characteristic Change' + -- Flips and non-arm's-length sales only indicate outliers + -- if the algorithm found the sale price to be unusual. + -- This is because flips and non-arm's-length sales are + -- only problematic if the sale is unrepresentative of the + -- market; if the flip brings the property up to standard + -- for the market, or if a non-arm's-length sale is close + -- to market price, then the information from that sale is + -- still useful for our valuation models + WHEN has_flag AND flag_is_outlier + THEN + CASE + WHEN review_is_flip + THEN + CASE + {%- for price_outlier_reason in price_outlier_reasons %} -- noqa: LT05 + WHEN CONTAINS( + flag_outlier_reasons, + '{{ price_outlier_reason }}' + ) + THEN 'Review: Flip, Algorithm: {{ price_outlier_reason }}' -- noqa: LT05 + {%- endfor %} + ELSE 'Review: Flip' + END + WHEN NOT review_is_arms_length + THEN + CASE + {%- for price_outlier_reason in price_outlier_reasons %} -- noqa: LT05 + WHEN CONTAINS( + flag_outlier_reasons, + '{{ price_outlier_reason }}' + ) + THEN 'Review: Non-Arms-Length, Algorithm: {{ price_outlier_reason }}' -- noqa: LT05 + {%- endfor %} + ELSE 'Review: Non-Arms-Length' + END + ELSE + -- If we reach this branch, then the reviewer + -- did not find anything unusual about the + -- sale, which means the reviewer and the + -- algorithm disagree about whether the sale + -- is an outlier. In these cases, we choose to + -- trust the reviewer's determination + 'Review: Valid Sale' + END + ELSE + -- If we reach this branch, we can be confident that + -- the sale does not have major characteristic errors, + -- and that the sales algorithm did not flag the sale + -- as having an unusual sale price. In this case, it + -- doesn't matter if the reviewer found the sale to be + -- a flip or a non-arm's-length transaction, since the + -- sale is in a sense "typical" enough to contain + -- relevant information for our valuation models. + -- + -- Note that a sale can also reach this branch if its + -- algorithmic flag group did not have enough sales in + -- it to reach statistical significance, or if the + -- algorithm has not evaluated this sale yet. Though + -- these two conditions are semantically different from + -- the case where the algorithm evaluated the sale and + -- found it to have a typical sale price (i.e. it is + -- not an algorithmic outlier), we lump all of these + -- conditions together for the purpose of this + -- conditional branch, since we want to default to + -- including a sale if we don't have enough information + -- to decide if its sale price was atypical + 'Review: Valid Sale' + END + -- If a reviewer has not looked at the sale, but the algorithm + -- has evaluated it, then we will use the algorithm's decision + -- as the basis for our outlier determination. If neither a + -- reviewer nor the algorithm has evaluated the sale, then the + -- outer CASE statement will fall through and return null + WHEN + has_flag + THEN + CASE + WHEN flag_is_outlier + THEN + CASE + -- The validation algorithm produces its own + -- set of reasons, so if these exist, we want + -- to use them for the final outlier reason. + -- + -- It shouldn't be possible for the algorithm + -- to flag the sale as an outlier without + -- providing reasons for that decision, but + -- if this ever happens against our + -- expectation, we want to avoid a trailing + -- comma in the output + WHEN CARDINALITY(flag_outlier_reasons) > 0 + THEN + CONCAT( + 'Algorithm: Outlier Sale, ', + ARRAY_JOIN( + flag_outlier_reasons, ', ' + ) + ) + ELSE + 'Algorithm: Outlier Sale' + END + ELSE + 'Algorithm: Valid Sale' + END + END AS outlier_reason + FROM flag_and_review +) + +SELECT + doc_no, + has_flag, + flag_is_outlier, + flag_is_ptax_outlier, + flag_is_heuristic_outlier, + flag_outlier_reason1, + flag_outlier_reason2, + flag_outlier_reason3, + flag_run_id, + flag_version, + has_review, + review_is_arms_length, + review_is_flip, + review_has_class_change, + review_has_characteristic_change, + review_json, + outlier_reason, + -- Cast the verbose outlier reasons to a boolean flag for easier filtering. + -- See the comments on the logic that produces the outlier reason column + -- above to clarify why these particular values indicate outliers or + -- valid sales + CASE + WHEN outlier_reason = 'Review: Major Characteristic Change' + OR outlier_reason LIKE 'Review: Non-Arms-Length%' + OR outlier_reason LIKE 'Review: Flip%' + OR outlier_reason LIKE 'Algorithm: Outlier Sale%' + THEN TRUE + WHEN outlier_reason IN ( + 'Review: Valid Sale', + 'Algorithm: Valid Sale' + ) + THEN FALSE + -- Default to considering the sale to be valid if neither a human + -- reviewer nor our algorithm has evaluated the sale. This should + -- only apply to sales that are so old they are not relevant for + -- modeling, or sales that are later than our modeling lien date + -- (in which case we do not need to flag or review them yet) + WHEN outlier_reason IS NULL + THEN FALSE + END AS is_outlier +FROM outlier_reason diff --git a/dbt/models/sale/schema.yml b/dbt/models/sale/schema.yml index 5f9f06544..a5f74a11e 100644 --- a/dbt/models/sale/schema.yml +++ b/dbt/models/sale/schema.yml @@ -50,8 +50,8 @@ sources: meta: description: flag is unique by meta_sale_document_num and version - - name: flag_override - description: '{{ doc("flag_override") }}' + - name: flag_review + description: '{{ doc("flag_review") }}' columns: - name: doc_no description: '{{ doc("shared_column_document_number") }}' @@ -77,7 +77,7 @@ sources: a planned change to the home sometime soon in the future. data_tests: - unique_combination_of_columns: - name: sale_flag_override_unique_by_doc_no + name: sale_flag_review_unique_by_doc_no combination_of_columns: - doc_no @@ -245,6 +245,42 @@ sources: mydec is unique by document_number for non multi-sales models: + - name: sale.vw_flag + description: '{{ doc("vw_flag") }}' + + columns: + - name: doc_no + description: '{{ doc("shared_column_document_number") }}' + - name: run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: sv_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' + - name: sv_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: sv_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' + - name: sv_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sv_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: version + description: '{{ doc("shared_column_sv_version") }}' + + data_tests: + - row_count: + name: sale_vw_flag_row_count_matches_sale_flag + equals: > + ( + SELECT COUNT(DISTINCT(meta_sale_document_num)) + FROM {{ source('sale', 'flag') }} + ) + - unique_combination_of_columns: + name: sale_vw_flag_unique_by_doc_no + combination_of_columns: + - doc_no + - name: sale.vw_ias_salesval_upload description: '{{ doc("vw_ias_salesval_upload") }}' data_tests: @@ -269,6 +305,7 @@ models: description: '{{ doc("shared_column_sv_outlier_reason") }}' - name: sv_outlier_reason3 description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: sale.vw_flag_group description: '{{ doc("vw_flag_group") }}' @@ -325,3 +362,303 @@ models: # more details: https://github.com/ccao-data/data-architecture/pull/913#discussion_r2475427865 # If there are any more duplicates, we will be pushed over 4 and this test will flag it error_if: ">4" + + - name: sale.vw_outlier + description: '{{ doc("vw_outlier") }}' + + columns: + - name: doc_no + description: '{{ doc("shared_column_document_number") }}' + - name: flag_is_heuristic_outlier + description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}' + - name: flag_is_outlier + description: '{{ doc("shared_column_sv_is_outlier") }}' + - name: flag_is_ptax_outlier + description: '{{ doc("shared_column_sv_is_ptax_outlier") }}' + - name: flag_outlier_reason1 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason2 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_outlier_reason3 + description: '{{ doc("shared_column_sv_outlier_reason") }}' + - name: flag_run_id + description: '{{ doc("shared_column_sv_run_id") }}' + - name: flag_version + description: '{{ doc("shared_column_sv_version") }}' + - name: has_flag + description: '{{ doc("shared_column_has_flag") }}' + - name: has_review + description: '{{ doc("shared_column_has_review") }}' + - name: is_outlier + description: '{{ doc("shared_column_is_outlier") }}' + data_tests: + - expression_is_true: + name: sale_vw_outlier_is_outlier_is_false_when_outlier_reason_is_null + expression: = FALSE + config: + where: outlier_reason IS NULL + - not_null: + name: sale_vw_outlier_is_outlier_not_null + - name: outlier_reason + description: '{{ doc("shared_column_outlier_reason") }}' + data_tests: + - is_null: + name: sale_vw_outlier_outlier_reason_is_null_when_not_has_flag_and_not_has_review + config: + where: NOT (has_flag OR has_review) + - not_null: + name: sale_vw_outlier_outlier_reason_not_null_when_is_outlier_or_has_flag_or_has_review + config: + where: is_outlier OR has_flag OR has_review + - name: review_has_characteristic_change + description: '{{ doc("shared_column_has_characteristic_change") }}' + - name: review_has_class_change + description: '{{ doc("shared_column_has_class_change") }}' + - name: review_is_arms_length + description: '{{ doc("shared_column_is_arms_length") }}' + - name: review_is_flip + description: '{{ doc("shared_column_is_flip") }}' + - name: review_json + description: '{{ doc("shared_column_review_json") }}' + + data_tests: + - row_count: + name: sale_vw_outlier_row_count_matches_sale_flag_and_sale_review + equals: > + ( + SELECT COUNT(*) + FROM ( + SELECT DISTINCT(meta_sale_document_num) AS doc_no + FROM {{ source('sale', 'flag') }} + UNION + SELECT DISTINCT(doc_no) AS doc_no + FROM {{ source('sale', 'flag_review') }} + ) + ) + - unique_combination_of_columns: + name: sale_vw_outlier_unique_by_doc_no + combination_of_columns: + - doc_no + +unit_tests: + - name: sale_vw_outlier_produces_correct_outliers_and_reasons + description: View should produce correct outliers with associated reasons + model: sale.vw_outlier + given: + - input: ref("sale.vw_flag") + rows: + - doc_no: flag_outlier_null_no_review + - doc_no: flag_outlier_null_has_class_change + - doc_no: flag_outlier_null_has_major_characteristic_change + - doc_no: flag_outlier_null_has_minor_characteristic_change + - doc_no: flag_outlier_null_no_characteristic_change + - doc_no: flag_outlier_null_is_not_arms_length + - doc_no: flag_outlier_null_is_flip + - doc_no: flag_outlier_false_no_review + sv_is_outlier: false + - doc_no: flag_outlier_false_has_class_change + sv_is_outlier: false + - doc_no: flag_outlier_false_has_major_characteristic_change + sv_is_outlier: false + - doc_no: flag_outlier_false_has_minor_characteristic_change + sv_is_outlier: false + - doc_no: flag_outlier_false_no_characteristic_change + sv_is_outlier: false + - doc_no: flag_outlier_false_is_not_arms_length + sv_is_outlier: false + - doc_no: flag_outlier_false_is_flip + sv_is_outlier: false + - doc_no: flag_outlier_true_no_review_no_reasons + sv_is_outlier: true + - doc_no: flag_outlier_true_no_review_with_reasons + sv_is_outlier: true + sv_outlier_reason1: 'High Price' + sv_outlier_reason2: 'Statistical Anomaly' + - doc_no: flag_outlier_true_has_class_change + sv_is_outlier: true + - doc_no: flag_outlier_true_has_major_characteristic_change + sv_is_outlier: true + - doc_no: flag_outlier_true_has_minor_characteristic_change + sv_is_outlier: true + - doc_no: flag_outlier_true_no_characteristic_change + sv_is_outlier: true + - doc_no: flag_outlier_true_is_not_arms_length_no_price_reasons + sv_is_outlier: true + - doc_no: flag_outlier_true_is_not_arms_length_high_price + sv_is_outlier: true + sv_outlier_reason1: High price per square foot # View should ignore + sv_outlier_reason2: High price + - doc_no: flag_outlier_true_is_not_arms_length_high_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason2: High price per square foot + - doc_no: flag_outlier_true_is_not_arms_length_low_price + sv_is_outlier: true + sv_outlier_reason3: Low price + - doc_no: flag_outlier_true_is_not_arms_length_low_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason1: Statistical Anomaly # View should ignore + sv_outlier_reason2: Low price per square foot + - doc_no: flag_outlier_true_is_flip_no_price_reasons + sv_is_outlier: true + - doc_no: flag_outlier_true_is_flip_high_price + sv_is_outlier: true + sv_outlier_reason1: High price + - doc_no: flag_outlier_true_is_flip_high_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason1: High price per square foot + - doc_no: flag_outlier_true_is_flip_low_price + sv_is_outlier: true + sv_outlier_reason1: Low price + - doc_no: flag_outlier_true_is_flip_low_price_per_sq_ft + sv_is_outlier: true + sv_outlier_reason1: Low price per square foot + - input: source("sale", "flag_review") + rows: + - doc_no: flag_outlier_null_no_review + - doc_no: flag_outlier_null_has_class_change + has_class_change: true + - doc_no: flag_outlier_null_has_major_characteristic_change + has_characteristic_change: yes_major + - doc_no: flag_outlier_null_has_minor_characteristic_change + has_characteristic_change: yes_minor + - doc_no: flag_outlier_null_no_characteristic_change + has_characteristic_change: no + - doc_no: flag_outlier_null_is_not_arms_length + is_arms_length: false + - doc_no: flag_outlier_null_is_flip + is_flip: true + - doc_no: flag_outlier_false_no_review + - doc_no: flag_outlier_false_has_class_change + has_class_change: true + - doc_no: flag_outlier_false_has_major_characteristic_change + has_characteristic_change: yes_major + - doc_no: flag_outlier_false_has_minor_characteristic_change + has_characteristic_change: yes_minor + - doc_no: flag_outlier_false_no_characteristic_change + has_characteristic_change: no + - doc_no: flag_outlier_false_is_not_arms_length + is_arms_length: false + - doc_no: flag_outlier_false_is_flip + is_flip: true + - doc_no: flag_outlier_true_no_review_no_reasons + - doc_no: flag_outlier_true_no_review_with_reasons + - doc_no: flag_outlier_true_has_class_change + has_class_change: true + - doc_no: flag_outlier_true_has_major_characteristic_change + has_characteristic_change: yes_major + - doc_no: flag_outlier_true_has_minor_characteristic_change + has_characteristic_change: yes_minor + - doc_no: flag_outlier_true_no_characteristic_change + has_characteristic_change: no + - doc_no: flag_outlier_true_is_not_arms_length_no_price_reasons + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_high_price + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_high_price_per_sq_ft + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_low_price + is_arms_length: false + - doc_no: flag_outlier_true_is_not_arms_length_low_price_per_sq_ft + is_arms_length: false + - doc_no: flag_outlier_true_is_flip_no_price_reasons + is_flip: true + - doc_no: flag_outlier_true_is_flip_high_price + is_flip: true + - doc_no: flag_outlier_true_is_flip_high_price_per_sq_ft + is_flip: true + - doc_no: flag_outlier_true_is_flip_low_price + is_flip: true + - doc_no: flag_outlier_true_is_flip_low_price_per_sq_ft + is_flip: true + expect: + rows: + - doc_no: flag_outlier_null_no_review + is_outlier: false + outlier_reason: null + - doc_no: flag_outlier_null_has_class_change + is_outlier: true + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_null_has_major_characteristic_change + is_outlier: true + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_null_has_minor_characteristic_change + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_no_characteristic_change + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_is_not_arms_length + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_null_is_flip + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_no_review + is_outlier: false + outlier_reason: 'Algorithm: Valid Sale' + - doc_no: flag_outlier_false_has_class_change + is_outlier: true + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_false_has_major_characteristic_change + is_outlier: true + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_false_has_minor_characteristic_change + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_no_characteristic_change + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_is_not_arms_length + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_false_is_flip + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_no_review_no_reasons + is_outlier: true + outlier_reason: 'Algorithm: Outlier Sale' + - doc_no: flag_outlier_true_no_review_with_reasons + is_outlier: true + outlier_reason: 'Algorithm: Outlier Sale, High Price, Statistical Anomaly' + - doc_no: flag_outlier_true_has_class_change + is_outlier: true + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_true_has_major_characteristic_change + is_outlier: true + outlier_reason: 'Review: Major Characteristic Change' + - doc_no: flag_outlier_true_has_minor_characteristic_change + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_no_characteristic_change + is_outlier: false + outlier_reason: 'Review: Valid Sale' + - doc_no: flag_outlier_true_is_not_arms_length_no_price_reasons + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length' + - doc_no: flag_outlier_true_is_not_arms_length_high_price + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: High price' + - doc_no: flag_outlier_true_is_not_arms_length_high_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: High price per square foot' + - doc_no: flag_outlier_true_is_not_arms_length_low_price + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: Low price' + - doc_no: flag_outlier_true_is_not_arms_length_low_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Non-Arms-Length, Algorithm: Low price per square foot' + - doc_no: flag_outlier_true_is_flip_no_price_reasons + is_outlier: true + outlier_reason: 'Review: Flip' + - doc_no: flag_outlier_true_is_flip_high_price + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: High price' + - doc_no: flag_outlier_true_is_flip_high_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: High price per square foot' + - doc_no: flag_outlier_true_is_flip_low_price + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: Low price' + - doc_no: flag_outlier_true_is_flip_low_price_per_sq_ft + is_outlier: true + outlier_reason: 'Review: Flip, Algorithm: Low price per square foot' diff --git a/dbt/models/shared_columns.md b/dbt/models/shared_columns.md index 2509ecd87..6b5a65053 100644 --- a/dbt/models/shared_columns.md +++ b/dbt/models/shared_columns.md @@ -1699,6 +1699,29 @@ Analyst determination of whether or not the property had an incorrect class classification at time of sale. {% enddocs %} +## has_flag + +{% docs shared_column_has_flag %} +Whether the sale has algorithm flags in the `sale.flag` table. + +If this column is `TRUE`, then all of the `flag_*` columns will +contain useful, reliable information from the `sale.flag` table. +If this column is `FALSE`, however, then our sales validation +algorithm has not evaluated this sale, and you should not rely +on those columns. +{% enddocs %} + +## has_review + +{% docs shared_column_has_review %} +Whether the sale has an analyst review in the `sale.flag_review` table. + +If this column is `TRUE`, then all of the `review_*` columns will +contain useful, reliable information from the `sale.flag_review` table. +If this column is `FALSE`, however, then an analyst has not reviewed +this sale, and you should not rely on those columns. +{% enddocs %} + ## is_arms_length {% docs shared_column_is_arms_length %} @@ -1713,6 +1736,51 @@ Analyst determination of whether or not the sale was a "flip". {% enddocs %} +## is_outlier + +{% docs shared_column_is_outlier %} +Whether this sale is an outlier, based on the combined information from our +sales validation algorithm and human reviewers. + +This column will default to `FALSE` if the sale has not been reviewed by our +sales validation algorithm or any human reviewers. +{% enddocs %} + +## outlier_reason + +{% docs shared_column_outlier_reason %} +Human-readable description of the reason behind this sale's outlier status. + +This column will be null if the sale has not been evaluated for outlier status +by our sales validation algorithm or our analyst review process. + +Possible values for this variable are: + +- `Review: Major Characteristic Change`: Outlier due to analyst reviewer finding + a major characteristic problem +- `Review: Non-Arms-Length, Algorithm: {High price|Low price|High price per square foot|Low price per square foot}`: + Outlier due to analyst reviewer finding the transaction was not arm's-length, + and the algorithm finding the sale price to be unusual. The string following + `Algorithm: ` indicates the specific reason that caused the algorithm to flag + the sale as having an unusual sale price. +- `Review: Flip, Algorithm: {High price|Low price|High price per square foot|Low price per square foot}`: + Outlier due to analyst reviewer finding the sale to be a flip, + and the algorithm finding the sale price to be unusual. The string following + `Algorithm: ` indicates the specific reason that caused the algorithm to flag + the sale as having an unusual sale price. +- `Review: Valid Sale`: Non-outlier due to analyst reviewer finding the sale to + be arm's-length and open-market with correct characteristics +- `Algorithm: Outlier Sale`: Outlier due to our sales validation algorithm + finding one or more issues with the sale. The exact reasons for the + algorithm's decision will be appended to the end of this value as a + comma-separated string, like + `Algorithm: Outlier Sale, High Price, Statistical Anomaly`. +- `Algorithm: Valid Sale`: Non-outlier due to our sales validation algorithm + finding no problems with the sale +- Null: The sale has not been evaluated by our sales validation algorithm or by + an analyst reviewer +{% enddocs %} + ## requires_field_check {% docs shared_column_requires_field_check %} @@ -1722,6 +1790,15 @@ in this column suggests that the analyst believes there has been completed work on the home. {% enddocs %} +## review_json + +{% docs shared_column_review_json %} +Archival object documenting the combined state of all sale review fields. + +The schema for this object is not guaranteed to be consistent over +time, so do not use it for serious analysis or programming tasks. +{% enddocs %} + ## sv_is_heuristic_outlier {% docs shared_column_sv_is_heuristic_outlier %} @@ -1764,8 +1841,8 @@ See [model-sales-val](https://github.com/ccao-data/model-sales-val) for more det {% docs shared_column_sv_outlier_reason %} One of three possible reasons that a sale is -flagged as on outlier. The priority for -sv_outlier_reason$n column filling is +flagged as an outlier by our algorithmic sales validation pipeline. +The priority for `sv_outlier_reason$n` column filling is ptax outlier > price outlier > characteristic outlier. See the [model-sales-val](https://github.com/ccao-data/model-sales-val) diff --git a/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_override.R b/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_review.R similarity index 90% rename from etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_override.R rename to etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_review.R index a7f90082b..36591327f 100644 --- a/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_override.R +++ b/etl/scripts-ccao-data-raw-us-east-1/sale/sale-flag_review.R @@ -8,7 +8,7 @@ library(writexl) src_dir <- "O:/CCAODATA/data/sale" # Output dir -s3_dir <- "s3://ccao-data-raw-us-east-1/sale/flag_override/" +s3_dir <- "s3://ccao-data-raw-us-east-1/sale/flag_review/" # List Excel files excel_files <- list.files( diff --git a/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_override.R b/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_review.R similarity index 98% rename from etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_override.R rename to etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_review.R index 50da6626c..e42220cb8 100644 --- a/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_override.R +++ b/etl/scripts-ccao-data-warehouse-us-east-1/sale/sale-flag_review.R @@ -17,7 +17,7 @@ source("utils.R") objs <- get_bucket( bucket = "ccao-data-raw-us-east-1", - prefix = "sale/flag_override/" + prefix = "sale/flag_review/" ) tmp_dir <- tempdir() @@ -228,7 +228,7 @@ dfs_ready_to_write <- purrr::imap( ) # Output dir -out_dir <- "s3://ccao-data-warehouse-us-east-1/sale/flag_override/" +out_dir <- "s3://ccao-data-warehouse-us-east-1/sale/flag_review/" purrr::iwalk( dfs_ready_to_write,