Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
988642b
First pass
wagnerlmichael Jan 13, 2026
13f850e
Edit comments
wagnerlmichael Jan 13, 2026
d66b74e
Add prefix and remove snake case
wagnerlmichael Jan 14, 2026
2b1362b
Make arms length negative
wagnerlmichael Jan 14, 2026
aae4e6d
Place analyst reasons before sv reasons
wagnerlmichael Jan 14, 2026
db05fa8
Comments and linting
wagnerlmichael Jan 14, 2026
9d16c80
Add source_is_outlier
wagnerlmichael Jan 15, 2026
408d606
Add docs
wagnerlmichael Jan 15, 2026
eaf6287
Adjust comment
wagnerlmichael Jan 15, 2026
530e5c0
Tweak comment
wagnerlmichael Jan 16, 2026
634097c
Refactor `default.vw_pin_sale.is_outlier` and `outlier_reason` logic
jeancochrane Jan 23, 2026
36e49d1
Move `is_outlier` and `outlier_reason` logic from `default.vw_pin_sal…
jeancochrane Jan 23, 2026
7d4ade5
Define `sale.vw_flag` view to record most recent flag for each review…
jeancochrane Jan 23, 2026
7b45684
Temporarily disable main branch restriction for `build_and_test_dbt` …
jeancochrane Jan 23, 2026
4957812
Rename `sale.flag_override` table to `sale.flag_review`
jeancochrane Jan 23, 2026
2a7f67e
WIP document new sale views
jeancochrane Jan 23, 2026
8f3615b
Document new outlier columns and views
jeancochrane Jan 23, 2026
9fe8905
Fix syntax errors in sale.vw_flag definition
jeancochrane Jan 23, 2026
e14b451
Add data tests for `sale.vw_outlier`
jeancochrane Jan 23, 2026
cf1b1a3
Fix a few more spots with `is_outlier`
jeancochrane Jan 24, 2026
27c75d6
Fixup market tracker and vw_ias_salesval_upload with vw_pin_sale changes
jeancochrane Jan 24, 2026
e9bbaf1
Merge branch 'master' into jeancochrane/fixup-is-outlier
jeancochrane Jan 24, 2026
abb4c08
Revert "Temporarily disable main branch restriction for `build_and_te…
jeancochrane Jan 24, 2026
786e5b9
Avoid errors in reporting.vw_market_tracker
jeancochrane Jan 24, 2026
d1d8371
Fix a couple docs
jeancochrane Jan 24, 2026
642737b
Fix error with `CARDINALITY` call in vw_market_tracker
jeancochrane Jan 24, 2026
34e3b37
Fix small logic bug and add unit tests to `sale.vw_outlier`
jeancochrane Jan 26, 2026
e512f5a
Fix unit test indentation for yamllint
jeancochrane Jan 26, 2026
b6562bc
Add algorithm flags to flip/non-arms-length reviewed sale outlier rea…
jeancochrane Jan 27, 2026
4b0f4fe
Refactor outlier reasons for more intuitive use in downstream consumers
jeancochrane Jan 27, 2026
e0da4d9
Factor out `review_json` docs to `shared_columns` and add it to `defa…
jeancochrane Jan 28, 2026
af0dcfc
Clarify comment on `is_outlier` logic in `sale.vw_outlier`
jeancochrane Jan 28, 2026
79dd34d
Document price outlier component of flip/non-arms-length `outlier_rea…
jeancochrane Jan 28, 2026
4608468
Document `sv_*` columns on `model.training_data`, for extra clarity
jeancochrane Jan 28, 2026
220bcbd
Ensure `has_flag`, `has_review`, and `is_outlier` are never null in `…
jeancochrane Jan 30, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 20 additions & 76 deletions dbt/models/default/default.vw_pin_sale.sql
Original file line number Diff line number Diff line change
Expand Up @@ -220,33 +220,6 @@ mydec_sales AS (
joined back onto unique_sales will create duplicates by pin/sale date. */
WHERE num_single_day_sales = 1
OR (year_of_sale > '2020')
),

max_version_flag AS (
SELECT
meta_sale_document_num,
MAX(version) AS max_version
FROM {{ source('sale', 'flag') }}
GROUP BY meta_sale_document_num
),

sales_val AS (
SELECT
sf.meta_sale_document_num,
sf.sv_is_outlier,
sf.sv_is_ptax_outlier,
sf.sv_is_heuristic_outlier,
sf.sv_outlier_reason1,
sf.sv_outlier_reason2,
sf.sv_outlier_reason3,
sf.run_id AS sv_run_id,
sf.version AS sv_version
FROM
{{ source('sale', 'flag') }}
AS sf
INNER JOIN max_version_flag AS mv
ON sf.meta_sale_document_num = mv.meta_sale_document_num
AND sf.version = mv.max_version
)

SELECT
Expand Down Expand Up @@ -290,11 +263,7 @@ SELECT
unique_sales.sale_filter_same_sale_within_365,
unique_sales.sale_filter_less_than_10k,
unique_sales.sale_filter_deed_type,
-- Our sales validation pipeline only validates sales past 2014 due to MyDec
-- limitations. Previous to that values for sv_is_outlier will be NULL, so
-- if we want to both exclude detected outliers and include sales prior to
-- 2014, we need to code everything NULL as FALSE.
COALESCE(sales_val.sv_is_outlier, FALSE) AS sale_filter_is_outlier,
COALESCE(outlier.is_outlier, FALSE) AS sale_filter_is_outlier,
mydec_sales.mydec_deed_type,
mydec_sales.sale_filter_ptax_flag,
mydec_sales.mydec_property_advertised,
Expand Down Expand Up @@ -328,50 +297,25 @@ SELECT
mydec_sales.mydec_homestead_exemption_general_alternative,
mydec_sales.mydec_homestead_exemption_senior_citizens,
mydec_sales.mydec_homestead_exemption_senior_citizens_assessment_freeze,
sales_val.sv_is_outlier,
sales_val.sv_is_ptax_outlier,
sales_val.sv_is_heuristic_outlier,
sales_val.sv_outlier_reason1,
sales_val.sv_outlier_reason2,
sales_val.sv_outlier_reason3,
sales_val.sv_run_id,
sales_val.sv_version,
flag_override.is_arms_length,
flag_override.is_flip,
flag_override.has_class_change,
flag_override.has_characteristic_change,
flag_override.requires_field_check,
CASE
-- If there is an override, use override logic
-- If neither override nor sv_is_outlier is populated, leave null
WHEN
flag_override.is_arms_length IS NOT NULL
OR flag_override.is_flip IS NOT NULL
OR flag_override.has_class_change IS NOT NULL
OR flag_override.has_characteristic_change IS NOT NULL
OR flag_override.requires_field_check IS NOT NULL
THEN (
-- COALESCE is required here because the boolean logic is
-- three-valued (TRUE / FALSE / NULL). When overrides exist
-- but some override columns are NULL, expressions like FALSE
-- OR NULL evaluate to NULL, which would incorrectly return
-- is_outlier = NULL instead of FALSE.
COALESCE(flag_override.is_arms_length = FALSE, FALSE)
OR COALESCE(flag_override.is_flip = TRUE, FALSE)
OR COALESCE(flag_override.has_class_change = TRUE, FALSE)
OR COALESCE(
flag_override.has_characteristic_change = 'yes_major', FALSE
)
OR COALESCE(flag_override.requires_field_check = TRUE, FALSE)
)
-- If there is no override, default to sv_is_outlier
WHEN sales_val.sv_is_outlier IS NOT NULL
THEN sales_val.sv_is_outlier
END AS is_outlier
COALESCE(outlier.has_flag, FALSE) AS has_flag,
outlier.flag_is_outlier,
outlier.flag_is_ptax_outlier,
outlier.flag_is_heuristic_outlier,
outlier.flag_outlier_reason1,
outlier.flag_outlier_reason2,
outlier.flag_outlier_reason3,
outlier.flag_run_id,
outlier.flag_version,
COALESCE(outlier.has_review, FALSE) AS has_review,
outlier.review_is_arms_length,
outlier.review_is_flip,
outlier.review_has_class_change,
outlier.review_has_characteristic_change,
outlier.review_json,
COALESCE(outlier.is_outlier, FALSE) AS is_outlier,
outlier.outlier_reason
FROM unique_sales
LEFT JOIN mydec_sales
ON unique_sales.doc_no = mydec_sales.doc_no
LEFT JOIN sales_val
ON unique_sales.doc_no = sales_val.meta_sale_document_num
LEFT JOIN {{ source('sale', 'flag_override') }} AS flag_override
ON unique_sales.doc_no = flag_override.doc_no
LEFT JOIN {{ ref('sale.vw_outlier') }} AS outlier
ON unique_sales.doc_no = outlier.doc_no
68 changes: 43 additions & 25 deletions dbt/models/default/schema/default.vw_pin_sale.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,41 @@ models:
description: '{{ doc("shared_column_deed_type") }}'
- name: doc_no
description: '{{ doc("shared_column_document_number") }}'
- name: has_characteristic_change
description: '{{ doc("shared_column_has_characteristic_change") }}'
- name: has_class_change
description: '{{ doc("shared_column_has_class_change") }}'
- name: is_arms_length
description: '{{ doc("shared_column_is_arms_length") }}'
- name: is_flip
description: '{{ doc("shared_column_is_flip") }}'
- name: flag_is_heuristic_outlier
description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}'
- name: flag_is_outlier
description: '{{ doc("shared_column_sv_is_outlier") }}'
- name: flag_is_ptax_outlier
description: '{{ doc("shared_column_sv_is_ptax_outlier") }}'
- name: flag_outlier_reason1
description: '{{ doc("shared_column_sv_outlier_reason") }}'
- name: flag_outlier_reason2
description: '{{ doc("shared_column_sv_outlier_reason") }}'
- name: flag_outlier_reason3
description: '{{ doc("shared_column_sv_outlier_reason") }}'
- name: flag_run_id
description: '{{ doc("shared_column_sv_run_id") }}'
- name: flag_version
description: '{{ doc("shared_column_sv_version") }}'
- name: has_flag
description: '{{ doc("shared_column_has_flag") }}'
data_tests:
- not_null:
name: default_vw_pin_sale_has_flag_not_null
- name: has_review
description: '{{ doc("shared_column_has_review") }}'
data_tests:
- not_null:
name: default_vw_pin_sale_has_review_not_null
- name: is_multisale
description: '{{ doc("shared_column_sale_is_multisale") }}'
- name: is_mydec_date
description: Indicator for whether or not the observation uses the MyDec sale date
- name: is_outlier
description: |
The final determination indicating whether a sale is a statistical
outlier that should be excluded from model training.

Combines information from our sales validation model with information
from analysts who review our outliers for correctness.
description: '{{ doc("shared_column_is_outlier") }}'
data_tests:
- not_null:
name: default_vw_pin_sale_is_outlier_not_null
- name: mydec_deed_type
description: Deed type from MyDec, more granular than CCAO deed type
- name: mydec_line_8_current_use
Expand Down Expand Up @@ -72,16 +88,28 @@ models:
description: '{{ doc("shared_column_nbhd_code") }}'
- name: num_parcels_sale
description: '{{ doc("shared_column_num_parcels_sale") }}'
- name: outlier_reason
description: '{{ doc("shared_column_outlier_reason") }}'
- name: pin
description: '{{ doc("shared_column_pin") }}'
- name: requires_field_check
description: '{{ doc("shared_column_requires_field_check") }}'
- name: review_has_characteristic_change
description: '{{ doc("shared_column_has_characteristic_change") }}'
- name: review_has_class_change
description: '{{ doc("shared_column_has_class_change") }}'
- name: review_is_arms_length
description: '{{ doc("shared_column_is_arms_length") }}'
- name: review_is_flip
description: '{{ doc("shared_column_is_flip") }}'
- name: review_json
description: '{{ doc("shared_column_review_json") }}'
- name: sale_date
description: '{{ doc("shared_column_sale_date") }}'
- name: sale_filter_deed_type
description: Remove quit claim, executor, beneficiary and missing deed types
- name: sale_filter_is_outlier
description: Same as `sv_is_outlier`, but `NULL` values are replaced with `FALSE`
description: Deprecated, prefer `is_outlier`
- name: sale_filter_less_than_10k
description: Indicator for whether sale is less than $10K FMW
- name: sale_filter_same_sale_within_365
Expand All @@ -92,16 +120,6 @@ models:
description: '{{ doc("shared_column_sale_price") }}'
- name: seller_name
description: '{{ doc("shared_column_seller_name") }}'
- name: sv_is_heuristic_outlier
description: '{{ doc("shared_column_sv_is_heuristic_outlier") }}'
- name: sv_is_outlier
description: '{{ doc("shared_column_sv_is_outlier") }}'
- name: sv_is_ptax_outlier
description: '{{ doc("shared_column_sv_is_ptax_outlier") }}'
- name: sv_run_id
description: '{{ doc("shared_column_sv_run_id") }}'
- name: sv_version
description: '{{ doc("shared_column_sv_version") }}'
- name: township_code
description: '{{ doc("shared_column_township_code") }}'
- name: year
Expand Down
15 changes: 15 additions & 0 deletions dbt/models/model/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,21 @@ models:
config:
tags:
- load_manual
columns:
- name: sv_is_outlier
description: '{{ doc("shared_column_is_outlier") }}'
- name: sv_outlier_reason
description: '{{ doc("shared_column_outlier_reason") }}'
- name: sv_outlier_reason1
description: '{{ doc("shared_column_sv_outlier_reason") }}'
- name: sv_outlier_reason2
description: '{{ doc("shared_column_sv_outlier_reason") }}'
- name: sv_outlier_reason3
description: '{{ doc("shared_column_sv_outlier_reason") }}'
- name: sv_review_json
description: '{{ doc("shared_column_review_json") }}'
- name: sv_run_id
description: '{{ doc("shared_column_sv_run_id") }}'
tests:
- unique_combination_of_columns:
name: model_training_data_unique_card_doc_number_run_id
Expand Down
8 changes: 4 additions & 4 deletions dbt/models/reporting/reporting.vw_market_tracker.sql
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ SELECT
vps.sale_price,
vps.sale_date,
vps.sale_filter_is_outlier,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jeancochrane should this become is_outlier?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sale_filter_is_outlier is now just an alias for is_outlier, in order to support backwards-compatibility for downstream consumers that are still relying on sale_filter_is_outlier (like the market tracker, I believe). It would probably be helpful in the long term to switch downstream consumers to is_outlier and remove this legacy field, but I don't think it's an urgent task.

COALESCE(outlier.is_outlier, FALSE) AS sale_filter_is_outlier,

COALESCE(outlier.is_outlier, FALSE) AS is_outlier,

vps.sv_is_outlier,
vps.flag_is_outlier AS sv_is_outlier,
vps.is_multisale,
vps.sale_filter_same_sale_within_365,
vps.sale_filter_less_than_10k,
vps.sale_filter_deed_type,
vps.sv_outlier_reason1,
vps.sv_outlier_reason2,
vps.sv_outlier_reason3,
vps.flag_outlier_reason1 AS sv_outlier_reason1,
vps.flag_outlier_reason2 AS sv_outlier_reason2,
vps.flag_outlier_reason3 AS sv_outlier_reason3,
-- Cards 1 and 2 are card numbers for the first two residential cards for a
-- parcel. They will not always be values of 1 and 2.
vrc1.card AS card1,
Expand Down
41 changes: 39 additions & 2 deletions dbt/models/sale/docs.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ should be possible to reconstruct using the other sales validation tables:
**Primary Key**: `meta_sale_document_number`, `run_id`, `version`
{% enddocs %}

# flag_override
# flag_review

{% docs flag_override %}
{% docs flag_review %}
Data built by manual review by analysts that determine whether or not
we should include sales in the model.

Expand Down Expand Up @@ -66,6 +66,20 @@ including the statistical bounds, groupings, window sizes, etc.
**Primary Key**: `run_id`
{% enddocs %}

# vw_flag

{% docs vw_flag %}
PIN-level sales validation flags created by
[model-sales-val](https://github.com/ccao-data/model-sales-val).

This view derives the most recent version of flags for each sale in the
`sale.flag` table, which uses its `version` column as a [type 2 slowly changing
dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension#Type_2:_add_new_row).
As such, this view is unique by `doc_no`.

**Primary Key**: `doc_no`
{% enddocs %}

# vw_flag_group

{% docs vw_flag_group %}
Expand All @@ -92,3 +106,26 @@ with iasWorld.

**Primary Key**: `salekey`, `run_id`
{% enddocs %}

# vw_outlier

{% docs vw_outlier %}

View that combines `sale.flag` and `sale.flag_review` to produce one
unified view of all sales validation information for a sale based on its
doc number.

**Nuance**: Unlike the constituent tables `sale.flag` and `sale.flag_review`,
the determination columns in this view (like `flag_is_outlier` or
`review_has_class_change`) will never be null, even if the sale was not flagged
or was not reviewed by an analyst. This is intended to ease the process of using
these columns for boolean logic, so that we never have to handle the case where
a boolean comparison could return null unexpectedly. However, it introduces the
potential for confusion, in that a sale could have a not-null value for a
determination column that does not actually correspond to a decision made by
our algorithm or a reviewer. To determine whether a value in a determination
column corresponds to a real decision made by our algorithm or a reviewer, use the
`has_flag` and `has_review` columns.

**Primary Key**: `doc_no`
{% enddocs %}
23 changes: 23 additions & 0 deletions dbt/models/sale/sale.vw_flag.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
-- View that derives the most recent version of flags for each sale in the
-- `sale.flag` table, which uses its `version` column as a type 2
-- slowly changing dimension
SELECT
sf.meta_sale_document_num AS doc_no,
sf.sv_is_outlier,
sf.sv_is_ptax_outlier,
sf.sv_is_heuristic_outlier,
sf.sv_outlier_reason1,
sf.sv_outlier_reason2,
sf.sv_outlier_reason3,
sf.run_id,
sf.version
FROM {{ source('sale', 'flag') }} AS sf
INNER JOIN (
SELECT
meta_sale_document_num,
MAX(version) AS max_version
FROM {{ source('sale', 'flag') }}
GROUP BY meta_sale_document_num
) AS mv
ON sf.meta_sale_document_num = mv.meta_sale_document_num
AND sf.version = mv.max_version
17 changes: 3 additions & 14 deletions dbt/models/sale/sale.vw_ias_salesval_upload.sql
Original file line number Diff line number Diff line change
@@ -1,18 +1,10 @@
WITH ias_sales AS (
SELECT
salekey,
NULLIF(REPLACE(instruno, 'D', ''), '') AS instruno_clean
NULLIF(REPLACE(instruno, 'D', ''), '') AS doc_no
FROM {{ source('iasworld', 'sales') }}
WHERE cur = 'Y'
AND deactivat IS NULL
),

max_version AS (
SELECT
meta_sale_document_num,
MAX(version) AS max_version
FROM {{ source('sale', 'flag') }}
GROUP BY meta_sale_document_num
)

SELECT
Expand All @@ -23,8 +15,5 @@ SELECT
sf.sv_outlier_reason3,
sf.run_id
FROM ias_sales
INNER JOIN {{ source('sale', 'flag') }} AS sf
ON ias_sales.instruno_clean = sf.meta_sale_document_num
INNER JOIN max_version AS mv
ON sf.meta_sale_document_num = mv.meta_sale_document_num
AND sf.version = mv.max_version;
INNER JOIN {{ ref('sale.vw_flag') }} AS sf
ON ias_sales.doc_no = sf.doc_no
Loading
Loading