Skip to content

Commit

Permalink
initial commit from previous repo
Browse files Browse the repository at this point in the history
  • Loading branch information
adamribaudo-velir committed Mar 24, 2022
1 parent 21e8ca3 commit 555c137
Show file tree
Hide file tree
Showing 16 changed files with 294 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Auto detect text files and perform LF normalization
* text=auto
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

target/
dbt_packages/
logs/
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,29 @@
# dbt-ga4
DBT guide to package creation: https://docs.getdbt.com/docs/guides/building-packages
DBT project structure notes: https://discourse.getdbt.com/t/how-we-structure-our-dbt-projects/355

To connect to BigQuery using OAuth, see instructions here: https://docs.getdbt.com/reference/warehouse-profiles/bigquery-profile#local-oauth-gcloud-setup

# Configuration Instructions

Create the following variables scoped to the ga4 package in your dbt_project.yml
- project (defaults to "bigquery-public-data")
- dataset (defaults to "ga4_obfuscated_sample_ecommerce")

# TODO

- Macro to extract hostname from URL
- Create staging tables for the following events:
- scroll
- first_visit
- view_promotion
- click
- add_to_cart
- purchase
- Full event reference: https://developers.google.com/analytics/devguides/collection/ga4/reference/events

- Create stg_sessions model
- Create stg_users model

- Recreate Fivetran ga3 models with ga4 data

- Convert basic unnesting operations into macros
Empty file added analyses/.gitkeep
Empty file.
30 changes: 30 additions & 0 deletions dbt_project.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: 'ga4'
version: '1.0.0'
config-version: 2
profile: 'ga4'
model-paths: ["models"]
analysis-paths: ["analyses"]
test-paths: ["tests"]
seed-paths: ["seeds"]
macro-paths: ["macros"]
snapshot-paths: ["snapshots"]

target-path: "target" # directory which will store compiled SQL files
clean-targets: # directories to be removed by `dbt clean`
- "target"
- "dbt_packages"

vars:
start_date: "20201230" # Defines the earliest GA4 _TABLE_SUFFIX to load into base events model. 20201230 produces about 1GB of data scanned.
project: "bigquery-public-data"
dataset: "ga4_obfuscated_sample_ecommerce"

# Configuring models
# Full documentation: https://docs.getdbt.com/docs/configuring-models

# In this example config, we tell dbt to build all models in the example/ directory
# as tables. These settings can be overridden in the individual model files
# using the `{{ config(...) }}` macro.
models:
ga4:
+materialized: view
Empty file added macros/.gitkeep
Empty file.
31 changes: 31 additions & 0 deletions models/staging/ga4/base/base_ga4__events.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
--BigQuery does not cache wildcard queries that scan across sharded tables which means it's best to materialize the raw event data as a partitioned table so that future queries benefit from caching
{{
config(
materialized = 'incremental',
incremental_strategy = 'insert_overwrite',
partition_by={
"field": "event_date_dt",
"data_type": "date",
}
)
}}

with source as (
select *
from {{ source('ga4', 'events') }}
where cast(_table_suffix as int64) >= {{var('start_date')}}
{% if is_incremental() %}
-- Incrementally add new events. Filters on _TABLE_SUFFIX using the max event_date_dt value found
-- See https://docs.getdbt.com/reference/resource-configs/bigquery-configs#the-insert_overwrite-strategy
and parse_date('%Y%m%d',_TABLE_SUFFIX) >= _dbt_max_partition
{% endif %}
),
renamed as (
select
parse_date('%Y%m%d',event_date) as event_date_dt,
*
EXCEPT (event_date) -- remove event date to ensure usage of event_date_dt which is partitioned
from source
)

select * from renamed
37 changes: 37 additions & 0 deletions models/staging/ga4/base/base_ga4__events_static_partition.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
-- If jobs are running daily, it may make sense to avoid the scanning necessary to determine the max partition date. Instead, a static incremental range can be set and this data will be overwritten/inserted at every incremental run.

{% set partitions_to_replace = [
'current_date()',
'date_sub(current_date(), interval 1 day)'
] %}

--BigQuery does not cache wildcard queries that scan across sharded tables which means it's best to materialize the raw event data as a partitioned table so that future queries benefit from caching
{{
config(
materialized = 'incremental',
incremental_strategy = 'insert_overwrite',
partition_by={
"field": "event_date_dt",
"data_type": "date",
}
)
}}

with source as (
select *
from {{ source('ga4', 'events') }}
where cast(_table_suffix as int64) >= {{var('start_date')}}
{% if is_incremental() %}
-- recalculate yesterday + today
and parse_date('%Y%m%d',_TABLE_SUFFIX) in ({{ partitions_to_replace | join(',') }})
{% endif %}
),
renamed as (
select
parse_date('%Y%m%d',event_date) as event_date_dt,
*
EXCEPT (event_date) -- remove event date to ensure usage of event_date_dt which is partitioned
from source
)

select * from renamed
46 changes: 46 additions & 0 deletions models/staging/ga4/events/stg_ga4__page_view.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
with page_view_with_params as (
select
event_date_dt,
user_id,
user_pseudo_id,
event_timestamp,
event_name, params,
traffic_source
FROM {{ref('base_ga4__events')}},
UNNEST(event_params) as params
where event_name = 'page_view'
and (params.key = 'page_location' or
params.key = 'ga_session_id' or
params.key = 'ga_session_number' or
params.key = 'entrances' or
params.key = 'page_title' or
params.key = 'page_referrer')
),
pivoted as (
select
event_date_dt,
user_pseudo_id,
user_id,
event_timestamp,
event_name,
traffic_source.name as traffic_source_name,
traffic_source.source as traffic_source_source,
traffic_source.medium as traffic_source_medium,
MAX(if(params.key = "page_location", params.value.string_value, NULL)) as page_location,
MAX(if(params.key = "ga_session_id", params.value.int_value, NULL)) as ga_session_id,
MAX(if(params.key = "ga_session_number", params.value.int_value, NULL)) as ga_session_number,
MAX(if(params.key = "entrances", params.value.int_value, 0)) as entrances,
MAX(if(params.key = "page_title", params.value.string_value, NULL)) as page_title,
MAX(if(params.key = "page_referrer", params.value.string_value, NULL)) as page_referrer

from page_view_with_params
group by 1,2,3,4,5,6,7,8
)

select
*,
case
when ga_session_number = 1 then TRUE
else FALSE
end as is_new_user
from pivoted
56 changes: 56 additions & 0 deletions models/staging/ga4/events/stg_ga4__purchase.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
with purchase_with_params as (
select
event_date_dt,
user_id,
user_pseudo_id,
event_timestamp,
event_name, params,
traffic_source
FROM {{ref('base_ga4__events')}},
UNNEST(event_params) as params
where event_name = 'purchase' -- Pull only 'purchase' events
and
(
params.key = 'page_location' or
params.key = 'ga_session_id' or
params.key = 'ga_session_number' or
params.key = 'page_referrer' or
params.key = 'currency' or
params.key = 'value' or
params.key = 'payment_type' or
params.key = 'coupon' or
params.key = 'transaction_id'
)
),
pivoted as (
select
event_date_dt,
user_pseudo_id,
user_id,
event_timestamp,
event_name,
traffic_source.name as traffic_source_name,
traffic_source.source as traffic_source_source,
traffic_source.medium as traffic_source_medium,
MAX(if(params.key = "page_location", params.value.string_value, NULL)) as page_location,
MAX(if(params.key = "ga_session_id", params.value.int_value, NULL)) as ga_session_id,
MAX(if(params.key = "ga_session_number", params.value.int_value, NULL)) as ga_session_number,
MAX(if(params.key = "page_referrer", params.value.string_value, NULL)) as page_referrer,
MAX(if(params.key = "coupon", params.value.string_value, NULL)) as coupon,
MAX(if(params.key = "transaction_id", params.value.string_value, NULL)) as transaction_id,
MAX(if(params.key = "currency", params.value.string_value, NULL)) as currency,
MAX(if(params.key = "payment_type", params.value.string_value, NULL)) as payment_type,
MAX(if(params.key = "value", params.value.float_value, NULL)) as value
-- TODO how to handle items array?

from purchase_with_params
group by 1,2,3,4,5,6,7,8
)

select
*,
case
when ga_session_number = 1 then TRUE
else FALSE
end as is_new_user
from pivoted
38 changes: 38 additions & 0 deletions models/staging/ga4/events/stg_ga4__session_start.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
with session_start_with_params as (
select
event_date_dt,
user_pseudo_id,
user_id,
event_timestamp,
event_name,
params,
traffic_source
FROM {{ref('base_ga4__events')}},
UNNEST(event_params) as params
where event_name = 'session_start'
and (params.key = 'page_location' or params.key = 'ga_session_id' or params.key = 'ga_session_number')
),
pivoted as (
select
event_date_dt,
user_pseudo_id,
user_id,
event_timestamp,
event_name,
traffic_source.name as traffic_source_name,
traffic_source.source as traffic_source_source,
traffic_source.medium as traffic_source_medium,
MAX(if(params.key = "page_location", params.value.string_value, NULL)) as page_location,
MAX(if(params.key = "ga_session_id", params.value.int_value, NULL)) as ga_session_id,
MAX(if(params.key = "ga_session_number", params.value.int_value, NULL)) as ga_session_number,
from session_start_with_params
group by 1,2,3,4,5,6,7,8
)

select
*,
case
when ga_session_number = 1 then TRUE
else FALSE
end as is_new_user
from pivoted
10 changes: 10 additions & 0 deletions models/staging/ga4/src_ga4.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
version: 2

sources:
- name: ga4
database: "{{var('project','bigquery-public-data')}}" #Default to bigquery-public-data
schema: "{{var('dataset', 'ga4_obfuscated_sample_ecommerce')}}" #Default to ga4_obfuscated_sample_ecommerce
tables:
- name: events
identifier: events_*

11 changes: 11 additions & 0 deletions models/staging/ga4/stg_ga4.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 2

models:
- name: base_ga4__events
description: Base events model that pulls all fields from raw data. Resulting table is partitioned on event_date and is useful in that BQ queries can be cached against this table, but not against wildcard searches from the original tables which are sharded on date.
- name: stg_ga4__page_view
description: GA4 events filtered to only show 'page_view' events. Pivots common event parameters into separate columns.
- name: stg_ga4__session_start
description: GA4 events filtered to only show 'session_start' events. Pivots common event parameters into separate columns.
- name: stg_ga4__purchase
description: GA4 events filtered to only show 'purchase' events. Pivots common event parameters into separate columns.
Empty file added seeds/.gitkeep
Empty file.
Empty file added snapshots/.gitkeep
Empty file.
Empty file added tests/.gitkeep
Empty file.

0 comments on commit 555c137

Please sign in to comment.