Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Asia weighted sampling #1150

Merged
merged 4 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 40 additions & 123 deletions nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,19 +77,19 @@ builds:
region: Africa
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Africa since pandemic start
asia_1m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
subsampling_scheme: nextstrain_region_asia_1m
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past month
asia_2m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
subsampling_scheme: nextstrain_region_asia_2m
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 2 months
asia_6m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
subsampling_scheme: nextstrain_region_asia_6m
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia over the past 6 months
asia_all-time:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
subsampling_scheme: nextstrain_region_asia_all_time
region: Asia
title: Evolution SARS-CoV-2 relative to clade 21L reference virus with subsampling focused on Asia since pandemic start
europe_1m:
Expand Down Expand Up @@ -280,31 +280,18 @@ subsampling:
exclude: "--exclude-where 'region={region}'"

# Custom subsampling logic for region Asia over 1m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_1m:
nextstrain_region_asia_1m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -313,22 +300,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 1M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 1M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
Expand All @@ -337,31 +313,18 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 2m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_2m:
nextstrain_region_asia_2m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -370,22 +333,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division week"
max_sequences: 1200
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 2M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division week"
max_sequences: 800
max_date: "--min-date 2M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country week"
Expand All @@ -394,31 +346,18 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 6m
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_6m:
nextstrain_region_asia_6m:
# Early focal samples for Asia
asia_early:
group_by: "division year month"
max_sequences: 300
max_date: "--max-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Early focal samples for China
china_early:
group_by: "division year month"
max_sequences: 200
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=China'"
# Early focal samples for India
india_early:
group_by: "division year month"
max_sequences: 200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 700
max_date: "--max-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_early:
group_by: "country year month"
Expand All @@ -427,22 +366,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "division year month"
max_sequences: 1200
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 6M"
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Recent focal samples for China
china_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=China'"
# Recent focal samples for India
india_recent:
group_by: "division year month"
max_sequences: 800
max_date: "--min-date 6M"
exclude: "--exclude-where 'country!=India'"
exclude: "--exclude-where 'region!=Asia'"
# Early contextual samples from the rest of the world
context_recent:
group_by: "country year month"
Expand All @@ -451,27 +379,16 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over all-time
# Grouping by division
# Separating three buckets for China, India and elsewhere
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of focal to context
# 3:2:2 proportions of Asia, China, India
nextstrain_region_asia_grouped_by_division_all_time:
nextstrain_region_asia_all_time:
# Focal samples for Asia
asia:
group_by: "division year month"
max_sequences: 1500
exclude: "--exclude-where 'region!=Asia' 'country=China' 'country=India'"
# Focal samples for China
china:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=China'"
# Focal samples for India
india:
group_by: "division year month"
max_sequences: 1000
exclude: "--exclude-where 'country!=India'"
group_by: "country year month"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 3500
exclude: "--exclude-where 'region!=Asia'"
# Contextual samples from the rest of the world
context:
group_by: "country year month"
Expand Down
23 changes: 10 additions & 13 deletions nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,19 @@ builds:
region: Africa
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Africa since pandemic start
asia_1m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_1m
subsampling_scheme: nextstrain_region_asia_1m
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past month
asia_2m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_2m
subsampling_scheme: nextstrain_region_asia_2m
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 2 months
asia_6m:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_6m
subsampling_scheme: nextstrain_region_asia_6m
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia over the past 6 months
asia_all-time:
subsampling_scheme: nextstrain_region_asia_grouped_by_division_all_time
subsampling_scheme: nextstrain_region_asia_all_time
region: Asia
title: Genomic epidemiology of SARS-CoV-2 with subsampling focused on Asia since pandemic start
europe_1m:
Expand Down Expand Up @@ -272,12 +272,11 @@ subsampling:
exclude: "--exclude-where 'region={region}'"

# Custom subsampling logic for region Asia over 1m
# Grouping by division
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_1m:
nextstrain_region_asia_1m:
# Early focal samples for Asia
asia_early:
group_by: "country year month"
Expand All @@ -293,7 +292,7 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "country year month"
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 1M"
Expand All @@ -306,12 +305,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 2m
# Grouping by division
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_2m:
nextstrain_region_asia_2m:
# Early focal samples for Asia
asia_early:
group_by: "country year month"
Expand All @@ -327,7 +325,7 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"
# Recent focal samples for Asia
asia_recent:
group_by: "country year month"
group_by: "country week"
group_by_weights: "defaults/population_weights.tsv"
max_sequences: 2800
min_date: "--min-date 2M"
Expand All @@ -340,12 +338,11 @@ subsampling:
exclude: "--exclude-where 'region=Asia'"

# Custom subsampling logic for region Asia over 6m
# Grouping by division
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of recent to early
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_6m:
nextstrain_region_asia_6m:
# Early focal samples for Asia
asia_early:
group_by: "country year month"
Expand Down Expand Up @@ -377,7 +374,7 @@ subsampling:
# Grouping by country weighted by population size
# 4375 total
# 4:1 ratio of focal to context
nextstrain_region_asia_grouped_by_division_all_time:
nextstrain_region_asia_all_time:
# Focal samples for Asia
asia:
group_by: "country year month"
Expand Down
Loading