Skip to content

Commit

Permalink
Reorganize config and rules around filter/subsampling
Browse files Browse the repository at this point in the history
Subsampling is a form of filtering, but there is a conceptual
distinction that is apparent in the rules and configs used in the ncov
workflow, which I've mirrored here.

Put all filtering-related rules in a "filter" config, which configures a
rule with the same name.

Put all subsampling-related rules in a "subsampling" config which also
configures a rule with the same name.
  • Loading branch information
victorlin committed Sep 21, 2023
1 parent e0a236b commit 91482e8
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 34 deletions.
10 changes: 5 additions & 5 deletions config/config_hmpxv1.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude: "config/exclude_accessions_mpxv.txt"
reference: "config/reference.fasta"
genemap: "config/genemap.gff"
genbank_reference: "config/reference.gb"
Expand All @@ -17,14 +16,15 @@ display_strain_field: "strain"
build_name: "hmpxv1"
auspice_name: "mpox_clade-IIb"

## filter
min_date: 2017
min_length: 100000
filter:
exclude: "config/exclude_accessions_mpxv.txt"
min_date: 2017
min_length: 100000


### Set 1: Non-B.1 sequences: use all
### Set 2: B.1 sequences: small sample across year/country, maybe month
filter:
subsample:
non_b1:
group_by: "--group-by lineage year country"
sequences_per_group: "--sequences-per-group 50"
Expand Down
9 changes: 5 additions & 4 deletions config/config_hmpxv1_big.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude: "config/exclude_accessions_mpxv.txt"
reference: "config/reference.fasta"
genemap: "config/genemap.gff"
genbank_reference: "config/reference.gb"
Expand All @@ -17,10 +16,12 @@ display_strain_field: "strain"
build_name: "hmpxv1_big"
auspice_name: "mpox_lineage-B.1"

## filter
min_date: 2022
min_length: 180000
filter:
exclude: "config/exclude_accessions_mpxv.txt"
min_date: 2022
min_length: 180000

subsample:
b1:
group_by: "--group-by year month country"
sequences_per_group: "--subsample-max-sequences 5000"
Expand Down
10 changes: 5 additions & 5 deletions config/config_mpxv.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
exclude: "config/exclude_accessions_mpxv.txt"
reference: "config/reference.fasta"
genemap: "config/genemap.gff"
genbank_reference: "config/reference.gb"
Expand All @@ -17,13 +16,14 @@ display_strain_field: "strain"
build_name: "mpxv"
auspice_name: "mpox_all-clades"

## filter
min_date: 1950
min_length: 100000
filter:
exclude: "config/exclude_accessions_mpxv.txt"
min_date: 1950
min_length: 100000

### Set 1: Non-B.1 sequences: use all
### Set 2: B.1 sequences: small sample across year/country, maybe month
filter:
subsample:
non_b1:
group_by: "--group-by clade year country"
sequences_per_group: "--sequences-per-group 50"
Expand Down
42 changes: 22 additions & 20 deletions workflow/snakemake_rules/core.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,51 +13,53 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream.
"""


rule exclude_bad:
rule filter:
"""
Removing strains that do not satisfy certain requirements.
"""
input:
sequences="data/sequences.fasta",
metadata="data/metadata.tsv",
exclude=config["exclude"],
output:
sequences=build_dir + "/{build_name}/good_sequences.fasta",
metadata=build_dir + "/{build_name}/good_metadata.tsv",
log=build_dir + "/{build_name}/good_filter.log",
params:
min_date=config["min_date"],
min_length=config["min_length"],
exclude=config["filter"]["exclude"],
min_date=config["filter"]["min_date"],
min_length=config["filter"]["min_length"],
strain_id=config["strain_id_field"],
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--output-sequences {output.sequences} \
--output-metadata {output.metadata} \
--exclude {params.exclude} \
--min-date {params.min_date} \
--min-length {params.min_length} \
--exclude-where QC_rare_mutations=bad \
--output-log {output.log}
"""


rule filter:
rule subsample:
input:
sequences=rules.exclude_bad.output.sequences,
metadata=rules.exclude_bad.output.metadata,
sequences=rules.filter.output.sequences,
metadata=rules.filter.output.metadata,
output:
strains=build_dir + "/{build_name}/{subset}_strains.txt",
log=build_dir + "/{build_name}/{subset}_filter.log",
strains=build_dir + "/{build_name}/{sample}_strains.txt",
log=build_dir + "/{build_name}/{sample}_filter.log",
params:
group_by=lambda w: config["filter"][w.subset]["group_by"],
sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"],
other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""),
exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}"
if "exclude_lineages" in config["filter"][w.subset]
group_by=lambda w: config["subsample"][w.sample]["group_by"],
sequences_per_group=lambda w: config["subsample"][w.sample][
"sequences_per_group"
],
other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""),
exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}"
if "exclude_lineages" in config["subsample"][w.sample]
else "",
strain_id=config["strain_id_field"],
shell:
Expand All @@ -75,14 +77,14 @@ rule filter:
"""


rule filter_merge:
rule combine_samples:
input:
strains=lambda w: [
f"{build_dir}/{w.build_name}/{subset}_strains.txt"
for subset in config["filter"]
f"{build_dir}/{w.build_name}/{sample}_strains.txt"
for sample in config["subsample"]
],
sequences=rules.exclude_bad.output.sequences,
metadata=rules.exclude_bad.output.metadata,
sequences=rules.filter.output.sequences,
metadata=rules.filter.output.metadata,
include="config/include_{build_name}.txt",
output:
sequences=build_dir + "/{build_name}/filtered.fasta",
Expand Down

0 comments on commit 91482e8

Please sign in to comment.