diff --git a/config/config_hmpxv1.yaml b/config/config_hmpxv1.yaml index fc42dad8..0edb53c2 100644 --- a/config/config_hmpxv1.yaml +++ b/config/config_hmpxv1.yaml @@ -1,4 +1,3 @@ -exclude: "config/exclude_accessions_mpxv.txt" reference: "config/reference.fasta" genemap: "config/genemap.gff" genbank_reference: "config/reference.gb" @@ -17,14 +16,15 @@ display_strain_field: "strain" build_name: "hmpxv1" auspice_name: "mpox_clade-IIb" -## filter -min_date: 2017 -min_length: 100000 +filter: + exclude: "config/exclude_accessions_mpxv.txt" + min_date: 2017 + min_length: 100000 ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month -filter: +subsample: non_b1: group_by: "--group-by lineage year country" sequences_per_group: "--sequences-per-group 50" diff --git a/config/config_hmpxv1_big.yaml b/config/config_hmpxv1_big.yaml index 6844f8b4..2b32589c 100644 --- a/config/config_hmpxv1_big.yaml +++ b/config/config_hmpxv1_big.yaml @@ -1,4 +1,3 @@ -exclude: "config/exclude_accessions_mpxv.txt" reference: "config/reference.fasta" genemap: "config/genemap.gff" genbank_reference: "config/reference.gb" @@ -17,10 +16,12 @@ display_strain_field: "strain" build_name: "hmpxv1_big" auspice_name: "mpox_lineage-B.1" -## filter -min_date: 2022 -min_length: 180000 filter: + exclude: "config/exclude_accessions_mpxv.txt" + min_date: 2022 + min_length: 180000 + +subsample: b1: group_by: "--group-by year month country" sequences_per_group: "--subsample-max-sequences 5000" diff --git a/config/config_mpxv.yaml b/config/config_mpxv.yaml index 34bdebd7..ba93fccf 100644 --- a/config/config_mpxv.yaml +++ b/config/config_mpxv.yaml @@ -1,4 +1,3 @@ -exclude: "config/exclude_accessions_mpxv.txt" reference: "config/reference.fasta" genemap: "config/genemap.gff" genbank_reference: "config/reference.gb" @@ -17,13 +16,14 @@ display_strain_field: "strain" build_name: "mpxv" auspice_name: "mpox_all-clades" -## filter -min_date: 1950 -min_length: 100000 +filter: + exclude: "config/exclude_accessions_mpxv.txt" + min_date: 1950 + min_length: 100000 ### Set 1: Non-B.1 sequences: use all ### Set 2: B.1 sequences: small sample across year/country, maybe month -filter: +subsample: non_b1: group_by: "--group-by clade year country" sequences_per_group: "--sequences-per-group 50" diff --git a/workflow/snakemake_rules/core.smk b/workflow/snakemake_rules/core.smk index 262da22c..dbe8aa7b 100644 --- a/workflow/snakemake_rules/core.smk +++ b/workflow/snakemake_rules/core.smk @@ -13,21 +13,21 @@ In addition, `build_dir` and `auspice_dir` need to be defined upstream. """ -rule exclude_bad: +rule filter: """ Removing strains that do not satisfy certain requirements. """ input: sequences="data/sequences.fasta", metadata="data/metadata.tsv", - exclude=config["exclude"], output: sequences=build_dir + "/{build_name}/good_sequences.fasta", metadata=build_dir + "/{build_name}/good_metadata.tsv", log=build_dir + "/{build_name}/good_filter.log", params: - min_date=config["min_date"], - min_length=config["min_length"], + exclude=config["filter"]["exclude"], + min_date=config["filter"]["min_date"], + min_length=config["filter"]["min_length"], strain_id=config["strain_id_field"], shell: """ @@ -35,9 +35,9 @@ rule exclude_bad: --sequences {input.sequences} \ --metadata {input.metadata} \ --metadata-id-columns {params.strain_id} \ - --exclude {input.exclude} \ --output-sequences {output.sequences} \ --output-metadata {output.metadata} \ + --exclude {params.exclude} \ --min-date {params.min_date} \ --min-length {params.min_length} \ --exclude-where QC_rare_mutations=bad \ @@ -45,19 +45,21 @@ rule exclude_bad: """ -rule filter: +rule subsample: input: - sequences=rules.exclude_bad.output.sequences, - metadata=rules.exclude_bad.output.metadata, + sequences=rules.filter.output.sequences, + metadata=rules.filter.output.metadata, output: - strains=build_dir + "/{build_name}/{subset}_strains.txt", - log=build_dir + "/{build_name}/{subset}_filter.log", + strains=build_dir + "/{build_name}/{sample}_strains.txt", + log=build_dir + "/{build_name}/{sample}_filter.log", params: - group_by=lambda w: config["filter"][w.subset]["group_by"], - sequences_per_group=lambda w: config["filter"][w.subset]["sequences_per_group"], - other_filters=lambda w: config["filter"][w.subset].get("other_filters", ""), - exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['filter'][w.subset]['exclude_lineages']])}" - if "exclude_lineages" in config["filter"][w.subset] + group_by=lambda w: config["subsample"][w.sample]["group_by"], + sequences_per_group=lambda w: config["subsample"][w.sample][ + "sequences_per_group" + ], + other_filters=lambda w: config["subsample"][w.sample].get("other_filters", ""), + exclude=lambda w: f"--exclude-where {' '.join([f'lineage={l}' for l in config['subsample'][w.sample]['exclude_lineages']])}" + if "exclude_lineages" in config["subsample"][w.sample] else "", strain_id=config["strain_id_field"], shell: @@ -75,14 +77,14 @@ rule filter: """ -rule filter_merge: +rule combine_samples: input: strains=lambda w: [ - f"{build_dir}/{w.build_name}/{subset}_strains.txt" - for subset in config["filter"] + f"{build_dir}/{w.build_name}/{sample}_strains.txt" + for sample in config["subsample"] ], - sequences=rules.exclude_bad.output.sequences, - metadata=rules.exclude_bad.output.metadata, + sequences=rules.filter.output.sequences, + metadata=rules.filter.output.metadata, include="config/include_{build_name}.txt", output: sequences=build_dir + "/{build_name}/filtered.fasta",