config.yaml

# matutils pre-built mutation-annotated tree from
# http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/
mat_tree: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2022/11/21/public-2022-11-21.all.masked.nextclade.pangolin.pb.gz

# Reference GTF and FASTA, and location of spike coding sequence
ref_fasta: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/wuhCor1.fa.gz
ref_gtf: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/genes/ncbiGenes.gtf.gz

# Only keep nextstrain clades with at least this many samples in mutation-annotated tree
min_clade_samples: 10000

# Subset samples based on whether they start with these regex matches
sample_subsets:
  all: .  # regex to match anything
  USA: USA
  England: England

# Founder genotypes for nextstrain clades from Richard Neher
clade_founder_json: https://raw.githubusercontent.com/neherlab/SC2_variant_rates/7e738194a8c6592082f1caa9a6ca70cb68289790/data/clade_gts.json

# For counting mutations, exclude any branches with more than this many nucleotide
# mutations or more than this many reversions to reference or clade founder
max_nt_mutations: 4
max_reversions_to_ref: 1
max_reversions_to_clade_founder: 1

# Exclude nucleotide mutations from reference to clade founder and their reversions.
# These sites have higher than normal errors due to calling of missing bases to reference.
exclude_ref_to_founder_muts: true

# sites to exclude the following sites (set to null for no exclusions)
sites_to_exclude:
  # Sites in Table S1 of https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1009175
  - 153
  - 1149
  - 2198
  - 3145
  - 3564
  - 3778
  - 4050
  - 6255
  - 8022
  - 8790
  - 13402
  - 13947
  - 22802
  - 24389
  - 24390
  - 24933
  # sites specified for exclusion because they have extremely high mutation
  # frequency in some clade
  - 5629  # T5629G is much higher (~5% of all) in clade 20A than any other mutation.
  - 6851  # C6851A and its reversion are top two mutations in 20C at ~5% and ~3% of all mutations
  - 7328  # ~6% of all mutations in clade 21I, also highly mutated (~4% of all) in several other clades
  - 28095  # ~11% of all mutations in clade 20I
  - 29362  # ~30% of all mutations in clade 21C

# for analysis of 4-fold synonymous mutation spectra/rates, only keep clade subsets with
# at least this many non-excluded mutation counts
synonymous_spectra_min_counts: 5000

# Orf1ab to Nsp numbering (nucleotide start in Orf1ab) from
# https://github.com/theosanderson/Codon2Nucleotide/blob/main/src/App.js
orf1ab_to_nsps:
  nsp1: [1, 180]
  nsp2: [181, 818]
  nsp3: [819, 2763]
  nsp4: [2764, 3263]
  nsp5 (Mpro): [3264, 3569]
  nsp6: [3570, 3859]
  nsp7: [3860, 3942]
  nsp8: [3943, 4140]
  nsp9: [4141, 4253]
  nsp10: [4254, 4392]
  nsp12 (RdRp): [4393, 5324]
  nsp13: [5325, 5925]
  nsp14: [5926, 6452]
  nsp15: [6453, 6798]
  nsp16: [6799, 7096]
  
# Pseudocount for calculating amino-acid fitnesses
fitness_pseudocount: 0.5

# initial cutoff for minimum expected count to show fitness values
min_expected_count: 20

# only plot correlation among clades when at least this many expected counts
clade_corr_min_count: 1e6