forked from jbloomlab/SARS2-mut-fitness
-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
88 lines (76 loc) · 3.11 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# matutils pre-built mutation-annotated tree from
# http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/
mat_tree: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/UShER_SARS-CoV-2/2022/11/21/public-2022-11-21.all.masked.nextclade.pangolin.pb.gz
# Reference GTF and FASTA, and location of spike coding sequence
ref_fasta: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/wuhCor1.fa.gz
ref_gtf: http://hgdownload.soe.ucsc.edu/goldenPath/wuhCor1/bigZips/genes/ncbiGenes.gtf.gz
# Only keep nextstrain clades with at least this many samples in mutation-annotated tree
min_clade_samples: 10000
# Subset samples based on whether they start with these regex matches
sample_subsets:
all: . # regex to match anything
USA: USA
England: England
# Founder genotypes for nextstrain clades from Richard Neher
clade_founder_json: https://raw.githubusercontent.com/neherlab/SC2_variant_rates/7e738194a8c6592082f1caa9a6ca70cb68289790/data/clade_gts.json
# For counting mutations, exclude any branches with more than this many nucleotide
# mutations or more than this many reversions to reference or clade founder
max_nt_mutations: 4
max_reversions_to_ref: 1
max_reversions_to_clade_founder: 1
# Exclude nucleotide mutations from reference to clade founder and their reversions.
# These sites have higher than normal errors due to calling of missing bases to reference.
exclude_ref_to_founder_muts: true
# sites to exclude the following sites (set to null for no exclusions)
sites_to_exclude:
# Sites in Table S1 of https://journals.plos.org/plosgenetics/article?id=10.1371/journal.pgen.1009175
- 153
- 1149
- 2198
- 3145
- 3564
- 3778
- 4050
- 6255
- 8022
- 8790
- 13402
- 13947
- 22802
- 24389
- 24390
- 24933
# sites specified for exclusion because they have extremely high mutation
# frequency in some clade
- 5629 # T5629G is much higher (~5% of all) in clade 20A than any other mutation.
- 6851 # C6851A and its reversion are top two mutations in 20C at ~5% and ~3% of all mutations
- 7328 # ~6% of all mutations in clade 21I, also highly mutated (~4% of all) in several other clades
- 28095 # ~11% of all mutations in clade 20I
- 29362 # ~30% of all mutations in clade 21C
# for analysis of 4-fold synonymous mutation spectra/rates, only keep clade subsets with
# at least this many non-excluded mutation counts
synonymous_spectra_min_counts: 5000
# Orf1ab to Nsp numbering (nucleotide start in Orf1ab) from
# https://github.com/theosanderson/Codon2Nucleotide/blob/main/src/App.js
orf1ab_to_nsps:
nsp1: [1, 180]
nsp2: [181, 818]
nsp3: [819, 2763]
nsp4: [2764, 3263]
nsp5 (Mpro): [3264, 3569]
nsp6: [3570, 3859]
nsp7: [3860, 3942]
nsp8: [3943, 4140]
nsp9: [4141, 4253]
nsp10: [4254, 4392]
nsp12 (RdRp): [4393, 5324]
nsp13: [5325, 5925]
nsp14: [5926, 6452]
nsp15: [6453, 6798]
nsp16: [6799, 7096]
# Pseudocount for calculating amino-acid fitnesses
fitness_pseudocount: 0.5
# initial cutoff for minimum expected count to show fitness values
min_expected_count: 20
# only plot correlation among clades when at least this many expected counts
clade_corr_min_count: 1e6