From b90db1a3a2b0348ef736de9ec10a64a74e8ee462 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Thu, 5 Oct 2023 12:29:04 +0200 Subject: [PATCH 01/27] Update titles --- template.qmd | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/template.qmd b/template.qmd index fc9bb8e..d9c3cfc 100644 --- a/template.qmd +++ b/template.qmd @@ -86,7 +86,7 @@ cor.mat <- cor(vcf) cor.mat[is.na(cor.mat)] <- 0 ``` -## Samples summary +## Summary of the target samples dataset ```{r, echo= F,message= F} gt(table) %>% cols_label( @@ -109,7 +109,7 @@ opt_table_font( ) ``` -## Evidence for chronic infection +## Evidence for single, serially-sampled infection @@ -140,7 +140,7 @@ With the neighbur-joining tree, partistic distances to root has been correlated ![**Figure 5.** Partistic distance to root in the neighbour-joining tree against days since first sample. Red line indicates the adjusted linear regression model.](`r params$tempest`) -## Characterizing within-host evolution +## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection ### Number of polymorphic sites @@ -187,7 +187,7 @@ heatmaply_cor( ``` ![**Figure 10.** Heatmap for pairwise correlation between frquencies for alternative alleles.]() -### dn/ds over time +### Non-synonymous to synonymous rate ratio over time ![**Figure 11.** Sinonymous mutations per sinonymous site **(dS)** and non sinonymous mutations mutations per non sinonymous site **(dN)** over time. The values are adjusted by the allele frequency.](`r params$evo`) From 7bf4590f805933de52544c1d2f138337548c5513 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Thu, 5 Oct 2023 12:33:34 +0200 Subject: [PATCH 02/27] Update 2.1 footnote --- template.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.qmd b/template.qmd index d9c3cfc..aec0410 100644 --- a/template.qmd +++ b/template.qmd @@ -116,7 +116,7 @@ opt_table_font( ### Lineage admixture The most probably lineage admixture for each sample has been calculated with software [Freyja](https://github.com/andersen-lab/Freyja). -![**Figure 1.** Lineage composition of samples according to Freyja. Samples are ordered by collection date.](`r params$freyja`) +![**Figure 1.** Lineage admixture calculated with Freyja for each sample. SSamples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$freyja`) ### Nucleotide diversity comparison From 31e0194fcd5a3c3f8e22c52d552664a99db04c51 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Thu, 5 Oct 2023 12:42:09 +0200 Subject: [PATCH 03/27] Update p.values reporting in nucleotide diversity --- workflow/scripts/report/diversity_plot.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/workflow/scripts/report/diversity_plot.R b/workflow/scripts/report/diversity_plot.R index eb23c89..ab51c0d 100644 --- a/workflow/scripts/report/diversity_plot.R +++ b/workflow/scripts/report/diversity_plot.R @@ -122,13 +122,14 @@ ggsave( ) # DATA JSON ##### +p.value <- ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp) list.div <- list( "diversity" = diversity, - "p.value" = ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp), - "normal.pvalue" = st$p.value, - "norm.text" = ifelse(st$p.value >= 0.05, "","not"), - "type.test" = ifelse(st$p.value >= 0.05, "","empirical"), + "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"), + "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"), + "norm.text" = ifelse(st$p.value >= 0.05, "", "not"), + "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"), "boot.reps" = snakemake@params[["bootstrap_reps"]], "sample.size" = length(study_aln) ) From 864a66f8ab1fd3b4a64cd89d9d4217127f10a514 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 11:37:42 +0200 Subject: [PATCH 04/27] Format for nucleotide diversity --- template.qmd | 4 ++-- workflow/scripts/report/diversity_plot.R | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/template.qmd b/template.qmd index aec0410..3f7fad9 100644 --- a/template.qmd +++ b/template.qmd @@ -122,9 +122,9 @@ The most probably lineage admixture for each sample has been calculated with sof Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). -The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]]`$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. +The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. -![**Figure 2.** Distribution of nucleotide diversity pi for 1000 randomly selected groups from context sequences. Red line indicates the pi value for the studied samples.](`r params$div`) +![**Figure 2.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`) ### Phylogeny and temporal signal diff --git a/workflow/scripts/report/diversity_plot.R b/workflow/scripts/report/diversity_plot.R index ab51c0d..aeceee6 100644 --- a/workflow/scripts/report/diversity_plot.R +++ b/workflow/scripts/report/diversity_plot.R @@ -125,7 +125,7 @@ ggsave( p.value <- ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp) list.div <- list( - "diversity" = diversity, + "diversity" = format(diversity, scientific = TRUE), "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"), "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"), "norm.text" = ifelse(st$p.value >= 0.05, "", "not"), From c78a1955ee9998b6f4905a1d93b0ba98b9b1491e Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 12:30:25 +0200 Subject: [PATCH 05/27] Add ufboot and sh-alrt values to report --- template.qmd | 2 +- workflow/scripts/report/phylo_plots.R | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/template.qmd b/template.qmd index 3f7fad9..26a55b0 100644 --- a/template.qmd +++ b/template.qmd @@ -128,7 +128,7 @@ The nucleotide diversity value for the studied samples is $`r div_values[["diver ### Phylogeny and temporal signal -A maximum likelihood tree has been adjusted with the context sequences: +A maximum likelihood tree has been adjusted with the context sequences. The studied samples `r stats[["monophyly"]]` monophyletic. The clade that contains all the studied samples is supported by a **UFBoot** score of $`r stats[["boot"]]`$% and a **SH-aLRT** score of $`r stats[["alrt"]]`$%. ![**Figure 3.** Maximum-likelihood tree whith studied and context sequence. Studied sequence are represented in red.](`r params$tree_ml`) diff --git a/workflow/scripts/report/phylo_plots.R b/workflow/scripts/report/phylo_plots.R index 1e3e996..1a3340c 100644 --- a/workflow/scripts/report/phylo_plots.R +++ b/workflow/scripts/report/phylo_plots.R @@ -53,8 +53,12 @@ study_names <- read.dna( snakemake@input[["study_fasta"]], format = "fasta", as.matrix = FALSE, - ) %>% - names() + ) + +study_names <- study_names[!startsWith(names(study_names), snakemake@config[["ALIGNMENT_REFERENCE"]])] + +study_names <- names(study_names) + # Obtain sample names ordered by CollectionDate date_order <- read_csv(snakemake@params[["metadata"]]) %>% @@ -268,10 +272,18 @@ tempest %>% # TEMPEST STATATS model <- lm(distance ~ date_interval, data = tempest) +# TREE STATS +study.node <- tree_ml$node.label[study.mrca - length(tip.color)] +monophyletic <- ifelse(is.monophyletic(tree_ml, study_names), "are", "are not") + + list( "sub_rate" = model$coefficients[[2]] * 365, "r2" = summary(model)$r.squared[[1]], - "pvalue" = cor.test(tempest$distance, tempest$date_interval)$p.value + "pvalue" = cor.test(tempest$distance, tempest$date_interval)$p.value, + "boot" = strsplit(study.node, "/")[[1]][2] %>% as.numeric(), + "alrt" = strsplit(study.node, "/")[[1]][1] %>% as.numeric(), + "monophyly"= monophyletic ) %>% toJSON() %>% write(snakemake@output[["json"]]) From 8ab38c4d572d865533b99c3e8eb80715f57bab9b Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 13:13:18 +0200 Subject: [PATCH 06/27] Update trees aesthetics --- config/design_plots.R | 31 ++++++------ workflow/scripts/report/phylo_plots.R | 69 +++++++++++++-------------- 2 files changed, 47 insertions(+), 53 deletions(-) diff --git a/config/design_plots.R b/config/design_plots.R index 39e1d9c..477f88e 100644 --- a/config/design_plots.R +++ b/config/design_plots.R @@ -37,28 +37,25 @@ gene_colors = c(M = "#B4D4B4", ORF7 = "#AA88CB", ORF10 = "#CACB5D") -snp_colors = c("C-26029-A" = "#e69138", - "G-21985-T" = "#ffd966", - "C-29421-T" = "#935fca", - "C-23043-T" = "#7c1d6f", - "C-28854-T" = "#ff4639" - ,"C-5178-T" = "#d5a6bd" - ,"C-4230-A" = "#1e3f9d" - ,"C-16375-T" = "#1EA0AE" - , "A-11923-C" = "#64acee" - ,"G-12761-A" = "#084c61" - ,"C-6033-T" = "chartreuse4" - ,"C-4230-T" = "#93c47d", - "Other" = "gray40") tree_colors = c( - tip_label = "blue", - Bootstrap_pass = "#ff6600", - alrt_pass = "#ffbf51", - boot_alrt_pass = "red" + tip_label = "#D944AA", + boot_alrt_pass = "#64acee" ) + +node.size <- c( + tip_label = 2, + boot_alrt_pass = 0.8 +) + +node.alpha <- c( + tip_label = 0.6, + boot_alrt_pass = 0.7 +) + + NV_colors <- c( Frameshift = "#568D63", "In frame" = "black", diff --git a/workflow/scripts/report/phylo_plots.R b/workflow/scripts/report/phylo_plots.R index 1a3340c..087b955 100644 --- a/workflow/scripts/report/phylo_plots.R +++ b/workflow/scripts/report/phylo_plots.R @@ -16,21 +16,13 @@ log_threshold(INFO) source(snakemake@params[["design"]]) # legend thresholds for ml tree + legend.names <- c( tip_label = "Studied samples", - Bootstrap_pass = sprintf( - "bootstrap >= %s", - snakemake@params[["boot_th"]] - ), - alrt_pass = sprintf( - "aLRT >= %s", - snakemake@params[["alrt_th"]] - ), - boot_alrt_pass = sprintf( - "bootstrap >= %s & aLRT >= %s ", - snakemake@params[["boot_th"]], - snakemake@params[["alrt_th"]] - ) + boot_alrt_pass = sprintf("UFBoot ≥ %s%s & SH-aLRT ≥ %s%s ", + snakemake@params[["boot_th"]], "%", + snakemake@params[["alrt_th"]], "%" + ) ) @@ -216,9 +208,7 @@ aLRT.mask <- aLRT.values >= snakemake@params[["alrt_th"]] boot.mask <- bootstrap.values >= snakemake@params[["boot_th"]] node.color <- case_when( - aLRT.mask & boot.mask ~ "boot_alrt_pass", - !aLRT.mask & boot.mask ~ "Bootstrap_pass", - aLRT.mask & !boot.mask ~ "alrt_pass" + aLRT.mask & boot.mask ~ "boot_alrt_pass" ) # MRCA for studied samples @@ -226,26 +216,33 @@ study.mrca <- getMRCA(tree_ml, study_names) log_info("Plotting M-L tree with context samples") p <- ggtree(tree_ml, layout = "circular") + - geom_highlight(node = study.mrca, colour = "red", fill = "red", alpha = 0.2) + - geom_point( - aes( - color = c(tip.color, node.color), # First node points in the tree are tip points, then internal nodes - shape = c(rep("tip", length(tree_ml$tip.label)), rep("node", length(tree_ml$node.label))) - ), - show.legend = TRUE - ) + - geom_treescale(x = 0.0008) + - geom_rootedge(0.0005) + - xlim(-0.0008, NA) + - scale_shape_manual(values = c( - tip = 1, - node = 20 - ), guide = "none") + - labs(color = "Class") + - scale_color_manual(values = tree_colors, - na.value = NA, - labels = legend.names - ) + geom_highlight(node = study.mrca, colour = "red", fill = "red", alpha = 0) + + geom_point( + aes( + color = c(tip.color, node.color), # First node points in the tree are tip points, then internal nodes + size = c(rep("tip_label", length(tree_ml$tip.label)), rep("boot_alrt_pass", length(tree_ml$node.label))), + alpha = c(rep("tip_label", length(tree_ml$tip.label)), rep("boot_alrt_pass", length(tree_ml$node.label))) + ), + show.legend = TRUE + ) + + geom_treescale(x = 0.0008) + + geom_rootedge(0.0005) + + xlim(-0.0008, NA) + + scale_size_manual( + name = "Class", + values = node.size, + labels = legend.names + ) + + scale_alpha_manual( + name = "Class", + values = node.alpha, + labels = legend.names + ) + + labs(color = "Class") + + scale_color_manual(values = tree_colors, + na.value = NA, + labels = legend.names + ) ggsave( filename = snakemake@output[["tree_ml"]], From 408083de9c392dae21c101186a216f94e5cb9a46 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 13:19:18 +0200 Subject: [PATCH 07/27] Update phylogeny footnotes --- template.qmd | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/template.qmd b/template.qmd index 26a55b0..2ba50a7 100644 --- a/template.qmd +++ b/template.qmd @@ -130,15 +130,15 @@ The nucleotide diversity value for the studied samples is $`r div_values[["diver A maximum likelihood tree has been adjusted with the context sequences. The studied samples `r stats[["monophyly"]]` monophyletic. The clade that contains all the studied samples is supported by a **UFBoot** score of $`r stats[["boot"]]`$% and a **SH-aLRT** score of $`r stats[["alrt"]]`$%. -![**Figure 3.** Maximum-likelihood tree whith studied and context sequence. Studied sequence are represented in red.](`r params$tree_ml`) +![**Figure 3.** Maximum-likelihood phylogeny with 1000 support replicates of target datasets with their context samples. The clade that contains the studied samples is squared in red.](`r params$tree_ml`) In addition, phylogeny has been reconstructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data. -![**Figure 4.** Neighbour-joining tree for studied samples.](`r params$tree`) +![**Figure 4.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`) With the neighbur-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. -![**Figure 5.** Partistic distance to root in the neighbour-joining tree against days since first sample. Red line indicates the adjusted linear regression model.](`r params$tempest`) +![**Figure 5.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`) ## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection From 4332ae6d317d29d31f82cb62a8b7f96e03c91f6b Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 13:22:41 +0200 Subject: [PATCH 08/27] Flip diversity and phyo order --- template.qmd | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/template.qmd b/template.qmd index 2ba50a7..f8eb17f 100644 --- a/template.qmd +++ b/template.qmd @@ -118,14 +118,6 @@ The most probably lineage admixture for each sample has been calculated with sof ![**Figure 1.** Lineage admixture calculated with Freyja for each sample. SSamples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$freyja`) -### Nucleotide diversity comparison - -Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). - -The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. - -![**Figure 2.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`) - ### Phylogeny and temporal signal A maximum likelihood tree has been adjusted with the context sequences. The studied samples `r stats[["monophyly"]]` monophyletic. The clade that contains all the studied samples is supported by a **UFBoot** score of $`r stats[["boot"]]`$% and a **SH-aLRT** score of $`r stats[["alrt"]]`$%. @@ -136,10 +128,19 @@ In addition, phylogeny has been reconstructed using pairwise distances between t ![**Figure 4.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`) -With the neighbur-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. +With the neighbor-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. ![**Figure 5.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`) +### Nucleotide diversity comparison + +Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). + +The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. + +![**Figure 2.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`) + + ## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection ### Number of polymorphic sites From 355bcc48d623db78573c364429489bf53c38d2f2 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 13:48:49 +0200 Subject: [PATCH 09/27] Update whithin-host evo section --- template.qmd | 24 ++++++++++++------------ workflow/scripts/report/NV_description.R | 4 +++- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/template.qmd b/template.qmd index f8eb17f..14f9e75 100644 --- a/template.qmd +++ b/template.qmd @@ -122,15 +122,15 @@ The most probably lineage admixture for each sample has been calculated with sof A maximum likelihood tree has been adjusted with the context sequences. The studied samples `r stats[["monophyly"]]` monophyletic. The clade that contains all the studied samples is supported by a **UFBoot** score of $`r stats[["boot"]]`$% and a **SH-aLRT** score of $`r stats[["alrt"]]`$%. -![**Figure 3.** Maximum-likelihood phylogeny with 1000 support replicates of target datasets with their context samples. The clade that contains the studied samples is squared in red.](`r params$tree_ml`) +![**Figure 2.** Maximum-likelihood phylogeny with 1000 support replicates of target datasets with their context samples. The clade that contains the studied samples is squared in red.](`r params$tree_ml`) In addition, phylogeny has been reconstructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data. -![**Figure 4.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`) +![**Figure 3.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`) With the neighbor-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. -![**Figure 5.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`) +![**Figure 4.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`) ### Nucleotide diversity comparison @@ -138,7 +138,7 @@ Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. -![**Figure 2.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`) +![**Figure 5.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`) ## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection @@ -147,31 +147,31 @@ The nucleotide diversity value for the studied samples is $`r div_values[["diver Sites with minor allele frequency > 0.05 are considered as polymorphic sites. -![**Figure 6.** Number of polymorphisms over time for the studied samples.](`r params$fig_cor_snp`) +![**Figure 6.** Number of polymorphic sites depending on collection date. The red line shows the linear model fit.](`r params$fig_cor_snp`) ### Description for within-host nucleotide variants -`r n_SNV` different SNV and `r n_INDELS` INDELS had been detected along the genome. In the **Figure 7a** is shown their distribution along the genome and the variation in allele frequencies in the studied samples. +`r n_SNV` different SNV and `r n_INDELS` INDELS had been detected along the genome. In the **Figure 7** is shown their distribution along the genome and the variation in allele frequencies in the studied samples. ::: {.panel-tabset} ## Whole genome -![**Figure 7a.** A\) Proportion of polymorphic sites along the genome calculated with 1000nt windows. B\) Nucleotide variants (NV) in the genomes of the studied samples. The shape and color indicates the type of NV. Samples are ordered by collection date.](`r params$SNV`) +![**Figure 7.** Summary of the intra-host accumulation of nucleotide variants (NV), using the reconstructed dataset ancestor as reference. A) Nucleotide variants per site along the SARS-CoV-2 genome. Relative abundance of NVs is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. Labels indicate the coding regions of the non structural proteins (NSP) within ORF1ab. B) Genome variation along the genome for each sample. The Y-axis displays samples in chronological order, with the earliest collection date at the bottom, and the latest, at the top.](`r params$SNV`) ## Spike sequence -![**Figure 7b.** A\) Proportion of polymorphic sites along the spike nucleotide sequence calculated with 1000nt windows. B\) Nucleotide variants (NV) in the spike sequences of the studied samples. The shape and color indicates the type of NV. Samples are ordered by collection date.](`r params$SNV_s`) +![**Figure 8.** Summary of the intra-host accumulation of nucleotide variants (NV) in the spike sequence, using the reconstructed dataset ancestor as reference. A) Nucleotide variants per site along the spike sequence. Relative abundance of NVs is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. B) Genome variation along the spike sequence for each sample. The Y-axis displays samples in chronological order, with the earliest collection date at the bottom, and the latest, at the top.](`r params$SNV_s`) ::: ### Time dependency for the within-host nucleotide variants The correlation with time is calculated for the allele frequency of each NV. -![**Figure 8.** *Pearson* r and adjusted p-value (-log10) for the correlation with time for each SNV. Red line indicates a p-value of 0.05.](`r params$volcano`) +![**Figure 9.** Pearson’s correlation coefficients and BH-adjusted p-values for all detected nucleotide variants. Red dashed line indicates adjusted p = 0.05. Labeled dots represent nucleotide variants correlated with time (adjusted p < 0.05).](`r params$volcano`) The significant correlated NV are swown in detail: -![**Figure 9.** Trend over time for the significantly correlated with time SNVs and for positions with more than 1 alternative allele. ](`r params$panel`) +![**Figure 10.** Time series of the relative allele frequencies for nucleotide variants (NV) with a significant time correlation, and for those that share their site in the genome. Each subplot depicts the progression of the allele frequencies in time for a given genome position.](`r params$panel`) ### Correlation between alternative alleles @@ -186,11 +186,11 @@ heatmaply_cor( column_text_angle = 90) ``` -![**Figure 10.** Heatmap for pairwise correlation between frquencies for alternative alleles.]() +![**Figure 10.** Hierarchically clustered heatmap of the pairwise Pearson’s correlation coefficients between the time series of allele frequencies in the case study.]() ### Non-synonymous to synonymous rate ratio over time -![**Figure 11.** Sinonymous mutations per sinonymous site **(dS)** and non sinonymous mutations mutations per non sinonymous site **(dN)** over time. The values are adjusted by the allele frequency.](`r params$evo`) +![**Figure 11.** Time series of dN and dS. Each point corresponds to a different sample, sorted in chronological order.](`r params$evo`) diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R index b1d0d6e..ee17d8e 100644 --- a/workflow/scripts/report/NV_description.R +++ b/workflow/scripts/report/NV_description.R @@ -470,7 +470,9 @@ n_snv <- length(unique(vcf$SNP)) - n_indels list( "INDELS" = n_indels, - "SNV" = n_snv + "SNV" = n_snv, + "window" = snakemake@params[["window"]], + "step" = snakemake@params[["step"]] ) %>% toJSON() %>% write(snakemake@output[["json"]]) From f2bd17c4b2f37566a047feeabcd5fbc9c8a32af9 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 14:37:17 +0200 Subject: [PATCH 10/27] Update header --- config/report.styles.css | 4 ++++ template.qmd | 4 +--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/config/report.styles.css b/config/report.styles.css index 43f6c4b..66e9724 100644 --- a/config/report.styles.css +++ b/config/report.styles.css @@ -8,6 +8,10 @@ background-origin: content-box; } +.title { + font-family: Montserrat; + color: #FFFFFF +} html { font-family: Montserrat; color: #000000; diff --git a/template.qmd b/template.qmd index 14f9e75..1cc7bec 100644 --- a/template.qmd +++ b/template.qmd @@ -1,7 +1,5 @@ --- -title: "Case-study-SARS-CoV-2" -author: "Jordi Sevilla" -institute: "PathoGenOmics-lab" +title: "VIPERA: report for the studied samples" date: last-modified date-format: "DD-MM-YYYY" title-block-banner: true From d943931de705b8e2b04ebb7e5b145aceec108313 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 14:41:50 +0200 Subject: [PATCH 11/27] Update p-value format in temp est --- workflow/scripts/report/phylo_plots.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/report/phylo_plots.R b/workflow/scripts/report/phylo_plots.R index 087b955..068dd6d 100644 --- a/workflow/scripts/report/phylo_plots.R +++ b/workflow/scripts/report/phylo_plots.R @@ -277,7 +277,10 @@ monophyletic <- ifelse(is.monophyletic(tree_ml, study_names), "are", "are not") list( "sub_rate" = model$coefficients[[2]] * 365, "r2" = summary(model)$r.squared[[1]], - "pvalue" = cor.test(tempest$distance, tempest$date_interval)$p.value, + "pvalue" = ifelse(cor.test(tempest$distance, tempest$date_interval)$p.value < 0.001, + "< 0.001", + cor.test(tempest$distance, tempest$date_interval)$p.value + ), "boot" = strsplit(study.node, "/")[[1]][2] %>% as.numeric(), "alrt" = strsplit(study.node, "/")[[1]][1] %>% as.numeric(), "monophyly"= monophyletic From c1f49aea430b871e7ab1184afe1267846cf740ce Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 15:08:21 +0200 Subject: [PATCH 12/27] Format number of poly sites --- template.qmd | 2 +- workflow/scripts/report/NV_description.R | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/template.qmd b/template.qmd index 1cc7bec..4a23250 100644 --- a/template.qmd +++ b/template.qmd @@ -143,7 +143,7 @@ The nucleotide diversity value for the studied samples is $`r div_values[["diver ### Number of polymorphic sites -Sites with minor allele frequency > 0.05 are considered as polymorphic sites. +Sites with minor allele frequency > 0.05 are considered as polymorphic sites. Collection date explains the number of polymorphic sites with an $R^2$ of **`r nv.counts[["r2"]]`** with a p-value of $`r nv.counts[["value"]]`$. ![**Figure 6.** Number of polymorphic sites depending on collection date. The red line shows the linear model fit.](`r params$fig_cor_snp`) diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R index ee17d8e..3d35559 100644 --- a/workflow/scripts/report/NV_description.R +++ b/workflow/scripts/report/NV_description.R @@ -360,7 +360,7 @@ ggsave( # Figure for nº of heterozygus sites for each sample log_info("Plotting nº of heterozygus sites for each sample") -figur_SNP_time <- vcf_snp %>% +figur_SNP_table <- vcf_snp %>% filter(ALT_FREQ <= 0.95) %>% select(!GFF_FEATURE) %>% unique() %>% @@ -373,7 +373,9 @@ figur_SNP_time <- vcf_snp %>% CollectionDate = min(as.Date(CollectionDate)), n = n_distinct(POS) ) %>% - ungroup() %>% + ungroup() + +figur_SNP_time <- figur_SNP_table %>% ggplot() + aes( x = CollectionDate, @@ -385,7 +387,6 @@ figur_SNP_time <- vcf_snp %>% alpha = 0.6 ) + geom_point() + - stat_cor(geom = "label") + labs( x = "Date", y = "Nº of polimorphic sites" @@ -467,12 +468,15 @@ n_indels <- vcf %>% length() n_snv <- length(unique(vcf$SNP)) - n_indels - +model <- lm(n ~ CollectionDate, data = figur_SNP_table) +cortest <- cor.test(figur_SNP_table$n, as.numeric(figur_SNP_table$CollectionDate)) list( "INDELS" = n_indels, "SNV" = n_snv, "window" = snakemake@params[["window"]], - "step" = snakemake@params[["step"]] + "step" = snakemake@params[["step"]], + "r2" = summary(model)$r.squared[[1]], + "value" = ifelse(cortest$p.value < 0.001, "< 0.001", cortest$p.value) ) %>% toJSON() %>% write(snakemake@output[["json"]]) From 5d86ad6a9152f502983a877b8a570cdde17792e0 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 16:12:02 +0200 Subject: [PATCH 13/27] Typos and text correction --- template.qmd | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/template.qmd b/template.qmd index 4a23250..e0b804c 100644 --- a/template.qmd +++ b/template.qmd @@ -112,7 +112,7 @@ opt_table_font( ### Lineage admixture -The most probably lineage admixture for each sample has been calculated with software [Freyja](https://github.com/andersen-lab/Freyja). +The most probable lineage admixture for each sample has been calculated using the software [Freyja](https://github.com/andersen-lab/Freyja). ![**Figure 1.** Lineage admixture calculated with Freyja for each sample. SSamples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$freyja`) @@ -122,17 +122,17 @@ A maximum likelihood tree has been adjusted with the context sequences. The stud ![**Figure 2.** Maximum-likelihood phylogeny with 1000 support replicates of target datasets with their context samples. The clade that contains the studied samples is squared in red.](`r params$tree_ml`) -In addition, phylogeny has been reconstructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data. +In addition, a neighbor-joining tree has been constructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data. ![**Figure 3.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`) -With the neighbor-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. +With the neighbor-joining tree, patristic distances to root have been correlated with time, obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. ![**Figure 4.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`) ### Nucleotide diversity comparison -Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). +Nucleotide diversity has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. @@ -149,7 +149,7 @@ Sites with minor allele frequency > 0.05 are considered as polymorphic sites. C ### Description for within-host nucleotide variants -`r n_SNV` different SNV and `r n_INDELS` INDELS had been detected along the genome. In the **Figure 7** is shown their distribution along the genome and the variation in allele frequencies in the studied samples. +A total of `r n_SNV` different SNVs and `r n_INDELS` INDELs have been detected along the genome. **Figure 7** shows their distribution along the genome and the variation in allele frequencies in the studied samples. ::: {.panel-tabset} @@ -167,13 +167,13 @@ The correlation with time is calculated for the allele frequency of each NV. ![**Figure 9.** Pearson’s correlation coefficients and BH-adjusted p-values for all detected nucleotide variants. Red dashed line indicates adjusted p = 0.05. Labeled dots represent nucleotide variants correlated with time (adjusted p < 0.05).](`r params$volcano`) -The significant correlated NV are swown in detail: +The significantly correlated NV are shown in detail: ![**Figure 10.** Time series of the relative allele frequencies for nucleotide variants (NV) with a significant time correlation, and for those that share their site in the genome. Each subplot depicts the progression of the allele frequencies in time for a given genome position.](`r params$panel`) ### Correlation between alternative alleles -To see posible interactions between mutations, pairwise correlation between the alternative frequencies are calculated. +To see possible interactions between mutations, pairwise correlation between the alternative frequencies are calculated. ```{r, echo= F,message= F, fig.align='center'} From 2a8bb8b06073f7e88ca7e6127e0c097bf416c65d Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 16:13:49 +0200 Subject: [PATCH 14/27] Add window and step info to the report --- workflow/rules/report.smk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 59363d4..47ca0b0 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -70,7 +70,9 @@ rule general_NV_description: samples = expand("{sample}", sample = iter_samples()), design = config["PLOTS"], nsp = config["NSP"], - metadata = config["METADATA"] + metadata = config["METADATA"], + window = 1000, + step = 1 input: window = OUTDIR/f"{OUTPUT_NAME}.window.csv", vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" From 08ab9b16147cd807dd469f55f1c971ab1db54937 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 17:18:28 +0200 Subject: [PATCH 15/27] Reverse NV legend --- workflow/scripts/report/NV_description.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R index 3d35559..228fab3 100644 --- a/workflow/scripts/report/NV_description.R +++ b/workflow/scripts/report/NV_description.R @@ -198,6 +198,9 @@ variants <- vcf %>% color = "Classification", alpha = "Frequency", fill = "Region" + ) + + guides( + fill = guide_legend(reverse = TRUE) ) @@ -333,7 +336,10 @@ variants_spike <- vcf %>% color = "Classification", alpha = "Frequency", fill = "Region" - ) + ) + + guides( + fill = guide_legend(reverse = TRUE) + ) From 2c33a973c84773cbe5d960a8c7d176813604be46 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 19:24:08 +0200 Subject: [PATCH 16/27] Discrete colors for snps --- workflow/scripts/report/snp_plots.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/scripts/report/snp_plots.R b/workflow/scripts/report/snp_plots.R index 3f34717..2453153 100644 --- a/workflow/scripts/report/snp_plots.R +++ b/workflow/scripts/report/snp_plots.R @@ -6,7 +6,9 @@ library(ggrepel) library(logger) log_threshold(INFO) - +# Get colors +color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = TRUE)] +color = color[grep('white', color, invert = TRUE)] # Import file with plots style source(snakemake@params[["design"]]) @@ -221,7 +223,7 @@ panel <- vcf %>% y = ALT_FREQ, color = SNP ) + - scale_color_viridis_d() + + scale_color_manual(values = sample(color, length(subset))) + geom_point() + geom_line() + theme( From 5a2000c76d02f144f23950c8a575df812ce86703 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Tue, 10 Oct 2023 20:11:12 +0200 Subject: [PATCH 17/27] Dont remove window table --- workflow/rules/report.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 47ca0b0..b06a567 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -20,7 +20,7 @@ rule window: vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", gb = OUTDIR/"reference.gb" output: - window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"), + window_df = REPORT_DIR_TABLES/f"{OUTPUT_NAME}.window.csv", log: LOGDIR / "window" / "log.txt" script: @@ -74,7 +74,7 @@ rule general_NV_description: window = 1000, step = 1 input: - window = OUTDIR/f"{OUTPUT_NAME}.window.csv", + window = REPORT_DIR_TABLES/f"{OUTPUT_NAME}.window.csv", vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" output: fig = report((REPORT_DIR_PLOTS/"figure_7a.png").resolve()), From 92728a0a7a8fe55c6cd4385b252fb667e025c5bc Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 09:30:00 +0200 Subject: [PATCH 18/27] Do not remove primary files --- workflow/rules/fetch.smk | 6 +++--- workflow/rules/report.smk | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk index 6a1d46d..f7472da 100644 --- a/workflow/rules/fetch.smk +++ b/workflow/rules/fetch.smk @@ -17,7 +17,7 @@ rule fetch_alignment_gb: params: ref = config["ALIGNMENT_REFERENCE"] output: - fasta = temp(OUTDIR/"reference.gb") + fasta = OUTDIR/"reference.gb" log: LOGDIR / "fetch_alignment_reference" / "log.txt" shell: @@ -47,7 +47,7 @@ rule fetch_alignment_annotation: params: ref = config["ALIGNMENT_REFERENCE"] output: - temp(OUTDIR/"reference.gff3") + OUTDIR/"reference.gff3" log: LOGDIR / "fetch_alignment_annotation" / "log.txt" shell: @@ -61,6 +61,6 @@ rule fetch_problematic_vcf: log: LOGDIR / "fetch_problematic_vcf" / "log.txt" output: - temp(OUTDIR / "problematic_sites.vcf") + OUTDIR / "problematic_sites.vcf" shell: "curl {params.url} -o {output} -s 2> {log}" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index b06a567..95c8dc5 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -20,7 +20,7 @@ rule window: vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", gb = OUTDIR/"reference.gb" output: - window_df = REPORT_DIR_TABLES/f"{OUTPUT_NAME}.window.csv", + window_df = temp(OUDIR/f"{OUTPUT_NAME}.window.csv"), log: LOGDIR / "window" / "log.txt" script: @@ -74,7 +74,7 @@ rule general_NV_description: window = 1000, step = 1 input: - window = REPORT_DIR_TABLES/f"{OUTPUT_NAME}.window.csv", + window = OUTDIR/f"{OUTPUT_NAME}.window.csv", vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" output: fig = report((REPORT_DIR_PLOTS/"figure_7a.png").resolve()), From 329f34a616fb8f24526ca75a14815861a76a94fd Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 09:30:47 +0200 Subject: [PATCH 19/27] Set default step as 50 --- workflow/rules/report.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 95c8dc5..3a0abc1 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -15,7 +15,7 @@ rule window: conda: "../envs/biopython.yaml" params: window = 1000, - step = 1 + step = 50 input: vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", gb = OUTDIR/"reference.gb" From a6904b0a263716bdfcb721d30c0a2c7e3ac4b299 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 09:31:36 +0200 Subject: [PATCH 20/27] Typo error --- workflow/rules/report.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 3a0abc1..1cb97a2 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -20,7 +20,7 @@ rule window: vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", gb = OUTDIR/"reference.gb" output: - window_df = temp(OUDIR/f"{OUTPUT_NAME}.window.csv"), + window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"), log: LOGDIR / "window" / "log.txt" script: From 8d017477269f6ae5eb61b9fb61ee1c2f14b822a3 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 10:43:31 +0200 Subject: [PATCH 21/27] Set step 50 as default --- workflow/rules/report.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index 1cb97a2..c789a19 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -72,7 +72,7 @@ rule general_NV_description: nsp = config["NSP"], metadata = config["METADATA"], window = 1000, - step = 1 + step = 50 input: window = OUTDIR/f"{OUTPUT_NAME}.window.csv", vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" From 2e92e8aacd120278dd35dbecc8df7fc5797c1888 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 11:13:44 +0200 Subject: [PATCH 22/27] Add window and step to general config --- config/config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/config.yaml b/config/config.yaml index 0265b52..81473ca 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -20,6 +20,9 @@ DEMIX: MIN_QUALITY: 20 COV_CUTOFF: 30 MIN_ABUNDANCE: 0.0001 +WINDOW: + WIDTH: 1000 + STEP: 50 GISAID_YAML: "config/gisaid.yaml" DIVERSITY_BOOTSTRAP_REPS: From e061b6411f8bb22a8d525b6fce9f4e9d8c436e71 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 11:14:01 +0200 Subject: [PATCH 23/27] Update dn/ds colors --- config/design_plots.R | 20 ++++++++++++++++---- workflow/scripts/report/evo_plots.R | 5 ++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/config/design_plots.R b/config/design_plots.R index 477f88e..3671098 100644 --- a/config/design_plots.R +++ b/config/design_plots.R @@ -22,8 +22,7 @@ theme_update(text = element_text(size = 16, family = "Montserrat"), - -# AJUSTES #### +# CONFIG gene_colors = c(M = "#B4D4B4", @@ -38,7 +37,7 @@ gene_colors = c(M = "#B4D4B4", ORF10 = "#CACB5D") - +# M-L tree colors and labels tree_colors = c( tip_label = "#D944AA", boot_alrt_pass = "#64acee" @@ -56,6 +55,8 @@ node.alpha <- c( ) +# Nucleotide variants classification colors and labels + NV_colors <- c( Frameshift = "#568D63", "In frame" = "black", @@ -70,4 +71,15 @@ NV_names <- c( Intergenic = "Intergenic", No = "Non synonymous", yes = "Synonymous" -) \ No newline at end of file +) + +# dn ds colors and labels +dnds.labels <- c( + dn = "dN", + ds = "dS" +) + +dnds.colors <- c( + dn = "#E53E47", + ds = "#2C47F5" +) diff --git a/workflow/scripts/report/evo_plots.R b/workflow/scripts/report/evo_plots.R index 14cf55b..c241646 100644 --- a/workflow/scripts/report/evo_plots.R +++ b/workflow/scripts/report/evo_plots.R @@ -81,7 +81,10 @@ plot <- vcf %>% ) + geom_point() + geom_line() + - scale_color_hue(labels = c("dN", "dS")) + + scale_color_manual( + labels = dnds.labels, + values = dnds.colors + ) + labs( y = "", x = "Time since first sample", From 6d0a99c77efebd5a613cbb202fbf124e1f28abff Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 11:14:36 +0200 Subject: [PATCH 24/27] add window and step to general config --- workflow/rules/report.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index c789a19..a05e455 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -14,8 +14,8 @@ rule heatmap: rule window: conda: "../envs/biopython.yaml" params: - window = 1000, - step = 50 + window = config["WINDOW"]["WIDTH"], + step = config["WINDOW"]["STEP"] input: vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", gb = OUTDIR/"reference.gb" @@ -71,8 +71,8 @@ rule general_NV_description: design = config["PLOTS"], nsp = config["NSP"], metadata = config["METADATA"], - window = 1000, - step = 50 + window = config["WINDOW"]["WIDTH"], + step = config["WINDOW"]["STEP"] input: window = OUTDIR/f"{OUTPUT_NAME}.window.csv", vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" From 7f95b1c7693f948c9532cd62fbecfc8f69e26959 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 11:15:20 +0200 Subject: [PATCH 25/27] set gff as temp file --- workflow/rules/fetch.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk index f7472da..3f81ad0 100644 --- a/workflow/rules/fetch.smk +++ b/workflow/rules/fetch.smk @@ -47,7 +47,7 @@ rule fetch_alignment_annotation: params: ref = config["ALIGNMENT_REFERENCE"] output: - OUTDIR/"reference.gff3" + temp(OUTDIR/"reference.gff3") log: LOGDIR / "fetch_alignment_annotation" / "log.txt" shell: From 74a6fc71f9187d63eae59d9144b9a5caf7749334 Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 11:50:40 +0200 Subject: [PATCH 26/27] Set workflow version as subtitle --- config/report.styles.css | 4 ++++ template.qmd | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/config/report.styles.css b/config/report.styles.css index 66e9724..f81e0a9 100644 --- a/config/report.styles.css +++ b/config/report.styles.css @@ -12,6 +12,10 @@ font-family: Montserrat; color: #FFFFFF } +.quarto-title p { + font-family: Montserrat; + color: #FFFFFF +} html { font-family: Montserrat; color: #000000; diff --git a/template.qmd b/template.qmd index e0b804c..6c3909c 100644 --- a/template.qmd +++ b/template.qmd @@ -1,5 +1,6 @@ --- title: "VIPERA: report for the studied samples" +subtitle: "Workflow: `r params$workflow_version`" date: last-modified date-format: "DD-MM-YYYY" title-block-banner: true @@ -42,7 +43,6 @@ output-file: report.html -> Workflow: `r params$workflow_version` ```{r , echo= F,message= F, include = F} library(jsonlite) From 733dd8f8eb4c300705fb06e788772ff1f5ced6de Mon Sep 17 00:00:00 2001 From: Jordi Sevilla Date: Wed, 11 Oct 2023 11:57:55 +0200 Subject: [PATCH 27/27] Update title --- template.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/template.qmd b/template.qmd index 6c3909c..6b0d6cd 100644 --- a/template.qmd +++ b/template.qmd @@ -1,5 +1,5 @@ --- -title: "VIPERA: report for the studied samples" +title: "VIPERA report" subtitle: "Workflow: `r params$workflow_version`" date: last-modified date-format: "DD-MM-YYYY"