diff --git a/config/config.yaml b/config/config.yaml index 3860db2..1f19727 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,6 +24,9 @@ DEMIX: MIN_QUALITY: 20 COV_CUTOFF: 30 MIN_ABUNDANCE: 0.0001 +WINDOW: + WIDTH: 1000 + STEP: 50 GISAID_YAML: "config/gisaid.yaml" DIVERSITY_BOOTSTRAP_REPS: diff --git a/config/design_plots.R b/config/design_plots.R index 39e1d9c..3671098 100644 --- a/config/design_plots.R +++ b/config/design_plots.R @@ -22,8 +22,7 @@ theme_update(text = element_text(size = 16, family = "Montserrat"), - -# AJUSTES #### +# CONFIG gene_colors = c(M = "#B4D4B4", @@ -37,28 +36,27 @@ gene_colors = c(M = "#B4D4B4", ORF7 = "#AA88CB", ORF10 = "#CACB5D") -snp_colors = c("C-26029-A" = "#e69138", - "G-21985-T" = "#ffd966", - "C-29421-T" = "#935fca", - "C-23043-T" = "#7c1d6f", - "C-28854-T" = "#ff4639" - ,"C-5178-T" = "#d5a6bd" - ,"C-4230-A" = "#1e3f9d" - ,"C-16375-T" = "#1EA0AE" - , "A-11923-C" = "#64acee" - ,"G-12761-A" = "#084c61" - ,"C-6033-T" = "chartreuse4" - ,"C-4230-T" = "#93c47d", - "Other" = "gray40") - +# M-L tree colors and labels tree_colors = c( - tip_label = "blue", - Bootstrap_pass = "#ff6600", - alrt_pass = "#ffbf51", - boot_alrt_pass = "red" + tip_label = "#D944AA", + boot_alrt_pass = "#64acee" +) + + +node.size <- c( + tip_label = 2, + boot_alrt_pass = 0.8 +) + +node.alpha <- c( + tip_label = 0.6, + boot_alrt_pass = 0.7 ) + +# Nucleotide variants classification colors and labels + NV_colors <- c( Frameshift = "#568D63", "In frame" = "black", @@ -73,4 +71,15 @@ NV_names <- c( Intergenic = "Intergenic", No = "Non synonymous", yes = "Synonymous" -) \ No newline at end of file +) + +# dn ds colors and labels +dnds.labels <- c( + dn = "dN", + ds = "dS" +) + +dnds.colors <- c( + dn = "#E53E47", + ds = "#2C47F5" +) diff --git a/config/report.styles.css b/config/report.styles.css index 43f6c4b..f81e0a9 100644 --- a/config/report.styles.css +++ b/config/report.styles.css @@ -8,6 +8,14 @@ background-origin: content-box; } +.title { + font-family: Montserrat; + color: #FFFFFF +} +.quarto-title p { + font-family: Montserrat; + color: #FFFFFF +} html { font-family: Montserrat; color: #000000; diff --git a/template.qmd b/template.qmd index fc9bb8e..6b0d6cd 100644 --- a/template.qmd +++ b/template.qmd @@ -1,7 +1,6 @@ --- -title: "Case-study-SARS-CoV-2" -author: "Jordi Sevilla" -institute: "PathoGenOmics-lab" +title: "VIPERA report" +subtitle: "Workflow: `r params$workflow_version`" date: last-modified date-format: "DD-MM-YYYY" title-block-banner: true @@ -44,7 +43,6 @@ output-file: report.html -> Workflow: `r params$workflow_version` ```{r , echo= F,message= F, include = F} library(jsonlite) @@ -86,7 +84,7 @@ cor.mat <- cor(vcf) cor.mat[is.na(cor.mat)] <- 0 ``` -## Samples summary +## Summary of the target samples dataset ```{r, echo= F,message= F} gt(table) %>% cols_label( @@ -109,72 +107,73 @@ opt_table_font( ) ``` -## Evidence for chronic infection +## Evidence for single, serially-sampled infection ### Lineage admixture -The most probably lineage admixture for each sample has been calculated with software [Freyja](https://github.com/andersen-lab/Freyja). +The most probable lineage admixture for each sample has been calculated using the software [Freyja](https://github.com/andersen-lab/Freyja). -![**Figure 1.** Lineage composition of samples according to Freyja. Samples are ordered by collection date.](`r params$freyja`) +![**Figure 1.** Lineage admixture calculated with Freyja for each sample. SSamples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$freyja`) -### Nucleotide diversity comparison +### Phylogeny and temporal signal -Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). +A maximum likelihood tree has been adjusted with the context sequences. The studied samples `r stats[["monophyly"]]` monophyletic. The clade that contains all the studied samples is supported by a **UFBoot** score of $`r stats[["boot"]]`$% and a **SH-aLRT** score of $`r stats[["alrt"]]`$%. -The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]]`$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. +![**Figure 2.** Maximum-likelihood phylogeny with 1000 support replicates of target datasets with their context samples. The clade that contains the studied samples is squared in red.](`r params$tree_ml`) -![**Figure 2.** Distribution of nucleotide diversity pi for 1000 randomly selected groups from context sequences. Red line indicates the pi value for the studied samples.](`r params$div`) +In addition, a neighbor-joining tree has been constructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data. -### Phylogeny and temporal signal +![**Figure 3.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`) -A maximum likelihood tree has been adjusted with the context sequences: +With the neighbor-joining tree, patristic distances to root have been correlated with time, obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. -![**Figure 3.** Maximum-likelihood tree whith studied and context sequence. Studied sequence are represented in red.](`r params$tree_ml`) +![**Figure 4.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`) + +### Nucleotide diversity comparison -In addition, phylogeny has been reconstructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data. +Nucleotide diversity has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*). -![**Figure 4.** Neighbour-joining tree for studied samples.](`r params$tree`) +The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$. -With the neighbur-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year. +![**Figure 5.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`) -![**Figure 5.** Partistic distance to root in the neighbour-joining tree against days since first sample. Red line indicates the adjusted linear regression model.](`r params$tempest`) -## Characterizing within-host evolution +## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection ### Number of polymorphic sites -Sites with minor allele frequency > 0.05 are considered as polymorphic sites. +Sites with minor allele frequency > 0.05 are considered as polymorphic sites. Collection date explains the number of polymorphic sites with an $R^2$ of **`r nv.counts[["r2"]]`** with a p-value of $`r nv.counts[["value"]]`$. -![**Figure 6.** Number of polymorphisms over time for the studied samples.](`r params$fig_cor_snp`) +![**Figure 6.** Number of polymorphic sites depending on collection date. The red line shows the linear model fit.](`r params$fig_cor_snp`) ### Description for within-host nucleotide variants -`r n_SNV` different SNV and `r n_INDELS` INDELS had been detected along the genome. In the **Figure 7a** is shown their distribution along the genome and the variation in allele frequencies in the studied samples. +A total of `r n_SNV` different SNVs and `r n_INDELS` INDELs have been detected along the genome. **Figure 7** shows their distribution along the genome and the variation in allele frequencies in the studied samples. ::: {.panel-tabset} ## Whole genome -![**Figure 7a.** A\) Proportion of polymorphic sites along the genome calculated with 1000nt windows. B\) Nucleotide variants (NV) in the genomes of the studied samples. The shape and color indicates the type of NV. Samples are ordered by collection date.](`r params$SNV`) +![**Figure 7.** Summary of the intra-host accumulation of nucleotide variants (NV), using the reconstructed dataset ancestor as reference. A) Nucleotide variants per site along the SARS-CoV-2 genome. Relative abundance of NVs is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. Labels indicate the coding regions of the non structural proteins (NSP) within ORF1ab. B) Genome variation along the genome for each sample. The Y-axis displays samples in chronological order, with the earliest collection date at the bottom, and the latest, at the top.](`r params$SNV`) ## Spike sequence -![**Figure 7b.** A\) Proportion of polymorphic sites along the spike nucleotide sequence calculated with 1000nt windows. B\) Nucleotide variants (NV) in the spike sequences of the studied samples. The shape and color indicates the type of NV. Samples are ordered by collection date.](`r params$SNV_s`) +![**Figure 8.** Summary of the intra-host accumulation of nucleotide variants (NV) in the spike sequence, using the reconstructed dataset ancestor as reference. A) Nucleotide variants per site along the spike sequence. Relative abundance of NVs is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. B) Genome variation along the spike sequence for each sample. The Y-axis displays samples in chronological order, with the earliest collection date at the bottom, and the latest, at the top.](`r params$SNV_s`) ::: ### Time dependency for the within-host nucleotide variants The correlation with time is calculated for the allele frequency of each NV. -![**Figure 8.** *Pearson* r and adjusted p-value (-log10) for the correlation with time for each SNV. Red line indicates a p-value of 0.05.](`r params$volcano`) +![**Figure 9.** Pearson’s correlation coefficients and BH-adjusted p-values for all detected nucleotide variants. Red dashed line indicates adjusted p = 0.05. Labeled dots represent nucleotide variants correlated with time (adjusted p < 0.05).](`r params$volcano`) -The significant correlated NV are swown in detail: +The significantly correlated NV are shown in detail: -![**Figure 9.** Trend over time for the significantly correlated with time SNVs and for positions with more than 1 alternative allele. ](`r params$panel`) +![**Figure 10.** Time series of the relative allele frequencies for nucleotide variants (NV) with a significant time correlation, and for those that share their site in the genome. Each subplot depicts the progression of the allele frequencies in time for a given genome position.](`r params$panel`) ### Correlation between alternative alleles -To see posible interactions between mutations, pairwise correlation between the alternative frequencies are calculated. +To see possible interactions between mutations, pairwise correlation between the alternative frequencies are calculated. ```{r, echo= F,message= F, fig.align='center'} @@ -185,11 +184,11 @@ heatmaply_cor( column_text_angle = 90) ``` -![**Figure 10.** Heatmap for pairwise correlation between frquencies for alternative alleles.]() +![**Figure 10.** Hierarchically clustered heatmap of the pairwise Pearson’s correlation coefficients between the time series of allele frequencies in the case study.]() -### dn/ds over time +### Non-synonymous to synonymous rate ratio over time -![**Figure 11.** Sinonymous mutations per sinonymous site **(dS)** and non sinonymous mutations mutations per non sinonymous site **(dN)** over time. The values are adjusted by the allele frequency.](`r params$evo`) +![**Figure 11.** Time series of dN and dS. Each point corresponds to a different sample, sorted in chronological order.](`r params$evo`) diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk index d3f0759..f56d7ac 100644 --- a/workflow/rules/fetch.smk +++ b/workflow/rules/fetch.smk @@ -17,7 +17,7 @@ rule fetch_reference_gb: params: ref = config["ALIGNMENT_REFERENCE"] output: - fasta = temp(OUTDIR/"reference.gb") + fasta = OUTDIR/"reference.gb" log: LOGDIR / "fetch_reference_gb" / "log.txt" shell: @@ -61,6 +61,6 @@ rule fetch_problematic_vcf: log: LOGDIR / "fetch_problematic_vcf" / "log.txt" output: - temp(OUTDIR / "problematic_sites.vcf") + OUTDIR / "problematic_sites.vcf" shell: "curl {params.url} -o {output} -s 2> {log}" diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk index f138c31..cd9ca87 100644 --- a/workflow/rules/report.smk +++ b/workflow/rules/report.smk @@ -14,8 +14,8 @@ rule heatmap: rule window: conda: "../envs/biopython.yaml" params: - window = 1000, - step = 1 + window = config["WINDOW"]["WIDTH"], + step = config["WINDOW"]["STEP"] input: vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv", gb = OUTDIR/"reference.gb", @@ -71,7 +71,9 @@ rule general_NV_description: samples = expand("{sample}", sample = iter_samples()), design = config["PLOTS"], nsp = config["NSP"], - metadata = config["METADATA"] + metadata = config["METADATA"], + window = config["WINDOW"]["WIDTH"], + step = config["WINDOW"]["STEP"] input: window = OUTDIR/f"{OUTPUT_NAME}.window.csv", vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv" diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R index 6b72548..0a4b436 100644 --- a/workflow/scripts/report/NV_description.R +++ b/workflow/scripts/report/NV_description.R @@ -190,6 +190,9 @@ variants <- vcf %>% color = "Classification", alpha = "Frequency", fill = "Region" + ) + + guides( + fill = guide_legend(reverse = TRUE) ) @@ -325,7 +328,10 @@ variants_spike <- vcf %>% color = "Classification", alpha = "Frequency", fill = "Region" - ) + ) + + guides( + fill = guide_legend(reverse = TRUE) + ) @@ -352,7 +358,7 @@ ggsave( # Figure for nº of heterozygus sites for each sample log_info("Plotting nº of heterozygus sites for each sample") -figur_SNP_time <- vcf_snp %>% +figur_SNP_table <- vcf_snp %>% filter(ALT_FREQ <= 0.95) %>% left_join( read_csv(snakemake@params[["metadata"]]), @@ -363,7 +369,9 @@ figur_SNP_time <- vcf_snp %>% CollectionDate = min(as.Date(CollectionDate)), n = n_distinct(POS) ) %>% - ungroup() %>% + ungroup() + +figur_SNP_time <- figur_SNP_table %>% ggplot() + aes( x = CollectionDate, @@ -375,7 +383,6 @@ figur_SNP_time <- vcf_snp %>% alpha = 0.6 ) + geom_point() + - stat_cor(geom = "label") + labs( x = "Date", y = "Nº of polimorphic sites" @@ -457,10 +464,16 @@ n_indels <- vcf %>% length() n_snv <- length(unique(vcf$variant)) - n_indels +model <- lm(n ~ CollectionDate, data = figur_SNP_table) +cortest <- cor.test(figur_SNP_table$n, as.numeric(figur_SNP_table$CollectionDate)) list( "INDELS" = n_indels, - "SNV" = n_snv + "SNV" = n_snv, + "window" = snakemake@params[["window"]], + "step" = snakemake@params[["step"]], + "r2" = summary(model)$r.squared[[1]], + "value" = ifelse(cortest$p.value < 0.001, "< 0.001", cortest$p.value) ) %>% toJSON() %>% write(snakemake@output[["json"]]) diff --git a/workflow/scripts/report/diversity_plot.R b/workflow/scripts/report/diversity_plot.R index eb23c89..aeceee6 100644 --- a/workflow/scripts/report/diversity_plot.R +++ b/workflow/scripts/report/diversity_plot.R @@ -122,13 +122,14 @@ ggsave( ) # DATA JSON ##### +p.value <- ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp) list.div <- list( - "diversity" = diversity, - "p.value" = ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp), - "normal.pvalue" = st$p.value, - "norm.text" = ifelse(st$p.value >= 0.05, "","not"), - "type.test" = ifelse(st$p.value >= 0.05, "","empirical"), + "diversity" = format(diversity, scientific = TRUE), + "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"), + "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"), + "norm.text" = ifelse(st$p.value >= 0.05, "", "not"), + "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"), "boot.reps" = snakemake@params[["bootstrap_reps"]], "sample.size" = length(study_aln) ) diff --git a/workflow/scripts/report/evo_plots.R b/workflow/scripts/report/evo_plots.R index 6b82481..0b5ad84 100644 --- a/workflow/scripts/report/evo_plots.R +++ b/workflow/scripts/report/evo_plots.R @@ -77,7 +77,10 @@ plot <- vcf %>% ) + geom_point() + geom_line() + - scale_color_hue(labels = c("dN", "dS")) + + scale_color_manual( + labels = dnds.labels, + values = dnds.colors + ) + labs( y = "", x = "Time since first sample", diff --git a/workflow/scripts/report/phylo_plots.R b/workflow/scripts/report/phylo_plots.R index 1e3e996..068dd6d 100644 --- a/workflow/scripts/report/phylo_plots.R +++ b/workflow/scripts/report/phylo_plots.R @@ -16,21 +16,13 @@ log_threshold(INFO) source(snakemake@params[["design"]]) # legend thresholds for ml tree + legend.names <- c( tip_label = "Studied samples", - Bootstrap_pass = sprintf( - "bootstrap >= %s", - snakemake@params[["boot_th"]] - ), - alrt_pass = sprintf( - "aLRT >= %s", - snakemake@params[["alrt_th"]] - ), - boot_alrt_pass = sprintf( - "bootstrap >= %s & aLRT >= %s ", - snakemake@params[["boot_th"]], - snakemake@params[["alrt_th"]] - ) + boot_alrt_pass = sprintf("UFBoot ≥ %s%s & SH-aLRT ≥ %s%s ", + snakemake@params[["boot_th"]], "%", + snakemake@params[["alrt_th"]], "%" + ) ) @@ -53,8 +45,12 @@ study_names <- read.dna( snakemake@input[["study_fasta"]], format = "fasta", as.matrix = FALSE, - ) %>% - names() + ) + +study_names <- study_names[!startsWith(names(study_names), snakemake@config[["ALIGNMENT_REFERENCE"]])] + +study_names <- names(study_names) + # Obtain sample names ordered by CollectionDate date_order <- read_csv(snakemake@params[["metadata"]]) %>% @@ -212,9 +208,7 @@ aLRT.mask <- aLRT.values >= snakemake@params[["alrt_th"]] boot.mask <- bootstrap.values >= snakemake@params[["boot_th"]] node.color <- case_when( - aLRT.mask & boot.mask ~ "boot_alrt_pass", - !aLRT.mask & boot.mask ~ "Bootstrap_pass", - aLRT.mask & !boot.mask ~ "alrt_pass" + aLRT.mask & boot.mask ~ "boot_alrt_pass" ) # MRCA for studied samples @@ -222,26 +216,33 @@ study.mrca <- getMRCA(tree_ml, study_names) log_info("Plotting M-L tree with context samples") p <- ggtree(tree_ml, layout = "circular") + - geom_highlight(node = study.mrca, colour = "red", fill = "red", alpha = 0.2) + - geom_point( - aes( - color = c(tip.color, node.color), # First node points in the tree are tip points, then internal nodes - shape = c(rep("tip", length(tree_ml$tip.label)), rep("node", length(tree_ml$node.label))) - ), - show.legend = TRUE - ) + - geom_treescale(x = 0.0008) + - geom_rootedge(0.0005) + - xlim(-0.0008, NA) + - scale_shape_manual(values = c( - tip = 1, - node = 20 - ), guide = "none") + - labs(color = "Class") + - scale_color_manual(values = tree_colors, - na.value = NA, - labels = legend.names - ) + geom_highlight(node = study.mrca, colour = "red", fill = "red", alpha = 0) + + geom_point( + aes( + color = c(tip.color, node.color), # First node points in the tree are tip points, then internal nodes + size = c(rep("tip_label", length(tree_ml$tip.label)), rep("boot_alrt_pass", length(tree_ml$node.label))), + alpha = c(rep("tip_label", length(tree_ml$tip.label)), rep("boot_alrt_pass", length(tree_ml$node.label))) + ), + show.legend = TRUE + ) + + geom_treescale(x = 0.0008) + + geom_rootedge(0.0005) + + xlim(-0.0008, NA) + + scale_size_manual( + name = "Class", + values = node.size, + labels = legend.names + ) + + scale_alpha_manual( + name = "Class", + values = node.alpha, + labels = legend.names + ) + + labs(color = "Class") + + scale_color_manual(values = tree_colors, + na.value = NA, + labels = legend.names + ) ggsave( filename = snakemake@output[["tree_ml"]], @@ -268,10 +269,21 @@ tempest %>% # TEMPEST STATATS model <- lm(distance ~ date_interval, data = tempest) +# TREE STATS +study.node <- tree_ml$node.label[study.mrca - length(tip.color)] +monophyletic <- ifelse(is.monophyletic(tree_ml, study_names), "are", "are not") + + list( "sub_rate" = model$coefficients[[2]] * 365, "r2" = summary(model)$r.squared[[1]], - "pvalue" = cor.test(tempest$distance, tempest$date_interval)$p.value + "pvalue" = ifelse(cor.test(tempest$distance, tempest$date_interval)$p.value < 0.001, + "< 0.001", + cor.test(tempest$distance, tempest$date_interval)$p.value + ), + "boot" = strsplit(study.node, "/")[[1]][2] %>% as.numeric(), + "alrt" = strsplit(study.node, "/")[[1]][1] %>% as.numeric(), + "monophyly"= monophyletic ) %>% toJSON() %>% write(snakemake@output[["json"]]) diff --git a/workflow/scripts/report/snp_plots.R b/workflow/scripts/report/snp_plots.R index d895614..b08c208 100644 --- a/workflow/scripts/report/snp_plots.R +++ b/workflow/scripts/report/snp_plots.R @@ -6,7 +6,9 @@ library(ggrepel) library(logger) log_threshold(INFO) - +# Get colors +color = grDevices::colors()[grep('gr(a|e)y', grDevices::colors(), invert = TRUE)] +color = color[grep('white', color, invert = TRUE)] # Import file with plots style source(snakemake@params[["design"]]) @@ -195,7 +197,7 @@ panel <- vcf %>% y = ALT_FREQ, color = variant ) + - scale_color_viridis_d() + + scale_color_manual(values = sample(color, length(subset))) + geom_point() + geom_line() + theme(