Merge branch 'dev' into feature-minorrev

PathoGenOmics-Lab · Oct 11, 2023 · 0a25fbb · 0a25fbb
2 parents 5863d1b + 6a0cc80
commit 0a25fbb
Show file tree

Hide file tree

Showing 11 changed files with 162 additions and 110 deletions.
diff --git a/config/config.yaml b/config/config.yaml
@@ -24,6 +24,9 @@ DEMIX:
   MIN_QUALITY: 20
   COV_CUTOFF: 30
   MIN_ABUNDANCE: 0.0001
+WINDOW:
+  WIDTH: 1000
+  STEP: 50
 GISAID_YAML:
   "config/gisaid.yaml"
 DIVERSITY_BOOTSTRAP_REPS:

diff --git a/config/design_plots.R b/config/design_plots.R
@@ -22,8 +22,7 @@ theme_update(text = element_text(size = 16,  family = "Montserrat"),
 
 
 
-
-# AJUSTES ####
+# CONFIG
 
 
 gene_colors = c(M = "#B4D4B4",
@@ -37,28 +36,27 @@ gene_colors = c(M = "#B4D4B4",
                 ORF7 = "#AA88CB",
                 ORF10 = "#CACB5D")
 
-snp_colors = c("C-26029-A" = "#e69138",
-               "G-21985-T" = "#ffd966",
-               "C-29421-T" = "#935fca",
-               "C-23043-T" = "#7c1d6f",
-               "C-28854-T" = "#ff4639"
-               ,"C-5178-T" = "#d5a6bd"
-               ,"C-4230-A" = "#1e3f9d"
-               ,"C-16375-T" = "#1EA0AE"
-               , "A-11923-C" = "#64acee"
-               ,"G-12761-A" = "#084c61"
-               ,"C-6033-T" = "chartreuse4"
-               ,"C-4230-T" = "#93c47d",
-               "Other" = "gray40")
-
 
+# M-L tree colors and labels
 tree_colors = c(
-    tip_label = "blue",
-    Bootstrap_pass = "#ff6600",
-    alrt_pass = "#ffbf51",
-    boot_alrt_pass = "red"
+  tip_label = "#D944AA",
+  boot_alrt_pass = "#64acee"
+)
+
+
+node.size <- c(
+  tip_label = 2,
+  boot_alrt_pass = 0.8
+)
+
+node.alpha <- c(
+  tip_label = 0.6,
+  boot_alrt_pass = 0.7
 )
 
+
+# Nucleotide variants classification colors and labels
+
 NV_colors <- c(
     Frameshift = "#568D63",
     "In frame" = "black",
@@ -73,4 +71,15 @@ NV_names <- c(
     Intergenic = "Intergenic",
     No = "Non synonymous",
     yes = "Synonymous"
-)
+)
+
+# dn ds colors and labels
+dnds.labels <- c(
+  dn = "dN",
+  ds = "dS"
+)
+
+dnds.colors <- c(
+  dn = "#E53E47",
+  ds = "#2C47F5"
+)
diff --git a/config/report.styles.css b/config/report.styles.css
@@ -8,6 +8,14 @@
     background-origin: content-box; 
   } 
 
+.title {
+  font-family: Montserrat;
+  color: #FFFFFF
+}
+.quarto-title p {
+  font-family: Montserrat;
+  color: #FFFFFF
+}
 html {
   font-family: Montserrat;
   color: #000000;

diff --git a/template.qmd b/template.qmd
@@ -1,7 +1,6 @@
 ---
-title: "Case-study-SARS-CoV-2" 
-author: "Jordi Sevilla"
-institute: "PathoGenOmics-lab"
+title: "VIPERA report" 
+subtitle: "Workflow: `r params$workflow_version`"
 date: last-modified
 date-format: "DD-MM-YYYY"
 title-block-banner: true
@@ -44,7 +43,6 @@ output-file: report.html
   <link href="https://fonts.googleapis.com/css2?family=Montserrat&display=swap" rel="stylesheet">
 </head>
 
-> Workflow: `r params$workflow_version`
 
 ```{r , echo= F,message= F, include = F}
 library(jsonlite)
@@ -86,7 +84,7 @@ cor.mat <- cor(vcf)
 cor.mat[is.na(cor.mat)] <- 0
 ```
 
-## Samples summary
+## Summary of the target samples dataset 
 ```{r, echo= F,message= F}
 gt(table) %>%
 cols_label(
@@ -109,72 +107,73 @@ opt_table_font(
 )
 
 ```
-## Evidence for chronic infection 
+## Evidence for single, serially-sampled infection
 
 
 
 ### Lineage admixture
-The most probably lineage admixture for each sample has been calculated with software [Freyja](https://github.com/andersen-lab/Freyja).
+The most probable lineage admixture for each sample has been calculated using the software [Freyja](https://github.com/andersen-lab/Freyja).
 
-![**Figure 1.** Lineage composition of samples according to Freyja. Samples are ordered by collection date.](`r params$freyja`)
+![**Figure 1.** Lineage admixture calculated with Freyja for each sample. SSamples in the X-axis are ordered chronologically, from more ancient to newer.](`r params$freyja`)
 
-### Nucleotide diversity comparison
+### Phylogeny and temporal signal
 
-Nucleotide diversity is has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*).
+A maximum likelihood tree has been adjusted with the context sequences. The studied samples `r stats[["monophyly"]]` monophyletic. The clade that contains all the studied samples is supported by a **UFBoot** score of $`r stats[["boot"]]`$% and a **SH-aLRT** score of  $`r stats[["alrt"]]`$%.
 
-The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]]`$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$.
+![**Figure 2.** Maximum-likelihood phylogeny with 1000 support replicates of target datasets with their context samples. The clade that contains the studied samples is squared in red.](`r params$tree_ml`)
 
-![**Figure 2.** Distribution of nucleotide diversity pi for 1000 randomly selected groups from context sequences. Red line indicates the pi value for the studied samples.](`r params$div`)
+In addition, a neighbor-joining tree has been constructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data.
 
-### Phylogeny and temporal signal
+![**Figure 3.** Neighbor-joining trees constructed with the pairwise allele frequency-weighted distances.](`r params$tree`)
 
-A maximum likelihood tree has been adjusted with the context sequences:
+With the neighbor-joining tree, patristic distances to root have been correlated with time, obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year.
 
-![**Figure 3.** Maximum-likelihood tree whith studied and context sequence. Studied sequence are represented in red.](`r params$tree_ml`)
+![**Figure 4.** Scatterplot showing the relationship between root-to-tip distances and the number of days passed since the first sample. The red line show the linear model fit.](`r params$tempest`)
+
+### Nucleotide diversity comparison
 
-In addition, phylogeny has been reconstructed using pairwise distances between target samples taking into account allele frequencies detected in sequencing data.
+Nucleotide diversity has been calculated for $`r div_values[["boot.reps"]]`$ bootstrap replicates of $`r div_values[["sample.size"]]`$ samples in the context data. The distribution for the nuclotide diversity is assumed to `r div_values[["norm.text"]]` be normal with a p-value = $`r div_values[["normal.pvalue"]]`$ (*Shapiro-test*).
 
-![**Figure 4.** Neighbour-joining tree for studied samples.](`r params$tree`)
+The nucleotide diversity value for the studied samples is $`r div_values[["diversity"]] `$. Assuming the context samples to be patient-independent, the `r div_values[["type.test"]]` p-value for a reduced nucleotide diversity in the studied samples is $`r div_values[["p.value"]]`$.
 
-With the neighbur-joining tree, partistic distances to root has been correlated with time obtaining a $R^2$ of **`r correlation`** with a p-value of $`r p_value_lm`$. The estimated substitution rate is **`r sub_rate`** substitutions per year.
+![**Figure 5.** Analysis of the nucleotide diversity (π). The orange line describe a normal distribution with the same mean and standard deviation as the distribution of π values obtained from $`r div_values[["boot.reps"]]`$ samples of $`r div_values[["sample.size"]]`$ sequences from the context. The red vertical line indicate the π value for the studied samples.](`r params$div`)
 
-![**Figure 5.** Partistic distance to root in the neighbour-joining tree against days since first sample. Red line indicates the adjusted linear regression model.](`r params$tempest`)
 
-## Characterizing within-host evolution
+## Evolutionary trajectory of the serially-sampled SARS-CoV-2 infection
 
 ### Number of polymorphic sites
 
-Sites with minor allele frequency > 0.05  are considered as polymorphic sites.
+Sites with minor allele frequency > 0.05  are considered as polymorphic sites. Collection date explains the number of polymorphic sites with an $R^2$ of **`r nv.counts[["r2"]]`** with a p-value of $`r nv.counts[["value"]]`$.
 
-![**Figure 6.** Number of polymorphisms over time for the studied samples.](`r params$fig_cor_snp`)
+![**Figure 6.**  Number of polymorphic sites depending on collection date. The red line shows the linear model fit.](`r params$fig_cor_snp`)
 
 ### Description for within-host nucleotide variants 
 
-`r n_SNV` different SNV and `r n_INDELS` INDELS had been detected along the genome. In the **Figure 7a** is shown their distribution along the genome and the variation in allele frequencies in the studied samples.
+A total of `r n_SNV` different SNVs and `r n_INDELS` INDELs have been detected along the genome. **Figure 7** shows their distribution along the genome and the variation in allele frequencies in the studied samples.
 
 ::: {.panel-tabset}
 
 ## Whole genome
-![**Figure 7a.** A\) Proportion of polymorphic sites along the genome calculated with 1000nt windows. B\) Nucleotide variants (NV) in the genomes of the studied samples. The shape and color indicates the type of NV. Samples are ordered by collection date.](`r params$SNV`)
+![**Figure 7.** Summary of the intra-host accumulation of nucleotide variants (NV), using the reconstructed dataset ancestor as reference. A) Nucleotide variants per site along the SARS-CoV-2 genome. Relative abundance of NVs is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. Labels indicate the coding regions of the non structural proteins (NSP) within ORF1ab. B) Genome variation along the genome for each sample. The Y-axis displays samples in chronological order, with the earliest collection date at the bottom, and the latest, at the top.](`r params$SNV`)
 
 ## Spike sequence
 
-![**Figure 7b.** A\) Proportion of polymorphic sites along the spike nucleotide sequence calculated with 1000nt windows. B\) Nucleotide variants (NV) in the spike sequences of the studied samples. The shape and color indicates the type of NV. Samples are ordered by collection date.](`r params$SNV_s`)
+![**Figure 8.** Summary of the intra-host accumulation of nucleotide variants (NV) in the spike sequence, using the reconstructed dataset ancestor as reference. A) Nucleotide variants per site along the spike sequence. Relative abundance of NVs is calculated with a sliding window of width $`r nv.counts[["window"]]`$ nucleotides and a step of $`r nv.counts[["step"]]`$. B) Genome variation along the spike sequence for each sample. The Y-axis displays samples in chronological order, with the earliest collection date at the bottom, and the latest, at the top.](`r params$SNV_s`)
 :::
 
 ### Time dependency for the within-host nucleotide variants
 
 The correlation with time is calculated for the allele frequency of each NV.
 
-![**Figure 8.** *Pearson* r and adjusted p-value (-log10) for the correlation with time for each SNV. Red line indicates a p-value of 0.05.](`r params$volcano`)
+![**Figure 9.** Pearson’s correlation coefficients and BH-adjusted p-values for all detected nucleotide variants. Red dashed line indicates adjusted p = 0.05. Labeled dots represent nucleotide variants correlated with time (adjusted p < 0.05).](`r params$volcano`)
 
-The significant correlated NV are swown in detail:
+The significantly correlated NV are shown in detail:
 
-![**Figure 9.** Trend over time for the significantly correlated with time SNVs and for positions with more than 1 alternative allele. ](`r params$panel`)
+![**Figure 10.** Time series of the relative allele frequencies for nucleotide variants (NV) with a significant time correlation, and for those that share their site in the genome. Each subplot depicts the progression of the allele frequencies in time for a given genome position.](`r params$panel`)
 
 ### Correlation between alternative alleles
 
-To see posible interactions between mutations, pairwise correlation between the alternative frequencies are calculated.
+To see possible interactions between mutations, pairwise correlation between the alternative frequencies are calculated.
 
 ```{r, echo= F,message= F, fig.align='center'}
 
@@ -185,11 +184,11 @@ heatmaply_cor(
   column_text_angle = 90)
 
 ```
-![**Figure 10.** Heatmap for pairwise correlation between frquencies for alternative alleles.]()
+![**Figure 10.** Hierarchically clustered heatmap of the pairwise Pearson’s correlation coefficients between the time series of allele frequencies in the case study.]()
 
-### dn/ds over time
+### Non-synonymous to synonymous rate ratio over time
 
-![**Figure 11.** Sinonymous mutations per sinonymous site **(dS)** and non sinonymous mutations mutations per non sinonymous site **(dN)** over time. The values are adjusted by the allele frequency.](`r params$evo`)
+![**Figure 11.** Time series of dN and dS. Each point corresponds to a different sample, sorted in chronological order.](`r params$evo`)
 
 
 
diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk
@@ -17,7 +17,7 @@ rule fetch_reference_gb:
     params:
         ref = config["ALIGNMENT_REFERENCE"]
     output:
-        fasta = temp(OUTDIR/"reference.gb")
+        fasta = OUTDIR/"reference.gb"
     log:
         LOGDIR / "fetch_reference_gb" / "log.txt"
     shell:
@@ -61,6 +61,6 @@ rule fetch_problematic_vcf:
     log:
         LOGDIR / "fetch_problematic_vcf" / "log.txt"
     output:
-        temp(OUTDIR / "problematic_sites.vcf")
+        OUTDIR / "problematic_sites.vcf"
     shell:
         "curl {params.url} -o {output} -s 2> {log}"
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
@@ -14,8 +14,8 @@ rule heatmap:
 rule window:
     conda: "../envs/biopython.yaml"
     params:
-        window = 1000,
-        step = 1
+        window = config["WINDOW"]["WIDTH"],
+        step = config["WINDOW"]["STEP"]
     input:
         vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
         gb = OUTDIR/"reference.gb",
@@ -71,7 +71,9 @@ rule general_NV_description:
         samples = expand("{sample}", sample = iter_samples()),
         design = config["PLOTS"],
         nsp = config["NSP"],
-        metadata = config["METADATA"]
+        metadata = config["METADATA"],
+        window = config["WINDOW"]["WIDTH"],
+        step = config["WINDOW"]["STEP"]
     input:
         window = OUTDIR/f"{OUTPUT_NAME}.window.csv",
         vcf =  OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv"

diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R
@@ -190,6 +190,9 @@ variants <- vcf %>%
     color = "Classification",
     alpha = "Frequency",
     fill = "Region"
+    ) +
+    guides(
+      fill = guide_legend(reverse = TRUE)
     )
 
 
@@ -325,7 +328,10 @@ variants_spike <- vcf %>%
       color = "Classification",
       alpha = "Frequency",
       fill = "Region"
-      )
+      ) +
+    guides(
+      fill = guide_legend(reverse = TRUE)
+    )
 
 
 
@@ -352,7 +358,7 @@ ggsave(
 
 # Figure for nº of heterozygus sites for each sample
 log_info("Plotting nº of heterozygus sites for each sample")
-figur_SNP_time <- vcf_snp %>%
+figur_SNP_table <- vcf_snp %>%
   filter(ALT_FREQ <= 0.95) %>%
   left_join(
     read_csv(snakemake@params[["metadata"]]),
@@ -363,7 +369,9 @@ figur_SNP_time <- vcf_snp %>%
     CollectionDate = min(as.Date(CollectionDate)),
     n = n_distinct(POS)
     ) %>%
-  ungroup() %>%
+  ungroup()
+
+figur_SNP_time <- figur_SNP_table %>%
   ggplot() +
   aes(
     x = CollectionDate,
@@ -375,7 +383,6 @@ figur_SNP_time <- vcf_snp %>%
     alpha = 0.6
     ) +
   geom_point() +
-  stat_cor(geom = "label") +
   labs(
     x = "Date",
     y = "Nº of polimorphic sites"
@@ -457,10 +464,16 @@ n_indels <- vcf %>%
   length()
 
 n_snv <- length(unique(vcf$variant)) - n_indels
+model <- lm(n ~ CollectionDate, data = figur_SNP_table)
+cortest <- cor.test(figur_SNP_table$n, as.numeric(figur_SNP_table$CollectionDate))
 
 list(
   "INDELS" = n_indels,
-  "SNV" = n_snv
+  "SNV" = n_snv,
+  "window" = snakemake@params[["window"]],
+  "step"   = snakemake@params[["step"]],
+  "r2"     = summary(model)$r.squared[[1]],
+  "value"  = ifelse(cortest$p.value < 0.001, "< 0.001", cortest$p.value)
 ) %>%
 toJSON() %>%
 write(snakemake@output[["json"]])
diff --git a/workflow/scripts/report/diversity_plot.R b/workflow/scripts/report/diversity_plot.R
@@ -122,13 +122,14 @@ ggsave(
 )
 
 # DATA JSON #####
+p.value <- ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp)
 
 list.div <- list(
-                "diversity" = diversity,
-                "p.value" = ifelse(st$p.value >= 0.05, pvalue.norm,pvalue.emp),
-                "normal.pvalue" = st$p.value,
-                "norm.text" = ifelse(st$p.value >= 0.05, "","not"),
-                "type.test" = ifelse(st$p.value >= 0.05, "","empirical"),
+                "diversity" = format(diversity, scientific = TRUE),
+                "p.value" = ifelse(p.value >= 0.001, p.value, "< 0.001"),
+                "normal.pvalue" = ifelse(st$p.value >= 0.001, p.value, "< 0.001"),
+                "norm.text" = ifelse(st$p.value >= 0.05, "", "not"),
+                "type.test" = ifelse(st$p.value >= 0.05, "", "empirical"),
                 "boot.reps" = snakemake@params[["bootstrap_reps"]],
                 "sample.size" = length(study_aln)
 )

diff --git a/workflow/scripts/report/evo_plots.R b/workflow/scripts/report/evo_plots.R
@@ -77,7 +77,10 @@ plot <- vcf %>%
   ) +
   geom_point() +
   geom_line() +
-  scale_color_hue(labels = c("dN", "dS")) +
+  scale_color_manual(
+    labels = dnds.labels,
+    values = dnds.colors
+    ) +
   labs(
     y = "",
     x = "Time since first sample",