Merge pull request #19 from PathoGenOmics-Lab/bug-solving

Bug solving
PathoGenOmics-Lab · Oct 2, 2023 · 414ccc8 · 414ccc8
2 parents 3e488e1 + 98eee45
commit 414ccc8
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 19 deletions.
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
@@ -14,7 +14,8 @@ rule heatmap:
 rule window:
     conda: "../envs/biopython.yaml"
     params:
-        step = 1000
+        window = 1000,
+        step = 1
     input:
         vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
         gb = OUTDIR/"reference.gb"
@@ -66,6 +67,7 @@ rule freyja_plot:
 rule general_NV_description:
     conda: "../envs/renv.yaml"
     params:
+        samples = expand("{sample}", sample = iter_samples()),
         design = config["PLOTS"],
         nsp = config["NSP"],
         metadata = config["METADATA"]

diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk
@@ -52,6 +52,19 @@ rule snps_to_ancestor:
         sed 's/'$ref'/'{wildcards.sample}'/g' {wildcards.sample}.tsv | cat > {output.tsv}
         """
 
+rule annotation:
+    threads:1
+    conda: "../envs/biopython.yaml"
+    shadow: "shallow"
+    input:
+        gb = OUTDIR/"reference.gb",
+        ref = OUTDIR/"reference.fasta"
+    output:
+        df = temp(OUTDIR/"annotation.csv")
+    log:
+        LOGDIR / "annotation" / "log.txt"
+    script:
+        "../scripts/report/get_annotation.py"
 
 rule format_tsv:
     threads:1
@@ -94,7 +107,8 @@ rule filter_tsv:
     threads: 1
     conda: "../envs/renv.yaml"
     input: 
-        tsv = OUTDIR/f"{OUTPUT_NAME}.masked.tsv"
+        tsv = OUTDIR/f"{OUTPUT_NAME}.masked.tsv",
+        annotation = OUTDIR/"annotation.csv"
     output:
         filtered_tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv"
     log:

diff --git a/workflow/scripts/filter_tsv.R b/workflow/scripts/filter_tsv.R
@@ -43,7 +43,17 @@ data <- mutate(
         TRUE             ~  "No"
     )
 )
+# Remove duplicated features
 
+data <- distinct(data, pick(!GFF_FEATURE), .keep_all = TRUE)
+
+# Change annotation to gb2seq annotation
+features <- read_csv(snakemake@input[["annotation"]])
+
+data <- data %>% 
+select(!GFF_FEATURE) %>%
+left_join(features) %>%
+rename(GFF_FEATURE = GEN)
 
 log_info("Saving results")
 write_tsv(

diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R
@@ -51,6 +51,7 @@ window <- read_csv(snakemake@input[["window"]])
 
 # Obtain sample names ordered by CollectionDate
 date_order <- read_csv(snakemake@params[["metadata"]]) %>%
+  filter(ID %in% snakemake@params[["samples"]]) %>%
   arrange(CollectionDate) %>%
   pull(ID) %>%
   unique()
@@ -102,14 +103,13 @@ vcf <- vcf %>%
       TRUE ~ "SNP"
       ),
     Class = case_when(
-      is.na(GFF_FEATURE) ~ "Intergenic",
+      GFF_FEATURE == "Intergenic" ~ "Intergenic",
       TRUE ~ synonimous
       ),
     POS = as.numeric(POS)
     ) %>%
   rowwise() %>%
   mutate(
-    gene = as.character(window[window$position == POS, "gen"]),
     indel_len = case_when(
       NV_class == "INDEL" &
         str_detect(SNP, fixed("--")) ~
@@ -119,7 +119,7 @@ vcf <- vcf %>%
         str_length(strsplit(SNP, "-+")[[1]][2])
       ),
     indel_class = case_when(
-      gene == "Intergenic" ~ "Intergenic",
+      GFF_FEATURE == "Intergenic" ~ "Intergenic",
       NV_class == "INDEL" &
         indel_len %% 3 == 0 ~ "In frame",
       NV_class == "INDEL" &
@@ -129,7 +129,7 @@ vcf <- vcf %>%
   ungroup() %>%
   mutate(
     group = case_when(
-      gene == "Intergenic" ~ "Intergenic",
+      GFF_FEATURE == "Intergenic" ~ "Intergenic",
       NV_class == "SNP" ~ Class,
       NV_class == "INDEL" ~ indel_class
       )
@@ -190,6 +190,7 @@ variants <- vcf %>%
     labels = NV_names,
     values = NV_colors
     ) +
+  scale_y_discrete(drop = FALSE) +
   labs(
     x = "SARS-CoV-2 genome position",
     y = "Sample",
@@ -308,7 +309,7 @@ window_plot_spike <- window %>%
 variants_spike <- vcf %>%
   filter(
     ALT_FREQ > 0,
-      POS %in% spike.pos
+      POS %in% c(min(spike.pos):max(spike.pos))
       ) %>%
   ggplot() +
   aes(
@@ -324,6 +325,7 @@ variants_spike <- vcf %>%
     labels = NV_names,
     values  = NV_colors
     ) +
+  scale_y_discrete(drop = FALSE) +
     labs(
       x = "SARS-CoV-2 genome position",
       y = "Sample",
@@ -334,6 +336,7 @@ variants_spike <- vcf %>%
       )
 
 
+
 figura_spike <- ggarrange(
   window_plot_spike,
   variants_spike,
@@ -368,7 +371,7 @@ figur_SNP_time <- vcf_snp %>%
   group_by(REGION) %>%
   summarise(
     CollectionDate = min(as.Date(CollectionDate)),
-    n = n()
+    n = n_distinct(POS)
     ) %>%
   ungroup() %>%
   ggplot() +
@@ -447,7 +450,7 @@ vcf_snp %>%
   group_by(REGION) %>%
   summarise(
     CollectionDate = min(as.Date(CollectionDate)),
-    n_PolymorphicSites = n()
+    n_PolymorphicSites = n_distinct(POS)
     ) %>%
   ungroup() %>%
   rename(sample = REGION) %>%
@@ -457,7 +460,12 @@ vcf_snp %>%
 
 # STATS FOR REPORTING
 
-n_indels <- filter(vcf, NV_class == "INDEL") %>% length()
+n_indels <- vcf %>%
+  filter(NV_class == "INDEL") %>%
+  pull(SNP) %>%
+  unique() %>%
+  length()
+
 n_snv <- length(unique(vcf$SNP)) - n_indels
 
 list(

diff --git a/workflow/scripts/report/get_annotation.py b/workflow/scripts/report/get_annotation.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+import logging
+import pandas as pd
+from gb2seq.features import Features
+from Bio import SeqIO
+
+replace_terry = {
+    "ORF1ab polyprotein": "orf1ab",
+    "ORF1a polyprotein": "orf1ab",
+    "surface glycoprotein": "S",
+    "ORF3a protein": "ORF3a",
+    "envelope protein": "E",
+    "membrane glycoprotein": "M",
+    "ORF6 protein": "ORF6",
+    "ORF7a protein": "ORF7",
+    "ORF7b": "ORF7",
+    "ORF8 protein": "ORF8",
+    "nucleocapsid phosphoprotein": "N",
+    "ORF10 protein": "ORF10"
+}
+
+def main():
+    logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+    ft = Features(snakemake.input.gb)
+
+    reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)
+
+    positions = [x for x in range(len(reference))]
+
+    genes = []
+    for pos in positions:
+
+        if len(ft.getFeatureNames(pos)) == 0:
+            genes.append("Intergenic")
+        else:
+            genes.append(replace_terry[list(ft.getFeatureNames(pos))[0]])
+
+    df = pd.DataFrame({"POS":positions, "GEN": genes})
+
+    df.to_csv(snakemake.output.df,index= False)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/workflow/scripts/report/heatmap.R b/workflow/scripts/report/heatmap.R
@@ -17,7 +17,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
 # Create SNP variable and select useful variables from vcf
 vcf <- vcf %>%
   mutate(
-    GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
     SNP = case_when(
       !is.na(REF_AA) ~ paste(
         GFF_FEATURE,

diff --git a/workflow/scripts/report/snp_plots.R b/workflow/scripts/report/snp_plots.R
@@ -37,7 +37,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
 # Simplify features names and create SNP variable
 vcf <- vcf %>%
   mutate(
-    GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
     SNP = case_when(
       !is.na(REF_AA) ~ paste(
         GFF_FEATURE,
@@ -47,6 +46,13 @@ vcf <- vcf %>%
         ALT_AA,
         sep = ""
         ),
+      GFF_FEATURE != "Intergenic" ~ paste(
+        GFF_FEATURE,
+        ":",
+        POS - 1,
+        "-",
+        ALT
+      ),
       TRUE ~ paste(
         REF,
         POS,
@@ -250,7 +256,7 @@ ggsave(
 # PLOT TABLES ####
 
 log_info("Saving coorelation table")
-cor.df %>%
+cor.df.fill %>%
   transmute(
     NV = snp,
     PearsonCor = cor,

diff --git a/workflow/scripts/window.py b/workflow/scripts/window.py
@@ -27,7 +27,7 @@ def get_polimorphic_sites(df:pd.DataFrame) -> set:
     return set(df.POS)
 
 
-def window_calculation(sites:set, step:int, genome_size:int, coord:str) -> pd.DataFrame:
+def window_calculation(sites:set, window:int, step:int, genome_size:int, coord:str) -> pd.DataFrame:
 
     ft = Features(coord) # Create Features object to obtain annotations
 
@@ -36,20 +36,20 @@ def window_calculation(sites:set, step:int, genome_size:int, coord:str) -> pd.Da
     genes = []
     lim_sup = genome_size + 1
 
-    for position in range(1, lim_sup):
+    for position in range(1, lim_sup, step):
 
         if len(ft.getFeatureNames(position)) == 0:
             genes.append("Intergenic")
         else:
             genes.append(list(ft.getFeatureNames(position))[0])
 
         # Add percent (excluding initial and final positions)
-        if position - step not in range(1, lim_sup):
+        if position - window not in range(1, lim_sup):
             pct.append(0)
         else:
             # Calculate nº of polimorphisms in the window
-            num_snp = len([ x for x in sites if x in range(position - step, position + 1) ]) # Calculate nº of polimorphisms in the window
-            pct.append(num_snp / step)
+            num_snp = len([ x for x in sites if x in range(position - window, position + 1) ]) # Calculate nº of polimorphisms in the window
+            pct.append(num_snp / window)
 
         positions.append(position)
 
@@ -66,7 +66,7 @@ def main():
 
     logging.info("Sliding window calculation of proportion of polimorphic sites")
 
-    frame = window_calculation(sites, snakemake.params.step, 29903, snakemake.input.gb)
+    frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, 29903, snakemake.input.gb)
 
     logging.info("Saving results")
     frame.replace(replace_terry, inplace = True)