Add gb2seq annotation to all analyses

PathoGenOmics-Lab · Sep 27, 2023 · 78830fe · 78830fe
1 parent 8e3c1aa
commit 78830fe
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 8 deletions.
diff --git a/workflow/scripts/filter_tsv.R b/workflow/scripts/filter_tsv.R
@@ -47,6 +47,14 @@ data <- mutate(
 
 data <- distinct(data, pick(!GFF_FEATURE), .keep_all = TRUE)
 
+# Change annotation to gb2seq annotation
+features <- read_csv(snakemake@input[["annotation"]])
+
+data <- data %>% 
+select(!GFF_FEATURE) %>%
+left_join(features) %>%
+rename(GFF_FEATURE = GEN)
+
 log_info("Saving results")
 write_tsv(
     data,

diff --git a/workflow/scripts/report/NV_description.R b/workflow/scripts/report/NV_description.R
@@ -103,14 +103,13 @@ vcf <- vcf %>%
       TRUE ~ "SNP"
       ),
     Class = case_when(
-      is.na(GFF_FEATURE) ~ "Intergenic",
+      GFF_FEATURE == "Intergenic" ~ "Intergenic",
       TRUE ~ synonimous
       ),
     POS = as.numeric(POS)
     ) %>%
   rowwise() %>%
   mutate(
-    gene = as.character(window[window$position == POS, "gen"]),
     indel_len = case_when(
       NV_class == "INDEL" &
         str_detect(SNP, fixed("--")) ~
@@ -120,7 +119,7 @@ vcf <- vcf %>%
         str_length(strsplit(SNP, "-+")[[1]][2])
       ),
     indel_class = case_when(
-      gene == "Intergenic" ~ "Intergenic",
+      GFF_FEATURE == "Intergenic" ~ "Intergenic",
       NV_class == "INDEL" &
         indel_len %% 3 == 0 ~ "In frame",
       NV_class == "INDEL" &
@@ -130,7 +129,7 @@ vcf <- vcf %>%
   ungroup() %>%
   mutate(
     group = case_when(
-      gene == "Intergenic" ~ "Intergenic",
+      GFF_FEATURE == "Intergenic" ~ "Intergenic",
       NV_class == "SNP" ~ Class,
       NV_class == "INDEL" ~ indel_class
       )
@@ -371,7 +370,7 @@ figur_SNP_time <- vcf_snp %>%
   group_by(REGION) %>%
   summarise(
     CollectionDate = min(as.Date(CollectionDate)),
-    n = n()
+    n = n_distinct(POS)
     ) %>%
   ungroup() %>%
   ggplot() +
@@ -450,7 +449,7 @@ vcf_snp %>%
   group_by(REGION) %>%
   summarise(
     CollectionDate = min(as.Date(CollectionDate)),
-    n_PolymorphicSites = n()
+    n_PolymorphicSites = n_distinct(POS)
     ) %>%
   ungroup() %>%
   rename(sample = REGION) %>%

diff --git a/workflow/scripts/report/get_annotation.py b/workflow/scripts/report/get_annotation.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+
+import logging
+import pandas as pd
+from gb2seq.features import Features
+from Bio import SeqIO
+
+replace_terry = {
+    "ORF1ab polyprotein": "orf1ab",
+    "ORF1a polyprotein": "orf1ab",
+    "surface glycoprotein": "S",
+    "ORF3a protein": "ORF3a",
+    "envelope protein": "E",
+    "membrane glycoprotein": "M",
+    "ORF6 protein": "ORF6",
+    "ORF7a protein": "ORF7",
+    "ORF7b": "ORF7",
+    "ORF8 protein": "ORF8",
+    "nucleocapsid phosphoprotein": "N",
+    "ORF10 protein": "ORF10"
+}
+
+def main():
+    logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+    ft = Features(snakemake.input.gb)
+
+    reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)
+
+    positions = [x for x in range(len(reference))]
+
+    genes = []
+    for pos in positions:
+
+        if len(ft.getFeatureNames(pos)) == 0:
+            genes.append("Intergenic")
+        else:
+            genes.append(replace_terry[list(ft.getFeatureNames(pos))[0]])
+
+    df = pd.DataFrame({"POS":positions, "GEN": genes})
+
+    df.to_csv(snakemake.output.df,index= False)
+
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/workflow/scripts/report/heatmap.R b/workflow/scripts/report/heatmap.R
@@ -17,7 +17,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
 # Create SNP variable and select useful variables from vcf
 vcf <- vcf %>%
   mutate(
-    GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
     SNP = case_when(
       !is.na(REF_AA) ~ paste(
         GFF_FEATURE,

diff --git a/workflow/scripts/report/snp_plots.R b/workflow/scripts/report/snp_plots.R
@@ -37,7 +37,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
 # Simplify features names and create SNP variable
 vcf <- vcf %>%
   mutate(
-    GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
     SNP = case_when(
       !is.na(REF_AA) ~ paste(
         GFF_FEATURE,
@@ -47,6 +46,13 @@ vcf <- vcf %>%
         ALT_AA,
         sep = ""
         ),
+      GFF_FEATURE != "Intergenic" ~ paste(
+        GFF_FEATURE,
+        ":",
+        POS - 1,
+        "-",
+        ALT
+      ),
       TRUE ~ paste(
         REF,
         POS,