Skip to content

Commit

Permalink
Add gb2seq annotation to all analyses
Browse files Browse the repository at this point in the history
  • Loading branch information
SeviJordi committed Sep 27, 2023
1 parent 8e3c1aa commit 78830fe
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 8 deletions.
8 changes: 8 additions & 0 deletions workflow/scripts/filter_tsv.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ data <- mutate(

data <- distinct(data, pick(!GFF_FEATURE), .keep_all = TRUE)

# Change annotation to gb2seq annotation
features <- read_csv(snakemake@input[["annotation"]])

data <- data %>%
select(!GFF_FEATURE) %>%
left_join(features) %>%
rename(GFF_FEATURE = GEN)

log_info("Saving results")
write_tsv(
data,
Expand Down
11 changes: 5 additions & 6 deletions workflow/scripts/report/NV_description.R
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,13 @@ vcf <- vcf %>%
TRUE ~ "SNP"
),
Class = case_when(
is.na(GFF_FEATURE) ~ "Intergenic",
GFF_FEATURE == "Intergenic" ~ "Intergenic",
TRUE ~ synonimous
),
POS = as.numeric(POS)
) %>%
rowwise() %>%
mutate(
gene = as.character(window[window$position == POS, "gen"]),
indel_len = case_when(
NV_class == "INDEL" &
str_detect(SNP, fixed("--")) ~
Expand All @@ -120,7 +119,7 @@ vcf <- vcf %>%
str_length(strsplit(SNP, "-+")[[1]][2])
),
indel_class = case_when(
gene == "Intergenic" ~ "Intergenic",
GFF_FEATURE == "Intergenic" ~ "Intergenic",
NV_class == "INDEL" &
indel_len %% 3 == 0 ~ "In frame",
NV_class == "INDEL" &
Expand All @@ -130,7 +129,7 @@ vcf <- vcf %>%
ungroup() %>%
mutate(
group = case_when(
gene == "Intergenic" ~ "Intergenic",
GFF_FEATURE == "Intergenic" ~ "Intergenic",
NV_class == "SNP" ~ Class,
NV_class == "INDEL" ~ indel_class
)
Expand Down Expand Up @@ -371,7 +370,7 @@ figur_SNP_time <- vcf_snp %>%
group_by(REGION) %>%
summarise(
CollectionDate = min(as.Date(CollectionDate)),
n = n()
n = n_distinct(POS)
) %>%
ungroup() %>%
ggplot() +
Expand Down Expand Up @@ -450,7 +449,7 @@ vcf_snp %>%
group_by(REGION) %>%
summarise(
CollectionDate = min(as.Date(CollectionDate)),
n_PolymorphicSites = n()
n_PolymorphicSites = n_distinct(POS)
) %>%
ungroup() %>%
rename(sample = REGION) %>%
Expand Down
47 changes: 47 additions & 0 deletions workflow/scripts/report/get_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python3

import logging
import pandas as pd
from gb2seq.features import Features
from Bio import SeqIO

replace_terry = {
"ORF1ab polyprotein": "orf1ab",
"ORF1a polyprotein": "orf1ab",
"surface glycoprotein": "S",
"ORF3a protein": "ORF3a",
"envelope protein": "E",
"membrane glycoprotein": "M",
"ORF6 protein": "ORF6",
"ORF7a protein": "ORF7",
"ORF7b": "ORF7",
"ORF8 protein": "ORF8",
"nucleocapsid phosphoprotein": "N",
"ORF10 protein": "ORF10"
}

def main():
logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
ft = Features(snakemake.input.gb)

reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)

positions = [x for x in range(len(reference))]

genes = []
for pos in positions:

if len(ft.getFeatureNames(pos)) == 0:
genes.append("Intergenic")
else:
genes.append(replace_terry[list(ft.getFeatureNames(pos))[0]])

df = pd.DataFrame({"POS":positions, "GEN": genes})

df.to_csv(snakemake.output.df,index= False)


if __name__ == "__main__":
main()


1 change: 0 additions & 1 deletion workflow/scripts/report/heatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
# Create SNP variable and select useful variables from vcf
vcf <- vcf %>%
mutate(
GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
SNP = case_when(
!is.na(REF_AA) ~ paste(
GFF_FEATURE,
Expand Down
8 changes: 7 additions & 1 deletion workflow/scripts/report/snp_plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
# Simplify features names and create SNP variable
vcf <- vcf %>%
mutate(
GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
SNP = case_when(
!is.na(REF_AA) ~ paste(
GFF_FEATURE,
Expand All @@ -47,6 +46,13 @@ vcf <- vcf %>%
ALT_AA,
sep = ""
),
GFF_FEATURE != "Intergenic" ~ paste(
GFF_FEATURE,
":",
POS - 1,
"-",
ALT
),
TRUE ~ paste(
REF,
POS,
Expand Down

0 comments on commit 78830fe

Please sign in to comment.