Skip to content

Commit

Permalink
Merge pull request #19 from PathoGenOmics-Lab/bug-solving
Browse files Browse the repository at this point in the history
Bug solving
  • Loading branch information
ahmig authored Oct 2, 2023
2 parents 3e488e1 + 98eee45 commit 414ccc8
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 19 deletions.
4 changes: 3 additions & 1 deletion workflow/rules/report.smk
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ rule heatmap:
rule window:
conda: "../envs/biopython.yaml"
params:
step = 1000
window = 1000,
step = 1
input:
vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
gb = OUTDIR/"reference.gb"
Expand Down Expand Up @@ -66,6 +67,7 @@ rule freyja_plot:
rule general_NV_description:
conda: "../envs/renv.yaml"
params:
samples = expand("{sample}", sample = iter_samples()),
design = config["PLOTS"],
nsp = config["NSP"],
metadata = config["METADATA"]
Expand Down
16 changes: 15 additions & 1 deletion workflow/rules/vaf.smk
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ rule snps_to_ancestor:
sed 's/'$ref'/'{wildcards.sample}'/g' {wildcards.sample}.tsv | cat > {output.tsv}
"""

rule annotation:
threads:1
conda: "../envs/biopython.yaml"
shadow: "shallow"
input:
gb = OUTDIR/"reference.gb",
ref = OUTDIR/"reference.fasta"
output:
df = temp(OUTDIR/"annotation.csv")
log:
LOGDIR / "annotation" / "log.txt"
script:
"../scripts/report/get_annotation.py"

rule format_tsv:
threads:1
Expand Down Expand Up @@ -94,7 +107,8 @@ rule filter_tsv:
threads: 1
conda: "../envs/renv.yaml"
input:
tsv = OUTDIR/f"{OUTPUT_NAME}.masked.tsv"
tsv = OUTDIR/f"{OUTPUT_NAME}.masked.tsv",
annotation = OUTDIR/"annotation.csv"
output:
filtered_tsv = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv"
log:
Expand Down
10 changes: 10 additions & 0 deletions workflow/scripts/filter_tsv.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,17 @@ data <- mutate(
TRUE ~ "No"
)
)
# Remove duplicated features

data <- distinct(data, pick(!GFF_FEATURE), .keep_all = TRUE)

# Change annotation to gb2seq annotation
features <- read_csv(snakemake@input[["annotation"]])

data <- data %>%
select(!GFF_FEATURE) %>%
left_join(features) %>%
rename(GFF_FEATURE = GEN)

log_info("Saving results")
write_tsv(
Expand Down
24 changes: 16 additions & 8 deletions workflow/scripts/report/NV_description.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ window <- read_csv(snakemake@input[["window"]])

# Obtain sample names ordered by CollectionDate
date_order <- read_csv(snakemake@params[["metadata"]]) %>%
filter(ID %in% snakemake@params[["samples"]]) %>%
arrange(CollectionDate) %>%
pull(ID) %>%
unique()
Expand Down Expand Up @@ -102,14 +103,13 @@ vcf <- vcf %>%
TRUE ~ "SNP"
),
Class = case_when(
is.na(GFF_FEATURE) ~ "Intergenic",
GFF_FEATURE == "Intergenic" ~ "Intergenic",
TRUE ~ synonimous
),
POS = as.numeric(POS)
) %>%
rowwise() %>%
mutate(
gene = as.character(window[window$position == POS, "gen"]),
indel_len = case_when(
NV_class == "INDEL" &
str_detect(SNP, fixed("--")) ~
Expand All @@ -119,7 +119,7 @@ vcf <- vcf %>%
str_length(strsplit(SNP, "-+")[[1]][2])
),
indel_class = case_when(
gene == "Intergenic" ~ "Intergenic",
GFF_FEATURE == "Intergenic" ~ "Intergenic",
NV_class == "INDEL" &
indel_len %% 3 == 0 ~ "In frame",
NV_class == "INDEL" &
Expand All @@ -129,7 +129,7 @@ vcf <- vcf %>%
ungroup() %>%
mutate(
group = case_when(
gene == "Intergenic" ~ "Intergenic",
GFF_FEATURE == "Intergenic" ~ "Intergenic",
NV_class == "SNP" ~ Class,
NV_class == "INDEL" ~ indel_class
)
Expand Down Expand Up @@ -190,6 +190,7 @@ variants <- vcf %>%
labels = NV_names,
values = NV_colors
) +
scale_y_discrete(drop = FALSE) +
labs(
x = "SARS-CoV-2 genome position",
y = "Sample",
Expand Down Expand Up @@ -308,7 +309,7 @@ window_plot_spike <- window %>%
variants_spike <- vcf %>%
filter(
ALT_FREQ > 0,
POS %in% spike.pos
POS %in% c(min(spike.pos):max(spike.pos))
) %>%
ggplot() +
aes(
Expand All @@ -324,6 +325,7 @@ variants_spike <- vcf %>%
labels = NV_names,
values = NV_colors
) +
scale_y_discrete(drop = FALSE) +
labs(
x = "SARS-CoV-2 genome position",
y = "Sample",
Expand All @@ -334,6 +336,7 @@ variants_spike <- vcf %>%
)



figura_spike <- ggarrange(
window_plot_spike,
variants_spike,
Expand Down Expand Up @@ -368,7 +371,7 @@ figur_SNP_time <- vcf_snp %>%
group_by(REGION) %>%
summarise(
CollectionDate = min(as.Date(CollectionDate)),
n = n()
n = n_distinct(POS)
) %>%
ungroup() %>%
ggplot() +
Expand Down Expand Up @@ -447,7 +450,7 @@ vcf_snp %>%
group_by(REGION) %>%
summarise(
CollectionDate = min(as.Date(CollectionDate)),
n_PolymorphicSites = n()
n_PolymorphicSites = n_distinct(POS)
) %>%
ungroup() %>%
rename(sample = REGION) %>%
Expand All @@ -457,7 +460,12 @@ vcf_snp %>%

# STATS FOR REPORTING

n_indels <- filter(vcf, NV_class == "INDEL") %>% length()
n_indels <- vcf %>%
filter(NV_class == "INDEL") %>%
pull(SNP) %>%
unique() %>%
length()

n_snv <- length(unique(vcf$SNP)) - n_indels

list(
Expand Down
47 changes: 47 additions & 0 deletions workflow/scripts/report/get_annotation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python3

import logging
import pandas as pd
from gb2seq.features import Features
from Bio import SeqIO

replace_terry = {
"ORF1ab polyprotein": "orf1ab",
"ORF1a polyprotein": "orf1ab",
"surface glycoprotein": "S",
"ORF3a protein": "ORF3a",
"envelope protein": "E",
"membrane glycoprotein": "M",
"ORF6 protein": "ORF6",
"ORF7a protein": "ORF7",
"ORF7b": "ORF7",
"ORF8 protein": "ORF8",
"nucleocapsid phosphoprotein": "N",
"ORF10 protein": "ORF10"
}

def main():
logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
ft = Features(snakemake.input.gb)

reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)

positions = [x for x in range(len(reference))]

genes = []
for pos in positions:

if len(ft.getFeatureNames(pos)) == 0:
genes.append("Intergenic")
else:
genes.append(replace_terry[list(ft.getFeatureNames(pos))[0]])

df = pd.DataFrame({"POS":positions, "GEN": genes})

df.to_csv(snakemake.output.df,index= False)


if __name__ == "__main__":
main()


1 change: 0 additions & 1 deletion workflow/scripts/report/heatmap.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
# Create SNP variable and select useful variables from vcf
vcf <- vcf %>%
mutate(
GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
SNP = case_when(
!is.na(REF_AA) ~ paste(
GFF_FEATURE,
Expand Down
10 changes: 8 additions & 2 deletions workflow/scripts/report/snp_plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ date_order <- read_csv(snakemake@params[["metadata"]]) %>%
# Simplify features names and create SNP variable
vcf <- vcf %>%
mutate(
GFF_FEATURE = gsub(":.*", "", GFF_FEATURE),
SNP = case_when(
!is.na(REF_AA) ~ paste(
GFF_FEATURE,
Expand All @@ -47,6 +46,13 @@ vcf <- vcf %>%
ALT_AA,
sep = ""
),
GFF_FEATURE != "Intergenic" ~ paste(
GFF_FEATURE,
":",
POS - 1,
"-",
ALT
),
TRUE ~ paste(
REF,
POS,
Expand Down Expand Up @@ -250,7 +256,7 @@ ggsave(
# PLOT TABLES ####

log_info("Saving coorelation table")
cor.df %>%
cor.df.fill %>%
transmute(
NV = snp,
PearsonCor = cor,
Expand Down
12 changes: 6 additions & 6 deletions workflow/scripts/window.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def get_polimorphic_sites(df:pd.DataFrame) -> set:
return set(df.POS)


def window_calculation(sites:set, step:int, genome_size:int, coord:str) -> pd.DataFrame:
def window_calculation(sites:set, window:int, step:int, genome_size:int, coord:str) -> pd.DataFrame:

ft = Features(coord) # Create Features object to obtain annotations

Expand All @@ -36,20 +36,20 @@ def window_calculation(sites:set, step:int, genome_size:int, coord:str) -> pd.Da
genes = []
lim_sup = genome_size + 1

for position in range(1, lim_sup):
for position in range(1, lim_sup, step):

if len(ft.getFeatureNames(position)) == 0:
genes.append("Intergenic")
else:
genes.append(list(ft.getFeatureNames(position))[0])

# Add percent (excluding initial and final positions)
if position - step not in range(1, lim_sup):
if position - window not in range(1, lim_sup):
pct.append(0)
else:
# Calculate nº of polimorphisms in the window
num_snp = len([ x for x in sites if x in range(position - step, position + 1) ]) # Calculate nº of polimorphisms in the window
pct.append(num_snp / step)
num_snp = len([ x for x in sites if x in range(position - window, position + 1) ]) # Calculate nº of polimorphisms in the window
pct.append(num_snp / window)

positions.append(position)

Expand All @@ -66,7 +66,7 @@ def main():

logging.info("Sliding window calculation of proportion of polimorphic sites")

frame = window_calculation(sites, snakemake.params.step, 29903, snakemake.input.gb)
frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, 29903, snakemake.input.gb)

logging.info("Saving results")
frame.replace(replace_terry, inplace = True)
Expand Down

0 comments on commit 414ccc8

Please sign in to comment.