Skip to content

Extended vcf ann #6

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,41 @@
- **create** a interactive `html` for data exploration
- **create** a static image (`jpg`, `png`, `pdf`, `svg`) ready for publication
- **add** additional tracks (supported: `.vcf`, `.gb`, `.bed`)
- **annotate** tracks with coverage data and vcf with additional information if a `.gb` file is provided
- **annotate** tracks with additional information
- **export** annoated track data as tabular files (`.bed`, `.vcf`) or json (`.gb`)
- **developed** for viral genomics
- **customize** all plotting parameters

**Feel free to report any bugs or request new features as issues!**


## Automatic annotation

BAMdash automatically computes serveral statistics:

- if `-bs` is > 1 it computes the mean over the bin size in the coverage plot
- for each track it computes recovery and mean coverage (set `-c` for the min coverage) for each element in the track
- if a `*.vcf` is provided it annotates `TRANSITION`/`TRANSVERSION` and type of exchange (`SNP`, `DEL`, `INS)

If a `*.gb`and `*.vcf` is provided BAMdash computes the aminoacid exchange and the effect in the CDS (inspired by but not as powerful as [snpeff](http://pcingola.github.io/SnpEff/snpeff)). SNP and INDEL vcf annotation supports:

- `START_LOST`: INDEL or SNP start at the CDS and result in a start loss
- `STOP_LOST`: INDEL or SNP result in the loss of the stop codon
- `STOP_GAINED`: INDEL or SNP result in an additional stop codon
- `SYN`: SNP does not lead to an amino acid change
- `NON-SYN`: SNP leads to an amino acid change
- `AC_INSERTION`: INS that does not change already present amino acids
- `AC_CHANGE+AC_INSERTION`: INS where the affected codon is also non-syn
- `AC_DELETION`: DEL that does not change already present amino acids
- `AC_CHANGE+AC_DELETION`: DEL where the affected codon is also non-syn

The nomenclature for the aminoacid effect is pretty simplified:

- `A58Y` - Exchange at pos 58 from A to Y
- `A58YY`- Exchange at pos 58 from A to Y and insertion of an additional Y
- `FA58Y`- Exchange at pos 58 from A to Y and deletion of the prior F
- `A58fsX` - Frameshift at pos 58

## Example
<img src="./example.gif" alt="example" />

Expand Down
2 changes: 1 addition & 1 deletion bamdash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""interactively visualize coverage and tracks"""
_program = "bamdash"
__version__ = "0.1.2"
__version__ = "0.2"
163 changes: 129 additions & 34 deletions bamdash/scripts/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""
# BUILT INS
import statistics
import math

# LIBS
import pandas as pd
Expand Down Expand Up @@ -280,9 +281,122 @@ def bed_to_dict(bed_file, coverage_df, ref, min_cov):
return define_track_position(bed_dict)


def get_mutations(start, stop, cds, variant, seq):
"""
classifiy the mutation effect on the amino acid level
inspired by SNPeff but only for cds

:param start: start of the cds
:param stop: stop of the cds
:param cds: cds dict
:param variant: slice of variant df
:param seq: sequence of the ref

:return: amino acid change, mutation effect
"""

ac_exchange, ac_effect = "NONE", "NONE"

# get the CDS sequence, cds pos of the variant and the codon pos
if "codon_start" in cds:
codon_start = int(cds["codon_start"])
else:
codon_start = 1
# reverse complement depending on the cds direction
if cds["strand"] == "-":
seq_cds = seq[start + codon_start - 2:stop].reverse_complement()
cds_variant_pos = stop - variant["position"]
ref = str(Seq.Seq(variant["reference"]).reverse_complement())
mut = str(Seq.Seq(variant["mutation"]).reverse_complement())
else:
seq_cds = seq[start + codon_start - 2:stop]
cds_variant_pos = variant["position"] - start
ref, mut = variant["reference"], variant["mutation"]

# define codons -> get the codon with the mutation and the position in the codon
cds_codons = [list(seq_cds[i:i + 3]) for i in range(0, len(seq_cds), 3)]
mut_codon, codon_pos = math.floor(cds_variant_pos / 3), cds_variant_pos % 3 # index affected codon, codon pos
ref_ac = Seq.Seq("".join(cds_codons[mut_codon])).translate() # ref aminoacid

# define the mutation
if variant["type"] == "SNP":
# get mut aminoacid
cds_codons[mut_codon][codon_pos] = mut
alt_ac = Seq.Seq("".join(cds_codons[mut_codon])).translate()
# define mutation type
if ref_ac == alt_ac:
return ac_exchange, "SYN"
# get ac mutation
ac_exchange = f"{ref_ac}{mut_codon + 1}{alt_ac}"
# check different mutations types
if mut_codon == 0:
return ac_exchange, "START_LOST"
if alt_ac == "*":
return ac_exchange, "STOP_GAINED"
if ref_ac == "*" and alt_ac != "*":
return ac_exchange, "STOP_LOST"
# otherwise it is non-syn
return ac_exchange, "NON_SYN"

elif variant["type"] == "INS":
# 1st case: triplet insertion
if (len(mut) - 1) % 3 == 0:
# mutate the codon
cds_codons[mut_codon][codon_pos] = mut
ins = Seq.Seq("".join(cds_codons[mut_codon])).translate()
ac_exchange = f"{ref_ac}{mut_codon + 1}{ins}"
# check different mutations types
if mut_codon == 0 and ins[0] != ref_ac:
return ac_exchange, "START_LOST"
if ref_ac != "*" and "*" in ins[1:]:
return ac_exchange, "STOP_GAINED"
if ref_ac == "*" and "*" not in ins:
return ac_exchange, "STOP_LOST"
if ins[0] == ref_ac:
return ac_exchange, "AC_INSERTION"
if ins[0] != ref_ac:
return ac_exchange, "AC_CHANGE+AC_INSERTION"
# 2nd case: non-triplet insertion -> frameshift
else:
return f"{ref_ac}{mut_codon + 1}fsX", "FRAMESHIFT"

elif variant["type"] == "DEL":
# 1st case: triplet insertion
if (len(ref) - 1) % 3 == 0:
# check if it is the last codon position -> deletion of the next x amninoacids
if codon_pos == 2:
deletion = Seq.Seq("".join(cds_codons[mut_codon]) + ref[1:]).translate()
new_ac = ref_ac
# otherwise check which codons are affected and how the new codon looks like
else:
del_codons = cds_codons[mut_codon:int(mut_codon + 1 + (len(ref) - 1) / 3)] # affected codons
deletion = Seq.Seq("".join(sum(del_codons, []))).translate()
# define potential new amino acid by joining the respective parts of the first and last triplet
new_ac = Seq.Seq(
"".join(
del_codons[0][:codon_pos + 1] + del_codons[len(del_codons) - 1][codon_pos + 1:]
)
).translate()
ac_exchange = f"{deletion}{mut_codon + 1}{new_ac}"
# check different mutations types
if mut_codon == 0 and new_ac != "M":
return ac_exchange, "START_LOST"
if "*" in deletion and new_ac != "*":
return ac_exchange, "STOP_LOST"
if new_ac == ref_ac:
return ac_exchange, "AC_DELETION"
if new_ac != ref_ac:
return ac_exchange, "AC_CHANGE+AC_DELETION"
# 2nd case: non-triplet insertion
else:
ac_effect, ac_exchange = "FRAMESHIFT", f"{ref_ac}{mut_codon + 1}fsX"
# dummy return in case I missed something
return ac_exchange, ac_effect


def annotate_vcf_df(vcf_df, cds_dict, seq):
"""
annotate mutations for their as effect
annotate mutations for their aminoacid effect
:param vcf_df: dataframe with vcf data
:param cds_dict: dictionary with all coding sequences
:param seq: sequence of the reference
Expand All @@ -292,61 +406,42 @@ def annotate_vcf_df(vcf_df, cds_dict, seq):
# define the potential identifiers to look for
potential_cds_identifiers = ["protein_id", "gene", "locus_tag", "product"]

proteins, as_exchange, as_effect = [], [], []
proteins, ac_exchange, ac_effect = [], [], []

for variant in vcf_df.iterrows():
proteins_temp, as_exchange_temp, as_effect_temp = [], [], []
pos, mut_type, mut = variant[1]["position"], variant[1]["type"], variant[1]["mutation"]
proteins_temp, ac_exchange_temp, ac_effect_temp = [], [], []
# check each cds for potential mutations
for cds in cds_dict:
# check if a protein identifier is present
if not any(identifier in potential_cds_identifiers for identifier in cds_dict[cds]):
continue
start, stop = cds_dict[cds]["start"], cds_dict[cds]["stop"]
# check if pos is in range
if pos not in range(start, stop):
if variant[1]["position"] not in range(start, stop+1):
continue
# now we know the protein
for identifier in potential_cds_identifiers:
if identifier in cds_dict[cds]:
proteins_temp.append(cds_dict[cds][identifier])
break
# at the moment only check SNPs
if mut_type != "SNP":
as_exchange_temp.append("AMBIG"), as_effect_temp.append(variant[1]["type"])
continue
# now we can analyse for a potential as exchange
if "codon_start" in cds_dict[cds]:
codon_start = int(cds_dict[cds]["codon_start"])
else:
codon_start = 1
strand, seq_mut = cds_dict[cds]["strand"], str(seq)
# mutate the sequence and get the CDS nt seq
seq_mut = Seq.Seq(seq_mut[:pos-1] + mut + seq_mut[pos:])
seq_cds, seq_mut_cds = seq[start+codon_start-2:stop], seq_mut[start+codon_start-2:stop]
# translate strand depend
if strand == "+":
cds_trans, cds_mut_trans = seq_cds.translate(), seq_mut_cds.translate()
else:
cds_trans, cds_mut_trans = seq_cds.reverse_complement().translate(), seq_mut_cds.reverse_complement().translate()
# get the mutation by searching for a diff between string - prop a bit slow
mut_string = [f"{x}{i+1}{y}" for i, (x, y) in enumerate(zip(cds_trans, cds_mut_trans)) if x != y]
# now append to list
if not mut_string:
as_exchange_temp.append("NONE"), as_effect_temp.append("SYN")
else:
as_exchange_temp.append(mut_string[0]), as_effect_temp.append("NON-SYN")
# annotate mutations
amino_acid_mutations = get_mutations(start, stop, cds_dict[cds], variant[1], seq)
ac_exchange_temp.append(amino_acid_mutations[0]), ac_effect_temp.append(amino_acid_mutations[1])
# check if we did not find a protein
if not proteins_temp:
proteins.append("NONE"), as_exchange.append("NONE"), as_effect.append("NONE")
proteins.append("NONE"), ac_exchange.append("NONE"), ac_effect.append("NONE")
# else append all mutations found in all cds
elif len(proteins_temp) == 1:
proteins.append(proteins_temp[0]), as_exchange.append(as_exchange_temp[0]), as_effect.append(as_effect_temp[0])
proteins.append(proteins_temp[0])
ac_exchange.append(ac_exchange_temp[0])
ac_effect.append(ac_effect_temp[0])
else:
proteins.append(" | ".join(proteins_temp)), as_exchange.append(" | ".join(as_exchange_temp)), as_effect.append(" | ".join(as_effect_temp))
proteins.append(" | ".join(proteins_temp))
ac_exchange.append(" | ".join(ac_exchange_temp))
ac_effect.append(" | ".join(ac_effect_temp))

# annotate and return df
vcf_df["protein"], vcf_df["effect"], vcf_df["as mutation"] = proteins, as_effect, as_exchange
vcf_df["protein"], vcf_df["effect"], vcf_df["as mutation"] = proteins, ac_effect, ac_exchange
return vcf_df


Expand Down