Skip to content

Commit

Permalink
Merge pull request #184 from sanogenetics/feature/enhance-vcf-output
Browse files Browse the repository at this point in the history
Enhance VCF output
  • Loading branch information
apriha authored Aug 26, 2024
2 parents 162d86e + eaf5824 commit c702fa6
Show file tree
Hide file tree
Showing 7 changed files with 268 additions and 102 deletions.
1 change: 1 addition & 0 deletions src/snps/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
REFERENCE_SEQUENCE_CHROMS = tuple(str(i) for i in range(1, 23)) + ("X", "Y", "MT")
25 changes: 20 additions & 5 deletions src/snps/io/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def read(self):
elif re.match("^rs[0-9]*[, \t]{1}[1]", first_line):
d = self.read_generic(file, compression, skip=0)
elif "vcf" in comments.lower() or "##contig" in comments.lower():
d = self.read_vcf(file, compression, "vcf", self._rsids)
d = self.read_vcf(file, compression, "vcf", self._rsids, comments)
elif ("Genes for Good" in comments) | ("PLINK" in comments):
d = self.read_genes_for_good(file, compression)
elif "DNA.Land" in comments:
Expand Down Expand Up @@ -1355,7 +1355,7 @@ def parse(sep):

return self.read_helper("generic", parser)

def read_vcf(self, file, compression, provider, rsids=()):
def read_vcf(self, file, compression, provider, rsids=(), comments=""):
"""Read and parse VCF file.
Notes
Expand All @@ -1366,7 +1366,7 @@ def read_vcf(self, file, compression, provider, rsids=()):
* SNPs that are not annotated with an RSID are skipped
* If the VCF contains multiple samples, only the first sample is used to
lookup the genotype
* Insertions and deletions are skipped
* Precise insertions and deletions are skipped
* If a sample allele is not specified, the genotype is reported as NaN
* If a sample allele refers to a REF or ALT allele that is not specified,
the genotype is reported as NaN
Expand All @@ -1393,6 +1393,17 @@ def parser():

return (df, phased)

header_lines = comments.replace("\r\n", "\n").split("\n")

detected_company = ""
for line in header_lines:
if line.startswith("##detectedCompany="):
detected_company = line.split("=")[1].strip('"')
break

if detected_company:
provider = detected_company

return self.read_helper(provider, parser)

def _parse_vcf(self, buffer, rsids):
Expand Down Expand Up @@ -1441,9 +1452,13 @@ def _parse_vcf(self, buffer, rsids):
if len(alt.split(",")) > 1 and alt.split(",")[1] == "<NON_REF>":
alt = alt.split(",")[0]

ref_alt = [ref] + alt.split(",")
# Handle <INS> and <DEL> alleles (imprecise insertions and deletions)
ref_alt = [ref] + [
"I" if allele == "<INS>" else "D" if allele == "<DEL>" else allele
for allele in alt.split(",")
]

# skip insertions and deletions
# skip precise insertions and deletions
if sum(map(len, ref_alt)) > len(ref_alt):
continue

Expand Down
178 changes: 114 additions & 64 deletions src/snps/io/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas as pd

import snps
from snps.constants import REFERENCE_SEQUENCE_CHROMS
from snps.io import get_empty_snps_dataframe
from snps.utils import clean_str, get_utc_now, save_df_as_csv

Expand Down Expand Up @@ -120,14 +121,14 @@ def _write_csv(self):

filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{ext}"

comment = (
f"# Source(s): {self._snps.source}\n"
f"# Build: {self._snps.build}\n"
f"# Build Detected: { self._snps.build_detected}\n"
f"# Phased: {self._snps.phased}\n"
f"# SNPs: {self._snps.count}\n"
f"# Chromosomes: {self._snps.chromosomes_summary}\n"
)
comment = [
f"# Source(s): {self._snps.source}",
f"# Build: {self._snps.build}",
f"# Build Detected: { self._snps.build_detected}",
f"# Phased: {self._snps.phased}",
f"# SNPs: {self._snps.count}",
f"# Chromosomes: {self._snps.chromosomes_summary}",
]
if "header" in self._kwargs:
if isinstance(self._kwargs["header"], bool):
if self._kwargs["header"]:
Expand All @@ -139,7 +140,7 @@ def _write_csv(self):
self._snps._snps,
self._snps._output_dir,
filename,
comment=comment,
comment="\n".join(comment) + "\n",
atomic=self._atomic,
**self._kwargs,
)
Expand All @@ -149,8 +150,8 @@ def _write_vcf(self):
References
----------
1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019,
https://samtools.github.io/hts-specs/VCFv4.2.pdf
1. The Variant Call Format (VCF) Version 4.3 Specification, 27 Nov 2022,
https://samtools.github.io/hts-specs/VCFv4.3.pdf
Returns
-------
Expand All @@ -163,61 +164,37 @@ def _write_vcf(self):
if not filename:
filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{'.vcf'}"

comment = (
f"##fileformat=VCFv4.2\n"
f'##fileDate={get_utc_now().strftime("%Y%m%d")}\n'
f'##source="{self._snps.source}; snps v{snps.__version__}; https://pypi.org/project/snps/"\n'
)
comment = [
"##fileformat=VCFv4.3",
f'##fileDate={get_utc_now().strftime("%Y%m%d")}',
f'##source="snps v{snps.__version__}; https://pypi.org/project/snps/"',
f'##detectedCompany="{self._snps.source}"',
]

reference_sequence_chroms = (
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"17",
"18",
"19",
"20",
"21",
"22",
"X",
"Y",
"MT",
)
if self._snps.build_original:
comment.append(f"##detectedOriginalBuild={self._snps.build_original}")

if self._snps.determine_sex():
comment.append(f"##detectedSex={self._snps.determine_sex()}")

if self._vcf_qc_only or self._vcf_qc_filter:
chip_version = ""
if self._snps.chip_version:
chip_version = f" {self._snps.chip_version}"

if self._snps.chip:
comment.append(
f'##detectedChip="{self._snps.chip}{chip_version} per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040"'
)

df = self._snps.snps

p = self._snps._parallelizer
tasks = []

# skip insertions and deletions
df = df.drop(
df.loc[
df["genotype"].notnull()
& (
(df["genotype"].str[0] == "I")
| (df["genotype"].str[0] == "D")
| (df["genotype"].str[1] == "I")
| (df["genotype"].str[1] == "D")
)
].index
)

chroms_to_drop = []
for chrom in df["chrom"].unique():
if chrom not in reference_sequence_chroms:
if chrom not in REFERENCE_SEQUENCE_CHROMS:
chroms_to_drop.append(chrom)
continue

Expand All @@ -237,41 +214,66 @@ def _write_vcf(self):
if self._vcf_qc_only or self._vcf_qc_filter
else get_empty_snps_dataframe()
),
"sex": self._snps.determine_sex(),
}
)

# drop chromosomes without reference sequence data (e.g., unassigned PAR)
for chrom in chroms_to_drop:
df = df.drop(df.loc[df["chrom"] == chrom].index)

# Check for the presence of insertions or deletions
has_ins = df["genotype"].str.contains("I", na=False).any()
has_del = df["genotype"].str.contains("D", na=False).any()

# create the VCF representation for SNPs
results = p(self._create_vcf_representation, tasks)

contigs = []
vcf = [pd.DataFrame()]
discrepant_vcf_position = [pd.DataFrame()]
for result in list(results):
contigs.append(result["contig"])
if result["contig"]:
contigs.append(result["contig"])
vcf.append(result["vcf"])
discrepant_vcf_position.append(result["discrepant_vcf_position"])

vcf = pd.concat(vcf)
discrepant_vcf_position = pd.concat(discrepant_vcf_position)

comment += "".join(contigs)
comment.extend(contigs)

if has_del:
comment.append(
'##ALT=<ID=DEL,Description="Deletion relative to the reference">'
)
if has_ins:
comment.append(
'##ALT=<ID=INS,Description="Insertion of novel sequence relative to the reference">'
)

if has_ins or has_del:
comment.append(
'##INFO=<ID=SVTYPE,Number=.,Type=String,Description="Type of structural variant: INS (Insertion), DEL (Deletion)">'
)
comment.append(
'##INFO=<ID=IMPRECISE,Number=0,Type=Flag,Description="Imprecise structural variation">'
)

if self._vcf_qc_filter and self._snps.cluster:
comment += '##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">\n'
comment.append(
'##FILTER=<ID=lq,Description="Low quality SNP per Lu et al.: https://doi.org/10.1016/j.csbj.2021.06.040">'
)

comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"
comment.append('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">')
comment.append("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")

return (
save_df_as_csv(
vcf,
self._snps._output_dir,
filename,
comment=comment,
comment="\n".join(comment) + "\n",
prepend_info=False,
header=False,
index=False,
Expand All @@ -288,6 +290,7 @@ def _create_vcf_representation(self, task):
snps = task["snps"]
cluster = task["cluster"]
low_quality_snps = task["low_quality_snps"]
sex = task["sex"]

if len(snps.loc[snps["genotype"].notnull()]) == 0:
return {
Expand All @@ -299,7 +302,7 @@ def _create_vcf_representation(self, task):
seqs = resources.get_reference_sequences(assembly, [chrom])
seq = seqs[chrom]

contig = f'##contig=<ID={seq.ID},URL={seq.url},length={seq.length},assembly={seq.build},md5={seq.md5},species="{seq.species}">\n'
contig = f'##contig=<ID={self._vcf_chrom_prefix}{seq.ID},URL={seq.url},length={seq.length},assembly={seq.build},md5={seq.md5},species="{seq.species}">'

if self._vcf_qc_only and cluster:
# drop low quality SNPs if SNPs object maps to a cluster
Expand Down Expand Up @@ -371,12 +374,27 @@ def _create_vcf_representation(self, task):
temp["REF"], temp["genotype"]
)

# Populate INFO field
df["INFO"] = df["ALT"].apply(self._compute_info)

temp = df.loc[df["genotype"].notnull()]

df.loc[df["genotype"].notnull(), "SAMPLE"] = np.vectorize(
self._compute_genotype
)(temp["REF"], temp["ALT"], temp["genotype"])

if sex == "Female":
haploid_chroms = ["Y", "MT"]
else:
haploid_chroms = ["X", "Y", "MT"]

# populate null values for haploid chromosomes
df.loc[
(df["SAMPLE"].isnull())
& (df["CHROM"].str.contains("|".join(haploid_chroms))),
"SAMPLE",
] = "."

df.loc[df["SAMPLE"].isnull(), "SAMPLE"] = "./."

del df["genotype"]
Expand All @@ -387,9 +405,18 @@ def _create_vcf_representation(self, task):
"discrepant_vcf_position": discrepant_vcf_position,
}

def _replace_genotype_indels(self, genotype):
# Replace 'I' and 'D' with '<INS>' and '<DEL>'
return [
"<INS>" if allele == "I" else "<DEL>" if allele == "D" else allele
for allele in genotype
]

def _compute_alt(self, ref, genotype):
genotype_alleles = list(set(genotype))

genotype_alleles = self._replace_genotype_indels(genotype_alleles)

if ref in genotype_alleles:
if len(genotype_alleles) == 1:
return self._vcf_alt_unavailable
Expand All @@ -401,6 +428,10 @@ def _compute_alt(self, ref, genotype):
return ",".join(genotype_alleles)

def _compute_genotype(self, ref, alt, genotype):
genotype = list(genotype)

genotype = self._replace_genotype_indels(genotype)

alleles = [ref]

if self._snps.phased:
Expand All @@ -417,3 +448,22 @@ def _compute_genotype(self, ref, alt, genotype):
)
else:
return f"{alleles.index(genotype[0])}"

def _compute_info(self, alt):
"""Generate the INFO field based on ALT values."""
if pd.isna(alt):
return "."

alt_values = alt.split(",")
svtypes = []
for alt_value in alt_values:
if alt_value == "<INS>":
svtypes.append("INS")
elif alt_value == "<DEL>":
svtypes.append("DEL")

if not svtypes:
return "."

svtype_str = ",".join(svtypes)
return f"SVTYPE={svtype_str};IMPRECISE" if svtype_str else "."
Loading

0 comments on commit c702fa6

Please sign in to comment.