Skip to content

Commit

Permalink
Added alternative to choose allele symbol when splitting variants
Browse files Browse the repository at this point in the history
  • Loading branch information
moonso committed Apr 23, 2015
1 parent 7f11c8f commit fea0e34
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 11 deletions.
12 changes: 8 additions & 4 deletions scripts/vcf_parser
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,11 @@ def print_version(ctx, param, value):
type=click.Path(exists=False),
help='Path to a outfile.'
)
@click.option('-v', '--verbose',
@click.option("--allele_symbol",
default='0',
help="The symbol that should be used when representing "\
"unobserved alleles. Default is '0'")
@click.option('-v', '--verbose',
is_flag=True,
help='Increase output verbosity.'
)
Expand Down Expand Up @@ -88,7 +92,7 @@ def print_version(ctx, param, value):
help="Set the level of log output."
)
def cli(variant_file, vep, split, outfile, verbose, silent, skip_info_check,
logfile, loglevel):
allele_symbol, logfile, loglevel):
"""
Tool for parsing vcf files.
Expand All @@ -107,11 +111,11 @@ def cli(variant_file, vep, split, outfile, verbose, silent, skip_info_check,
if variant_file == '-':
logger.info("Start parsing variants from stdin")
my_parser = vcf_parser.VCFParser(fsock=sys.stdin, split_variants=split,
skip_info_check=skip_info_check)
skip_info_check=skip_info_check, allele_symbol=allele_symbol)
else:
logger.info("Start parsing variants from file {0}".format(variant_file))
my_parser = vcf_parser.VCFParser(infile = variant_file,
split_variants=split, skip_info_check=skip_info_check)
split_variants=split, skip_info_check=skip_info_check, allele_symbol=allele_symbol)

start = datetime.now()
nr_of_variants = 0
Expand Down
17 changes: 13 additions & 4 deletions vcf_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@

class VCFParser(object):
"""docstring for VCFParser"""
def __init__(self, infile=None, fsock=None, split_variants=False, skip_info_check=False):
def __init__(self, infile=None, fsock=None, split_variants=False,
skip_info_check=False, allele_symbol='0'):
super(VCFParser, self).__init__()
self.logger = logging.getLogger(__name__)

Expand All @@ -99,6 +100,9 @@ def __init__(self, infile=None, fsock=None, split_variants=False, skip_info_chec

self.skip_info_check = skip_info_check
self.logger.info("Skip info check = {0}".format(self.skip_info_check))

self.allele_symbol = allele_symbol
self.logger.info("Allele symbol = {0}".format(self.allele_symbol))

self.logger.info("Initializing HeaderParser")
self.metadata = HeaderParser()
Expand Down Expand Up @@ -166,7 +170,8 @@ def add_variant(self, chrom, pos, rs_id, ref, alt, qual, filt, info, form=None,

# If multiple alternative and split_variants we must split the variant
else:
for splitted_variant in split_variants(variant, self.metadata):
for splitted_variant in split_variants(variant_dict=variant,
header_parser=self.metadata, allele_symbol=self.allele_symbol):
self.variants.append(splitted_variant)


Expand All @@ -185,7 +190,8 @@ def __iter__(self):
if not (self.split_variants and len(first_variant['ALT'].split(',')) > 1):
variants.append(first_variant)
else:
for splitted_variant in split_variants(first_variant, self.metadata):
for splitted_variant in split_variants(variant_dict=first_variant,
header_parser=self.metadata, allele_symbol=self.allele_symbol):
variants.append(splitted_variant)

for variant in variants:
Expand All @@ -208,7 +214,10 @@ def __iter__(self):
variants.append(variant)

else:
for splitted_variant in split_variants(variant, self.metadata):
for splitted_variant in split_variants(
variant_dict=variant,
header_parser=self.metadata,
allele_symbol=self.allele_symbol):
variants.append(splitted_variant)

for variant in variants:
Expand Down
2 changes: 2 additions & 0 deletions vcf_parser/utils/split_genotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ def split_genotype(genotype, gt_format, alternative_number, allele_symbol = '0')
new_genotype (str): A string that represents the new genotype
"""
logger = getLogger(__name__)
logger.info("Allele symbol {0}".format(allele_symbol))

splitted_genotype = genotype.split(':')
logger.debug("Parsing genotype {0}".format(splitted_genotype))
splitted_gt_format = gt_format.split(':')
Expand Down
8 changes: 5 additions & 3 deletions vcf_parser/utils/split_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vcf_parser import Genotype
from vcf_parser.utils import (build_vep_string, split_genotype, build_info_string)

def split_variants(variant_dict, header_parser):
def split_variants(variant_dict, header_parser, allele_symbol='0'):
"""
Checks if there are multiple alternative alleles and splitts the
variant.
Expand All @@ -26,7 +26,8 @@ def split_variants(variant_dict, header_parser):
variant: A variant dictionary with the splitted information for each
alternative
"""

logger = getLogger(__name__)
logger.info("Allele symbol {0}".format(allele_symbol))
alternatives = variant_dict['ALT'].split(',')
reference = variant_dict['REF']
number_of_values = 1
Expand Down Expand Up @@ -102,7 +103,8 @@ def split_variants(variant_dict, header_parser):
new_genotype = split_genotype(
variant_dict[individual],
variant['FORMAT'],
alternative_number
alternative_number,
allele_symbol
)

variant[individual] = new_genotype
Expand Down

0 comments on commit fea0e34

Please sign in to comment.