From ab4e8563b01fe856809252ba09ecd8158c0a8cea Mon Sep 17 00:00:00 2001 From: "Lataretu, Marie" Date: Wed, 11 Dec 2024 16:56:12 +0100 Subject: [PATCH] formatting and change error to warning - the minimal variant fraction threshold was limited to >0.5 -> changed to the error to a warning --- .gitignore | 9 ++- bin/adjust_gt.py | 153 +++++++++++++++++++++++++++++++---------------- 2 files changed, 107 insertions(+), 55 deletions(-) diff --git a/.gitignore b/.gitignore index ffae102..e5f2cf0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ -work/ +work*/ results*/ conda/ .nextflow/ -.nextflow.log* -nextflow-autodownload-databases/ \ No newline at end of file +*.nextflow.log* +nextflow-autodownload-databases/ +tests_*/ +tmp/ +settings.json \ No newline at end of file diff --git a/bin/adjust_gt.py b/bin/adjust_gt.py index 145ebf9..1600efd 100755 --- a/bin/adjust_gt.py +++ b/bin/adjust_gt.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -#author: Stephan Fuchs (Robert Koch Institute, MF-1, fuchss@rki.de) +# author: Stephan Fuchs (Robert Koch Institute, MF-1, fuchss@rki.de) VERSION = "0.0.9" import os @@ -9,19 +9,51 @@ import re import sys import gzip +import warnings + def parse_args(CMD=None): - parser = argparse.ArgumentParser(prog="rename_in_gff3.py", description="changes genotype in VCFs", ) - parser.add_argument('vcf', metavar="FILE", help="vcf file", type=str) - parser.add_argument('--ao', metavar="STR", help="tag for read count supporting the respective variant (default: AO)", type=str, default="AO") - parser.add_argument('--dp', metavar="STR", help="tag for total read count at the repsective position (default: DP)", type=str, default="DP") - parser.add_argument('--gt', metavar="STR", help="tag for genotype (default: GT)", type=str, default="GT") - parser.add_argument('-o', help="output file (will be overwritten!)", type=str, required=True) - parser.add_argument('--gz', help="bgzip compressed input", action="store_true") - parser.add_argument('--vf', metavar="FLOAT", help="minimal variant fraction to set a homogeneous genotype (default: 0.9)", type=float, default=0.9) - parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION) + parser = argparse.ArgumentParser( + prog="rename_in_gff3.py", + description="changes genotype in VCFs", + ) + parser.add_argument("vcf", metavar="FILE", help="vcf file", type=str) + parser.add_argument( + "--ao", + metavar="STR", + help="tag for read count supporting the respective variant (default: AO)", + type=str, + default="AO", + ) + parser.add_argument( + "--dp", + metavar="STR", + help="tag for total read count at the repsective position (default: DP)", + type=str, + default="DP", + ) + parser.add_argument( + "--gt", + metavar="STR", + help="tag for genotype (default: GT)", + type=str, + default="GT", + ) + parser.add_argument( + "-o", help="output file (will be overwritten!)", type=str, required=True + ) + parser.add_argument("--gz", help="bgzip compressed input", action="store_true") + parser.add_argument( + "--vf", + metavar="FLOAT", + help="minimal variant fraction to set a homogeneous genotype (default: 0.9)", + type=float, + default=0.9, + ) + parser.add_argument("--version", action="version", version="%(prog)s " + VERSION) return parser.parse_args(CMD) + # open file handles considering compression state def get_filehandle(in_fname, gz): if not gz: @@ -29,91 +61,108 @@ def get_filehandle(in_fname, gz): else: inhandle = gzip.open(in_fname, "rt") return inhandle - -def process(in_fname, out_fname, min_vf, ao_tag="AO", dp_tag="DP", gt_tag="GT", gz=False): - #sanity checks + + +def process( + in_fname, out_fname, min_vf, ao_tag="AO", dp_tag="DP", gt_tag="GT", gz=False +): + # sanity checks if min_vf <= 0.5: - sys.exit("error: min_vf has to be greater than 0.5") + warnings.warn( + f"[WARNING] Minimal variant fraction to set a homogeneous genotype (--vf) is below 0.5 ({min_vf}). Assuming you know what you are doing." + ) out_gz = out_fname.endswith(".gz") - intermediate = re.sub("\.gz$","", out_fname) + intermediate = re.sub("\.gz$", "", out_fname) - #regex generation + # regex generation ao_pattern = re.compile(r"(?:^|\t|;)" + re.escape(ao_tag) + "=([0-9,]+)(?:$|\t|;)") dp_pattern = re.compile(r"(?:^|\t|;)" + re.escape(dp_tag) + "=([0-9]+)(?:$|\t|;)") - gt_pattern = re.compile(r"(?:^|\t|;)" + re.escape(gt_tag) + "=([0-9]+/[0-9]+)(?:$|\t|;)") + gt_pattern = re.compile( + r"(?:^|\t|;)" + re.escape(gt_tag) + "=([0-9]+/[0-9]+)(?:$|\t|;)" + ) - with get_filehandle(in_fname, gz) as inhandle: - with open(intermediate, "w") as outhandle: + with get_filehandle(in_fname, gz) as inhandle: + with open(intermediate, "w") as outhandle: for l, line in enumerate(inhandle): - - #skip empty or comment lines + # skip empty or comment lines if len(line.strip()) == 0 or line.startswith("#"): outhandle.write(line) continue fields = line.split("\t") - - #find GT position + + # find GT position gt_pos = fields[8].split(":").index(gt_tag) - - #replacing GT info (considering line eventual breaks at end) + + # replacing GT info (considering line eventual breaks at end) cols = fields[9].split(":") # check line for homo/herterozygot - gt1, gt2 = cols[gt_pos].split('/') + gt1, gt2 = cols[gt_pos].split("/") if gt1 == gt2: # homozygot outhandle.write(line) continue - - #find ao and dp + + # find ao and dp ao = ao_pattern.findall(fields[7]) dp = dp_pattern.findall(fields[7]) - + if len(ao) > 1: - sys.exit("error: multiple occurences of " + ao_tag + " tag in line " + str(l+1)) + sys.exit( + "error: multiple occurences of " + + ao_tag + + " tag in line " + + str(l + 1) + ) if len(dp) > 1: - sys.exit("error: multiple occurences of " + dp_tag + " tag in line " + str(l+1)) - - #calc fractions and check threshold - fracs = [int(x)/int(dp[0]) for x in ao[0].split(",")] + sys.exit( + "error: multiple occurences of " + + dp_tag + + " tag in line " + + str(l + 1) + ) + + # calc fractions and check threshold + fracs = [int(x) / int(dp[0]) for x in ao[0].split(",")] m = max(fracs) - + if m < min_vf: outhandle.write(line) continue - - #generate new GT - gt = str(fracs.index(m) + 1) # REF == 0 -> ++1 + + # generate new GT + gt = str(fracs.index(m) + 1) # REF == 0 -> ++1 gt = gt + "/" + gt - - #find GT position + + # find GT position gt_pos = fields[8].split(":").index(gt_tag) - - #replacing GT info (considering line eventual breaks at end) - if gt_pos == len(cols)-1: + + # replacing GT info (considering line eventual breaks at end) + if gt_pos == len(cols) - 1: cols[gt_pos] += "\n" fields[9] = ":".join(cols) outhandle.write("\t".join(fields)) if out_gz: bgzip_outname(intermediate, out_fname) - + + def bgzip_outname(_file, outfile=None): if outfile is not None: with open(outfile, "wb") as out_fh: - with subprocess.Popen(["bgzip", _file, "-c"], - stdout=out_fh) as bg_proc: - ret = bg_proc.wait() - os.remove(_file) + with subprocess.Popen(["bgzip", _file, "-c"], stdout=out_fh) as bg_proc: + ret = bg_proc.wait() + os.remove(_file) else: - with subprocess.Popen(["bgzip", _file], - stderr=subprocess.PIPE) as bg_proc: + with subprocess.Popen(["bgzip", _file], stderr=subprocess.PIPE) as bg_proc: out, err = bg_proc.communicate() - ret = bg_proc.wait() - + ret = bg_proc.wait() + + def main(CMD=None): args = parse_args(CMD) process(args.vcf, args.o, args.vf, args.ao, args.dp, args.gt, args.gz) + if __name__ == "__main__": main()