From ab4e8563b01fe856809252ba09ecd8158c0a8cea Mon Sep 17 00:00:00 2001
From: "Lataretu, Marie" <LataretuM@rki.de>
Date: Wed, 11 Dec 2024 16:56:12 +0100
Subject: [PATCH] formatting and change error to warning

- the minimal variant fraction threshold was limited to >0.5 -> changed to the error to a warning
---
 .gitignore       |   9 ++-
 bin/adjust_gt.py | 153 +++++++++++++++++++++++++++++++----------------
 2 files changed, 107 insertions(+), 55 deletions(-)

diff --git a/.gitignore b/.gitignore
index ffae102..e5f2cf0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,9 @@
-work/
+work*/
 results*/
 conda/
 .nextflow/
-.nextflow.log*
-nextflow-autodownload-databases/
\ No newline at end of file
+*.nextflow.log*
+nextflow-autodownload-databases/
+tests_*/
+tmp/
+settings.json
\ No newline at end of file
diff --git a/bin/adjust_gt.py b/bin/adjust_gt.py
index 145ebf9..1600efd 100755
--- a/bin/adjust_gt.py
+++ b/bin/adjust_gt.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-#author: Stephan Fuchs (Robert Koch Institute, MF-1, fuchss@rki.de)
+# author: Stephan Fuchs (Robert Koch Institute, MF-1, fuchss@rki.de)
 
 VERSION = "0.0.9"
 import os
@@ -9,19 +9,51 @@
 import re
 import sys
 import gzip
+import warnings
+
 
 def parse_args(CMD=None):
-    parser = argparse.ArgumentParser(prog="rename_in_gff3.py", description="changes genotype in VCFs", )
-    parser.add_argument('vcf', metavar="FILE", help="vcf file", type=str)
-    parser.add_argument('--ao', metavar="STR", help="tag for read count supporting the respective variant (default: AO)", type=str, default="AO")
-    parser.add_argument('--dp', metavar="STR", help="tag for total read count at the repsective position (default: DP)", type=str, default="DP")
-    parser.add_argument('--gt', metavar="STR", help="tag for genotype (default: GT)", type=str, default="GT")
-    parser.add_argument('-o', help="output file (will be overwritten!)", type=str, required=True)
-    parser.add_argument('--gz', help="bgzip compressed input", action="store_true")
-    parser.add_argument('--vf', metavar="FLOAT", help="minimal variant fraction to set a homogeneous genotype (default: 0.9)", type=float, default=0.9)
-    parser.add_argument('--version', action='version', version='%(prog)s ' + VERSION)
+    parser = argparse.ArgumentParser(
+        prog="rename_in_gff3.py",
+        description="changes genotype in VCFs",
+    )
+    parser.add_argument("vcf", metavar="FILE", help="vcf file", type=str)
+    parser.add_argument(
+        "--ao",
+        metavar="STR",
+        help="tag for read count supporting the respective variant (default: AO)",
+        type=str,
+        default="AO",
+    )
+    parser.add_argument(
+        "--dp",
+        metavar="STR",
+        help="tag for total read count at the repsective position (default: DP)",
+        type=str,
+        default="DP",
+    )
+    parser.add_argument(
+        "--gt",
+        metavar="STR",
+        help="tag for genotype (default: GT)",
+        type=str,
+        default="GT",
+    )
+    parser.add_argument(
+        "-o", help="output file (will be overwritten!)", type=str, required=True
+    )
+    parser.add_argument("--gz", help="bgzip compressed input", action="store_true")
+    parser.add_argument(
+        "--vf",
+        metavar="FLOAT",
+        help="minimal variant fraction to set a homogeneous genotype (default: 0.9)",
+        type=float,
+        default=0.9,
+    )
+    parser.add_argument("--version", action="version", version="%(prog)s " + VERSION)
     return parser.parse_args(CMD)
 
+
 # open file handles considering compression state
 def get_filehandle(in_fname, gz):
     if not gz:
@@ -29,91 +61,108 @@ def get_filehandle(in_fname, gz):
     else:
         inhandle = gzip.open(in_fname, "rt")
     return inhandle
-                
-def process(in_fname, out_fname, min_vf, ao_tag="AO", dp_tag="DP", gt_tag="GT", gz=False):
-    #sanity checks
+
+
+def process(
+    in_fname, out_fname, min_vf, ao_tag="AO", dp_tag="DP", gt_tag="GT", gz=False
+):
+    # sanity checks
     if min_vf <= 0.5:
-        sys.exit("error: min_vf has to be greater than 0.5")
+        warnings.warn(
+            f"[WARNING] Minimal variant fraction to set a homogeneous genotype (--vf) is below 0.5 ({min_vf}). Assuming you know what you are doing."
+        )
     out_gz = out_fname.endswith(".gz")
-    intermediate = re.sub("\.gz$","", out_fname)
+    intermediate = re.sub("\.gz$", "", out_fname)
 
-    #regex generation
+    # regex generation
     ao_pattern = re.compile(r"(?:^|\t|;)" + re.escape(ao_tag) + "=([0-9,]+)(?:$|\t|;)")
     dp_pattern = re.compile(r"(?:^|\t|;)" + re.escape(dp_tag) + "=([0-9]+)(?:$|\t|;)")
-    gt_pattern = re.compile(r"(?:^|\t|;)" + re.escape(gt_tag) + "=([0-9]+/[0-9]+)(?:$|\t|;)")
+    gt_pattern = re.compile(
+        r"(?:^|\t|;)" + re.escape(gt_tag) + "=([0-9]+/[0-9]+)(?:$|\t|;)"
+    )
 
-    with get_filehandle(in_fname, gz) as inhandle: 
-        with open(intermediate, "w") as outhandle: 
+    with get_filehandle(in_fname, gz) as inhandle:
+        with open(intermediate, "w") as outhandle:
             for l, line in enumerate(inhandle):
-       
-                #skip empty or comment lines
+                # skip empty or comment lines
                 if len(line.strip()) == 0 or line.startswith("#"):
                     outhandle.write(line)
                     continue
 
                 fields = line.split("\t")
-                
-                #find GT position
+
+                # find GT position
                 gt_pos = fields[8].split(":").index(gt_tag)
-                
-                #replacing GT info (considering line eventual breaks at end)
+
+                # replacing GT info (considering line eventual breaks at end)
                 cols = fields[9].split(":")
 
                 # check line for homo/herterozygot
-                gt1, gt2 = cols[gt_pos].split('/')
+                gt1, gt2 = cols[gt_pos].split("/")
                 if gt1 == gt2:
                     # homozygot
                     outhandle.write(line)
                     continue
-                
-                #find ao and dp
+
+                # find ao and dp
                 ao = ao_pattern.findall(fields[7])
                 dp = dp_pattern.findall(fields[7])
-                
+
                 if len(ao) > 1:
-                    sys.exit("error: multiple occurences of " + ao_tag + " tag in line " + str(l+1))
+                    sys.exit(
+                        "error: multiple occurences of "
+                        + ao_tag
+                        + " tag in line "
+                        + str(l + 1)
+                    )
                 if len(dp) > 1:
-                    sys.exit("error: multiple occurences of " + dp_tag + " tag in line " + str(l+1))
-                
-                #calc fractions and check threshold
-                fracs = [int(x)/int(dp[0]) for x in ao[0].split(",")]
+                    sys.exit(
+                        "error: multiple occurences of "
+                        + dp_tag
+                        + " tag in line "
+                        + str(l + 1)
+                    )
+
+                # calc fractions and check threshold
+                fracs = [int(x) / int(dp[0]) for x in ao[0].split(",")]
                 m = max(fracs)
-                
+
                 if m < min_vf:
                     outhandle.write(line)
                     continue
-                
-                #generate new GT
-                gt = str(fracs.index(m) + 1) # REF == 0 -> ++1
+
+                # generate new GT
+                gt = str(fracs.index(m) + 1)  # REF == 0 -> ++1
                 gt = gt + "/" + gt
-                
-                #find GT position
+
+                # find GT position
                 gt_pos = fields[8].split(":").index(gt_tag)
-                
-                #replacing GT info (considering line eventual breaks at end)
-                if gt_pos == len(cols)-1:
+
+                # replacing GT info (considering line eventual breaks at end)
+                if gt_pos == len(cols) - 1:
                     cols[gt_pos] += "\n"
                 fields[9] = ":".join(cols)
                 outhandle.write("\t".join(fields))
         if out_gz:
             bgzip_outname(intermediate, out_fname)
-        
+
+
 def bgzip_outname(_file, outfile=None):
     if outfile is not None:
         with open(outfile, "wb") as out_fh:
-            with subprocess.Popen(["bgzip", _file, "-c"], 
-                                   stdout=out_fh) as bg_proc:
-                ret = bg_proc.wait() 
-        os.remove(_file)  
+            with subprocess.Popen(["bgzip", _file, "-c"], stdout=out_fh) as bg_proc:
+                ret = bg_proc.wait()
+        os.remove(_file)
     else:
-        with subprocess.Popen(["bgzip", _file], 
-                               stderr=subprocess.PIPE) as bg_proc:
+        with subprocess.Popen(["bgzip", _file], stderr=subprocess.PIPE) as bg_proc:
             out, err = bg_proc.communicate()
-            ret = bg_proc.wait() 
-                
+            ret = bg_proc.wait()
+
+
 def main(CMD=None):
     args = parse_args(CMD)
     process(args.vcf, args.o, args.vf, args.ao, args.dp, args.gt, args.gz)
 
+
 if __name__ == "__main__":
     main()