From 3e4881bd49e3458ad37700e5e2b0c70ec6045d08 Mon Sep 17 00:00:00 2001
From: apsteinberg <asherps@gmail.com>
Date: Wed, 5 Nov 2025 10:37:19 -0500
Subject: [PATCH 1/6] Handle edge case where both precision and recall are 0

---
 minda/stats.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/minda/stats.py b/minda/stats.py
index ffcede6..dc6905e 100644
--- a/minda/stats.py
+++ b/minda/stats.py
@@ -67,7 +67,11 @@ def _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len,
     if tp+fn == 0:
         sys.exit(f"{caller_name} has no TP or FN records. Please double check input files.")
     recall = tp/(tp+fn)
-    f1 = (2*precision*recall)/(precision+recall)
+    # Handle edge case where both precision and recall are 0
+    if precision + recall == 0:
+        f1 = 0.0
+    else:
+        f1 = (2 * precision * recall) / (precision + recall)
 
     caller_len = len(paired_df)
     base_len = len(base_df)

From 80cb5cc5fdb44ae721d837000367a079e5b5ed27 Mon Sep 17 00:00:00 2001
From: Asher Preska Steinberg <63111464+apsteinberg@users.noreply.github.com>
Date: Tue, 6 Jan 2026 13:20:08 -0500
Subject: [PATCH 2/6] Incorporate breakend orientation + fix for compressed
 vcfs (#1)

* updates to allow for strand info

* incorporate strand information into alt field

* add in CHR2 and END fields to info

* in progress strand info

* strand inference + tests for strand inference

* fix to handle compressed vcfs
---
 minda/decompose.py     |  11 ++---
 minda/ensemble.py      | 102 +++++++++++++++++++++++++++++------------
 tests/__init__.py      |   1 +
 tests/test_ensemble.py |  97 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 176 insertions(+), 35 deletions(-)
 create mode 100644 tests/__init__.py
 create mode 100644 tests/test_ensemble.py

diff --git a/minda/decompose.py b/minda/decompose.py
index ea952e7..a1740f6 100644
--- a/minda/decompose.py
+++ b/minda/decompose.py
@@ -65,10 +65,10 @@ def get_df(vcf):
     """
     is_vcf_gz = _is_vcf_gz(vcf)
     if is_vcf_gz == False:
-        df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'})
+        df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,3,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'})
     else:
-        df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'})
-    df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO']
+        df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,3,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'})
+    df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO']
     
     return df
 
@@ -81,8 +81,8 @@ def get_intersected_df(vcf, bed):
     bed_to_bt = BedTool(bed)
     vcf_to_bt = BedTool(vcf)
     intersect_obj = vcf_to_bt.intersect(bed_to_bt, u=True)
-    df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'})
-    df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO']
+    df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,3,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'})
+    df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO']
     return df
 
 
@@ -359,7 +359,6 @@ def get_decomposed_dfs(caller_name, df, filter, min_size, prefixed, vaf, sample_
         df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list()
         if df.VAF.isnull().all() == True:
             sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ")
-    
     # get indices of mate rows
     df = _get_alt_mate_index(df)
 
diff --git a/minda/ensemble.py b/minda/ensemble.py
index ce44c3e..c2ec462 100644
--- a/minda/ensemble.py
+++ b/minda/ensemble.py
@@ -1,10 +1,44 @@
 import sys
+from ast import parse
 from collections import Counter
 from datetime import datetime
-import pandas as pd 
+import pandas as pd
 import numpy as np
 import re
+import gzip
+from minda.decompose import _is_vcf_gz
 
+def parse_bnd_alt(alt_string):
+    '''
+    Parse the BND alt string and return separators and region
+    adapted from svtools package
+    '''
+    # NOTE The below is ugly but intended to match things like [2:222[ and capture the brackets
+    result = re.findall(r'([][])(.+?)([][])', alt_string)
+    assert result, "%s\n" % alt_string
+    sep1, _ , sep2 = result[0]
+    assert sep1 == sep2
+    return sep1
+
+def _infer_strands(svtype, alt):
+    """
+    infer SV strands from ALT
+    """
+    valid_svtypes = {"DEL", "INS", "DUP", "INV", "BND"}
+    assert svtype in valid_svtypes, f"invalid svtype: {svtype}. must be one of {valid_svtypes}"
+
+    orientation1 = orientation2 = "+"
+    if svtype in ("DEL", "INS") or alt in ("<DEL>", "<INS>"):
+        orientation2 = "-"
+    elif svtype == "DUP" or "DUP" in alt:
+        orientation1 = "-"
+    else:
+        sep = parse_bnd_alt(alt)
+        if alt.startswith(sep):
+            orientation1 = "-"
+        if sep == "[":
+            orientation2 = "-"
+    return orientation1+orientation2
 
 def _add_columns(ensemble_df, vaf):
     # create a column of list of prefixed IDs for each locus group
@@ -72,7 +106,7 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir,
     # create stat dfs
     start_dfs_list = []
     start_dfs = pd.concat(dfs_1).reset_index(drop=True)
-    start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN']].sort_values(['#CHROM', 'POS'])
+    start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN', 'REF', 'ALT']].sort_values(['#CHROM', 'POS'])
     
     start_dfs['diff_x'] = start_dfs.groupby('#CHROM').POS.diff().fillna(9999)
     diffs = start_dfs['diff_x'].to_list()
@@ -98,17 +132,11 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir,
 
     #ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'SVLEN','Minda_ID'])
     ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'Minda_ID'])
-    ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\
-           'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\
-           'ID_y', ]]
     ensemble_df = ensemble_df.sort_values(['locus_group_x','#CHROM_y', 'POS_y'])
     ensemble_df ['diff_y'] = ensemble_df.groupby(['locus_group_x','#CHROM_y']).POS_y.diff().abs().fillna(9999)
     diffs = ensemble_df['diff_y'].to_list()
     caller_names = ensemble_df['Minda_ID'].apply(lambda x: x.rsplit('_', 1)[0]).tolist()
     ensemble_df['caller_names']= caller_names
-    ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\
-           'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\
-           'ID_y','diff_y','caller_names' ]]
 
     # group end loci
     locus_callers = []
@@ -195,17 +223,15 @@ def _get_ensemble_call_column(support_df, conditions):
     #support_df['ensemble'] = mask 
     support_df.insert(loc=12, column='ensemble', value=mask)
     return support_df
-    
-def _replace_value(row):
-    if row['ALT'] == '<BND>':
-        return f"N]{row['#CHROM_y']}:{row['POS_y']}]"
-    else:
-        return row['ALT']
-    
+
 def _get_contigs(vcf_list):
     contig_dict = {}
     for vcf in vcf_list:
-        with open(vcf, 'r') as file:
+        is_vcf_gz = _is_vcf_gz(vcf)
+        open_func = gzip.open if is_vcf_gz else open
+        mode = 'rt' if is_vcf_gz else 'r'
+
+        with open_func(vcf, mode) as file:
             for line in file:
                 if not line.startswith("##"):
                     break
@@ -227,34 +253,43 @@ def _get_contigs(vcf_list):
 def _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version):
     vcf_df = support_df[support_df['ensemble'] == True].reset_index(drop=True).copy()
     vcf_df['ID'] = f'Minda_' + (vcf_df.index + 1).astype(str)
-    vcf_df['REF'] = "N" 
-    vcf_df['ALT'] = ["<" + svtype +">" for svtype in vcf_df['SVTYPE']]
-    vcf_df['ALT'] = vcf_df.apply(_replace_value, axis=1)
     vcf_df['QUAL'] = "."
     vcf_df['FILTER'] = "PASS"
 
     if vaf != None:
         vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \
+                          ';CHR2=' + str(chr2) + ';END=' + str(end) + \
+                          ';STRANDS=' + strands + \
                           ';SUPP_VEC=' + ','.join(map(str, supp_vec)) + ';VAF=' + str(vaf) \
-                          for svlen, svtype, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'], vcf_df['VAF'])]
-        vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"})
+                          for svlen, svtype, chr2, end, strands, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['#CHROM_y'], vcf_df['POS_y'], vcf_df['STRANDS'], vcf_df['ID_list_y'], vcf_df['VAF'])]
+        vcf_df = (vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF_x', 'ALT_x', 'QUAL', 'FILTER','INFO']]
+                  .rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS", "REF_x":"REF", "ALT_x": "ALT"}))
     else:
-        vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \
-                          for svlen, svtype, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'])]
-        vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"})
+        vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \
+                          ';CHR2=' + str(chr2) + ';END=' + str(end) + \
+                          ';STRANDS=' + strands + \
+                          ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \
+                          for svlen, svtype, chr2, end, strands, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['#CHROM_y'], vcf_df['POS_y'], vcf_df['STRANDS'], vcf_df['ID_list_y'])]
+        vcf_df = (vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF_x', 'ALT_x', 'QUAL', 'FILTER','INFO']]
+                  .rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS", "REF_x": "REF", "ALT_x": "ALT"}))
     date = datetime.today().strftime('%Y-%m-%d')
     with open(f'{out_dir}/{sample_name}_minda_ensemble.vcf', 'w') as file:
         file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n')
+        command_str = " ".join(sys.argv)
+        file.write(f"##CommandLine= {command_str}\n")
         contig_dict = _get_contigs(vcf_list)
         for key, value in contig_dict.items():
             file.write(f'##contig=<ID={key},length={value}>\n')
         file.write('##ALT=<ID=DEL,Description="Deletion">\n##ALT=<ID=INS,Description="Insertion">\n##ALT=<ID=DUP,Description="Duplication">\n##ALT=<ID=INV,Description="Inversion">\n')
         file.write('##FILTER=<ID=PASS,Description="Default">\n')
-        file.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the structural variant">\n##INFO=<ID=SUPP_VEC,Number=.,Type=String,Description="IDs of support records">\n')
+        file.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n'
+                   '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the structural variant">\n'
+                   '##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate">\n'
+                   '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">\n'
+                   '##INFO=<ID=STRANDS,Number=1,Type=String,Description="Breakpoint strandedness (first_to_second)">\n'
+                   '##INFO=<ID=SUPP_VEC,Number=.,Type=String,Description="IDs of support records">\n')
         if vaf != None:
             file.write('##INFO=<ID=VAF,Number=1,Type=Float,Description="Variant allele frequency">\n')
-        command_str = " ".join(sys.argv)
-        file.write(f"##cmd: {command_str}\n")
         vcf_df.to_csv(file, sep="\t", index=False)
 
 
@@ -274,9 +309,18 @@ def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, condi
             call_boolean  = any(value.startswith(caller_name) for value in intersect_list)
             caller_column.append(call_boolean)
         ensemble_df[f'{caller_name}'] = caller_column
+    # add in a column for strands
+    strands = []
+    for row in ensemble_df.itertuples():
+        alt = row.ALT_x
+        svtype = row.SVTYPE
+        strands1 = _infer_strands(svtype, alt)
+        strands.append(strands1)
+    ensemble_df["STRANDS"] = strands
 
-    column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x',  \
-                    '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \
+    column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x',
+                    '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y',
+                    'REF_x', 'REF_y', 'ALT_x', 'ALT_y', 'STRANDS',
                     'SVTYPE', 'SVLEN', 'VAF', 'Minda_ID_list_y'] + caller_names
     
     support_df = ensemble_df[column_names].rename(columns={"Minda_ID_list_y": "Minda_IDs"}).copy()
diff --git a/tests/__init__.py b/tests/__init__.py
new file mode 100644
index 0000000..050d85c
--- /dev/null
+++ b/tests/__init__.py
@@ -0,0 +1 @@
+# Tests for minda package
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
new file mode 100644
index 0000000..56399dc
--- /dev/null
+++ b/tests/test_ensemble.py
@@ -0,0 +1,97 @@
+import pytest
+from minda.ensemble import _infer_strands, parse_bnd_alt
+
+
+class TestParseBndAlt:
+    """Test BND ALT string parsing"""
+
+    def test_forward_bracket(self):
+        """Test parsing forward bracket notation"""
+        assert parse_bnd_alt("N[chr2:100[") == "["
+
+    def test_reverse_bracket(self):
+        """Test parsing reverse bracket notation"""
+        assert parse_bnd_alt("N]chr2:100]") == "]"
+
+    def test_bracket_at_start(self):
+        """Test parsing when bracket is at start"""
+        assert parse_bnd_alt("[chr2:100[N") == "["
+        assert parse_bnd_alt("]chr2:100]N") == "]"
+
+    def test_invalid_alt_raises_assertion(self):
+        """Test that invalid ALT strings raise assertion error"""
+        with pytest.raises(AssertionError):
+            parse_bnd_alt("<INV>")
+        with pytest.raises(AssertionError):
+            parse_bnd_alt("N")
+
+
+class TestInferStrands:
+    """Test strand inference from ALT field"""
+
+    # DEL tests
+    def test_del_simple_notation(self):
+        """DEL with simple notation should be +-"""
+        assert _infer_strands("DEL", "N") == "+-"
+
+    def test_del_bnd_notation(self):
+        """DEL with BND notation should be +-"""
+        assert _infer_strands("DEL", "N[chr2:100[") == "+-"
+    # INS tests
+    def test_ins_simple_notation(self):
+        """INS with simple notation should be +-"""
+        assert _infer_strands("INS", "N") == "+-"
+
+    def test_ins_bnd_notation(self):
+        """INS with BND notation should be +-"""
+        assert _infer_strands("INS", "N]chr2:100]") == "+-"
+
+    # INV tests - various BND notations
+    def test_inv_reverse_reverse_bracket_before(self):
+        """INV: [chr2:100[N should give -- (reverse-reverse)"""
+        assert _infer_strands("INV", "[chr2:100[N") == "--"
+
+    def test_inv_forward_forward_bracket_after(self):
+        """INV: N]chr2:100] should give ++ (forward-forward)"""
+        assert _infer_strands("INV", "N]chr2:100]") == "++"
+
+    # DUP tests
+    def test_dup_simple_notation(self):
+        """DUP with simple notation should be -+"""
+        assert _infer_strands("DUP", "<DUP>") == "-+"
+
+    def test_dup_bnd_notation(self):
+        """DUP with BND notation should be -+"""
+        assert _infer_strands("DUP", "N[chr2:100[") == "-+"
+
+    # BND tests - all four orientations
+    def test_bnd_forward_reverse(self):
+        """BND: N[chr2:100[ should give +- (forward-reverse)"""
+        assert _infer_strands("BND", "N[chr2:100[") == "+-"
+
+    def test_bnd_forward_forward(self):
+        """BND: N]chr2:100] should give ++ (forward-forward)"""
+        assert _infer_strands("BND", "N]chr2:100]") == "++"
+
+    def test_bnd_reverse_reverse(self):
+        """BND: [chr2:100[N should give -- (reverse-reverse)"""
+        assert _infer_strands("BND", "[chr2:100[N") == "--"
+
+    def test_bnd_reverse_forward(self):
+        """BND: ]chr2:100]N should give -+ (reverse-forward)"""
+        assert _infer_strands("BND", "]chr2:100]N") == "-+"
+
+
+class TestStrandInferenceValidation:
+    """Test input validation"""
+
+    def test_invalid_svtype_raises_error(self):
+        """Test that invalid SVTYPE raises AssertionError"""
+        with pytest.raises(AssertionError, match="invalid svtype"):
+            _infer_strands("INVALID", "N")
+
+        with pytest.raises(AssertionError, match="invalid svtype"):
+            _infer_strands("del", "N")  # lowercase
+
+        with pytest.raises(AssertionError, match="invalid svtype"):
+            _infer_strands("", "N")

From 3a09ad3e7eb91fe3ee4a138ef4ff63be0aa8069f Mon Sep 17 00:00:00 2001
From: apsteinberg <asherps@gmail.com>
Date: Tue, 6 Jan 2026 15:29:35 -0500
Subject: [PATCH 3/6] Dockerfile for minda

---
 Dockerfile | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 Dockerfile

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..26d82b5
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,25 @@
+# Use Python base image
+FROM python:3.10-slim
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y git build-essential python3-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install uv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
+
+# Clone the Minda repository
+RUN git clone https://github.com/shahcompbio/minda.git && \
+    cd minda && git checkout 80cb5cc && \
+    git rev-parse --short HEAD > /opt/minda_version.txt && \
+    git rev-parse HEAD > /opt/minda_version_full.txt
+
+# Set working directory
+WORKDIR /minda
+
+# Install Python dependencies using uv
+RUN uv pip install --system pandas>=2.1.1 && \
+    uv pip install --system numpy>=1.26.0 && \
+    uv pip install --system pybedtools>=0.9.1 && \
+    uv pip install --system intervaltree
\ No newline at end of file

From c57c7d1032348f3478cf49060ab3e93f8dd11f47 Mon Sep 17 00:00:00 2001
From: apsteinberg <asherps@gmail.com>
Date: Wed, 7 Jan 2026 10:38:51 -0500
Subject: [PATCH 4/6] fix for Dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 26d82b5..5d31039 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,7 +3,7 @@ FROM python:3.10-slim
 
 # Install system dependencies
 RUN apt-get update && \
-    apt-get install -y git build-essential python3-dev && \
+    apt-get install -y git build-essential python3-dev bedtools && \
     rm -rf /var/lib/apt/lists/*
 
 # Install uv

From 8b6d81c8b21ba2c7e9acb8f923276abae6c743bd Mon Sep 17 00:00:00 2001
From: apsteinberg <asherps@gmail.com>
Date: Wed, 7 Jan 2026 18:03:51 -0500
Subject: [PATCH 5/6] pull strand info from info field if present; return ".."
 if STRANDS cannot be inferred; updated tests

---
 minda/ensemble.py      |  76 ++++++++++++-------
 tests/test_ensemble.py | 161 ++++++++++++++++++++++++++---------------
 2 files changed, 155 insertions(+), 82 deletions(-)

diff --git a/minda/ensemble.py b/minda/ensemble.py
index c2ec462..84f74e6 100644
--- a/minda/ensemble.py
+++ b/minda/ensemble.py
@@ -8,37 +8,58 @@
 import gzip
 from minda.decompose import _is_vcf_gz
 
-def parse_bnd_alt(alt_string):
+def _get_strands_from_info(info):
     '''
-    Parse the BND alt string and return separators and region
-    adapted from svtools package
+    Parse info to get strand info
     '''
+    terms = info.split(";")
+    strands = None
+    for term in terms:
+        if term.startswith("STRANDS"):
+            strands_info = term
+            _, strands = strands_info.split("=")
+            break
+    return strands
+
+def _get_strands_from_alt(alt_string):
+    '''
+    Parse alt_string to get strand info (adapted from svtools)
+    :param alt_string: ALT field from vcf
+    return strands
+    '''
+    orientation1 = orientation2 = "+"
     # NOTE The below is ugly but intended to match things like [2:222[ and capture the brackets
     result = re.findall(r'([][])(.+?)([][])', alt_string)
-    assert result, "%s\n" % alt_string
-    sep1, _ , sep2 = result[0]
-    assert sep1 == sep2
-    return sep1
+    # if we couldn't parse return blank strands
+    if not result:
+        return ".."
+    else:
+        sep1, _, sep2 = result[0]
+    # handle different scenarios
+    if sep1 != sep2:
+        # if true we did not parse correctly
+        return ".."
+    if alt_string.startswith(sep1):
+        orientation1 = "-"
+    if sep1 == "[":
+        orientation2 = "-"
+    return orientation1+orientation2
 
-def _infer_strands(svtype, alt):
+def _infer_strands(svtype, alt, info):
     """
     infer SV strands from ALT
     """
-    valid_svtypes = {"DEL", "INS", "DUP", "INV", "BND"}
-    assert svtype in valid_svtypes, f"invalid svtype: {svtype}. must be one of {valid_svtypes}"
-
-    orientation1 = orientation2 = "+"
+    valid_strands = ["++", "--", "+-", "-+"]
     if svtype in ("DEL", "INS") or alt in ("<DEL>", "<INS>"):
-        orientation2 = "-"
-    elif svtype == "DUP" or "DUP" in alt:
-        orientation1 = "-"
+        strands = "+-"
+    elif svtype == "DUP" or alt == "<DUP>":
+        strands = "-+"
     else:
-        sep = parse_bnd_alt(alt)
-        if alt.startswith(sep):
-            orientation1 = "-"
-        if sep == "[":
-            orientation2 = "-"
-    return orientation1+orientation2
+        strands = _get_strands_from_info(info)
+    # infer from alt field if we don't have valid strand info
+    if strands not in valid_strands:
+        strands = _get_strands_from_alt(alt)
+    return strands
 
 def _add_columns(ensemble_df, vaf):
     # create a column of list of prefixed IDs for each locus group
@@ -98,11 +119,11 @@ def _add_columns(ensemble_df, vaf):
 
 
 def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch):
-    
+
     dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list]
     dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list]
     dfs_list = [dfs_1, dfs_2]
-    
+
     # create stat dfs
     start_dfs_list = []
     start_dfs = pd.concat(dfs_1).reset_index(drop=True)
@@ -178,7 +199,7 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir,
         ensemble_df['VAF'] = np.nan
    
     ensemble_df = ensemble_df.drop_duplicates(['locus_group_x', 'locus_group_y']).reset_index(drop=True)
-    
+
     return ensemble_df
 
 
@@ -309,12 +330,17 @@ def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, condi
             call_boolean  = any(value.startswith(caller_name) for value in intersect_list)
             caller_column.append(call_boolean)
         ensemble_df[f'{caller_name}'] = caller_column
+    # Remove artifact entries with NaN ALT_x values. These can occur when BND mate records
+    # are reindexed during decomposition, creating index collisions with info_df records.
+    # The actual BND events are still properly represented by their working mate records.
+    ensemble_df = ensemble_df[ensemble_df["ALT_x"].notna()].copy()
     # add in a column for strands
     strands = []
     for row in ensemble_df.itertuples():
         alt = row.ALT_x
         svtype = row.SVTYPE
-        strands1 = _infer_strands(svtype, alt)
+        info = row.INFO_x
+        strands1 = _infer_strands(svtype, alt, info)
         strands.append(strands1)
     ensemble_df["STRANDS"] = strands
 
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
index 56399dc..3a24a78 100644
--- a/tests/test_ensemble.py
+++ b/tests/test_ensemble.py
@@ -1,97 +1,144 @@
 import pytest
-from minda.ensemble import _infer_strands, parse_bnd_alt
+from minda.ensemble import _infer_strands, _get_strands_from_alt, _get_strands_from_info
 
 
-class TestParseBndAlt:
-    """Test BND ALT string parsing"""
+class TestGetStrandsFromInfo:
+    """Test STRANDS parsing from INFO field"""
+
+    def test_strands_in_info(self):
+        """Test parsing STRANDS from INFO field"""
+        assert _get_strands_from_info("SVTYPE=DEL;STRANDS=+-;SVLEN=100") == "+-"
+        assert _get_strands_from_info("SVTYPE=BND;STRANDS=++;END=500") == "++"
+        assert _get_strands_from_info("STRANDS=--") == "--"
+        assert _get_strands_from_info("SVLEN=200;STRANDS=-+;CHR2=chr2") == "-+"
+
+    def test_strands_not_in_info(self):
+        """Test when STRANDS is not in INFO field"""
+        assert _get_strands_from_info("SVTYPE=DEL;SVLEN=100") is None
+        assert _get_strands_from_info("SVTYPE=INS") is None
+        assert _get_strands_from_info("") is None
+
+    def test_strands_at_different_positions(self):
+        """Test STRANDS at beginning, middle, and end of INFO"""
+        assert _get_strands_from_info("STRANDS=+-;SVTYPE=DEL;SVLEN=100") == "+-"
+        assert _get_strands_from_info("SVTYPE=DEL;STRANDS=++;SVLEN=100") == "++"
+        assert _get_strands_from_info("SVTYPE=DEL;SVLEN=100;STRANDS=--") == "--"
+
+
+class TestGetStrandsFromAlt:
+    """Test strand inference from BND ALT field"""
 
     def test_forward_bracket(self):
-        """Test parsing forward bracket notation"""
-        assert parse_bnd_alt("N[chr2:100[") == "["
+        """Test parsing forward bracket notation: N[chr2:100[ gives +-"""
+        assert _get_strands_from_alt("N[chr2:100[") == "+-"
 
     def test_reverse_bracket(self):
-        """Test parsing reverse bracket notation"""
-        assert parse_bnd_alt("N]chr2:100]") == "]"
+        """Test parsing reverse bracket notation: N]chr2:100] gives ++"""
+        assert _get_strands_from_alt("N]chr2:100]") == "++"
+
+    def test_bracket_at_start_forward(self):
+        """Test parsing when forward bracket is at start: [chr2:100[N gives --"""
+        assert _get_strands_from_alt("[chr2:100[N") == "--"
 
-    def test_bracket_at_start(self):
-        """Test parsing when bracket is at start"""
-        assert parse_bnd_alt("[chr2:100[N") == "["
-        assert parse_bnd_alt("]chr2:100]N") == "]"
+    def test_bracket_at_start_reverse(self):
+        """Test parsing when reverse bracket is at start: ]chr2:100]N gives -+"""
+        assert _get_strands_from_alt("]chr2:100]N") == "-+"
 
-    def test_invalid_alt_raises_assertion(self):
-        """Test that invalid ALT strings raise assertion error"""
-        with pytest.raises(AssertionError):
-            parse_bnd_alt("<INV>")
-        with pytest.raises(AssertionError):
-            parse_bnd_alt("N")
+    def test_invalid_alt_returns_dots(self):
+        """Test that invalid ALT strings return '..'"""
+        assert _get_strands_from_alt("<INV>") == ".."
+        assert _get_strands_from_alt("N") == ".."
+        assert _get_strands_from_alt("") == ".."
+
+    def test_mismatched_brackets_returns_dots(self):
+        """Test that mismatched brackets return '..'"""
+        assert _get_strands_from_alt("N[chr2:100]") == ".."
+        assert _get_strands_from_alt("N]chr2:100[") == ".."
 
 
 class TestInferStrands:
-    """Test strand inference from ALT field"""
+    """Test strand inference with fallback logic"""
 
     # DEL tests
     def test_del_simple_notation(self):
         """DEL with simple notation should be +-"""
-        assert _infer_strands("DEL", "N") == "+-"
+        assert _infer_strands("DEL", "N", "SVTYPE=DEL;SVLEN=100") == "+-"
+
+    def test_del_symbolic_alt(self):
+        """DEL with symbolic ALT should be +-"""
+        assert _infer_strands("DEL", "<DEL>", "SVTYPE=DEL;SVLEN=100") == "+-"
 
-    def test_del_bnd_notation(self):
-        """DEL with BND notation should be +-"""
-        assert _infer_strands("DEL", "N[chr2:100[") == "+-"
     # INS tests
     def test_ins_simple_notation(self):
         """INS with simple notation should be +-"""
-        assert _infer_strands("INS", "N") == "+-"
-
-    def test_ins_bnd_notation(self):
-        """INS with BND notation should be +-"""
-        assert _infer_strands("INS", "N]chr2:100]") == "+-"
+        assert _infer_strands("INS", "N", "SVTYPE=INS;SVLEN=100") == "+-"
 
-    # INV tests - various BND notations
-    def test_inv_reverse_reverse_bracket_before(self):
-        """INV: [chr2:100[N should give -- (reverse-reverse)"""
-        assert _infer_strands("INV", "[chr2:100[N") == "--"
-
-    def test_inv_forward_forward_bracket_after(self):
-        """INV: N]chr2:100] should give ++ (forward-forward)"""
-        assert _infer_strands("INV", "N]chr2:100]") == "++"
+    def test_ins_symbolic_alt(self):
+        """INS with symbolic ALT should be +-"""
+        assert _infer_strands("INS", "<INS>", "SVTYPE=INS;SVLEN=100") == "+-"
 
     # DUP tests
     def test_dup_simple_notation(self):
         """DUP with simple notation should be -+"""
-        assert _infer_strands("DUP", "<DUP>") == "-+"
+        assert _infer_strands("DUP", "<DUP>", "SVTYPE=DUP;SVLEN=100") == "-+"
 
-    def test_dup_bnd_notation(self):
-        """DUP with BND notation should be -+"""
-        assert _infer_strands("DUP", "N[chr2:100[") == "-+"
+    def test_dup_alt_contains_dup(self):
+        """DUP in ALT string should be -+"""
+        assert _infer_strands("DUP", "N<DUP>", "SVTYPE=DUP;SVLEN=100") == "-+"
 
-    # BND tests - all four orientations
+    # BND tests - all four orientations inferred from ALT
     def test_bnd_forward_reverse(self):
         """BND: N[chr2:100[ should give +- (forward-reverse)"""
-        assert _infer_strands("BND", "N[chr2:100[") == "+-"
+        assert _infer_strands("BND", "N[chr2:100[", "SVTYPE=BND;CHR2=chr2") == "+-"
 
     def test_bnd_forward_forward(self):
         """BND: N]chr2:100] should give ++ (forward-forward)"""
-        assert _infer_strands("BND", "N]chr2:100]") == "++"
+        assert _infer_strands("BND", "N]chr2:100]", "SVTYPE=BND;CHR2=chr2") == "++"
 
     def test_bnd_reverse_reverse(self):
         """BND: [chr2:100[N should give -- (reverse-reverse)"""
-        assert _infer_strands("BND", "[chr2:100[N") == "--"
+        assert _infer_strands("BND", "[chr2:100[N", "SVTYPE=BND;CHR2=chr2") == "--"
 
     def test_bnd_reverse_forward(self):
         """BND: ]chr2:100]N should give -+ (reverse-forward)"""
-        assert _infer_strands("BND", "]chr2:100]N") == "-+"
-
-
-class TestStrandInferenceValidation:
-    """Test input validation"""
-
-    def test_invalid_svtype_raises_error(self):
-        """Test that invalid SVTYPE raises AssertionError"""
-        with pytest.raises(AssertionError, match="invalid svtype"):
-            _infer_strands("INVALID", "N")
+        assert _infer_strands("BND", "]chr2:100]N", "SVTYPE=BND;CHR2=chr2") == "-+"
 
-        with pytest.raises(AssertionError, match="invalid svtype"):
-            _infer_strands("del", "N")  # lowercase
+    # INV tests - various BND notations
+    def test_inv_reverse_reverse_bracket_before(self):
+        """INV: [chr2:100[N should give -- (reverse-reverse)"""
+        assert _infer_strands("INV", "[chr2:100[N", "SVTYPE=INV;SVLEN=100") == "--"
 
-        with pytest.raises(AssertionError, match="invalid svtype"):
-            _infer_strands("", "N")
+    def test_inv_forward_forward_bracket_after(self):
+        """INV: N]chr2:100] should give ++ (forward-forward)"""
+        assert _infer_strands("INV", "N]chr2:100]", "SVTYPE=INV;SVLEN=100") == "++"
+
+
+class TestStrandInferenceFallback:
+    """Test fallback behavior when INFO field has valid STRANDS"""
+
+    def test_info_field_takes_precedence(self):
+        """When INFO has valid STRANDS, use it for non-DEL/INS/DUP types"""
+        # For BND/INV, INFO field takes precedence over ALT parsing
+        # Even though BND ALT says +-, INFO says --
+        assert _infer_strands("BND", "N[chr2:100[", "SVTYPE=BND;STRANDS=--") == "--"
+        # Even though INV ALT would give ++, INFO says +-
+        assert _infer_strands("INV", "N]chr2:100]", "SVTYPE=INV;STRANDS=+-") == "+-"
+
+    def test_fallback_to_alt_when_info_invalid(self):
+        """When INFO has invalid STRANDS, fall back to ALT parsing"""
+        # INFO has invalid strand value, should parse ALT
+        assert _infer_strands("BND", "N]chr2:100]", "SVTYPE=BND;STRANDS=invalid") == "++"
+        assert _infer_strands("BND", "[chr2:100[N", "SVTYPE=BND;STRANDS=X") == "--"
+
+    def test_fallback_to_alt_when_no_info_strands(self):
+        """When INFO has no STRANDS, fall back to ALT parsing"""
+        assert _infer_strands("BND", "N[chr2:100[", "SVTYPE=BND;SVLEN=100") == "+-"
+        assert _infer_strands("BND", "]chr2:100]N", "SVTYPE=BND") == "-+"
+
+    def test_returns_dots_when_unparseable(self):
+        """When everything fails, return '..'"""
+        # Unknown SVTYPE, no INFO strands, unparseable ALT
+        assert _infer_strands("UNKNOWN", "<UNKNOWN>", "SVTYPE=UNKNOWN") == ".."
+        assert _infer_strands("CTX", "N", "SVTYPE=CTX;SVLEN=100") == ".."
+        # Invalid ALT and no INFO strands
+        assert _infer_strands("BND", "malformed", "SVTYPE=BND") == ".."

From 366f0bbd7c9b5b2e993d8a426cf8d77213686ad2 Mon Sep 17 00:00:00 2001
From: apsteinberg <asherps@gmail.com>
Date: Thu, 8 Jan 2026 13:33:23 -0500
Subject: [PATCH 6/6] updated Dockerfile

---
 Dockerfile | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 5d31039..041505e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,25 +1,24 @@
 # Use Python base image
-FROM python:3.10-slim
+FROM community.wave.seqera.io/library/uv:0.9.22--2326eebc25ee1806
 
 # Install system dependencies
 RUN apt-get update && \
     apt-get install -y git build-essential python3-dev bedtools && \
     rm -rf /var/lib/apt/lists/*
 
-# Install uv
-COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
-
 # Clone the Minda repository
-RUN git clone https://github.com/shahcompbio/minda.git && \
-    cd minda && git checkout 80cb5cc && \
+RUN git clone https://github.com/shahcompbio/minda.git /opt/minda && \
+    cd /opt/minda && git checkout 8b6d81c && \
     git rev-parse --short HEAD > /opt/minda_version.txt && \
-    git rev-parse HEAD > /opt/minda_version_full.txt
+    git rev-parse HEAD > /opt/minda_version_full.txt && \
+    chmod +x minda.py && \
+    ln -s /opt/minda/minda.py /usr/local/bin/minda
 
 # Set working directory
-WORKDIR /minda
+WORKDIR /opt/minda
 
 # Install Python dependencies using uv
-RUN uv pip install --system pandas>=2.1.1 && \
-    uv pip install --system numpy>=1.26.0 && \
-    uv pip install --system pybedtools>=0.9.1 && \
-    uv pip install --system intervaltree
\ No newline at end of file
+RUN uv pip install --system --break-system-packages pandas>=2.1.1 && \
+    uv pip install --system --break-system-packages numpy>=1.26.0 && \
+    uv pip install --system --break-system-packages pybedtools>=0.9.1 && \
+    uv pip install --system --break-system-packages intervaltree
\ No newline at end of file