From 3e4881bd49e3458ad37700e5e2b0c70ec6045d08 Mon Sep 17 00:00:00 2001 From: apsteinberg Date: Wed, 5 Nov 2025 10:37:19 -0500 Subject: [PATCH 1/6] Handle edge case where both precision and recall are 0 --- minda/stats.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/minda/stats.py b/minda/stats.py index ffcede6..dc6905e 100644 --- a/minda/stats.py +++ b/minda/stats.py @@ -67,7 +67,11 @@ def _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len, if tp+fn == 0: sys.exit(f"{caller_name} has no TP or FN records. Please double check input files.") recall = tp/(tp+fn) - f1 = (2*precision*recall)/(precision+recall) + # Handle edge case where both precision and recall are 0 + if precision + recall == 0: + f1 = 0.0 + else: + f1 = (2 * precision * recall) / (precision + recall) caller_len = len(paired_df) base_len = len(base_df) From 80cb5cc5fdb44ae721d837000367a079e5b5ed27 Mon Sep 17 00:00:00 2001 From: Asher Preska Steinberg <63111464+apsteinberg@users.noreply.github.com> Date: Tue, 6 Jan 2026 13:20:08 -0500 Subject: [PATCH 2/6] Incorporate breakend orientation + fix for compressed vcfs (#1) * updates to allow for strand info * incorporate strand information into alt field * add in CHR2 and END fields to info * in progress strand info * strand inference + tests for strand inference * fix to handle compressed vcfs --- minda/decompose.py | 11 ++--- minda/ensemble.py | 102 +++++++++++++++++++++++++++++------------ tests/__init__.py | 1 + tests/test_ensemble.py | 97 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 176 insertions(+), 35 deletions(-) create mode 100644 tests/__init__.py create mode 100644 tests/test_ensemble.py diff --git a/minda/decompose.py b/minda/decompose.py index ea952e7..a1740f6 100644 --- a/minda/decompose.py +++ b/minda/decompose.py @@ -65,10 +65,10 @@ def get_df(vcf): """ is_vcf_gz = _is_vcf_gz(vcf) if is_vcf_gz == False: - df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'}) + df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,3,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'}) else: - df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'}) - df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO'] + df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,3,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'}) + df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO'] return df @@ -81,8 +81,8 @@ def get_intersected_df(vcf, bed): bed_to_bt = BedTool(bed) vcf_to_bt = BedTool(vcf) intersect_obj = vcf_to_bt.intersect(bed_to_bt, u=True) - df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'}) - df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO'] + df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,3,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'}) + df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO'] return df @@ -359,7 +359,6 @@ def get_decomposed_dfs(caller_name, df, filter, min_size, prefixed, vaf, sample_ df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list() if df.VAF.isnull().all() == True: sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ") - # get indices of mate rows df = _get_alt_mate_index(df) diff --git a/minda/ensemble.py b/minda/ensemble.py index ce44c3e..c2ec462 100644 --- a/minda/ensemble.py +++ b/minda/ensemble.py @@ -1,10 +1,44 @@ import sys +from ast import parse from collections import Counter from datetime import datetime -import pandas as pd +import pandas as pd import numpy as np import re +import gzip +from minda.decompose import _is_vcf_gz +def parse_bnd_alt(alt_string): + ''' + Parse the BND alt string and return separators and region + adapted from svtools package + ''' + # NOTE The below is ugly but intended to match things like [2:222[ and capture the brackets + result = re.findall(r'([][])(.+?)([][])', alt_string) + assert result, "%s\n" % alt_string + sep1, _ , sep2 = result[0] + assert sep1 == sep2 + return sep1 + +def _infer_strands(svtype, alt): + """ + infer SV strands from ALT + """ + valid_svtypes = {"DEL", "INS", "DUP", "INV", "BND"} + assert svtype in valid_svtypes, f"invalid svtype: {svtype}. must be one of {valid_svtypes}" + + orientation1 = orientation2 = "+" + if svtype in ("DEL", "INS") or alt in ("", ""): + orientation2 = "-" + elif svtype == "DUP" or "DUP" in alt: + orientation1 = "-" + else: + sep = parse_bnd_alt(alt) + if alt.startswith(sep): + orientation1 = "-" + if sep == "[": + orientation2 = "-" + return orientation1+orientation2 def _add_columns(ensemble_df, vaf): # create a column of list of prefixed IDs for each locus group @@ -72,7 +106,7 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, # create stat dfs start_dfs_list = [] start_dfs = pd.concat(dfs_1).reset_index(drop=True) - start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN']].sort_values(['#CHROM', 'POS']) + start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN', 'REF', 'ALT']].sort_values(['#CHROM', 'POS']) start_dfs['diff_x'] = start_dfs.groupby('#CHROM').POS.diff().fillna(9999) diffs = start_dfs['diff_x'].to_list() @@ -98,17 +132,11 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, #ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'SVLEN','Minda_ID']) ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'Minda_ID']) - ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\ - 'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\ - 'ID_y', ]] ensemble_df = ensemble_df.sort_values(['locus_group_x','#CHROM_y', 'POS_y']) ensemble_df ['diff_y'] = ensemble_df.groupby(['locus_group_x','#CHROM_y']).POS_y.diff().abs().fillna(9999) diffs = ensemble_df['diff_y'].to_list() caller_names = ensemble_df['Minda_ID'].apply(lambda x: x.rsplit('_', 1)[0]).tolist() ensemble_df['caller_names']= caller_names - ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\ - 'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\ - 'ID_y','diff_y','caller_names' ]] # group end loci locus_callers = [] @@ -195,17 +223,15 @@ def _get_ensemble_call_column(support_df, conditions): #support_df['ensemble'] = mask support_df.insert(loc=12, column='ensemble', value=mask) return support_df - -def _replace_value(row): - if row['ALT'] == '': - return f"N]{row['#CHROM_y']}:{row['POS_y']}]" - else: - return row['ALT'] - + def _get_contigs(vcf_list): contig_dict = {} for vcf in vcf_list: - with open(vcf, 'r') as file: + is_vcf_gz = _is_vcf_gz(vcf) + open_func = gzip.open if is_vcf_gz else open + mode = 'rt' if is_vcf_gz else 'r' + + with open_func(vcf, mode) as file: for line in file: if not line.startswith("##"): break @@ -227,34 +253,43 @@ def _get_contigs(vcf_list): def _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version): vcf_df = support_df[support_df['ensemble'] == True].reset_index(drop=True).copy() vcf_df['ID'] = f'Minda_' + (vcf_df.index + 1).astype(str) - vcf_df['REF'] = "N" - vcf_df['ALT'] = ["<" + svtype +">" for svtype in vcf_df['SVTYPE']] - vcf_df['ALT'] = vcf_df.apply(_replace_value, axis=1) vcf_df['QUAL'] = "." vcf_df['FILTER'] = "PASS" if vaf != None: vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \ + ';CHR2=' + str(chr2) + ';END=' + str(end) + \ + ';STRANDS=' + strands + \ ';SUPP_VEC=' + ','.join(map(str, supp_vec)) + ';VAF=' + str(vaf) \ - for svlen, svtype, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'], vcf_df['VAF'])] - vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"}) + for svlen, svtype, chr2, end, strands, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['#CHROM_y'], vcf_df['POS_y'], vcf_df['STRANDS'], vcf_df['ID_list_y'], vcf_df['VAF'])] + vcf_df = (vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF_x', 'ALT_x', 'QUAL', 'FILTER','INFO']] + .rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS", "REF_x":"REF", "ALT_x": "ALT"})) else: - vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \ - for svlen, svtype, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'])] - vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"}) + vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \ + ';CHR2=' + str(chr2) + ';END=' + str(end) + \ + ';STRANDS=' + strands + \ + ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \ + for svlen, svtype, chr2, end, strands, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['#CHROM_y'], vcf_df['POS_y'], vcf_df['STRANDS'], vcf_df['ID_list_y'])] + vcf_df = (vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF_x', 'ALT_x', 'QUAL', 'FILTER','INFO']] + .rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS", "REF_x": "REF", "ALT_x": "ALT"})) date = datetime.today().strftime('%Y-%m-%d') with open(f'{out_dir}/{sample_name}_minda_ensemble.vcf', 'w') as file: file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n') + command_str = " ".join(sys.argv) + file.write(f"##CommandLine= {command_str}\n") contig_dict = _get_contigs(vcf_list) for key, value in contig_dict.items(): file.write(f'##contig=\n') file.write('##ALT=\n##ALT=\n##ALT=\n##ALT=\n') file.write('##FILTER=\n') - file.write('##INFO=\n##INFO=\n##INFO=\n') + file.write('##INFO=\n' + '##INFO=\n' + '##INFO=\n' + '##INFO=\n' + '##INFO=\n' + '##INFO=\n') if vaf != None: file.write('##INFO=\n') - command_str = " ".join(sys.argv) - file.write(f"##cmd: {command_str}\n") vcf_df.to_csv(file, sep="\t", index=False) @@ -274,9 +309,18 @@ def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, condi call_boolean = any(value.startswith(caller_name) for value in intersect_list) caller_column.append(call_boolean) ensemble_df[f'{caller_name}'] = caller_column + # add in a column for strands + strands = [] + for row in ensemble_df.itertuples(): + alt = row.ALT_x + svtype = row.SVTYPE + strands1 = _infer_strands(svtype, alt) + strands.append(strands1) + ensemble_df["STRANDS"] = strands - column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x', \ - '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \ + column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x', + '#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', + 'REF_x', 'REF_y', 'ALT_x', 'ALT_y', 'STRANDS', 'SVTYPE', 'SVLEN', 'VAF', 'Minda_ID_list_y'] + caller_names support_df = ensemble_df[column_names].rename(columns={"Minda_ID_list_y": "Minda_IDs"}).copy() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..050d85c --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Tests for minda package diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py new file mode 100644 index 0000000..56399dc --- /dev/null +++ b/tests/test_ensemble.py @@ -0,0 +1,97 @@ +import pytest +from minda.ensemble import _infer_strands, parse_bnd_alt + + +class TestParseBndAlt: + """Test BND ALT string parsing""" + + def test_forward_bracket(self): + """Test parsing forward bracket notation""" + assert parse_bnd_alt("N[chr2:100[") == "[" + + def test_reverse_bracket(self): + """Test parsing reverse bracket notation""" + assert parse_bnd_alt("N]chr2:100]") == "]" + + def test_bracket_at_start(self): + """Test parsing when bracket is at start""" + assert parse_bnd_alt("[chr2:100[N") == "[" + assert parse_bnd_alt("]chr2:100]N") == "]" + + def test_invalid_alt_raises_assertion(self): + """Test that invalid ALT strings raise assertion error""" + with pytest.raises(AssertionError): + parse_bnd_alt("") + with pytest.raises(AssertionError): + parse_bnd_alt("N") + + +class TestInferStrands: + """Test strand inference from ALT field""" + + # DEL tests + def test_del_simple_notation(self): + """DEL with simple notation should be +-""" + assert _infer_strands("DEL", "N") == "+-" + + def test_del_bnd_notation(self): + """DEL with BND notation should be +-""" + assert _infer_strands("DEL", "N[chr2:100[") == "+-" + # INS tests + def test_ins_simple_notation(self): + """INS with simple notation should be +-""" + assert _infer_strands("INS", "N") == "+-" + + def test_ins_bnd_notation(self): + """INS with BND notation should be +-""" + assert _infer_strands("INS", "N]chr2:100]") == "+-" + + # INV tests - various BND notations + def test_inv_reverse_reverse_bracket_before(self): + """INV: [chr2:100[N should give -- (reverse-reverse)""" + assert _infer_strands("INV", "[chr2:100[N") == "--" + + def test_inv_forward_forward_bracket_after(self): + """INV: N]chr2:100] should give ++ (forward-forward)""" + assert _infer_strands("INV", "N]chr2:100]") == "++" + + # DUP tests + def test_dup_simple_notation(self): + """DUP with simple notation should be -+""" + assert _infer_strands("DUP", "") == "-+" + + def test_dup_bnd_notation(self): + """DUP with BND notation should be -+""" + assert _infer_strands("DUP", "N[chr2:100[") == "-+" + + # BND tests - all four orientations + def test_bnd_forward_reverse(self): + """BND: N[chr2:100[ should give +- (forward-reverse)""" + assert _infer_strands("BND", "N[chr2:100[") == "+-" + + def test_bnd_forward_forward(self): + """BND: N]chr2:100] should give ++ (forward-forward)""" + assert _infer_strands("BND", "N]chr2:100]") == "++" + + def test_bnd_reverse_reverse(self): + """BND: [chr2:100[N should give -- (reverse-reverse)""" + assert _infer_strands("BND", "[chr2:100[N") == "--" + + def test_bnd_reverse_forward(self): + """BND: ]chr2:100]N should give -+ (reverse-forward)""" + assert _infer_strands("BND", "]chr2:100]N") == "-+" + + +class TestStrandInferenceValidation: + """Test input validation""" + + def test_invalid_svtype_raises_error(self): + """Test that invalid SVTYPE raises AssertionError""" + with pytest.raises(AssertionError, match="invalid svtype"): + _infer_strands("INVALID", "N") + + with pytest.raises(AssertionError, match="invalid svtype"): + _infer_strands("del", "N") # lowercase + + with pytest.raises(AssertionError, match="invalid svtype"): + _infer_strands("", "N") From 3a09ad3e7eb91fe3ee4a138ef4ff63be0aa8069f Mon Sep 17 00:00:00 2001 From: apsteinberg Date: Tue, 6 Jan 2026 15:29:35 -0500 Subject: [PATCH 3/6] Dockerfile for minda --- Dockerfile | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..26d82b5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +# Use Python base image +FROM python:3.10-slim + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y git build-essential python3-dev && \ + rm -rf /var/lib/apt/lists/* + +# Install uv +COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv + +# Clone the Minda repository +RUN git clone https://github.com/shahcompbio/minda.git && \ + cd minda && git checkout 80cb5cc && \ + git rev-parse --short HEAD > /opt/minda_version.txt && \ + git rev-parse HEAD > /opt/minda_version_full.txt + +# Set working directory +WORKDIR /minda + +# Install Python dependencies using uv +RUN uv pip install --system pandas>=2.1.1 && \ + uv pip install --system numpy>=1.26.0 && \ + uv pip install --system pybedtools>=0.9.1 && \ + uv pip install --system intervaltree \ No newline at end of file From c57c7d1032348f3478cf49060ab3e93f8dd11f47 Mon Sep 17 00:00:00 2001 From: apsteinberg Date: Wed, 7 Jan 2026 10:38:51 -0500 Subject: [PATCH 4/6] fix for Dockerfile --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 26d82b5..5d31039 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ FROM python:3.10-slim # Install system dependencies RUN apt-get update && \ - apt-get install -y git build-essential python3-dev && \ + apt-get install -y git build-essential python3-dev bedtools && \ rm -rf /var/lib/apt/lists/* # Install uv From 8b6d81c8b21ba2c7e9acb8f923276abae6c743bd Mon Sep 17 00:00:00 2001 From: apsteinberg Date: Wed, 7 Jan 2026 18:03:51 -0500 Subject: [PATCH 5/6] pull strand info from info field if present; return ".." if STRANDS cannot be inferred; updated tests --- minda/ensemble.py | 76 ++++++++++++------- tests/test_ensemble.py | 161 ++++++++++++++++++++++++++--------------- 2 files changed, 155 insertions(+), 82 deletions(-) diff --git a/minda/ensemble.py b/minda/ensemble.py index c2ec462..84f74e6 100644 --- a/minda/ensemble.py +++ b/minda/ensemble.py @@ -8,37 +8,58 @@ import gzip from minda.decompose import _is_vcf_gz -def parse_bnd_alt(alt_string): +def _get_strands_from_info(info): ''' - Parse the BND alt string and return separators and region - adapted from svtools package + Parse info to get strand info ''' + terms = info.split(";") + strands = None + for term in terms: + if term.startswith("STRANDS"): + strands_info = term + _, strands = strands_info.split("=") + break + return strands + +def _get_strands_from_alt(alt_string): + ''' + Parse alt_string to get strand info (adapted from svtools) + :param alt_string: ALT field from vcf + return strands + ''' + orientation1 = orientation2 = "+" # NOTE The below is ugly but intended to match things like [2:222[ and capture the brackets result = re.findall(r'([][])(.+?)([][])', alt_string) - assert result, "%s\n" % alt_string - sep1, _ , sep2 = result[0] - assert sep1 == sep2 - return sep1 + # if we couldn't parse return blank strands + if not result: + return ".." + else: + sep1, _, sep2 = result[0] + # handle different scenarios + if sep1 != sep2: + # if true we did not parse correctly + return ".." + if alt_string.startswith(sep1): + orientation1 = "-" + if sep1 == "[": + orientation2 = "-" + return orientation1+orientation2 -def _infer_strands(svtype, alt): +def _infer_strands(svtype, alt, info): """ infer SV strands from ALT """ - valid_svtypes = {"DEL", "INS", "DUP", "INV", "BND"} - assert svtype in valid_svtypes, f"invalid svtype: {svtype}. must be one of {valid_svtypes}" - - orientation1 = orientation2 = "+" + valid_strands = ["++", "--", "+-", "-+"] if svtype in ("DEL", "INS") or alt in ("", ""): - orientation2 = "-" - elif svtype == "DUP" or "DUP" in alt: - orientation1 = "-" + strands = "+-" + elif svtype == "DUP" or alt == "": + strands = "-+" else: - sep = parse_bnd_alt(alt) - if alt.startswith(sep): - orientation1 = "-" - if sep == "[": - orientation2 = "-" - return orientation1+orientation2 + strands = _get_strands_from_info(info) + # infer from alt field if we don't have valid strand info + if strands not in valid_strands: + strands = _get_strands_from_alt(alt) + return strands def _add_columns(ensemble_df, vaf): # create a column of list of prefixed IDs for each locus group @@ -98,11 +119,11 @@ def _add_columns(ensemble_df, vaf): def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch): - + dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list] dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list] dfs_list = [dfs_1, dfs_2] - + # create stat dfs start_dfs_list = [] start_dfs = pd.concat(dfs_1).reset_index(drop=True) @@ -178,7 +199,7 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, ensemble_df['VAF'] = np.nan ensemble_df = ensemble_df.drop_duplicates(['locus_group_x', 'locus_group_y']).reset_index(drop=True) - + return ensemble_df @@ -309,12 +330,17 @@ def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, condi call_boolean = any(value.startswith(caller_name) for value in intersect_list) caller_column.append(call_boolean) ensemble_df[f'{caller_name}'] = caller_column + # Remove artifact entries with NaN ALT_x values. These can occur when BND mate records + # are reindexed during decomposition, creating index collisions with info_df records. + # The actual BND events are still properly represented by their working mate records. + ensemble_df = ensemble_df[ensemble_df["ALT_x"].notna()].copy() # add in a column for strands strands = [] for row in ensemble_df.itertuples(): alt = row.ALT_x svtype = row.SVTYPE - strands1 = _infer_strands(svtype, alt) + info = row.INFO_x + strands1 = _infer_strands(svtype, alt, info) strands.append(strands1) ensemble_df["STRANDS"] = strands diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 56399dc..3a24a78 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -1,97 +1,144 @@ import pytest -from minda.ensemble import _infer_strands, parse_bnd_alt +from minda.ensemble import _infer_strands, _get_strands_from_alt, _get_strands_from_info -class TestParseBndAlt: - """Test BND ALT string parsing""" +class TestGetStrandsFromInfo: + """Test STRANDS parsing from INFO field""" + + def test_strands_in_info(self): + """Test parsing STRANDS from INFO field""" + assert _get_strands_from_info("SVTYPE=DEL;STRANDS=+-;SVLEN=100") == "+-" + assert _get_strands_from_info("SVTYPE=BND;STRANDS=++;END=500") == "++" + assert _get_strands_from_info("STRANDS=--") == "--" + assert _get_strands_from_info("SVLEN=200;STRANDS=-+;CHR2=chr2") == "-+" + + def test_strands_not_in_info(self): + """Test when STRANDS is not in INFO field""" + assert _get_strands_from_info("SVTYPE=DEL;SVLEN=100") is None + assert _get_strands_from_info("SVTYPE=INS") is None + assert _get_strands_from_info("") is None + + def test_strands_at_different_positions(self): + """Test STRANDS at beginning, middle, and end of INFO""" + assert _get_strands_from_info("STRANDS=+-;SVTYPE=DEL;SVLEN=100") == "+-" + assert _get_strands_from_info("SVTYPE=DEL;STRANDS=++;SVLEN=100") == "++" + assert _get_strands_from_info("SVTYPE=DEL;SVLEN=100;STRANDS=--") == "--" + + +class TestGetStrandsFromAlt: + """Test strand inference from BND ALT field""" def test_forward_bracket(self): - """Test parsing forward bracket notation""" - assert parse_bnd_alt("N[chr2:100[") == "[" + """Test parsing forward bracket notation: N[chr2:100[ gives +-""" + assert _get_strands_from_alt("N[chr2:100[") == "+-" def test_reverse_bracket(self): - """Test parsing reverse bracket notation""" - assert parse_bnd_alt("N]chr2:100]") == "]" + """Test parsing reverse bracket notation: N]chr2:100] gives ++""" + assert _get_strands_from_alt("N]chr2:100]") == "++" + + def test_bracket_at_start_forward(self): + """Test parsing when forward bracket is at start: [chr2:100[N gives --""" + assert _get_strands_from_alt("[chr2:100[N") == "--" - def test_bracket_at_start(self): - """Test parsing when bracket is at start""" - assert parse_bnd_alt("[chr2:100[N") == "[" - assert parse_bnd_alt("]chr2:100]N") == "]" + def test_bracket_at_start_reverse(self): + """Test parsing when reverse bracket is at start: ]chr2:100]N gives -+""" + assert _get_strands_from_alt("]chr2:100]N") == "-+" - def test_invalid_alt_raises_assertion(self): - """Test that invalid ALT strings raise assertion error""" - with pytest.raises(AssertionError): - parse_bnd_alt("") - with pytest.raises(AssertionError): - parse_bnd_alt("N") + def test_invalid_alt_returns_dots(self): + """Test that invalid ALT strings return '..'""" + assert _get_strands_from_alt("") == ".." + assert _get_strands_from_alt("N") == ".." + assert _get_strands_from_alt("") == ".." + + def test_mismatched_brackets_returns_dots(self): + """Test that mismatched brackets return '..'""" + assert _get_strands_from_alt("N[chr2:100]") == ".." + assert _get_strands_from_alt("N]chr2:100[") == ".." class TestInferStrands: - """Test strand inference from ALT field""" + """Test strand inference with fallback logic""" # DEL tests def test_del_simple_notation(self): """DEL with simple notation should be +-""" - assert _infer_strands("DEL", "N") == "+-" + assert _infer_strands("DEL", "N", "SVTYPE=DEL;SVLEN=100") == "+-" + + def test_del_symbolic_alt(self): + """DEL with symbolic ALT should be +-""" + assert _infer_strands("DEL", "", "SVTYPE=DEL;SVLEN=100") == "+-" - def test_del_bnd_notation(self): - """DEL with BND notation should be +-""" - assert _infer_strands("DEL", "N[chr2:100[") == "+-" # INS tests def test_ins_simple_notation(self): """INS with simple notation should be +-""" - assert _infer_strands("INS", "N") == "+-" - - def test_ins_bnd_notation(self): - """INS with BND notation should be +-""" - assert _infer_strands("INS", "N]chr2:100]") == "+-" + assert _infer_strands("INS", "N", "SVTYPE=INS;SVLEN=100") == "+-" - # INV tests - various BND notations - def test_inv_reverse_reverse_bracket_before(self): - """INV: [chr2:100[N should give -- (reverse-reverse)""" - assert _infer_strands("INV", "[chr2:100[N") == "--" - - def test_inv_forward_forward_bracket_after(self): - """INV: N]chr2:100] should give ++ (forward-forward)""" - assert _infer_strands("INV", "N]chr2:100]") == "++" + def test_ins_symbolic_alt(self): + """INS with symbolic ALT should be +-""" + assert _infer_strands("INS", "", "SVTYPE=INS;SVLEN=100") == "+-" # DUP tests def test_dup_simple_notation(self): """DUP with simple notation should be -+""" - assert _infer_strands("DUP", "") == "-+" + assert _infer_strands("DUP", "", "SVTYPE=DUP;SVLEN=100") == "-+" - def test_dup_bnd_notation(self): - """DUP with BND notation should be -+""" - assert _infer_strands("DUP", "N[chr2:100[") == "-+" + def test_dup_alt_contains_dup(self): + """DUP in ALT string should be -+""" + assert _infer_strands("DUP", "N", "SVTYPE=DUP;SVLEN=100") == "-+" - # BND tests - all four orientations + # BND tests - all four orientations inferred from ALT def test_bnd_forward_reverse(self): """BND: N[chr2:100[ should give +- (forward-reverse)""" - assert _infer_strands("BND", "N[chr2:100[") == "+-" + assert _infer_strands("BND", "N[chr2:100[", "SVTYPE=BND;CHR2=chr2") == "+-" def test_bnd_forward_forward(self): """BND: N]chr2:100] should give ++ (forward-forward)""" - assert _infer_strands("BND", "N]chr2:100]") == "++" + assert _infer_strands("BND", "N]chr2:100]", "SVTYPE=BND;CHR2=chr2") == "++" def test_bnd_reverse_reverse(self): """BND: [chr2:100[N should give -- (reverse-reverse)""" - assert _infer_strands("BND", "[chr2:100[N") == "--" + assert _infer_strands("BND", "[chr2:100[N", "SVTYPE=BND;CHR2=chr2") == "--" def test_bnd_reverse_forward(self): """BND: ]chr2:100]N should give -+ (reverse-forward)""" - assert _infer_strands("BND", "]chr2:100]N") == "-+" - - -class TestStrandInferenceValidation: - """Test input validation""" - - def test_invalid_svtype_raises_error(self): - """Test that invalid SVTYPE raises AssertionError""" - with pytest.raises(AssertionError, match="invalid svtype"): - _infer_strands("INVALID", "N") + assert _infer_strands("BND", "]chr2:100]N", "SVTYPE=BND;CHR2=chr2") == "-+" - with pytest.raises(AssertionError, match="invalid svtype"): - _infer_strands("del", "N") # lowercase + # INV tests - various BND notations + def test_inv_reverse_reverse_bracket_before(self): + """INV: [chr2:100[N should give -- (reverse-reverse)""" + assert _infer_strands("INV", "[chr2:100[N", "SVTYPE=INV;SVLEN=100") == "--" - with pytest.raises(AssertionError, match="invalid svtype"): - _infer_strands("", "N") + def test_inv_forward_forward_bracket_after(self): + """INV: N]chr2:100] should give ++ (forward-forward)""" + assert _infer_strands("INV", "N]chr2:100]", "SVTYPE=INV;SVLEN=100") == "++" + + +class TestStrandInferenceFallback: + """Test fallback behavior when INFO field has valid STRANDS""" + + def test_info_field_takes_precedence(self): + """When INFO has valid STRANDS, use it for non-DEL/INS/DUP types""" + # For BND/INV, INFO field takes precedence over ALT parsing + # Even though BND ALT says +-, INFO says -- + assert _infer_strands("BND", "N[chr2:100[", "SVTYPE=BND;STRANDS=--") == "--" + # Even though INV ALT would give ++, INFO says +- + assert _infer_strands("INV", "N]chr2:100]", "SVTYPE=INV;STRANDS=+-") == "+-" + + def test_fallback_to_alt_when_info_invalid(self): + """When INFO has invalid STRANDS, fall back to ALT parsing""" + # INFO has invalid strand value, should parse ALT + assert _infer_strands("BND", "N]chr2:100]", "SVTYPE=BND;STRANDS=invalid") == "++" + assert _infer_strands("BND", "[chr2:100[N", "SVTYPE=BND;STRANDS=X") == "--" + + def test_fallback_to_alt_when_no_info_strands(self): + """When INFO has no STRANDS, fall back to ALT parsing""" + assert _infer_strands("BND", "N[chr2:100[", "SVTYPE=BND;SVLEN=100") == "+-" + assert _infer_strands("BND", "]chr2:100]N", "SVTYPE=BND") == "-+" + + def test_returns_dots_when_unparseable(self): + """When everything fails, return '..'""" + # Unknown SVTYPE, no INFO strands, unparseable ALT + assert _infer_strands("UNKNOWN", "", "SVTYPE=UNKNOWN") == ".." + assert _infer_strands("CTX", "N", "SVTYPE=CTX;SVLEN=100") == ".." + # Invalid ALT and no INFO strands + assert _infer_strands("BND", "malformed", "SVTYPE=BND") == ".." From 366f0bbd7c9b5b2e993d8a426cf8d77213686ad2 Mon Sep 17 00:00:00 2001 From: apsteinberg Date: Thu, 8 Jan 2026 13:33:23 -0500 Subject: [PATCH 6/6] updated Dockerfile --- Dockerfile | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5d31039..041505e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,25 +1,24 @@ # Use Python base image -FROM python:3.10-slim +FROM community.wave.seqera.io/library/uv:0.9.22--2326eebc25ee1806 # Install system dependencies RUN apt-get update && \ apt-get install -y git build-essential python3-dev bedtools && \ rm -rf /var/lib/apt/lists/* -# Install uv -COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv - # Clone the Minda repository -RUN git clone https://github.com/shahcompbio/minda.git && \ - cd minda && git checkout 80cb5cc && \ +RUN git clone https://github.com/shahcompbio/minda.git /opt/minda && \ + cd /opt/minda && git checkout 8b6d81c && \ git rev-parse --short HEAD > /opt/minda_version.txt && \ - git rev-parse HEAD > /opt/minda_version_full.txt + git rev-parse HEAD > /opt/minda_version_full.txt && \ + chmod +x minda.py && \ + ln -s /opt/minda/minda.py /usr/local/bin/minda # Set working directory -WORKDIR /minda +WORKDIR /opt/minda # Install Python dependencies using uv -RUN uv pip install --system pandas>=2.1.1 && \ - uv pip install --system numpy>=1.26.0 && \ - uv pip install --system pybedtools>=0.9.1 && \ - uv pip install --system intervaltree \ No newline at end of file +RUN uv pip install --system --break-system-packages pandas>=2.1.1 && \ + uv pip install --system --break-system-packages numpy>=1.26.0 && \ + uv pip install --system --break-system-packages pybedtools>=0.9.1 && \ + uv pip install --system --break-system-packages intervaltree \ No newline at end of file