Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Use Python base image
FROM community.wave.seqera.io/library/uv:0.9.22--2326eebc25ee1806

# Install system dependencies
RUN apt-get update && \
apt-get install -y git build-essential python3-dev bedtools && \
rm -rf /var/lib/apt/lists/*

# Clone the Minda repository
RUN git clone https://github.com/shahcompbio/minda.git /opt/minda && \
cd /opt/minda && git checkout 8b6d81c && \
git rev-parse --short HEAD > /opt/minda_version.txt && \
git rev-parse HEAD > /opt/minda_version_full.txt && \
chmod +x minda.py && \
ln -s /opt/minda/minda.py /usr/local/bin/minda

# Set working directory
WORKDIR /opt/minda

# Install Python dependencies using uv
RUN uv pip install --system --break-system-packages pandas>=2.1.1 && \
uv pip install --system --break-system-packages numpy>=1.26.0 && \
uv pip install --system --break-system-packages pybedtools>=0.9.1 && \
uv pip install --system --break-system-packages intervaltree
11 changes: 5 additions & 6 deletions minda/decompose.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def get_df(vcf):
"""
is_vcf_gz = _is_vcf_gz(vcf)
if is_vcf_gz == False:
df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'})
df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,3,4,6,7], header=None, dtype={'#CHROM': 'str', 'POS':'Int64'})
else:
df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'})
df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO']
df = pd.read_csv(vcf, comment='#', sep='\t', usecols=[0,1,2,3,4,6,7], header=None, compression='gzip', dtype={'#CHROM': 'str', 'POS':'Int64'})
df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO']

return df

Expand All @@ -81,8 +81,8 @@ def get_intersected_df(vcf, bed):
bed_to_bt = BedTool(bed)
vcf_to_bt = BedTool(vcf)
intersect_obj = vcf_to_bt.intersect(bed_to_bt, u=True)
df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'})
df.columns = ['#CHROM', 'POS', 'ID', 'ALT', 'FILTER', 'INFO']
df = BedTool.to_dataframe(intersect_obj, header=None, usecols=[0,1,2,3,4,6,7], dtype={'#CHROM': 'str', 'POS':'int'})
df.columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'FILTER', 'INFO']
return df


Expand Down Expand Up @@ -359,7 +359,6 @@ def get_decomposed_dfs(caller_name, df, filter, min_size, prefixed, vaf, sample_
df['VAF'] = df.INFO.str.extract(r';VAF=([\d.]+)')[0].astype('float').to_list()
if df.VAF.isnull().all() == True:
sys.exit(f"No VAF values found in {caller_name} VCF. Run Minda without --vaf parameter or add VAF to INFO. ")

# get indices of mate rows
df = _get_alt_mate_index(df)

Expand Down
134 changes: 102 additions & 32 deletions minda/ensemble.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,65 @@
import sys
from ast import parse
from collections import Counter
from datetime import datetime
import pandas as pd
import pandas as pd
import numpy as np
import re
import gzip
from minda.decompose import _is_vcf_gz

def _get_strands_from_info(info):
'''
Parse info to get strand info
'''
terms = info.split(";")
strands = None
for term in terms:
if term.startswith("STRANDS"):
strands_info = term
_, strands = strands_info.split("=")
break
return strands

def _get_strands_from_alt(alt_string):
'''
Parse alt_string to get strand info (adapted from svtools)
:param alt_string: ALT field from vcf
return strands
'''
orientation1 = orientation2 = "+"
# NOTE The below is ugly but intended to match things like [2:222[ and capture the brackets
result = re.findall(r'([][])(.+?)([][])', alt_string)
# if we couldn't parse return blank strands
if not result:
return ".."
else:
sep1, _, sep2 = result[0]
# handle different scenarios
if sep1 != sep2:
# if true we did not parse correctly
return ".."
if alt_string.startswith(sep1):
orientation1 = "-"
if sep1 == "[":
orientation2 = "-"
return orientation1+orientation2

def _infer_strands(svtype, alt, info):
"""
infer SV strands from ALT
"""
valid_strands = ["++", "--", "+-", "-+"]
if svtype in ("DEL", "INS") or alt in ("<DEL>", "<INS>"):
strands = "+-"
elif svtype == "DUP" or alt == "<DUP>":
strands = "-+"
else:
strands = _get_strands_from_info(info)
# infer from alt field if we don't have valid strand info
if strands not in valid_strands:
strands = _get_strands_from_alt(alt)
return strands

def _add_columns(ensemble_df, vaf):
# create a column of list of prefixed IDs for each locus group
Expand Down Expand Up @@ -64,15 +119,15 @@ def _add_columns(ensemble_df, vaf):


def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir, sample_name, args, multimatch):

dfs_1 = [dfs_list[0] for dfs_list in decomposed_dfs_list]
dfs_2 = [dfs_list[1] for dfs_list in decomposed_dfs_list]
dfs_list = [dfs_1, dfs_2]

# create stat dfs
start_dfs_list = []
start_dfs = pd.concat(dfs_1).reset_index(drop=True)
start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN']].sort_values(['#CHROM', 'POS'])
start_dfs = start_dfs[['#CHROM', 'POS', 'ID', 'Minda_ID', 'INFO', 'SVTYPE', 'SVLEN', 'REF', 'ALT']].sort_values(['#CHROM', 'POS'])

start_dfs['diff_x'] = start_dfs.groupby('#CHROM').POS.diff().fillna(9999)
diffs = start_dfs['diff_x'].to_list()
Expand All @@ -98,17 +153,11 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir,

#ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'SVLEN','Minda_ID'])
ensemble_df = start_dfs.merge(end_dfs, on=['SVTYPE', 'Minda_ID'])
ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\
'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\
'ID_y', ]]
ensemble_df = ensemble_df.sort_values(['locus_group_x','#CHROM_y', 'POS_y'])
ensemble_df ['diff_y'] = ensemble_df.groupby(['locus_group_x','#CHROM_y']).POS_y.diff().abs().fillna(9999)
diffs = ensemble_df['diff_y'].to_list()
caller_names = ensemble_df['Minda_ID'].apply(lambda x: x.rsplit('_', 1)[0]).tolist()
ensemble_df['caller_names']= caller_names
ensemble_df[['#CHROM_x', 'POS_x', 'ID_x', 'Minda_ID', 'SVTYPE', 'SVLEN',\
'diff_x', 'locus_group_x', 'median', '#CHROM_y', 'POS_y',\
'ID_y','diff_y','caller_names' ]]

# group end loci
locus_callers = []
Expand Down Expand Up @@ -150,7 +199,7 @@ def _get_ensemble_df(decomposed_dfs_list, caller_names, tolerance, vaf, out_dir,
ensemble_df['VAF'] = np.nan

ensemble_df = ensemble_df.drop_duplicates(['locus_group_x', 'locus_group_y']).reset_index(drop=True)

return ensemble_df


Expand Down Expand Up @@ -195,17 +244,15 @@ def _get_ensemble_call_column(support_df, conditions):
#support_df['ensemble'] = mask
support_df.insert(loc=12, column='ensemble', value=mask)
return support_df

def _replace_value(row):
if row['ALT'] == '<BND>':
return f"N]{row['#CHROM_y']}:{row['POS_y']}]"
else:
return row['ALT']


def _get_contigs(vcf_list):
contig_dict = {}
for vcf in vcf_list:
with open(vcf, 'r') as file:
is_vcf_gz = _is_vcf_gz(vcf)
open_func = gzip.open if is_vcf_gz else open
mode = 'rt' if is_vcf_gz else 'r'

with open_func(vcf, mode) as file:
for line in file:
if not line.startswith("##"):
break
Expand All @@ -227,34 +274,43 @@ def _get_contigs(vcf_list):
def _get_ensemble_vcf(vcf_list, support_df, out_dir, sample_name, args, vaf, version):
vcf_df = support_df[support_df['ensemble'] == True].reset_index(drop=True).copy()
vcf_df['ID'] = f'Minda_' + (vcf_df.index + 1).astype(str)
vcf_df['REF'] = "N"
vcf_df['ALT'] = ["<" + svtype +">" for svtype in vcf_df['SVTYPE']]
vcf_df['ALT'] = vcf_df.apply(_replace_value, axis=1)
vcf_df['QUAL'] = "."
vcf_df['FILTER'] = "PASS"

if vaf != None:
vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \
';CHR2=' + str(chr2) + ';END=' + str(end) + \
';STRANDS=' + strands + \
';SUPP_VEC=' + ','.join(map(str, supp_vec)) + ';VAF=' + str(vaf) \
for svlen, svtype, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'], vcf_df['VAF'])]
vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"})
for svlen, svtype, chr2, end, strands, supp_vec, vaf in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['#CHROM_y'], vcf_df['POS_y'], vcf_df['STRANDS'], vcf_df['ID_list_y'], vcf_df['VAF'])]
vcf_df = (vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF_x', 'ALT_x', 'QUAL', 'FILTER','INFO']]
.rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS", "REF_x":"REF", "ALT_x": "ALT"}))
else:
vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + ';SUPP_VEC=' + ','.join(map(str, supp_vec)) \
for svlen, svtype, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['ID_list_y'])]
vcf_df = vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER','INFO']].rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS"})
vcf_df['INFO'] = ['SVLEN=' + str(svlen) + ';SVTYPE=' + svtype + \
';CHR2=' + str(chr2) + ';END=' + str(end) + \
';STRANDS=' + strands + \
';SUPP_VEC=' + ','.join(map(str, supp_vec)) \
for svlen, svtype, chr2, end, strands, supp_vec in zip(vcf_df['SVLEN'],vcf_df['SVTYPE'], vcf_df['#CHROM_y'], vcf_df['POS_y'], vcf_df['STRANDS'], vcf_df['ID_list_y'])]
vcf_df = (vcf_df[['#CHROM_x', 'POS_x', 'ID', 'REF_x', 'ALT_x', 'QUAL', 'FILTER','INFO']]
.rename(columns={'#CHROM_x':"#CHROM", "POS_x":"POS", "REF_x": "REF", "ALT_x": "ALT"}))
date = datetime.today().strftime('%Y-%m-%d')
with open(f'{out_dir}/{sample_name}_minda_ensemble.vcf', 'w') as file:
file.write(f'##fileformat=VCFv4.2\n##fileDate={date}\n##source=MindaV{version}\n')
command_str = " ".join(sys.argv)
file.write(f"##CommandLine= {command_str}\n")
contig_dict = _get_contigs(vcf_list)
for key, value in contig_dict.items():
file.write(f'##contig=<ID={key},length={value}>\n')
file.write('##ALT=<ID=DEL,Description="Deletion">\n##ALT=<ID=INS,Description="Insertion">\n##ALT=<ID=DUP,Description="Duplication">\n##ALT=<ID=INV,Description="Inversion">\n')
file.write('##FILTER=<ID=PASS,Description="Default">\n')
file.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the structural variant">\n##INFO=<ID=SUPP_VEC,Number=.,Type=String,Description="IDs of support records">\n')
file.write('##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">\n'
'##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of the structural variant">\n'
'##INFO=<ID=CHR2,Number=1,Type=String,Description="Chromosome for END coordinate">\n'
'##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">\n'
'##INFO=<ID=STRANDS,Number=1,Type=String,Description="Breakpoint strandedness (first_to_second)">\n'
'##INFO=<ID=SUPP_VEC,Number=.,Type=String,Description="IDs of support records">\n')
if vaf != None:
file.write('##INFO=<ID=VAF,Number=1,Type=Float,Description="Variant allele frequency">\n')
command_str = " ".join(sys.argv)
file.write(f"##cmd: {command_str}\n")
vcf_df.to_csv(file, sep="\t", index=False)


Expand All @@ -274,9 +330,23 @@ def get_support_df(vcf_list, decomposed_dfs_list, caller_names, tolerance, condi
call_boolean = any(value.startswith(caller_name) for value in intersect_list)
caller_column.append(call_boolean)
ensemble_df[f'{caller_name}'] = caller_column
# Remove artifact entries with NaN ALT_x values. These can occur when BND mate records
# are reindexed during decomposition, creating index collisions with info_df records.
# The actual BND events are still properly represented by their working mate records.
ensemble_df = ensemble_df[ensemble_df["ALT_x"].notna()].copy()
# add in a column for strands
strands = []
for row in ensemble_df.itertuples():
alt = row.ALT_x
svtype = row.SVTYPE
info = row.INFO_x
strands1 = _infer_strands(svtype, alt, info)
strands.append(strands1)
ensemble_df["STRANDS"] = strands

column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x', \
'#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y', \
column_names = ['#CHROM_x', 'POS_x', 'locus_group_x', 'ID_list_x',
'#CHROM_y', 'POS_y', 'locus_group_y', 'ID_list_y',
'REF_x', 'REF_y', 'ALT_x', 'ALT_y', 'STRANDS',
'SVTYPE', 'SVLEN', 'VAF', 'Minda_ID_list_y'] + caller_names

support_df = ensemble_df[column_names].rename(columns={"Minda_ID_list_y": "Minda_IDs"}).copy()
Expand Down
6 changes: 5 additions & 1 deletion minda/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,11 @@ def _get_stats_df(tp_df, fn_df, fp_df, paired_df, base_df, caller_name, max_len,
if tp+fn == 0:
sys.exit(f"{caller_name} has no TP or FN records. Please double check input files.")
recall = tp/(tp+fn)
f1 = (2*precision*recall)/(precision+recall)
# Handle edge case where both precision and recall are 0
if precision + recall == 0:
f1 = 0.0
else:
f1 = (2 * precision * recall) / (precision + recall)

caller_len = len(paired_df)
base_len = len(base_df)
Expand Down
1 change: 1 addition & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Tests for minda package
Loading