Skip to content

Commit

Permalink
ENH/BUG: mm2 parameter str; use shell=True in aln
Browse files Browse the repository at this point in the history
Closes #23 -- the user can now specify the exact preset themselves.

Closes #17 -- piping is now handled by the shell, rather than Python,
which should remove the Python-specific piping memory problems. We
could add explicit -o at every step if desired, but I think it should
be fine.

Partial work on #21: it would still be nice to add an "early warning
system" for massive FASTA files.

Parital work on #29: users can now add these extra parameters
themselves if desired. It might be worth adding new default
parameters, but that's a less urgent question.
  • Loading branch information
fedarko committed Sep 11, 2022
1 parent 2be3aaf commit b22305b
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 43 deletions.
31 changes: 29 additions & 2 deletions strainflye/_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,23 @@ def strainflye():
"will not be considered in this filter."
),
)
@click.option(
"-p",
"--minimap2-params",
required=False,
default=config.DEFAULT_MM2_PARAMS,
show_default=True,
type=click.STRING,
help=(
"Additional parameters to pass to minimap2, besides the contig and "
"read information. Depending on the size of your dataset and the "
"amount of memory your system has, you may want to adjust the -I "
"parameter; see minimap2's documentation for details. Please note "
"that we do not perform any validation on this string before passing "
"it to minimap2 (so if you are allowing users to run strainFlye "
"through a web server, be careful about shell injection)."
),
)
@click.option(
"-o",
"--output-dir",
Expand Down Expand Up @@ -110,7 +127,9 @@ def strainflye():
# Regarding the \b marker, see https://stackoverflow.com/a/53302580 -- this is
# apparently needed to get the formatting to look the way I want (otherwise all
# of the four steps are smooshed into a paragraph)
def align(reads, contigs, graph, output_dir, rm_tmp_bam, verbose):
def align(
reads, contigs, graph, minimap2_params, output_dir, rm_tmp_bam, verbose
):
"""Align reads to contigs, and filter the resulting alignment.
Files of reads should be in the FASTA or FASTQ formats; GZIP'd files
Expand Down Expand Up @@ -142,6 +161,7 @@ def align(reads, contigs, graph, output_dir, rm_tmp_bam, verbose):
("file(s) of reads", reads_info),
("contig file", contigs),
("graph file", graph),
("minimap2 parameters", minimap2_params),
),
(("directory", output_dir),),
extra_info=(
Expand All @@ -150,7 +170,14 @@ def align(reads, contigs, graph, output_dir, rm_tmp_bam, verbose):
),
)
align_utils.run(
reads, contigs, graph, output_dir, rm_tmp_bam, fancylog, verbose
reads,
contigs,
graph,
minimap2_params,
output_dir,
rm_tmp_bam,
fancylog,
verbose,
)
fancylog("Done.")

Expand Down
68 changes: 27 additions & 41 deletions strainflye/align_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -724,7 +724,16 @@ def check_and_update_alignment(
fancylog("Done filtering out partially-mapped reads.", prefix="")


def run(reads, contigs, graph, output_dir, rm_tmp_bam, fancylog, verbose):
def run(
reads,
contigs,
graph,
minimap2_params,
output_dir,
rm_tmp_bam,
fancylog,
verbose,
):
"""Runs the entire alignment-and-filtering process.
Parameters
Expand All @@ -741,6 +750,9 @@ def run(reads, contigs, graph, output_dir, rm_tmp_bam, fancylog, verbose):
is None, this just won't be used there. See filter_pm_reads()' docs for
more information.
minimap2_params: str
Parameters to pass to minimap2.
output_dir: str
Output directory path. Will be created if it doesn't already exist.
Expand Down Expand Up @@ -805,54 +817,28 @@ def run(reads, contigs, graph, output_dir, rm_tmp_bam, fancylog, verbose):
# unsorted BAM file from "samtools view". So we use piping.
# SAMtools stuff based on:
# https://davetang.org/wiki/tiki-index.php?page=SAMTools#Converting_SAM_directly_to_a_sorted_BAM_file # noqa
# Python stuff based on https://stackoverflow.com/a/4846923 and
# https://stackoverflow.com/a/9655939

# As discussed at https://stackoverflow.com/q/6880090, some of the piping
# utilities Python has struggle when the amount of data being passed is
# massive. So -- for the sake of simplicity -- we just use shell=True. This
# carries with it some things that should be considered when running this
# e.g. on a server (see
# https://docs.python.org/3/library/subprocess.html#security-considerations),
# but for local usage it should be fine. (This is also the case for "smooth
# assemble" when it calls LJA.)

# There's probably a way to print stuff after each individual command in
# the chain finishes, but I don't think that sorta granularity is super
# necessary right now tbh

threesteps = "minimap2 --> samtools view --> samtools sort"
fancylog(f"Running {threesteps}...")

# NOTE: the -ax asm20 preset is what we use in the paper, but later
# versions of minimap2 have added in "-ax map-hifi" which is probs a better
# option in most cases. Shouldn't make too much of a difference; for
# simplicity's sake we just stick with asm20 here, but we could definitely
# change this (or add the option to configure it) if desired
#
# TODO TODO TODO the I8g thing is gonna cause problems for other computers
# -- see https://github.com/lh3/minimap2/blob/master/FAQ.md, #3.
# The way to handle this is make minimap2 CLI params configurable by the
# user (they can pass a string with params I guess?)
minimap2_run = subprocess.Popen(
[
"minimap2",
"-ax",
"asm20",
"--secondary=no",
"-I8g",
"--MD",
contigs,
*reads,
],
stdout=subprocess.PIPE,
check=True,
)
sam_to_bam_run = subprocess.Popen(
["samtools", "view", "-b", "-"],
stdin=minimap2_run.stdout,
stdout=subprocess.PIPE,
check=True,
)
minimap2_run.stdout.close()
bam_to_sorted_bam_run = subprocess.Popen(
["samtools", "sort", "-", "-o", first_output_bam],
stdin=sam_to_bam_run.stdout,
check=True,
cmd = (
f"minimap2 {minimap2_params} {contigs} {' '.join(reads)} | samtools "
f"view -b - | samtools sort - -o {first_output_bam}"
)
sam_to_bam_run.stdout.close()
bam_to_sorted_bam_run.communicate()
fancylog(f"Exact command we're running: {cmd}", prefix="")
subprocess.run(cmd, shell=True, check=True)

fancylog(f"Done running {threesteps}.", prefix="")

Expand Down
3 changes: 3 additions & 0 deletions strainflye/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
# Prefix of diversity index columns in diversity index TSV files
DI_PREF = "DivIdx"

# Default parameters for minimap2 for "strainFlye align"
DEFAULT_MM2_PARAMS = "-ax asm20 --secondary=no -I 8g --MD"

# Default parameters for LJA for "strainFlye smooth assemble"
DEFAULT_LJA_PARAMS = "--simpleec --Cov-threshold 10"

Expand Down

0 comments on commit b22305b

Please sign in to comment.