Skip to content

Commit

Permalink
Merge pull request #90 from pdimens/align_aggregate
Browse files Browse the repository at this point in the history
replace minimap with strobealign
  • Loading branch information
pdimens authored Jun 21, 2024
2 parents 1b6a2f1 + 3992eac commit 01c9cbe
Show file tree
Hide file tree
Showing 7 changed files with 466 additions and 34 deletions.
6 changes: 3 additions & 3 deletions .github/filters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,11 @@ ema: &ema
- 'src/harpy/scripts/countBX.py'
- 'src/harpy/bin/makewindows.py'
- 'test/fastq/**'
minimap: &minimap
strobealign: &strobealign
- *common
- *container
- 'src/harpy/align.py'
- 'src/harpy/rules/align-minimap.smk'
- 'src/harpy/rules/align-strobealign.smk'
- 'src/harpy/reports/AlignStats.Rmd'
- 'src/harpy/reports/BxCount.Rmd'
- 'src/harpy/scripts/bxStats.py'
Expand Down Expand Up @@ -150,7 +150,7 @@ modules:
- *qc
- *bwa
- *ema
- *minimap
- *strobealign
- *mpileup
- *freebayes
- *impute
Expand Down
12 changes: 6 additions & 6 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
qc: ${{ steps.filter.outputs.qc }}
bwa: ${{ steps.filter.outputs.bwa }}
ema: ${{ steps.filter.outputs.ema }}
minimap: ${{ steps.filter.outputs.minimap }}
strobealign: ${{ steps.filter.outputs.strobealign }}
mpileup: ${{ steps.filter.outputs.mpileup }}
freebayes: ${{ steps.filter.outputs.freebayes }}
leviathan: ${{ steps.filter.outputs.leviathan }}
Expand Down Expand Up @@ -309,10 +309,10 @@ jobs:
shell: micromamba-shell {0}
run: harpy align ema --ema-bins 150 -g test/genome/genome.fasta.gz --snakemake "--show-failed-logs" -x "-d" test/fastq

minimap:
strobe:
needs: [changes, pkgbuild]
if: ${{ needs.changes.outputs.minimap == 'true' && needs.pkgbuild.result == 'success' }}
name: align minimap
if: ${{ needs.changes.outputs.strobealign == 'true' && needs.pkgbuild.result == 'success' }}
name: align strobe
runs-on: ubuntu-latest
steps:
- name: Checkout
Expand Down Expand Up @@ -343,9 +343,9 @@ jobs:
with:
name: deps-image
path: .snakemake/singularity
- name: test minimap
- name: test strobealign
shell: micromamba-shell {0}
run: harpy align minimap -g test/genome/genome.fasta.gz --snakemake "--show-failed-logs" -x "--seed 13" test/fastq
run: harpy align strobe -r 125 -g test/genome/genome.fasta.gz --snakemake "--show-failed-logs" test/fastq

mpileup:
needs: [changes, pkgbuild]
Expand Down
130 changes: 108 additions & 22 deletions src/harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,13 @@ def align():
"""
Align sample sequences to a reference genome
The three available aligners all retain the linked-read barcode information in the
resulting output, however `EMA` is the only aligner to use the barcode information
to facilitate the aligning process and can be prohibitively slow. The `minimap2`
aligner is the fastest of the three and is comparable in accuracy to `bwa` for
The available aligners all retain the linked-read barcode information in the
resulting output, however `ema` is the only aligner to use the barcode information
to facilitate the aligning process and can be prohibitively slow. The `strobe`
aligner is the fastest option and is comparable in accuracy (or better) to `bwa` for
sequences >100bp.
**Aligners**
- `bwa`: uses BWA MEM to align reads (fast)
- `ema`: uses the BX barcode-aware EMA aligner (very slow)
- `minimap`: uses minimap2 to align reads (ultra fast)
Provide an additional subcommand `bwa`, `ema`, or `minimap` to get more information on using
Provide an additional subcommand `bwa`, `ema`, or `strobe` to get more information on using
those aligners.
"""

Expand Down Expand Up @@ -62,6 +57,16 @@ def align():
"name": "Other Options",
"options": ["--output-dir", "--threads", "--depth-window", "--skipreports", "--conda", "--snakemake", "--quiet", "--help"],
},
],
"harpy align strobe": [
{
"name": "Parameters",
"options": ["--genome", "--read-length", "--quality-filter", "--molecule-distance", "--extra-params"],
},
{
"name": "Other Options",
"options": ["--output-dir", "--threads", "--depth-window", "--skipreports", "--conda", "--snakemake", "--quiet", "--help"],
},
]
}

Expand All @@ -82,7 +87,7 @@ def align():
@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
"""
Align sequences to genome using BWA MEM
Align sequences to genome using `BWA MEM`
Provide the input fastq files and/or directories at the end of the command as individual
files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
Expand Down Expand Up @@ -158,7 +163,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality
@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, ema_bins, skipreports, extra_params, quality_filter, snakemake, quiet, hpc, conda, print_only):
"""
Align sequences to a genome using EMA
Align sequences to genome using `EMA`
Provide the input fastq files and/or directories at the end of the
command as individual files/folders, using shell wildcards
Expand Down Expand Up @@ -233,37 +238,116 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
_module = subprocess.run(command.split())
sys.exit(_module.returncode)

#@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
#@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
#@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
#@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
#@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
#@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
#@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
#@click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running')
#@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory')
#@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes')
#@click.option('--hpc', type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission')
#@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container')
#@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports')
#@click.option('--print-only', is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit')
#@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
#def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
# """
# Align sequences to genome using `Minimap2`
#
# Provide the input fastq files and/or directories at the end of the command as individual
# files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
#
# Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does
# not use barcodes when mapping. Harpy will post-processes the alignments using the
# specified `--molecule-distance` to assign alignments to unique molecules.
# """
# output_dir = output_dir.rstrip("/")
# workflowdir = f"{output_dir}/workflow"
# sdm = "conda" if conda else "conda apptainer"
# command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . '
# command += f"--snakefile {workflowdir}/align-minimap.smk "
# command += f"--configfile {workflowdir}/config.yaml "
# if hpc:
# command += f"--workflow-profile {hpc} "
# if quiet:
# command += "--quiet all "
# if snakemake is not None:
# command += snakemake
# if print_only:
# click.echo(command)
# sys.exit(0)
#
# os.makedirs(f"{workflowdir}/", exist_ok= True)
# fqlist, sample_count = parse_fastq_inputs(inputs)
# validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
# fetch_rule(workflowdir, "align-minimap.smk")
# fetch_script(workflowdir, "assignMI.py")
# fetch_script(workflowdir, "bxStats.py")
# fetch_report(workflowdir, "AlignStats.Rmd")
#
# with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
# config.write("workflow: align minimap\n")
# config.write(f"genomefile: {genome}\n")
# config.write(f"seq_directory: {workflowdir}/input\n")
# config.write(f"output_directory: {output_dir}\n")
# config.write(f"quality: {quality_filter}\n")
# config.write(f"molecule_distance: {molecule_distance}\n")
# config.write(f"depth_windowsize: {depth_window}\n")
# config.write(f"skipreports: {skipreports}\n")
# if extra_params is not None:
# config.write(f"extra: {extra_params}\n")
# config.write(f"workflow_call: {command}\n")
# config.write("inputs:\n")
# config.write(f" genome: {Path(genome).resolve()}\n")
# config.write(" fastq:\n")
# for i in fqlist:
# config.write(f" - {i}\n")
#
# print_onstart(
# f"Samples: {sample_count}\nOutput Directory: {output_dir}",
# "align minimap"
# )
# generate_conda_deps()
# _module = subprocess.run(command.split())
# sys.exit(_module.returncode)

@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
@click.option('-r', '--read-length', default = "125", show_default = True, type = click.Choice(["50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
@click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running')
@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory')
@click.option('-o', '--output-dir', type = str, default = "Align/strobealign", show_default=True, help = 'Name of output directory')
@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes')
@click.option('--hpc', type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission')
@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container')
@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports')
@click.option('--print-only', is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit')
@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
"""
Align sequences to genome using Minimap2
Align sequences to genome using `strobealign`
Provide the input fastq files and/or directories at the end of the command as individual
files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does
strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does
not use barcodes when mapping. Harpy will post-processes the alignments using the
specified `--molecule-distance` to assign alignments to unique molecules.
specified `--molecule-distance` to assign alignments to unique molecules. The `--read-length` is
an *approximate* parameter and should be one of [`50`, `75`, `100`, `125`, `150`, `250`, `400`].
If your input is post-qc sequences, then you should expect the read lengths to be <150.
"""
output_dir = output_dir.rstrip("/")
workflowdir = f"{output_dir}/workflow"
sdm = "conda" if conda else "conda apptainer"
command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . '
command += f"--snakefile {workflowdir}/align-minimap.smk "
command += f"--snakefile {workflowdir}/align-strobealign.smk "
command += f"--configfile {workflowdir}/config.yaml "
if hpc:
command += f"--workflow-profile {hpc} "
Expand All @@ -278,18 +362,19 @@ def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, qua
os.makedirs(f"{workflowdir}/", exist_ok= True)
fqlist, sample_count = parse_fastq_inputs(inputs)
validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
fetch_rule(workflowdir, "align-minimap.smk")
fetch_rule(workflowdir, "align-strobealign.smk")
fetch_script(workflowdir, "assignMI.py")
fetch_script(workflowdir, "bxStats.py")
fetch_report(workflowdir, "AlignStats.Rmd")

with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
config.write("workflow: align minimap\n")
config.write("workflow: align strobe\n")
config.write(f"genomefile: {genome}\n")
config.write(f"seq_directory: {workflowdir}/input\n")
config.write(f"output_directory: {output_dir}\n")
config.write(f"quality: {quality_filter}\n")
config.write(f"molecule_distance: {molecule_distance}\n")
config.write(f"average_read_length: {read_length}\n")
config.write(f"depth_windowsize: {depth_window}\n")
config.write(f"skipreports: {skipreports}\n")
if extra_params is not None:
Expand All @@ -303,12 +388,13 @@ def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, qua

print_onstart(
f"Samples: {sample_count}\nOutput Directory: {output_dir}",
"align minimap"
"align strobe"
)
generate_conda_deps()
_module = subprocess.run(command.split())
sys.exit(_module.returncode)

align.add_command(bwa)
align.add_command(ema)
align.add_command(minimap)
#align.add_command(minimap)
align.add_command(strobe)
2 changes: 1 addition & 1 deletion src/harpy/conda_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def generate_conda_deps():
condachannels = ["bioconda","conda-forge","defaults"]
environ = {
"qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc", "bioconda::pysam=0.22"],
"align": ["bioconda::bwa", "bioconda::ema","conda-forge::icu","conda-forge::libzlib", "bioconda::minimap2", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"],
"align": ["bioconda::bwa", "bioconda::ema","bioconda::strobealign", "conda-forge::icu","conda-forge::libzlib", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"],
"snp": ["bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6"],
"sv": ["bioconda::leviathan", "bioconda::naibr-plus"],
"phase" : ["bioconda::hapcut2", "bioconda::whatshap"],
Expand Down
Loading

0 comments on commit 01c9cbe

Please sign in to comment.