Merge pull request #90 from pdimens/align_aggregate

replace minimap with strobealign
pdimens · Jun 21, 2024 · 01c9cbe · 01c9cbe
2 parents 1b6a2f1 + 3992eac
commit 01c9cbe
Show file tree

Hide file tree

Showing 7 changed files with 466 additions and 34 deletions.
diff --git a/.github/filters.yml b/.github/filters.yml
@@ -59,11 +59,11 @@ ema: &ema
   - 'src/harpy/scripts/countBX.py'
   - 'src/harpy/bin/makewindows.py'
   - 'test/fastq/**'
-minimap: &minimap
+strobealign: &strobealign
   - *common
   - *container
   - 'src/harpy/align.py'
-  - 'src/harpy/rules/align-minimap.smk'
+  - 'src/harpy/rules/align-strobealign.smk'
   - 'src/harpy/reports/AlignStats.Rmd'
   - 'src/harpy/reports/BxCount.Rmd'
   - 'src/harpy/scripts/bxStats.py'
@@ -150,7 +150,7 @@ modules:
   - *qc
   - *bwa
   - *ema
-  - *minimap
+  - *strobealign
   - *mpileup
   - *freebayes
   - *impute

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -31,7 +31,7 @@ jobs:
       qc: ${{ steps.filter.outputs.qc }}
       bwa: ${{ steps.filter.outputs.bwa }}
       ema: ${{ steps.filter.outputs.ema }}
-      minimap: ${{ steps.filter.outputs.minimap }}
+      strobealign: ${{ steps.filter.outputs.strobealign }}
       mpileup: ${{ steps.filter.outputs.mpileup }}
       freebayes: ${{ steps.filter.outputs.freebayes }}
       leviathan: ${{ steps.filter.outputs.leviathan }}
@@ -309,10 +309,10 @@ jobs:
         shell: micromamba-shell {0}
         run: harpy align ema --ema-bins 150 -g test/genome/genome.fasta.gz --snakemake "--show-failed-logs" -x "-d" test/fastq
 
-  minimap:
+  strobe:
     needs: [changes, pkgbuild]
-    if: ${{ needs.changes.outputs.minimap == 'true' && needs.pkgbuild.result == 'success' }}
-    name: align minimap
+    if: ${{ needs.changes.outputs.strobealign == 'true' && needs.pkgbuild.result == 'success' }}
+    name: align strobe
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -343,9 +343,9 @@ jobs:
         with:
           name: deps-image
           path: .snakemake/singularity
-      - name: test minimap
+      - name: test strobealign
         shell: micromamba-shell {0}
-        run: harpy align minimap -g test/genome/genome.fasta.gz --snakemake "--show-failed-logs" -x "--seed 13" test/fastq
+        run: harpy align strobe -r 125 -g test/genome/genome.fasta.gz --snakemake "--show-failed-logs" test/fastq
 
   mpileup:
     needs: [changes, pkgbuild]

diff --git a/src/harpy/align.py b/src/harpy/align.py
@@ -17,18 +17,13 @@ def align():
     """
     Align sample sequences to a reference genome
 
-    The three available aligners all retain the linked-read barcode information in the
-    resulting output, however `EMA` is the only aligner to use the barcode information
-    to facilitate the aligning process and can be prohibitively slow. The `minimap2`
-    aligner is the fastest of the three and is comparable in accuracy to `bwa` for
+    The available aligners all retain the linked-read barcode information in the
+    resulting output, however `ema` is the only aligner to use the barcode information
+    to facilitate the aligning process and can be prohibitively slow. The `strobe`
+    aligner is the fastest option and is comparable in accuracy (or better) to `bwa` for
     sequences >100bp.
 
-    **Aligners**
-    - `bwa`: uses BWA MEM to align reads (fast)
-    - `ema`: uses the BX barcode-aware EMA aligner (very slow)
-    - `minimap`: uses minimap2 to align reads (ultra fast)
-
-    Provide an additional subcommand `bwa`, `ema`, or `minimap` to get more information on using
+    Provide an additional subcommand `bwa`, `ema`, or `strobe` to get more information on using
     those aligners.
     """
 
@@ -62,6 +57,16 @@ def align():
             "name": "Other Options",
             "options": ["--output-dir", "--threads", "--depth-window", "--skipreports", "--conda", "--snakemake", "--quiet", "--help"],
         },
+    ],
+    "harpy align strobe": [
+        {
+            "name": "Parameters",
+            "options": ["--genome", "--read-length", "--quality-filter", "--molecule-distance", "--extra-params"],
+        },
+        {
+            "name": "Other Options",
+            "options": ["--output-dir", "--threads", "--depth-window", "--skipreports", "--conda", "--snakemake", "--quiet", "--help"],
+        },
     ]
 }
 
@@ -82,7 +87,7 @@ def align():
 @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
 def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
     """
-    Align sequences to genome using BWA MEM
+    Align sequences to genome using `BWA MEM`
  
     Provide the input fastq files and/or directories at the end of the command as individual
     files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
@@ -158,7 +163,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality
 @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
 def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, ema_bins, skipreports, extra_params, quality_filter, snakemake, quiet, hpc, conda, print_only):
     """
-    Align sequences to a genome using EMA
+    Align sequences to genome using `EMA`
 
     Provide the input fastq files and/or directories at the end of the
     command as individual files/folders, using shell wildcards
@@ -233,37 +238,116 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
     _module = subprocess.run(command.split())
     sys.exit(_module.returncode)
 
+#@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
+#@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
+#@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
+#@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
+#@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
+#@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
+#@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
+#@click.option('-q', '--quiet',  is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running')
+#@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory')
+#@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes')
+#@click.option('--hpc',  type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission')
+#@click.option('--conda',  is_flag = True, default = False, help = 'Use conda/mamba instead of container')
+#@click.option('--skipreports',  is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports')
+#@click.option('--print-only',  is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit')
+#@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
+#def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
+#    """
+#    Align sequences to genome using `Minimap2`
+# 
+#    Provide the input fastq files and/or directories at the end of the command as individual
+#    files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
+#    
+#    Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does 
+#    not use barcodes when mapping. Harpy will post-processes the alignments using the
+#    specified `--molecule-distance` to assign alignments to unique molecules. 
+#    """
+#    output_dir = output_dir.rstrip("/")
+#    workflowdir = f"{output_dir}/workflow"
+#    sdm = "conda" if conda else "conda apptainer"
+#    command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . '
+#    command += f"--snakefile {workflowdir}/align-minimap.smk "
+#    command += f"--configfile {workflowdir}/config.yaml "
+#    if hpc:
+#        command += f"--workflow-profile {hpc} "
+#    if quiet:
+#        command += "--quiet all "
+#    if snakemake is not None:
+#        command +=  snakemake
+#    if print_only:
+#        click.echo(command)
+#        sys.exit(0)
+#
+#    os.makedirs(f"{workflowdir}/", exist_ok= True)
+#    fqlist, sample_count = parse_fastq_inputs(inputs)
+#    validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
+#    fetch_rule(workflowdir, "align-minimap.smk")
+#    fetch_script(workflowdir, "assignMI.py")
+#    fetch_script(workflowdir, "bxStats.py")
+#    fetch_report(workflowdir, "AlignStats.Rmd")
+#
+#    with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
+#        config.write("workflow: align minimap\n")
+#        config.write(f"genomefile: {genome}\n")
+#        config.write(f"seq_directory: {workflowdir}/input\n")
+#        config.write(f"output_directory: {output_dir}\n")
+#        config.write(f"quality: {quality_filter}\n")
+#        config.write(f"molecule_distance: {molecule_distance}\n")
+#        config.write(f"depth_windowsize: {depth_window}\n")
+#        config.write(f"skipreports: {skipreports}\n")
+#        if extra_params is not None:
+#            config.write(f"extra: {extra_params}\n")
+#        config.write(f"workflow_call: {command}\n")
+#        config.write("inputs:\n")
+#        config.write(f"  genome: {Path(genome).resolve()}\n")
+#        config.write("  fastq:\n")
+#        for i in fqlist:
+#            config.write(f"    - {i}\n")
+#
+#    print_onstart(
+#        f"Samples: {sample_count}\nOutput Directory: {output_dir}",
+#        "align minimap"
+#    )
+#    generate_conda_deps()
+#    _module = subprocess.run(command.split())
+#    sys.exit(_module.returncode)
+
 @click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
 @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
 @click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
 @click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
+@click.option('-r', '--read-length', default = "125", show_default = True, type = click.Choice(["50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
 @click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
 @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
 @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
 @click.option('-q', '--quiet',  is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running')
-@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory')
+@click.option('-o', '--output-dir', type = str, default = "Align/strobealign", show_default=True, help = 'Name of output directory')
 @click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes')
 @click.option('--hpc',  type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission')
 @click.option('--conda',  is_flag = True, default = False, help = 'Use conda/mamba instead of container')
 @click.option('--skipreports',  is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports')
 @click.option('--print-only',  is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit')
 @click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
-def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
+def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
     """
-    Align sequences to genome using Minimap2
+    Align sequences to genome using `strobealign`
  
     Provide the input fastq files and/or directories at the end of the command as individual
     files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
     
-    Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does 
+    strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does 
     not use barcodes when mapping. Harpy will post-processes the alignments using the
-    specified `--molecule-distance` to assign alignments to unique molecules. 
+    specified `--molecule-distance` to assign alignments to unique molecules. The `--read-length` is
+    an *approximate* parameter and should be one of [`50`, `75`, `100`, `125`, `150`, `250`, `400`].
+    If your input is post-qc sequences, then you should expect the read lengths to be <150.
     """
     output_dir = output_dir.rstrip("/")
     workflowdir = f"{output_dir}/workflow"
     sdm = "conda" if conda else "conda apptainer"
     command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . '
-    command += f"--snakefile {workflowdir}/align-minimap.smk "
+    command += f"--snakefile {workflowdir}/align-strobealign.smk "
     command += f"--configfile {workflowdir}/config.yaml "
     if hpc:
         command += f"--workflow-profile {hpc} "
@@ -278,18 +362,19 @@ def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, qua
     os.makedirs(f"{workflowdir}/", exist_ok= True)
     fqlist, sample_count = parse_fastq_inputs(inputs)
     validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
-    fetch_rule(workflowdir, "align-minimap.smk")
+    fetch_rule(workflowdir, "align-strobealign.smk")
     fetch_script(workflowdir, "assignMI.py")
     fetch_script(workflowdir, "bxStats.py")
     fetch_report(workflowdir, "AlignStats.Rmd")
 
     with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
-        config.write("workflow: align minimap\n")
+        config.write("workflow: align strobe\n")
         config.write(f"genomefile: {genome}\n")
         config.write(f"seq_directory: {workflowdir}/input\n")
         config.write(f"output_directory: {output_dir}\n")
         config.write(f"quality: {quality_filter}\n")
         config.write(f"molecule_distance: {molecule_distance}\n")
+        config.write(f"average_read_length: {read_length}\n")
         config.write(f"depth_windowsize: {depth_window}\n")
         config.write(f"skipreports: {skipreports}\n")
         if extra_params is not None:
@@ -303,12 +388,13 @@ def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, qua
 
     print_onstart(
         f"Samples: {sample_count}\nOutput Directory: {output_dir}",
-        "align minimap"
+        "align strobe"
     )
     generate_conda_deps()
     _module = subprocess.run(command.split())
     sys.exit(_module.returncode)
 
 align.add_command(bwa)
 align.add_command(ema)
-align.add_command(minimap)
+#align.add_command(minimap)
+align.add_command(strobe)
diff --git a/src/harpy/conda_deps.py b/src/harpy/conda_deps.py
@@ -7,7 +7,7 @@ def generate_conda_deps():
     condachannels = ["bioconda","conda-forge","defaults"]
     environ = {
         "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc", "bioconda::pysam=0.22"],
-        "align": ["bioconda::bwa", "bioconda::ema","conda-forge::icu","conda-forge::libzlib", "bioconda::minimap2", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"],
+        "align": ["bioconda::bwa", "bioconda::ema","bioconda::strobealign", "conda-forge::icu","conda-forge::libzlib", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"],
         "snp": ["bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6"],
         "sv": ["bioconda::leviathan", "bioconda::naibr-plus"],
         "phase" : ["bioconda::hapcut2", "bioconda::whatshap"],