Merge pull request #92 from pdimens/align_aggregate

Add aggregate report for bx stats
pdimens · Jun 28, 2024 · 0cf2243 · 0cf2243
2 parents 01c9cbe + ac458c3
commit 0cf2243
Show file tree

Hide file tree

Showing 26 changed files with 820 additions and 703 deletions.
diff --git a/src/harpy/reports/EmaCount.Rmd → .deprecated/EmaCount.Rmd b/src/harpy/reports/EmaCount.Rmd → .deprecated/EmaCount.Rmd
@@ -1,12 +1,15 @@
 ---
 title: "EMA Count Barcode Summary"
-date: "`r format(Sys.time(), '%d %B, %Y')`"
+date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`"
 output:
   flexdashboard::flex_dashboard:
     theme: lumen
     orientation: rows
     vertical_layout: scroll
-    favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png"
+    logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png
+    favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png"
+    navbar:
+      - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right }
 ---
 
 ```{r echo = FALSE, message = FALSE}
@@ -132,7 +135,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "nor
   hc_caption(text = "As determined by ema count, given as counts") |>
   hc_tooltip(crosshairs = TRUE) |>
   hc_exporting(enabled = T, filename = "ema.bxcount",
-    buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
+    buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
   )
 ```
 ### proportional valid barcodes
@@ -145,7 +148,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "per
   hc_caption(text = "As determined by ema count, given as percents") |>
   hc_tooltip(crosshairs = TRUE) |>
   hc_exporting(enabled = T, filename = "ema.bxpercent",
-    buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
+    buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
 )
 ```
 

diff --git a/src/harpy/snakefiles/align-minimap.smk → .deprecated/align-minimap.smk b/src/harpy/snakefiles/align-minimap.smk → .deprecated/align-minimap.smk
@@ -52,7 +52,7 @@ d = dict(zip(samplenames, samplenames))
 def get_fq(wildcards):
     # returns a list of fastq files for read 1 based on *wildcards.sample* e.g.
     r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE)
-    return list(filter(r.match, fqlist))[:2]
+    return sorted(list(filter(r.match, fqlist))[:2])
 
 rule genome_setup:
     input:
@@ -316,7 +316,7 @@ rule samtools_reports:
         "Summarizing samtools stats and flagstat"
     shell:
         """
-        multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null
+        multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null
         """
 
 rule log_workflow:

diff --git a/src/harpy/align.py b/src/harpy/align.py
@@ -119,6 +119,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality
     fetch_script(workflowdir, "assignMI.py")
     fetch_script(workflowdir, "bxStats.py")
     fetch_report(workflowdir, "AlignStats.Rmd")
+    fetch_report(workflowdir, "AlignBxStats.Rmd")
 
     with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
         config.write("workflow: align bwa\n")
@@ -208,8 +209,8 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
     validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
     fetch_rule(workflowdir, "align-ema.smk")
     fetch_script(workflowdir, "bxStats.py")
-    for i in ["EmaCount", "AlignStats"]:
-        fetch_report(workflowdir, f"{i}.Rmd")
+    fetch_report(workflowdir, "AlignStats.Rmd")
+    fetch_report(workflowdir, "AlignBxStats.Rmd")
 
     with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
         config.write("workflow: align ema\n")
@@ -238,87 +239,11 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
     _module = subprocess.run(command.split())
     sys.exit(_module.returncode)
 
-#@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
-#@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
-#@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
-#@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
-#@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
-#@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
-#@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
-#@click.option('-q', '--quiet',  is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running')
-#@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory')
-#@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes')
-#@click.option('--hpc',  type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission')
-#@click.option('--conda',  is_flag = True, default = False, help = 'Use conda/mamba instead of container')
-#@click.option('--skipreports',  is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports')
-#@click.option('--print-only',  is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit')
-#@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
-#def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
-#    """
-#    Align sequences to genome using `Minimap2`
-# 
-#    Provide the input fastq files and/or directories at the end of the command as individual
-#    files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
-#    
-#    Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does 
-#    not use barcodes when mapping. Harpy will post-processes the alignments using the
-#    specified `--molecule-distance` to assign alignments to unique molecules. 
-#    """
-#    output_dir = output_dir.rstrip("/")
-#    workflowdir = f"{output_dir}/workflow"
-#    sdm = "conda" if conda else "conda apptainer"
-#    command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . '
-#    command += f"--snakefile {workflowdir}/align-minimap.smk "
-#    command += f"--configfile {workflowdir}/config.yaml "
-#    if hpc:
-#        command += f"--workflow-profile {hpc} "
-#    if quiet:
-#        command += "--quiet all "
-#    if snakemake is not None:
-#        command +=  snakemake
-#    if print_only:
-#        click.echo(command)
-#        sys.exit(0)
-#
-#    os.makedirs(f"{workflowdir}/", exist_ok= True)
-#    fqlist, sample_count = parse_fastq_inputs(inputs)
-#    validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
-#    fetch_rule(workflowdir, "align-minimap.smk")
-#    fetch_script(workflowdir, "assignMI.py")
-#    fetch_script(workflowdir, "bxStats.py")
-#    fetch_report(workflowdir, "AlignStats.Rmd")
-#
-#    with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
-#        config.write("workflow: align minimap\n")
-#        config.write(f"genomefile: {genome}\n")
-#        config.write(f"seq_directory: {workflowdir}/input\n")
-#        config.write(f"output_directory: {output_dir}\n")
-#        config.write(f"quality: {quality_filter}\n")
-#        config.write(f"molecule_distance: {molecule_distance}\n")
-#        config.write(f"depth_windowsize: {depth_window}\n")
-#        config.write(f"skipreports: {skipreports}\n")
-#        if extra_params is not None:
-#            config.write(f"extra: {extra_params}\n")
-#        config.write(f"workflow_call: {command}\n")
-#        config.write("inputs:\n")
-#        config.write(f"  genome: {Path(genome).resolve()}\n")
-#        config.write("  fastq:\n")
-#        for i in fqlist:
-#            config.write(f"    - {i}\n")
-#
-#    print_onstart(
-#        f"Samples: {sample_count}\nOutput Directory: {output_dir}",
-#        "align minimap"
-#    )
-#    generate_conda_deps()
-#    _module = subprocess.run(command.split())
-#    sys.exit(_module.returncode)
-
 @click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
 @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
 @click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
 @click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
-@click.option('-r', '--read-length', default = "125", show_default = True, type = click.Choice(["50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
+@click.option('-r', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
 @click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
 @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
 @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
@@ -337,11 +262,12 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
     Provide the input fastq files and/or directories at the end of the command as individual
     files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
     
-    strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does 
-    not use barcodes when mapping. Harpy will post-processes the alignments using the
+    strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp and does 
+    not use barcodes when mapping, so Harpy will post-processes the alignments using the
     specified `--molecule-distance` to assign alignments to unique molecules. The `--read-length` is
-    an *approximate* parameter and should be one of [`50`, `75`, `100`, `125`, `150`, `250`, `400`].
-    If your input is post-qc sequences, then you should expect the read lengths to be <150.
+    an *approximate* parameter and should be one of [`auto`, `50`, `75`, `100`, `125`, `150`, `250`, `400`].
+    The alignment process will be faster and take up less disk/RAM if you specify an `-r` value that isn't
+    `auto`. If your input has adapters removed, then you should expect the read lengths to be <150.
     """
     output_dir = output_dir.rstrip("/")
     workflowdir = f"{output_dir}/workflow"
@@ -366,6 +292,7 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
     fetch_script(workflowdir, "assignMI.py")
     fetch_script(workflowdir, "bxStats.py")
     fetch_report(workflowdir, "AlignStats.Rmd")
+    fetch_report(workflowdir, "AlignBxStats.Rmd")
 
     with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
         config.write("workflow: align strobe\n")
@@ -396,5 +323,4 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
 
 align.add_command(bwa)
 align.add_command(ema)
-#align.add_command(minimap)
 align.add_command(strobe)
diff --git a/src/harpy/conda_deps.py b/src/harpy/conda_deps.py
@@ -6,7 +6,7 @@ def generate_conda_deps():
     """Create the YAML files of the workflow conda dependencies"""
     condachannels = ["bioconda","conda-forge","defaults"]
     environ = {
-        "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc", "bioconda::pysam=0.22"],
+        "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc=1.22", "bioconda::pysam=0.22"],
         "align": ["bioconda::bwa", "bioconda::ema","bioconda::strobealign", "conda-forge::icu","conda-forge::libzlib", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"],
         "snp": ["bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6"],
         "sv": ["bioconda::leviathan", "bioconda::naibr-plus"],

diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py
@@ -5,7 +5,6 @@
 import sys
 import subprocess
 from pathlib import Path
-from collections import Counter
 from rich.markdown import Markdown
 import rich_click as click
 from .printfunctions import print_error, print_solution_with_culprits
@@ -117,40 +116,36 @@ def parse_alignment_inputs(inputs):
         sys.exit(1)
     return bam_infiles, len(uniqs)
 
-#def get_samples_from_fastq(directory):
-#    """Identify the sample names from a directory containing FASTQ files"""
-#    full_flist = [i for i in glob.iglob(f"{directory}/*") if not os.path.isdir(i)]
-#    r = re.compile(r".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE)
-#    full_fqlist = list(filter(r.match, full_flist))
-#    fqlist = [os.path.basename(i) for i in full_fqlist]
-#    bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$"
-#    if len(fqlist) == 0:
-#        print_error(f"No fastq files with acceptable names found in [bold]{directory}[/bold]")
-#        print_solution("Check that the file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]\nRead the documentation for details: https://pdimens.github.io/harpy/haplotagdata/#naming-conventions")
-#        sys.exit(1)
-#
-#    return set([re.sub(bn_r, "", i, flags = re.IGNORECASE) for i in fqlist])
-
-
 def biallelic_contigs(vcf, workdir):
-    """Identify which contigs have at least 2 biallelic SNPs"""
+    """Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic"""
     vbn = os.path.basename(vcf)
-    if os.path.exists(f"{workdir}/{vbn}.biallelic"):
-        with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f:
-            contigs = [line.rstrip() for line in f]
-        click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr)
-    else:
-        click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr)
-        os.makedirs(f"{workdir}/", exist_ok = True)
-        biallelic = subprocess.Popen(f"bcftools view -M2 -v snps {vcf} -Ob".split(), stdout = subprocess.PIPE)
-        contigs = subprocess.run("""bcftools query -f '%CHROM\\n'""".split(), stdin = biallelic.stdout, stdout = subprocess.PIPE, check = False).stdout.decode().splitlines()
-        counts = Counter(contigs)
-        contigs = [i.replace("\'", "") for i in counts if counts[i] > 1]
-        with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f:
-            _ = [f.write(f"{i}\n") for i in contigs]
+    os.makedirs(f"{workdir}/", exist_ok = True)
+    valid = []
+    vcfheader = subprocess.check_output(['bcftools', 'view', '-h', vcf]).decode().split('\n')
+    header_contigs = [i.split(",")[0].replace("##contig=<ID=","") for i in vcfheader if i.startswith("##contig=")]
+    if vcf.lower().endswith("bcf") and not os.path.exists(f"{vcf}.csi"):
+        subprocess.run(f"bcftools index {vcf}".split())
+    if vcf.lower().endswith("vcf.gz") and not os.path.exists(f"{vcf}.csi"):
+        subprocess.run(f"bcftools index --tbi {vcf}".split())
 
-    if len(contigs) == 0:
-        print_error("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.")
+    for contig in header_contigs:
+        # Use bcftools to count the number of biallelic SNPs in the contig
+        viewcmd = subprocess.Popen(['bcftools', 'view', '-r', contig, '-v', 'snps', '-m2', '-M2', '-c', '2', vcf], stdout=subprocess.PIPE)
+        snpcount = 0
+        while True:
+            # Read the next line of output
+            line = viewcmd.stdout.readline().decode()
+            if not line:
+                break
+            snpcount += 1
+            # If there are at least 2 biallellic snps, terminate the process
+            if snpcount >= 2:
+                valid.append(contig)
+                viewcmd.terminate()
+                break
+    if not valid:
+        click.echo("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.")
         sys.exit(1)
-    else:
-        return f"{workdir}/{vbn}.biallelic"
+    with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f:
+        f.write("\n".join(valid))
+    return f"{workdir}/{vbn}.biallelic", len(valid)
diff --git a/src/harpy/impute.py b/src/harpy/impute.py
@@ -46,10 +46,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para
     individual files/folders, using shell wildcards (e.g. `data/drosophila*.bam`), or both.
     
     Requires a parameter file, use **harpy stitchparams** to generate one and adjust it for your study.
-    The `--vcf-samples` flag toggles to phase only the samples present in your input `--vcf` file rather than all
+    The `--vcf-samples` option considers only the samples present in your input `--vcf` file rather than all
     the samples identified in `INPUT`. If providing additional STITCH arguments, they must be in quotes and 
-    in R language style. The extra parameters will remain constant across different models.
-    Use single-quotes (string literals) if supplying an argument that requires quotes. For example:
+    in R language style. Use single-quotes (string literals) if supplying an argument that requires quotes. For example:
     
     ```
     harpy ... -x 'switchModelIteration = 39, splitReadIterations = NA, reference_populations = c("CEU","GBR")'...
@@ -72,12 +71,12 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para
         sys.exit(0)
 
     os.makedirs(f"{workflowdir}/", exist_ok= True)
-    bamlist, n = parse_alignment_inputs(inputs)
     validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"])
-    samplenames = vcf_samplematch(vcf, bamlist, vcf_samples)
-    validate_bam_RG(bamlist)
     check_impute_params(parameters)
-    biallelic = biallelic_contigs(vcf, f"{workflowdir}")
+    bamlist, n = parse_alignment_inputs(inputs)
+    validate_bam_RG(bamlist)
+    samplenames = vcf_samplematch(vcf, bamlist, vcf_samples)
+    biallelic, n_biallelic = biallelic_contigs(vcf, f"{workflowdir}")
 
     fetch_rule(workflowdir, "impute.smk")
     fetch_script(workflowdir, "stitch_impute.R")
@@ -99,9 +98,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para
         config.write("  alignments:\n")
         for i in bamlist:
             config.write(f"    - {i}\n")
-
+    
     print_onstart(
-        f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/",
+        f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nContigs with ≥2 Biallelic SNPs: {n_biallelic}\nOutput Directory: {output_dir}/",
         "impute"
     )
     generate_conda_deps()