diff --git a/src/harpy/reports/EmaCount.Rmd b/.deprecated/EmaCount.Rmd similarity index 91% rename from src/harpy/reports/EmaCount.Rmd rename to .deprecated/EmaCount.Rmd index 87d64d91b..598d3db52 100644 --- a/src/harpy/reports/EmaCount.Rmd +++ b/.deprecated/EmaCount.Rmd @@ -1,12 +1,15 @@ --- title: "EMA Count Barcode Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo = FALSE, message = FALSE} @@ -132,7 +135,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "nor hc_caption(text = "As determined by ema count, given as counts") |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "ema.bxcount", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` ### proportional valid barcodes @@ -145,7 +148,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "per hc_caption(text = "As determined by ema count, given as percents") |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "ema.bxpercent", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/snakefiles/align-minimap.smk b/.deprecated/align-minimap.smk similarity index 96% rename from src/harpy/snakefiles/align-minimap.smk rename to .deprecated/align-minimap.smk index 5fb40c9c8..055a3dc63 100644 --- a/src/harpy/snakefiles/align-minimap.smk +++ b/.deprecated/align-minimap.smk @@ -52,7 +52,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: @@ -316,7 +316,7 @@ rule samtools_reports: "Summarizing samtools stats and flagstat" shell: """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null + multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ rule log_workflow: diff --git a/src/harpy/align.py b/src/harpy/align.py index 08acc4a7c..a0e03f29c 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -119,6 +119,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality fetch_script(workflowdir, "assignMI.py") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") + fetch_report(workflowdir, "AlignBxStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align bwa\n") @@ -208,8 +209,8 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"]) fetch_rule(workflowdir, "align-ema.smk") fetch_script(workflowdir, "bxStats.py") - for i in ["EmaCount", "AlignStats"]: - fetch_report(workflowdir, f"{i}.Rmd") + fetch_report(workflowdir, "AlignStats.Rmd") + fetch_report(workflowdir, "AlignBxStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align ema\n") @@ -238,87 +239,11 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, _module = subprocess.run(command.split()) sys.exit(_module.returncode) -#@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/") -#@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping') -#@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') -#@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') -#@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') -#@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') -#@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -#@click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -#@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory') -#@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -#@click.option('--hpc', type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission') -#@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') -#@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -#@click.option('--print-only', is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit') -#@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1) -#def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only): -# """ -# Align sequences to genome using `Minimap2` -# -# Provide the input fastq files and/or directories at the end of the command as individual -# files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both. -# -# Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does -# not use barcodes when mapping. Harpy will post-processes the alignments using the -# specified `--molecule-distance` to assign alignments to unique molecules. -# """ -# output_dir = output_dir.rstrip("/") -# workflowdir = f"{output_dir}/workflow" -# sdm = "conda" if conda else "conda apptainer" -# command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . ' -# command += f"--snakefile {workflowdir}/align-minimap.smk " -# command += f"--configfile {workflowdir}/config.yaml " -# if hpc: -# command += f"--workflow-profile {hpc} " -# if quiet: -# command += "--quiet all " -# if snakemake is not None: -# command += snakemake -# if print_only: -# click.echo(command) -# sys.exit(0) -# -# os.makedirs(f"{workflowdir}/", exist_ok= True) -# fqlist, sample_count = parse_fastq_inputs(inputs) -# validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"]) -# fetch_rule(workflowdir, "align-minimap.smk") -# fetch_script(workflowdir, "assignMI.py") -# fetch_script(workflowdir, "bxStats.py") -# fetch_report(workflowdir, "AlignStats.Rmd") -# -# with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: -# config.write("workflow: align minimap\n") -# config.write(f"genomefile: {genome}\n") -# config.write(f"seq_directory: {workflowdir}/input\n") -# config.write(f"output_directory: {output_dir}\n") -# config.write(f"quality: {quality_filter}\n") -# config.write(f"molecule_distance: {molecule_distance}\n") -# config.write(f"depth_windowsize: {depth_window}\n") -# config.write(f"skipreports: {skipreports}\n") -# if extra_params is not None: -# config.write(f"extra: {extra_params}\n") -# config.write(f"workflow_call: {command}\n") -# config.write("inputs:\n") -# config.write(f" genome: {Path(genome).resolve()}\n") -# config.write(" fastq:\n") -# for i in fqlist: -# config.write(f" - {i}\n") -# -# print_onstart( -# f"Samples: {sample_count}\nOutput Directory: {output_dir}", -# "align minimap" -# ) -# generate_conda_deps() -# _module = subprocess.run(command.split()) -# sys.exit(_module.returncode) - @click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping') @click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') @click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') -@click.option('-r', '--read-length', default = "125", show_default = True, type = click.Choice(["50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index') +@click.option('-r', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index') @click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @@ -337,11 +262,12 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra Provide the input fastq files and/or directories at the end of the command as individual files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both. - strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does - not use barcodes when mapping. Harpy will post-processes the alignments using the + strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp and does + not use barcodes when mapping, so Harpy will post-processes the alignments using the specified `--molecule-distance` to assign alignments to unique molecules. The `--read-length` is - an *approximate* parameter and should be one of [`50`, `75`, `100`, `125`, `150`, `250`, `400`]. - If your input is post-qc sequences, then you should expect the read lengths to be <150. + an *approximate* parameter and should be one of [`auto`, `50`, `75`, `100`, `125`, `150`, `250`, `400`]. + The alignment process will be faster and take up less disk/RAM if you specify an `-r` value that isn't + `auto`. If your input has adapters removed, then you should expect the read lengths to be <150. """ output_dir = output_dir.rstrip("/") workflowdir = f"{output_dir}/workflow" @@ -366,6 +292,7 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra fetch_script(workflowdir, "assignMI.py") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") + fetch_report(workflowdir, "AlignBxStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align strobe\n") @@ -396,5 +323,4 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra align.add_command(bwa) align.add_command(ema) -#align.add_command(minimap) align.add_command(strobe) diff --git a/src/harpy/conda_deps.py b/src/harpy/conda_deps.py index 9a3332a30..6132b5696 100644 --- a/src/harpy/conda_deps.py +++ b/src/harpy/conda_deps.py @@ -6,7 +6,7 @@ def generate_conda_deps(): """Create the YAML files of the workflow conda dependencies""" condachannels = ["bioconda","conda-forge","defaults"] environ = { - "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc", "bioconda::pysam=0.22"], + "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc=1.22", "bioconda::pysam=0.22"], "align": ["bioconda::bwa", "bioconda::ema","bioconda::strobealign", "conda-forge::icu","conda-forge::libzlib", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"], "snp": ["bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6"], "sv": ["bioconda::leviathan", "bioconda::naibr-plus"], diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index a601f3d4a..72ac61bbd 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -5,7 +5,6 @@ import sys import subprocess from pathlib import Path -from collections import Counter from rich.markdown import Markdown import rich_click as click from .printfunctions import print_error, print_solution_with_culprits @@ -117,40 +116,36 @@ def parse_alignment_inputs(inputs): sys.exit(1) return bam_infiles, len(uniqs) -#def get_samples_from_fastq(directory): -# """Identify the sample names from a directory containing FASTQ files""" -# full_flist = [i for i in glob.iglob(f"{directory}/*") if not os.path.isdir(i)] -# r = re.compile(r".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE) -# full_fqlist = list(filter(r.match, full_flist)) -# fqlist = [os.path.basename(i) for i in full_fqlist] -# bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$" -# if len(fqlist) == 0: -# print_error(f"No fastq files with acceptable names found in [bold]{directory}[/bold]") -# print_solution("Check that the file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]\nRead the documentation for details: https://pdimens.github.io/harpy/haplotagdata/#naming-conventions") -# sys.exit(1) -# -# return set([re.sub(bn_r, "", i, flags = re.IGNORECASE) for i in fqlist]) - - def biallelic_contigs(vcf, workdir): - """Identify which contigs have at least 2 biallelic SNPs""" + """Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic""" vbn = os.path.basename(vcf) - if os.path.exists(f"{workdir}/{vbn}.biallelic"): - with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: - contigs = [line.rstrip() for line in f] - click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr) - else: - click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) - os.makedirs(f"{workdir}/", exist_ok = True) - biallelic = subprocess.Popen(f"bcftools view -M2 -v snps {vcf} -Ob".split(), stdout = subprocess.PIPE) - contigs = subprocess.run("""bcftools query -f '%CHROM\\n'""".split(), stdin = biallelic.stdout, stdout = subprocess.PIPE, check = False).stdout.decode().splitlines() - counts = Counter(contigs) - contigs = [i.replace("\'", "") for i in counts if counts[i] > 1] - with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f: - _ = [f.write(f"{i}\n") for i in contigs] + os.makedirs(f"{workdir}/", exist_ok = True) + valid = [] + vcfheader = subprocess.check_output(['bcftools', 'view', '-h', vcf]).decode().split('\n') + header_contigs = [i.split(",")[0].replace("##contig== 2: + valid.append(contig) + viewcmd.terminate() + break + if not valid: + click.echo("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.") sys.exit(1) - else: - return f"{workdir}/{vbn}.biallelic" \ No newline at end of file + with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f: + f.write("\n".join(valid)) + return f"{workdir}/{vbn}.biallelic", len(valid) \ No newline at end of file diff --git a/src/harpy/impute.py b/src/harpy/impute.py index 2dbed6b83..6ee7d4269 100644 --- a/src/harpy/impute.py +++ b/src/harpy/impute.py @@ -46,10 +46,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para individual files/folders, using shell wildcards (e.g. `data/drosophila*.bam`), or both. Requires a parameter file, use **harpy stitchparams** to generate one and adjust it for your study. - The `--vcf-samples` flag toggles to phase only the samples present in your input `--vcf` file rather than all + The `--vcf-samples` option considers only the samples present in your input `--vcf` file rather than all the samples identified in `INPUT`. If providing additional STITCH arguments, they must be in quotes and - in R language style. The extra parameters will remain constant across different models. - Use single-quotes (string literals) if supplying an argument that requires quotes. For example: + in R language style. Use single-quotes (string literals) if supplying an argument that requires quotes. For example: ``` harpy ... -x 'switchModelIteration = 39, splitReadIterations = NA, reference_populations = c("CEU","GBR")'... @@ -72,12 +71,12 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para sys.exit(0) os.makedirs(f"{workflowdir}/", exist_ok= True) - bamlist, n = parse_alignment_inputs(inputs) validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) - samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) - validate_bam_RG(bamlist) check_impute_params(parameters) - biallelic = biallelic_contigs(vcf, f"{workflowdir}") + bamlist, n = parse_alignment_inputs(inputs) + validate_bam_RG(bamlist) + samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) + biallelic, n_biallelic = biallelic_contigs(vcf, f"{workflowdir}") fetch_rule(workflowdir, "impute.smk") fetch_script(workflowdir, "stitch_impute.R") @@ -99,9 +98,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para config.write(" alignments:\n") for i in bamlist: config.write(f" - {i}\n") - + print_onstart( - f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/", + f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nContigs with ≥2 Biallelic SNPs: {n_biallelic}\nOutput Directory: {output_dir}/", "impute" ) generate_conda_deps() diff --git a/src/harpy/reports/AlignBxStats.Rmd b/src/harpy/reports/AlignBxStats.Rmd new file mode 100644 index 000000000..c68645a5f --- /dev/null +++ b/src/harpy/reports/AlignBxStats.Rmd @@ -0,0 +1,266 @@ +--- +title: "Harpy Alignment Barcode Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" +output: + flexdashboard::flex_dashboard: + theme: lumen + orientation: rows + vertical_layout: scroll + horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } +--- + +```{r imports, echo = F, results = F, message = F} +using<-function(...) { + libs<-unlist(list(...)) + req<-unlist(lapply(libs,require,character.only=TRUE)) + need<-libs[req==FALSE] + if(length(need)>0){ + install.packages(need, repos = "https://cloud.r-project.org/") + lapply(need,require,character.only=TRUE) + } +} + +using("flexdashboard","dplyr","tidyr", "highcharter","DT") +``` + +```{r nxx_and_process_funs, echo = F, results = F, message = F} +NX <- function(lengths, X=50){ + lengths <- as.numeric(sort(lengths, decreasing=TRUE)) + index <- which(cumsum(lengths) / sum(lengths) >= X/100)[1] + return(lengths[index]) +} + +std_error <- function(x){ + sd(x)/sqrt(length((x))) +} + +process_input <- function(infile){ + bamfile <- gsub(".bxstats.gz", ".bam", infile) + samplename <- gsub(".bxstats.gz", "", basename(infile)) + tb <- read.table(infile, header = T, sep = "\t") %>% select(-4, -5) + tb$valid <- tb$molecule + tb[tb$valid != "invalidBX", "valid"] <- "validBX" + tb$valid <- gsub("BX", " BX", tb$valid) + # isolate non-singletons b/c molecules with 1 read pair aren't linked reads + multiplex_df <- filter(tb, valid == "valid BX", reads > 2) + singletons <- sum(tb$reads <= 2 & tb$valid == "valid BX") + tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+") + tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer() + tot_mol <- sum(tb$valid == "valid BX") + tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"]) + avg_reads_per_mol <- round(mean(multiplex_df$reads),1) + sem_reads_per_mol <- round(std_error(multiplex_df$reads), 2) + tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"]) + avg_mol_cov <- round(mean(multiplex_df$coverage_inserts), 2) + sem_mol_cov <- round(std_error(multiplex_df$coverage_inserts), 4) + avg_mol_cov_bp <- round(mean(multiplex_df$coverage_bp), 2) + sem_mol_cov_bp<- round(std_error(multiplex_df$coverage_bp), 4) + n50 <- NX(multiplex_df$length_inferred, 50) + n75 <- NX(multiplex_df$length_inferred, 75) + n90 <- NX(multiplex_df$length_inferred, 90) + + outrow <- data.frame( + sample = samplename, + totalreads = tot_valid_reads + tot_invalid_reads, + totaluniquemol = tot_mol, + singletons = singletons, + multiplex = nrow(multiplex_df), + totaluniquebx = tot_uniq_bx, + molecule2bxratio = round(tot_mol / tot_uniq_bx,2), + totalvalidreads = tot_valid_reads, + totalinvalidreads = tot_invalid_reads, + totalvalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2), + averagereadspermol = avg_reads_per_mol, + sereadspermol = sem_reads_per_mol, + averagemolcov = avg_mol_cov, + semolcov = sem_mol_cov, + averagemolcovbp = avg_mol_cov_bp, + semolcovbp = sem_mol_cov_bp, + N50 = n50, + N75 = n75, + N90 = n90 + ) + return(outrow) +} +``` + +```{r setup_df, echo = F, results = F, message = F} +infiles <- unlist(snakemake@input, recursive = FALSE) +#infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") + +aggregate_df <- Reduce(rbind, Map(process_input, infiles)) + +if(nrow(aggregate_df) == 0){ + print("All input files were empty") + knittr::knittr_exit() +} +``` + +# General +## General Barcode Information from Alignments +### Desc {.no-title} +

General Barcode Information from Alignments

+This report aggregates the barcode-specific information from the alignments +that were created using `harpy align`. Detailed information for any one sample +can be found in that sample's individual report. The table below is an aggregation +of data for each sample based on their `*.bxstats.gz` file. + +- `avg` refers to the average (arithmetic mean) +- `SEM` refers to the Standard Error of the mean +- `molecules` are the unique DNA molecules as inferred from linked-read barcodes + - `singletons` are molecules composed of one single-end or paired-end sequence, effectively not a linked read + - `multiplex` refer to molecules composed of more than two single-end or paired-end sequences +- `barcodes` are the linked-read barcodes associated with DNA sequences and are synonymous with `bx` +- `valid` refers to a proper haplotag barcode (e.g. `A01C34B92D51`) +- `invalid` refers to an invalidated haplotag barcode, where there is a `00` in any of the `ACBD` positions (e.g. `A21C00B32D57`) +- `NX` are the N-statistics (explained in more detail below) + +## Sampletable +### Per-Sample Information {.no-title} +```{r} +DT::datatable( + aggregate_df, + rownames = F, + extensions = 'Buttons', + options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE), + caption = 'Per-Sample Alignment Information Pertaining to Barcodes', + fillContainer = T, + colnames = c("sample", "alignments", "unique molecules", "singletons", "multiplex", "unique barcodes", "molecule:bx ratio", "valid bx alignments", "invalid bx alignments", "% valid bx", "avg reads/molecule", "SEM reads/molecule", "avg molecule coverage %","SEM molecule coverage %", "N50", "N75","N90") +) +``` + +```{r sampleplot_height, echo=FALSE, message=FALSE, warning=FALSE} +n_samples <- nrow(aggregate_df) +plotheight <- 150 + (15 * n_samples) +figheight <- 0.6 + (0.2 * n_samples) +``` + +## NX plots desc +### NX desc {.no-title} +

NX Information

+The **NX** metric (e.g. **N50**) is the length of the shortest molecule in the group of longest molecules that together +represent at least **X%** of the total molecules by length. For example, `N50` would be the shortest molecule in the +group of longest molecules that together represent **50%** of the total molecules by length (sort of like a median). +Below is the distributions of three common NX metrics (N50, N75, N90) across all samples. + +## NXX plots actual +### NX plots {.no-title} +```{r nxxplot, echo = F, warning = F, message = F} +highchart() |> + hc_chart(type = "area", animation = F) |> + hc_add_series(data = density(aggregate_df$N50), name = "N50", type = "areaspline") |> + hc_add_series(data = density(aggregate_df$N75), name = "N75", type = "areaspline") |> + hc_add_series(data = density(aggregate_df$N90), name = "N90", color = "#d3805f", type = "areaspline") |> + hc_tooltip(enabled = FALSE) |> + hc_caption(text = "Values derived using non-singleton molecules") |> + hc_title(text = "NX Stats Across Samples") |> + hc_xAxis(title = list(text = "NX value"), min = 0) |> + hc_yAxis(title = list(text = "density")) |> + hc_exporting(enabled = T, filename = "NX.stats", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +## Distribution of valid bx alignments +### dist description {.no-title} +

Distribution of alignments with valid barcodes

+Below is a distribution of what percent of total alignments each sample +had valid haplotag barcodes (`AXXCXXBXXDXX` where `XX` is not `00`). + +## valid bx plot +### distribution plot {.no-title} +```{r perc_valid_dist, echo = F, warning = F, message = F} +hs <- hist( + aggregate_df$totalvalidpercent, + breaks = min(aggregate_df$totalvalidpercent):max(aggregate_df$totalvalidpercent), + plot = F +) +hs$counts <- round(hs$counts / sum(hs$counts) * 100, 4) +hs <- data.frame(val = hs$breaks[-1], freq = hs$counts) + +hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Percent Alignments", marker = list(enabled = FALSE), animation = F) |> + hc_title(text = "Percent of Alignments with Valid BX tags") |> + hc_xAxis(min = 0, max = 100, title = list(text = "% alignment with valid BX tag")) |> + hc_yAxis(max = 100, title = list(text = "% samples")) |> + hc_tooltip(crosshairs = TRUE) |> + hc_exporting(enabled = T, filename = "percent.valid.dist", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +# Per-Sample +## Per-Sample Metrics +### Per-sample desc {.no-title} +

Per-Sample Metrics

+Below is a series of plots showing per-sample metrics. The meaning of percentages and error bars are provided in the bottom-left captions of the plots. + +## Per-sample plots +### percent valid {.no-title} +```{r perc_valid_per_sample, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} +hchart(aggregate_df, "xrange", animation = F, groupPadding = 0.0001, + hcaes(x = 0, x2 = totalreads, y = rev(0:(n_samples-1)), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> + hc_xAxis(title = list(text = "Total Alignments")) |> + hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> + hc_caption(text = "Percentage represents the percent of alignments with a valid BX barcode") |> + hc_tooltip(enabled = FALSE) |> + hc_colors(color = "#95d8a7") |> + hc_title(text = "Percent Alignments with Valid BX Barcode") |> + hc_exporting(enabled = T, filename = "BX.valid.alignments", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +### multiplex percent {.no-title} +```{r perc_multiplex, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} +hchart(aggregate_df, "xrange", animation = F, groupPadding = 0.0001, + hcaes(x = 0, x2 = multiplex + singletons, y = rev(0:(n_samples-1)), partialFill = round(multiplex / (multiplex + singletons), 4)), dataLabels = list(enabled = TRUE)) |> + hc_xAxis(title = list(text = "Total Alignments with Valid Barcodes")) |> + hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> + hc_caption(text = "Percentage represents the percent of molecules composed of more than 2 single-end or paired-end sequences") |> + hc_tooltip(enabled = FALSE) |> + hc_colors(color = "#8dc6f5") |> + hc_title(text = "Percent Non-Singleton Molecules") |> + hc_exporting(enabled = T, filename = "BX.multiplex", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +### reads per molecule{.no-title} +```{r reads_per_mol_sample, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} +err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) +highchart() |> + hc_chart(inverted=TRUE, animation = F, pointPadding = 0.0001, groupPadding = 0.0001) |> + hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagereadspermol), marker = list(radius = 8), color = "#6d73c2", zIndex = 1) |> + hc_add_series(data = err_df, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#8186c7", whiskerColor = "#8186c7") |> + hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_title(text = "Average Reads Per Molecule") |> + hc_caption(text = "Error bars show the standard error of the mean of non-singleton molecules") |> + hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> + hc_exporting(enabled = T, filename = "avg.reads.per.molecule", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +### average molecule coverage{.no-title} +```{r avg_mol_cov, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} +err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolcov, low = aggregate_df$averagemolcov - aggregate_df$semolcov, high = aggregate_df$averagemolcov + aggregate_df$semolcov) +err_df_bp <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolcovbp, low = aggregate_df$averagemolcovbp - aggregate_df$semolcovbp, high = aggregate_df$averagemolcovbp + aggregate_df$semolcovbp) + +highchart() |> + hc_chart(inverted=TRUE, animation = F, pointPadding = 0.0001, groupPadding = 0.0001) |> + hc_add_series(data = aggregate_df, type = "scatter", name = "Inferred Fragments", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolcov), marker = list(radius = 8), color = "#df77b5", zIndex = 1) |> + hc_add_series(data = err_df, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#ef94ca", whiskerColor = "#ef94ca") |> + hc_add_series(data = aggregate_df, type = "scatter", name = "Aligned Base Pairs", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolcovbp), marker = list(radius = 8), color = "#924596", zIndex = 1) |> + hc_add_series(data = err_df_bp, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#916094", whiskerColor = "#916094") |> + hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_title(text = "Average Molecule Percent Coverage") |> + hc_caption(text = "Error bars show the standard error of the mean percent coverage of non-singleton molecules") |> + hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> + hc_exporting(enabled = T, filename = "avg.molecule.coverage", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` \ No newline at end of file diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 22d020ec3..4a7b4bae8 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -1,13 +1,16 @@ --- -title: "Harpy Alignment Report" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +title: "Harpy Alignment Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- # Barcode Stats @@ -32,8 +35,12 @@ infile <- snakemake@input[[1]] bamfile <- gsub(".bxstats.gz", ".bam", infile) samplename <- gsub(".bxstats.gz", "", basename(infile)) tb <- read.table(infile, header = T, sep = "\t") %>% select(-start, -end) +if(nrow(tb) == 0){ + print(paste("Input data file",infle, "is empty")) + knittr::knittr_exit() +} tb$valid <- tb$molecule -tb[!(tb$valid %in% c("noBX", "invalidBX")), "valid"] <- "validBX" +tb[tb$valid != "invalidBX", "valid"] <- "validBX" tb$valid <- gsub("BX", " BX", tb$valid) ``` @@ -43,18 +50,16 @@ nBX <- group_by(valids, contig) %>% summarize(nBX = length(molecule)) avgBX <- round(mean(nBX$nBX), digits = 2) - -totuniqBX <- length(unique(valids$molecule)) +totuniqBX <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+") +totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() tots <- tb %>% group_by(valid) %>% summarize(total = length(molecule)) %>% as.data.frame() -for(i in c("no BX", "invalid BX", "valid BX")){ - if (!(i %in% tots$valid)){ - tots <- rbind(tots, data.frame("valid" = i, total = 0)) - } -} + +tot_invalid <- sum(tots$valid == "invalid BX") +tot_valid <- sum(tots$valid == "valid BX") ``` ```{r nxx, echo = F, results = F, message = F} @@ -79,12 +84,12 @@ valueBox(scales::comma(length(unique(tb$contig))), caption = "Contigs") ### validBX ```{r} -valueBox(scales::comma(tots[tots$valid == "valid BX", 2]), caption = "Valid Barcodes", color = "success", icon = "fa-vial-circle-check") +valueBox(scales::comma(tot_valid), caption = "Valid Barcodes", color = "success", icon = "fa-vial-circle-check") ``` ### invalidBX ```{r} -valueBox(scales::comma(tots[tots$valid == "invalid BX", 2]), caption = "Invalid Barcodes", color = "warning", icon = "fa-x") +valueBox(scales::comma(tot_invalid), caption = "Invalid Barcodes", color = "warning", icon = "fa-x") ``` ### glob-avg @@ -138,7 +143,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".readsper"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -155,7 +160,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#75b89e", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".basesper"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -186,7 +191,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#b3519d", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".mollen"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -227,7 +232,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#e59765", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".molcov"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -248,7 +253,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#dbe465", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".molcov"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -307,6 +312,10 @@ knitr::kable( covfile <- snakemake@input[[2]] #covfile <- "~/test.cov.gz" tb <- read.table(covfile, header = F) +if(nrow(tb) == 0){ + print(paste("Input data file",covfile, "is empty")) + knittr::knittr_exit() +} colnames(tb) <- c("contig", "position", "depth") q99 <- quantile(tb$depth, 0.99) Mode <- function(x) { @@ -402,7 +411,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#9393d2", name = "% hc_yAxis(title = list(text = "% molecules")) |> hc_title(text = "Distribution of Alignment Depths") |> hc_exporting(enabled = T, filename = paste0(samplename, ".cov"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/BcftoolsStats.Rmd b/src/harpy/reports/BcftoolsStats.Rmd index f51e06f23..252aa8d23 100644 --- a/src/harpy/reports/BcftoolsStats.Rmd +++ b/src/harpy/reports/BcftoolsStats.Rmd @@ -1,13 +1,16 @@ --- title: "BCFtools Stats Report" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} @@ -157,7 +160,7 @@ if(do_qual){ formatter = JS("function () {return 'Quality ' + this.x + '
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "snp.quality", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } else { @@ -181,7 +184,7 @@ if(do_qual){ formatter = JS("function () {return 'Quality ' + this.x + '
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "indel.quality", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } else { @@ -235,7 +238,7 @@ if(sum(.dpL) == 0){ formatter = JS("function () {return 'depth: ' + this.x + '
Percent: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "genotype.depth", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } ``` @@ -287,7 +290,7 @@ if(sum(.iddL) == 0){ formatter = JS("function () {return '' + this.series.name + '
Length: ' + Math.abs(this.x) + '
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "indel.distribution", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } ``` @@ -324,7 +327,7 @@ highchart() |> pointFormat= '{point.category}
Count: {point.y:.0f}' ) |> hc_exporting(enabled = T, filename = "samples.variantstats", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/BxCount.Rmd b/src/harpy/reports/BxCount.Rmd index 0886d489d..07d81b720 100644 --- a/src/harpy/reports/BxCount.Rmd +++ b/src/harpy/reports/BxCount.Rmd @@ -1,12 +1,15 @@ --- title: "Haplotag QC Bardcode Summary" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo = FALSE, message = FALSE} @@ -224,7 +227,7 @@ hchart(inv_hist, "areaspline", hcaes(x = Bin, y = Count, group = Segment), stack formatter = JS("function () {return '' + this.series.name + '
' + this.x + '% invalid
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "indel.quality", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -274,7 +277,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "nor hc_yAxis(title = list(text = "count")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "bxcount", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` ### proportional valid barcodes @@ -287,7 +290,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "per hc_yAxis(title = list(text = "percent")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "bxpercent", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -306,6 +309,6 @@ highchart() |> formatter = JS("function () {return '' + this.x + '
' + this.series.name + '
Percent: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "invalid.segments", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` \ No newline at end of file diff --git a/src/harpy/reports/HapCut2.Rmd b/src/harpy/reports/HapCut2.Rmd index 1f58cf8ba..a44cd9c5c 100644 --- a/src/harpy/reports/HapCut2.Rmd +++ b/src/harpy/reports/HapCut2.Rmd @@ -1,13 +1,16 @@ --- -title: "Haplotype Phasing Report" -date: "`r format(Sys.time(), '%d %B, %Y')`" +title: "Haplotype Phasing Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo= FALSE, message = FALSE, warning = FALSE} @@ -145,7 +148,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#7eb495", name = "co hc_yAxis(title = list(text = "count")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "haplotype.hist", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/Impute.Rmd b/src/harpy/reports/Impute.Rmd index dda770aab..3da983aaa 100644 --- a/src/harpy/reports/Impute.Rmd +++ b/src/harpy/reports/Impute.Rmd @@ -1,12 +1,15 @@ --- title: "Imputation Assessment" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- # General Stats @@ -124,7 +127,7 @@ hchart(h, "areaspline", hcaes(x = info, y = freq), name = "% sites", marker = li formatter = JS("function () {return 'INFO_SCORE ' + this.x + '
' + this.y + '% of sites';}") ) |> hc_exporting(enabled = T, filename = "impute.infoscores", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -253,7 +256,7 @@ hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stack hc_yAxis(title = list(text = "count")) |> hc_tooltip(crosshairs = TRUE, animation = F) |> hc_exporting(enabled = T, filename = "impute.samples", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/Leviathan.Rmd b/src/harpy/reports/Leviathan.Rmd index 7f6c2245f..b70ca4600 100644 --- a/src/harpy/reports/Leviathan.Rmd +++ b/src/harpy/reports/Leviathan.Rmd @@ -1,12 +1,15 @@ --- -title: "Leviathan Variant Calling Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +title: "Leviathan Structural Variant Calling Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/LeviathanPop.Rmd b/src/harpy/reports/LeviathanPop.Rmd index 559d3f611..07ac4d976 100644 --- a/src/harpy/reports/LeviathanPop.Rmd +++ b/src/harpy/reports/LeviathanPop.Rmd @@ -1,12 +1,15 @@ --- -title: "Leviathan Structural Variant Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +title: "Leviathan Structural Variant Calling Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/Naibr.Rmd b/src/harpy/reports/Naibr.Rmd index 5b5b1a3df..60e6af5d2 100644 --- a/src/harpy/reports/Naibr.Rmd +++ b/src/harpy/reports/Naibr.Rmd @@ -1,13 +1,16 @@ --- -title: "NAIBR Structural Variant Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +title: "NAIBR Structural Variant Calling Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo = FALSE, warnings = FALSE, message = FALSE} diff --git a/src/harpy/reports/NaibrPop.Rmd b/src/harpy/reports/NaibrPop.Rmd index d8530a261..01f613523 100644 --- a/src/harpy/reports/NaibrPop.Rmd +++ b/src/harpy/reports/NaibrPop.Rmd @@ -1,13 +1,16 @@ --- -title: "NAIBR Structural Variant Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +title: "NAIBR Structural Variant Calling Summary" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/PreflightBam.Rmd b/src/harpy/reports/PreflightBam.Rmd index 4af31228a..446a7512e 100644 --- a/src/harpy/reports/PreflightBam.Rmd +++ b/src/harpy/reports/PreflightBam.Rmd @@ -1,12 +1,15 @@ --- title: "BAM Haplotag Format Check" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} @@ -27,6 +30,7 @@ data <- read.table(infile, header = T) attention_df <- data %>% select(-1,-3) %>% rowwise() %>% summarise(count = sum(c(nameMismatch, noBX, badBX))) attention <- sum(attention_df$count > 0) ``` +# File Checks ## filetop ### fltop {.no-title}

Preflight checks for BAM files

@@ -37,75 +41,24 @@ the files to identify formatting conformity and issues that may require your att ## General Information {data-height=100} ### nfiles ```{r} -valueBox(scales::comma(nrow(data)), caption = "Files", color = "info") -``` - -### fileformat -```{r} -valueBox("BAM", caption = "File Format", color = "info") +valueBox(scales::comma(nrow(data)), caption = "Alignment Files", icon = "fa-file", color = "#C0C0C0") ``` ### no MI tag ```{r} -noMItag <- sum(data$noMI > 0) -valueBox(scales::comma(noMItag), caption = "No MI: tag", color = ifelse(noMItag > 0, "warning", "success")) +noMItag <- sum(data$noMI ==data$alignments) +valueBox(scales::comma(noMItag), caption = "No MI: tag", icon = ifelse(noMItag > 0, "fa-exclamation", "fa-check"), color = ifelse(noMItag > 0, "warning", "success")) ``` ### bxnotlast ```{r} bxnotlast <- sum(data$bxNotLast > 0) -valueBox(scales::comma(bxnotlast), caption = "BX:Z: tag not last", color = ifelse(bxnotlast > 0, "warning", "success")) +valueBox(scales::comma(bxnotlast), caption = "BX:Z: tag not last", icon = ifelse(bxnotlast > 0, "fa-exclamation", "fa-check"), color = ifelse(bxnotlast > 0, "warning", "success")) ``` ### issues ```{r} -valueBox(scales::comma(attention), caption = "Files With Issues", color = ifelse(attention > 0, "warning", "success")) -``` - -## interpret -### interpcol {.no-title} -

Interpreting the Data

- -The `harpy preflight bam -d ...` created a `Preflight` folder within -the supplied directory and within it you will find this report, along with the -`filecheck.bam.tsv` file summing the results that are included in this report. -This file contains a tab-delimited table with the columns: `file`, `nameMismatch`, `alignments`, `noBX`, and `badBX`. These columns are defined by: - -- `file`: the name of the BAM file -- `nameMismatch`: the sample name of the file inferred from name of the file (i.e. Harpy assumes `sample1.bam` to be `sample1`) does not match the `@RG ID` tag in the alignment file. - - severity: critical and will likely cause downstream issues and errors -- `alignments`: the total number of alignment records in the file -- `noMI`: alignment records lack `MI:` tag (`MI:Z:` or `MI:i:`) - - severity: critical and will prevent HapCut2 phasing - - the `MI` tag is the "molecular identifier", meaning it's the unique name of a molecule from which are series of alignments share a barcode. - - `EMA` adds these to alignments, `BWA` does not, but `harpy align` will add them when using `BWA` -- `noBX`: the number of alignments that do not have a `BX:Z:` tag in the record - - severity: minimal and likely won't cause issues unless no alignments have `BX:Z:` tags - - if you expect all of your alignments should have `BX:Z:` tags, then further investigation is necessary -- `badBX`: the haplotag barcode in the `BX:Z:` tag does not adhere to the proper `AxxCxxBxxDxx` format - - severity: critical and will cause downstream issues and errors -- `bxNotLast`: the `BX:Z:` tag in the alignment record is not the last tag - - severity: conditional and only relevant for LEVIATHAN variant calling - - can be ignored if not intending to call structural variants with LEVIATHAN - - if intending to use LEVIATHAN, the `BX:Z:` tag must be the last tag in the alignment records - - can be fixed by editing the BAM file, or fixing FASTQ files prior to alignment - - `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record - - -### Proper format {.no-title} -

Proper BAM file format

- -A proper BAM file will contain a `BX:Z:` tag with an alignment that features a -properly-formatted haplotag barcode `AxxCxxBxxDxx`. If this barcode is not in that -format, then it's likely the input FASTQ used for read mapping is the source of the -issue. You can check those FASTQ files for errors with `harpy preflight fastq ...`. - -Below is an example of a proper alignment record for a file named `sample1.bam`. -Note the tag `RG:Z:sample1`, indicating this alignment is associated with `sample1` and -matches the file name. Also note the correctly formatted haplotag barcode `BX:Z:A19C01B86D78` -and the presence of a `MI:` tag. -``` -A00814:267:HTMH3DRXX:2:1132:26268:10316 113 contig1 6312 60 4S47M1D86M = 6312 0 TAACCCTAACCCTAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACCCTATTCCTATTCCTAACCCTAACCCTAAAAACCAACTGCACCCCGGTGCACTTTTGGCTATTACCCCCTATCATACCCTTGTCC FFFF,FFFF:FFFF:FF:FFF:FFFF,,FFFFFFFFFFF:FFFFF:FFFFFFFFFFFFF:FFFF,FFFFFFFFF:FF:F:FFFFF,F:,FFFFFFFFFF:FFF,FFFFFF:FFFFFF:FFFF:FFFFFFFFFF:FFF NM:i:2 MD:Z:23C23^C86 MC:Z:4S47M1D86M AS:i:121 XS:i:86 RG:Z:sample1 MI:i:4040669 RX:Z:GAAACGATGTTGC+CCTAAGCCGAATC QX:Z:FFFFFFFFFFFFF+FFFFF:FFFFFFF BX:Z:A19C01B86D78 +valueBox(scales::comma(attention), caption = "Files With Issues", icon = ifelse(attention > 0, "fa-exclamation", "fa-check"), color = ifelse(attention > 0, "warning", "success")) ``` ## viz desc @@ -137,9 +90,6 @@ knitr::kable( ) ``` -You may click on the plot to expand it in your browser window. Clicking it again will exit -out of the zoomed view. - ## datarow ### plot {.no-title} ```{r status_df} @@ -159,18 +109,18 @@ rownames(m) <- gsub(".bam", "", data$file) ```{r readsper, echo = FALSE, message = FALSE, warning = FALSE, out.width = '100%'} hchart(m) |> - hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2) |> - hc_title(text = "BAM file preflight QC") |> + hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2, min = 0, max = 1) |> + hc_title(text = "Alignment File Checks") |> hc_xAxis(labels = list(style = list(fontSize = '16px')) )|> hc_yAxis(labels = list(style = list(fontSize = '16px')) )|> - hc_plotOptions(heatmap = list(borderWidth = 1, borderColor = "white", animated = F)) |> + hc_plotOptions(heatmap = list(animation = FALSE, borderWidth = 1, borderColor = "white", states = list(inactive = list(animation = FALSE), normal = list(animation = FALSE)))) |> hc_legend(enabled = F) |> hc_tooltip( animation = FALSE, - formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS') + '';}") + formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS');}") ) |> hc_exporting(enabled = T, filename = "bam.preflight", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -185,3 +135,72 @@ DT::datatable( fillContainer = T ) ``` + + +# Details +## interpret +### interpcol {.no-title} +

Interpreting the Data

+ +The `harpy preflight bam ...` command created a `filecheck.bam.tsv` file in the specified +output directory that summarizes the results that are included in this report. +This file contains a tab-delimited table with the columns: `file`, `nameMismatch`, `alignments`, `noBX`, and `badBX`. These columns are defined by: + +#### file +The name of the BAM file + +#### nameMismatch +The sample name of the file inferred from name of the file (i.e. Harpy assumes `sample1.bam` to be `sample1`) does not match the `@RG ID` tag in the alignment file. + +- severity: critical and will likely cause downstream issues and errors + +#### alignments +The total number of alignment records in the file + +#### noMI +Alignment records lack `MI:` tag (`MI:Z:` or `MI:i:`) + +- severity: critical and will prevent HapCut2 phasing +- the `MI` tag is the "molecular identifier", meaning it's the unique name of a molecule from which are series of alignments share a barcode. + - `EMA` adds these to alignments, `BWA` does not, but `harpy align` will add them when using `BWA` + +#### noBX +The number of alignments that do not have a `BX:Z:` tag in the record + +- severity: minimal and likely won't cause issues unless no alignments have `BX:Z:` tags +- if you expect all of your alignments should have `BX:Z:` tags, then further investigation is necessary + +#### badBX +The haplotag barcode in the `BX:Z:` tag does not adhere to the proper `AxxCxxBxxDxx` format + +- severity: critical and will cause downstream issues and errors + +#### bxNotLast +The `BX:Z:` tag in the alignment record is not the last tag + +- severity: conditional and only relevant for LEVIATHAN variant calling +- can be ignored if not intending to call structural variants with LEVIATHAN +- if intending to use LEVIATHAN, the `BX:Z:` tag must be the last tag in the alignment records + - can be fixed by editing the BAM file, or fixing FASTQ files prior to alignment + - `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record + + +### Proper format {.no-title} +

Proper BAM file format

+ +#### BX Tag +A proper linked-read alignment file will contain a `BX:Z:` tag with an alignment record +that features a properly-formatted haplotag barcode `AxxCxxBxxDxx`. If this barcode is not in that +format, then it's likely the input FASTQ used for read mapping is the source of the +issue. You can check those FASTQ files for errors with `harpy preflight fastq ...`. + +#### Example Alignment Record +Below is an example of a proper alignment record for a file named `sample1.bam`. +Note the tag `RG:Z:sample1`, indicating this alignment is associated with `sample1` and +matches the file name. Also note the correctly formatted haplotag barcode `BX:Z:A19C01B86D78` +and the presence of a `MI:` tag. To reduce horizontal scrolling, the alignment sequence and Phred +scores have been replaced with `SEQUENCE` and `QUALITY`, respectively. + +``` +A00814:267:HTMH3DRXX:2:1132:26268:10316 113 contig1 6312 60 4S47M1D86M = 6312 0 SEQUENCE QUALITY NM:i:2 MD:Z:23C23^C86 MC:Z:4S47M1D86M AS:i:121 XS:i:86 RG:Z:sample1 MI:i:4040669 RX:Z:GAAACGATGTTGC+CCTAAGCCGAATC QX:Z:FFFFFFFFFFFFF+FFFFF:FFFFFFF BX:Z:A19C01B86D78 +``` \ No newline at end of file diff --git a/src/harpy/reports/PreflightFastq.Rmd b/src/harpy/reports/PreflightFastq.Rmd index 12b7adbcf..ab2c97a07 100644 --- a/src/harpy/reports/PreflightFastq.Rmd +++ b/src/harpy/reports/PreflightFastq.Rmd @@ -1,12 +1,15 @@ --- title: "FASTQ Haplotag Format Check" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} @@ -21,9 +24,9 @@ using<-function(...) { } using("flexdashboard","dplyr","tidyr","DT","highcharter","scales") -#infile <- "~/Documents/harpy/test/fastq/Preflight/filecheck.fastq.tsv" +#infile <- "~/Documents/harpy/Preflight/fastq/filecheck.fastq.tsv" infile <- snakemake@input[[1]] -#dirname <- "filler text" + data <- read.table(infile, header = T) attention_df <- data %>% mutate(allmissing = (reads == noBX)) %>% @@ -32,72 +35,29 @@ attention_df <- data %>% summarise(count = sum(c(badBX,badSamSpec, allmissing))) attention <- sum(attention_df$count > 0) ``` - +# File Checks ## filetop ### fltop {.no-title}

Preflight checks for FASTQ files

This report reflects the FASTQ files provided. Harpy has processed -the files to identify formatting issues that may require your attention. +the files to identify [formatting issues](#details) that may require your attention. ## General Information {data-height=100} ### nfiles ```{r} -valueBox(scales::comma(nrow(data)), caption = "Files", color = "info") -``` -### fileformat -```{r} -valueBox("FASTQ", caption = "File Format", color = "info") +valueBox(scales::comma(nrow(data)), caption = "FASTQ Files", icon = "fa-file", color = "#C0C0C0") ``` + ### issues ```{r} -valueBox(scales::comma(attention), caption = "Files With Issues", color = ifelse(attention > 0, "warning", "success")) +valueBox(scales::comma(attention), caption = "Files With Issues", icon = ifelse(attention > 0, "fa-exclamation", "fa-check"), color = ifelse(attention > 0, "warning", "success")) ``` ### BX not last ```{r} bxnotlast <- sum(data$bxNotLast > 0) -valueBox(bxnotlast, caption = "BX:Z: tag not last", color = ifelse(bxnotlast > 0, "#feda75", "success")) -``` - -## interpret -### interpcol {.no-title} -

Interpreting the Data

- -The `harpy preflight fastq -d ...` created a `Preflight` folder within -the supplied directory and within it you will find this report, along with the -`filecheck.fastq.tsv` file summing the results that are included in this report. -This file contains a tab-delimited table with the columns: `file`, `reads`, `noBX`, `badBX`, and `badSamSpec`. These columns are defined by: - -- `file`: the name of the FASTQ file -- `reads`: the total number of reads in the file -- `noBX`: the number of reads that do not have a `BX:Z:` tag in the read header - - severity: minimal and likely won't cause issues unless no reads have `BX:Z:` tags - - if you expect all of your reads should have `BX:Z:` tags, then further investigation is necessary -- `badBX`: the haplotag barcode in the `BX:Z:` comment does not adhere to the proper `AxxCxxBxxDxx` format - - severity: critical and will cause downstream issues and errors -- `badSamSpec`: the comments in the read header after the read ID do not conform to the `TAG:TYPE:VALUE` [SAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) - - severity: critical and will cause downstream issues and errors -- `bxNotLast`: the `BX:Z:` tag in the FASTQ header is not the last comment - - severity: conditional and only relevant for LEVIATHAN variant calling - - can be ignored if not intending to call structural variants with LEVIATHAN - - if intending to use LEVIATHAN, the `BX:Z:` tag must be the last comment in a read header - - `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record - -### Proper header {.no-title} -

Proper Read Headers

- -An example of a proper FASTQ read header is like the one below, where all comments -following the initial read ID (`@A00814...`) have the proper [SAM spec](https://samtools.github.io/hts-specs/SAMv1.pdf) -`TAG:TYPE:VALUE` format and among them is a `BX:Z:` tag followed by a `AxxCxxBxxDxx` -formatted haplotag barcode. The comments must be **TAB separated** because the `:Z:` tag type allows a -whitespace character for its value. The read header below contains the read ID `@A00814...`, -the `BX:Z:` haplotag barcode tag, and two more comments `RX:Z:` and `QX:Z:` that both -adhere to the SAM specification. If using LEVIATHAN to call structural variants, the `BX:Z:` tag -must be the last comment in the read header. - -``` -@A00814:267:HTMH3DRXX:2:1101:4580:1000 BX:Z:A02C01B11D46 RX:Z:GAAACGACCAACA+CGAACACGTTAGC QX:Z:F,FFFFFFFFFFF+FF,FFF:FFFFFF +valueBox(bxnotlast, caption = "BX:Z: tag not last", icon = ifelse(attention > 0, "fa-exclamation", "fa-check"), color = ifelse(bxnotlast > 0, "#feda75", "success")) ``` ## viz desc @@ -127,9 +87,6 @@ knitr::kable( ) ``` -If the heatmap appears to be completely green, then all of your samples have passed all the preflight checks. -If any sample fails at least one check, fails will appear in yellow and passes will appear in blue. - ## datarow ### plot {.no-title} ```{r status_df} @@ -146,21 +103,20 @@ data2 <- data %>% rownames(m) <- data$file ``` - -```{r readsper, echo = FALSE, message = FALSE, warning = FALSE, out.width = '100%'} +```{r readsper, echo = FALSE, message = FALSE, warning = FALSE} hchart(m) |> - hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2) |> - hc_title(text = "FASTQ file preflight QC") |> + hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2, min = 0, max = 1) |> + hc_title(text = "FASTQ File Checks") |> hc_xAxis(labels = list(style = list(fontSize = '16px')) )|> hc_yAxis(labels = list(style = list(fontSize = '16px')) )|> - hc_plotOptions(heatmap = list(borderWidth = 1, borderColor = "white", animated = F)) |> + hc_plotOptions(heatmap = list(animation = FALSE, borderWidth = 1, borderColor = "white", states = list(inactive = list(animation = FALSE), normal = list(animation = FALSE)))) |> hc_legend(enabled = F) |> hc_tooltip( animation = FALSE, - formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS') + '';}") + formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS');}") ) |> hc_exporting(enabled = T, filename = "fastq.preflight", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -175,3 +131,65 @@ DT::datatable( fillContainer = T ) ``` + +# Details +## interpret +### interpcol {.no-title} +

Interpreting the Data

+ +The `harpy preflight fastq ...` command created a `filecheck.fastq.tsv` file in the specified +output directory that summarizes the results that are included in this report. +This file contains a tab-delimited table with the columns: `file`, `reads`, `noBX`, `badBX`, and `badSamSpec`. These columns are defined by: + +#### file +The name of the FASTQ file + +#### reads +The total number of reads in the file + +#### noBX +The number of reads that do not have a `BX:Z:` tag in the read header + +- severity: minimal and likely won't cause issues unless no reads have `BX:Z:` tags +- if you expect all of your reads should have `BX:Z:` tags, then further investigation is necessary + +#### badBX +The haplotag barcode in the `BX:Z:` comment does not adhere to the proper `AxxCxxBxxDxx` format + +- severity: critical and will cause downstream issues and errors + +#### badSamSpec +The comments in the read header after the read ID do not conform to the `TAG:TYPE:VALUE` [SAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) + +- severity: critical and will cause downstream issues and errors + +#### bxNotLast +The `BX:Z:` tag in the FASTQ header is not the last comment + +- severity: conditional and only relevant for LEVIATHAN variant calling +- can be ignored if not intending to call structural variants with LEVIATHAN +- if intending to use LEVIATHAN, the `BX:Z:` tag must be the last comment in a read header +- `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record + +### Proper header {.no-title} +

Proper Read Headers

+ +#### SAM Specification +An example of a proper FASTQ read header is like the one below, where all comments +following the initial read ID (`@A00814...`) have the proper [SAM spec](https://samtools.github.io/hts-specs/SAMv1.pdf) +`TAG:TYPE:VALUE` format and among them is a `BX:Z:` tag followed by a `AxxCxxBxxDxx` +formatted haplotag barcode. + +#### Comments +The comments must be **TAB separated** because the `:Z:` tag type allows a +whitespace character for its value. + +#### Example Header +The read header below contains the read ID `@A00814...`, +the `BX:Z:` haplotag barcode tag, and two more comments `RX:Z:` and `QX:Z:` that both +adhere to the SAM specification. If using LEVIATHAN to call structural variants, the `BX:Z:` tag +must be the last comment in the read header. + +``` +@A00814:267:HTMH3DRXX:2:1101:4580:1000 BX:Z:A02C01B11D46 RX:Z:GAAACGACCAACA+CGAACACGTTAGC QX:Z:F,FFFFFFFFFFF+FF,FFF:FFFFFF +``` \ No newline at end of file diff --git a/src/harpy/reports/StitchCollate.Rmd b/src/harpy/reports/StitchCollate.Rmd index 6963b5316..f3707b19e 100644 --- a/src/harpy/reports/StitchCollate.Rmd +++ b/src/harpy/reports/StitchCollate.Rmd @@ -1,12 +1,15 @@ --- title: "STITCH Imputation Details" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png + favicon: "https://github.com/pdimens/harpy/blob/docs/static/report_favicon.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r setup, include=FALSE} diff --git a/src/harpy/scripts/bxStats.py b/src/harpy/scripts/bxStats.py index 01f923731..fae907bf8 100755 --- a/src/harpy/scripts/bxStats.py +++ b/src/harpy/scripts/bxStats.py @@ -1,5 +1,4 @@ import re -import sys import gzip import pysam @@ -7,31 +6,33 @@ outfile = gzip.open(snakemake.output[0], "wb", 6) outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n") -d = dict() -chromlast = None +d = {} +all_bx = set() +chrom_last = None -def writestats(x, contig): - for mi in x: - x[mi]["inferred"] = x[mi]["end"] - x[mi]["start"] +def writestats(x, writechrom): + """write to file the bx stats dictionary as a table""" + for _mi in x: + x[_mi]["inferred"] = x[_mi]["end"] - x[_mi]["start"] try: - x[mi]["covered_bp"] = min(x[mi]["bp"] / x[mi]["inferred"], 1.0) + x[_mi]["covered_bp"] = round(min(x[_mi]["bp"] / x[_mi]["inferred"], 1.0),4) except: - x[mi]["covered_bp"] = 0 + x[_mi]["covered_bp"] = 0 try: - x[mi]["covered_inserts"] = min(x[mi]["insert_len"] / x[mi]["inferred"], 1.0) + x[_mi]["covered_inserts"] = round(min(x[_mi]["insert_len"] / x[_mi]["inferred"], 1.0), 4) except: - x[mi]["covered_inferred"] = 0 - outtext = f"{chromlast}\t{mi}\t" + "\t".join([str(x[mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) + x[_mi]["covered_inferred"] = 0 + outtext = f"{writechrom}\t{_mi}\t" + "\t".join([str(x[_mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) outfile.write(outtext.encode() + b"\n") for read in alnfile.fetch(): chrom = read.reference_name # check if the current chromosome is different from the previous one # if so, print the dict to file and empty it (a consideration for RAM usage) - if chromlast and chrom != chromlast: - writestats(d, chromlast) - d = dict() - chromlast = chrom + if chrom_last and chrom != chrom_last: + writestats(d, chrom_last) + d = {} + chrom_last = chrom # skip duplicates, unmapped, and secondary alignments if read.is_duplicate or read.is_unmapped or read.is_secondary: continue @@ -46,10 +47,11 @@ def writestats(x, contig): try: mi = read.get_tag("MI") + bx = read.get_tag("BX") # do a regex search to find X00 pattern in the BX - if re.search("[ABCD]0{2,4}", read.get_tag("BX")): + if re.search("[ABCD]0{2,4}", bx): # if found, invalid - if "invalidBX" not in d.keys(): + if "invalidBX" not in d: d["invalidBX"] = { "start": 0, "end": 0, @@ -61,6 +63,8 @@ def writestats(x, contig): d["invalidBX"]["bp"] += bp d["invalidBX"]["n"] += 1 continue + # add valid bx to set of all unique barcodes + all_bx.add(bx) except: # There is no bx/MI tag continue @@ -73,8 +77,10 @@ def writestats(x, contig): if read.is_paired: # by using max(), will either add 0 or positive TLEN to avoid double-counting isize = max(0, read.template_length) + #isize = min(bp, abs(read.template_length)) # only count the bp of the first read of paired end reads - bp = bp if read.is_read1 else 0 + #bp = bp if read.is_read1 else 0 + bp = min(abs(read.template_length), read.infer_query_length()) elif read.is_supplementary: # if it's a supplementary alignment, just use the alignment length isize = bp @@ -82,7 +88,7 @@ def writestats(x, contig): # if it's unpaired, use the TLEN or query length, whichever is bigger isize = max(abs(read.template_length), read.infer_query_length()) # create bx entry if it's not present - if mi not in d.keys(): + if mi not in d: d[mi] = { "start": pos_start, "end": pos_end, @@ -99,5 +105,7 @@ def writestats(x, contig): d[mi]["end"] = max(pos_end, d[mi]["end"]) # print the last entry -writestats(d, chrom) -outfile.close() \ No newline at end of file +writestats(d, chrom_last) +# write comment on the last line with the total number of unique BX barcodes +outfile.write(f"#total unique barcodes: {len(all_bx)}\n".encode()) +outfile.close() diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index 6c51ccdb9..3c3617a98 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -52,7 +52,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: @@ -116,111 +116,62 @@ rule align: genome = f"Genome/{bn}", genome_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - pipe(outdir + "/samples/{sample}/{sample}.raw.sam") + temp(outdir + "/samples/{sample}/{sample}.bwa.sam") log: outdir + "/logs/{sample}.bwa.log" params: samps = lambda wc: d[wc.get("sample")], + quality = config["alignment_quality"], extra = extra benchmark: ".Benchmark/Mapping/bwa/align.{sample}.txt" threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning sequences: {wildcards.sample}" shell: """ - bwa mem -C -v 2 -t {threads} {params.extra} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} > {output} 2> {log} + bwa mem -C -v 2 -t {threads} {params.extra} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} 2> {log} | + samtools view -h -F 4 -q {params.quality} > {output} """ - -rule quality_filter: - input: - outdir + "/samples/{sample}/{sample}.raw.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.bwa.sam") - params: - quality = config["alignment_quality"] - conda: - f"{envdir}/align.yaml" - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" - -rule collate: - input: - outdir + "/samples/{sample}/{sample}.bwa.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.collate.bam") - conda: - f"{envdir}/align.yaml" - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/samples/{sample}/{sample}.collate.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.fixmate.bam") - conda: - f"{envdir}/align.yaml" - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" -rule sort_alignments: +rule mark_duplicates: input: - sam = outdir + "/samples/{sample}/{sample}.fixmate.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn_idx}", - genome_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") + sam = outdir + "/samples/{sample}/{sample}.bwa.sam", + genome = f"Genome/{bn}", + faidx = f"Genome/{bn_idx}" output: - bam = temp(outdir + "/samples/{sample}/{sample}.sort.bam"), - bai = temp(outdir + "/samples/{sample}/{sample}.sort.bam.bai") + temp(outdir + "/samples/{sample}/{sample}.markdup.bam") log: - outdir + "/logs/{sample}.bwa.sort.log" + outdir + "/logs/{sample}.markdup.log" params: tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: mem_mb = 2000 - conda: - f"{envdir}/align.yaml" + container: + None + threads: + 2 message: - "Sorting alignments: {wildcards.sample}" + "Marking duplicates: {wildcards.sample}" shell: """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + samtools collate -O -u {input.sam} | + samtools fixmate -m -u - - | + samtools sort -T {params.tmpdir} -u --reference {input.genome} -l 0 -m {resources.mem_mb}M - | + samtools markdup -@ {threads} -S --barcode-tag BX -f {log} - {output} rm -rf {params.tmpdir} """ -rule mark_duplicates: - input: - outdir + "/samples/{sample}/{sample}.sort.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.markdup.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - conda: - f"{envdir}/align.yaml" - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S --barcode-tag BX -f {log} {input} {output} 2> /dev/null" - -rule index_markdups: +rule markdups_index: input: outdir + "/samples/{sample}/{sample}.markdup.bam" output: temp(outdir + "/samples/{sample}/{sample}.markdup.bam.bai") - conda: - f"{envdir}/align.yaml" + container: + None message: "Indexing duplicate-marked alignments: {wildcards.sample}" shell: @@ -242,7 +193,7 @@ rule assign_molecules: script: "scripts/assignMI.py" -rule alignment_bxstats: +rule bxstats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -257,7 +208,7 @@ rule alignment_bxstats: script: "scripts/bxStats.py" -rule alignment_coverage: +rule coverage: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -272,7 +223,7 @@ rule alignment_coverage: shell: "samtools depth -a {input.bam} | depthWindows.py {params} | gzip > {output}" -rule reports: +rule report_persample: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -287,15 +238,15 @@ rule reports: script: "report/AlignStats.Rmd" -rule alignment_report: +rule stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" output: stats = temp(outdir + "/reports/data/samtools_stats/{sample}.stats"), flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") - conda: - f"{envdir}/align.yaml" + container: + None message: "Calculating alignment stats: {wildcards.sample}" shell: @@ -304,28 +255,42 @@ rule alignment_report: samtools flagstat {input.bam} > {output.flagstat} """ -rule samtools_reports: +rule report_samtools: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: outdir + "/reports/bwa.stats.html" params: - outdir + outdir = f"{outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"Basic Alignment Statistics\"", + comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: - """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" + +rule report_bx: + input: + collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) + output: + outdir + "/reports/barcodes.summary.html" + conda: + f"{envdir}/r.yaml" + message: + "Summarizing all barcode information from alignments" + script: + "report/AlignBxStats.Rmd" rule log_workflow: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam", "bam.bai"]), reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - agg_report = outdir + "/reports/bwa.stats.html" if not skipreports else [] + agg_report = outdir + "/reports/bwa.stats.html" if not skipreports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] params: quality = config["alignment_quality"], extra = extra diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 77d5d0167..ee410dd5c 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -57,7 +57,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: @@ -115,32 +115,16 @@ rule genome_bwa_index: shell: "bwa index {input} 2> {log}" -rule interleave: +rule ema_count: input: - fastq = get_fq - output: - pipe(outdir + "/.interleave/{sample}.interleave.fq") - container: - None - message: - "Interleaving input fastq files: {wildcards.sample}" - shell: - "seqtk mergepe {input} > {output}" - -use rule interleave as interleave2 with: - output: - outdir + "/.interleave/{sample}.interleave2.fq" - -rule beadtag_count: - input: - outdir + "/.interleave/{sample}.interleave.fq" + get_fq output: - counts = temp(outdir + "/bxcount/{sample}.ema-ncnt"), + counts = temp(outdir + "/ema_count/{sample}.ema-ncnt"), logs = temp(outdir + "/logs/count/{sample}.count") params: - prefix = lambda wc: outdir + "/bxcount/" + wc.get("sample"), + prefix = lambda wc: outdir + "/ema_count/" + wc.get("sample"), beadtech = "-p" if platform == "haplotag" else f"-w {whitelist}", - logdir = f"{outdir}/logs/count/" + logdir = f"{outdir}/logs/ema_count/" message: "Counting barcode frequency: {wildcards.sample}" conda: @@ -148,32 +132,21 @@ rule beadtag_count: shell: """ mkdir -p {params.prefix} {params.logdir} - ema count {params.beadtech} -o {params.prefix} < {input} 2> {output.logs} + seqtk mergepe {input} | + ema count {params.beadtech} -o {params.prefix} 2> {output.logs} """ -rule beadtag_summary: +rule ema_preprocess: input: - countlog = collect(outdir + "/logs/count/{sample}.count", sample = samplenames) - output: - outdir + "/reports/reads.bxcounts.html" - conda: - f"{envdir}/r.yaml" - message: - "Creating sample barcode validation report" - script: - "report/EmaCount.Rmd" - -rule preprocess: - input: - reads = outdir + "/.interleave/{sample}.interleave2.fq", - emacounts = outdir + "/bxcount/{sample}.ema-ncnt" + reads = get_fq, + emacounts = outdir + "/ema_count/{sample}.ema-ncnt" output: - bins = temp(collect(outdir + "/preproc/{{sample}}/ema-bin-{bin}", bin = binrange)), - unbarcoded = temp(outdir + "/preproc/{sample}/ema-nobc") + bins = temp(collect(outdir + "/ema_preproc/{{sample}}/ema-bin-{bin}", bin = binrange)), + unbarcoded = temp(outdir + "/ema_preproc/{sample}/ema-nobc") log: - outdir + "/logs/preproc/{sample}.preproc.log" + outdir + "/logs/ema_preproc/{sample}.preproc.log" params: - outdir = lambda wc: outdir + "/preproc/" + wc.get("sample"), + outdir = lambda wc: outdir + "/ema_preproc/" + wc.get("sample"), bxtype = "-p" if platform == "haplotag" else f"-w {whitelist}", bins = nbins threads: @@ -184,172 +157,103 @@ rule preprocess: "Preprocessing for EMA mapping: {wildcards.sample}" shell: """ - ema preproc {params.bxtype} -n {params.bins} -t {threads} -o {params.outdir} {input.emacounts} < {input.reads} 2>&1 | + seqtk mergepe {input.reads} | + ema preproc {params.bxtype} -n {params.bins} -t {threads} -o {params.outdir} {input.emacounts} 2>&1 | cat - > {log} """ -rule align: +rule ema_align: input: - readbin = collect(outdir + "/preproc/{{sample}}/ema-bin-{bin}", bin = binrange), + readbin = collect(outdir + "/ema_preproc/{{sample}}/ema-bin-{bin}", bin = binrange), genome = f"Genome/{bn}", + geno_faidx = f"Genome/{bn_idx}", geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - pipe(outdir + "/align/{sample}.bc.raw.sam"), + aln = temp(outdir + "/ema_align/{sample}.bc.bam"), + idx = temp(outdir + "/ema_align/{sample}.bc.bam.bai") log: - outdir + "/logs/{sample}.ema.align.log", + ema = outdir + "/logs/align/{sample}.ema.align.log", + sort = outdir + "/logs/align/{sample}.ema.sort.log", + resources: + mem_mb = 500 params: bxtype = f"-p {platform}", + tmpdir = lambda wc: outdir + "/." + d[wc.sample], + quality = config["quality"], extra = extra threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning barcoded sequences: {wildcards.sample}" shell: """ - ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" -x {input.readbin} > {output} 2> {log} - """ - -rule sort_raw_ema: - input: - sam = outdir + "/align/{sample}.bc.raw.sam", - genome = f"Genome/{bn}", - geno_faidx = f"Genome/{bn_idx}", - geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") - output: - aln = temp(outdir + "/align/{sample}.bc.bam"), - idx = temp(outdir + "/align/{sample}.bc.bam.bai") - log: - outdir + "/logs/{sample}.ema.sort.log" - params: - quality = config["quality"], - tmpdir = lambda wc: outdir + "/align/." + d[wc.sample], - extra = extra - resources: - mem_mb = 2000 - threads: - 2 - container: - None - message: - "Sorting and quality filtering alignments: {wildcards.sample}" - shell: - """ - samtools view -h -F 4 -q {params.quality} {input.sam} | - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam --write-index -m {resources.mem_mb}M -o {output.aln}##idx##{output.idx} - 2> {log} + ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" -x {input.readbin} 2> {log.ema} | + samtools view -h -F 4 -q {params.quality} | + samtools sort -T {params.tmpdir} --reference {input.genome} -O bam --write-index -m {resources.mem_mb}M -o {output.aln}##idx##{output.idx} - 2> {log.sort} rm -rf {params.tmpdir} """ -rule align_nobarcode: +rule bwa_align: input: - reads = outdir + "/preproc/{sample}/ema-nobc", + reads = outdir + "/ema_preproc/{sample}/ema-nobc", genome = f"Genome/{bn}", geno_faidx = f"Genome/{bn_idx}", geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - pipe(outdir + "/align/{sample}.nobc.raw.sam") + temp(outdir + "/bwa_align/{sample}.bwa.nobc.sam") log: - outdir + "/logs/{sample}.bwa.align.log" + outdir + "/logs/align/{sample}.bwa.align.log" + params: + quality = config["quality"] benchmark: ".Benchmark/Mapping/ema/bwaAlign.{sample}.txt" threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning unbarcoded sequences: {wildcards.sample}" shell: """ - bwa mem -t {threads} -v2 -C -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.reads} > {output} 2> {log} + bwa mem -t {threads} -v2 -C -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.reads} 2> {log} | + samtools view -h -F 4 -q {params.quality} > {output} """ -rule quality_filter: - input: - outdir + "/align/{sample}.nobc.raw.sam" - output: - temp(outdir + "/align/{sample}.bwa.nobc.sam") - params: - quality = config["quality"] - container: - None - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" - -rule collate: +rule bwa_markdups: input: - outdir + "/align/{sample}.bwa.nobc.sam" + sam = outdir + "/bwa_align/{sample}.bwa.nobc.sam", + genome = f"Genome/{bn}", + faidx = f"Genome/{bn_idx}" output: - temp(outdir + "/align/{sample}.collate.nobc.bam") - container: - None - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/align/{sample}.collate.nobc.bam" - output: - temp(outdir + "/align/{sample}.fixmate.nobc.bam") - container: - None - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" - -rule sort_nobc_alignments: - input: - sam = outdir + "/align/{sample}.fixmate.nobc.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn_idx}", - genome_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") - output: - bam = temp(outdir + "/align/{sample}.sort.nobc.bam"), - bai = temp(outdir + "/align/{sample}.sort.nobc.bam.bai") + temp(outdir + "/bwa_align/{sample}.markdup.nobc.bam") log: - outdir + "/logs/{sample}.bwa.sort.log" + outdir + "/logs/align/{sample}.markdup.log" params: - quality = config["quality"], tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: - mem_mb = 2000 + mem_mb = 500 container: None + threads: + 2 message: - "Sorting alignments: {wildcards.sample}" + "Marking duplicates: {wildcards.sample}" shell: """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + samtools collate -O -u {input.sam} | + samtools fixmate -m -u - - | + samtools sort -T {params.tmpdir} -u --reference {input.genome} -l 0 -m {resources.mem_mb}M - | + samtools markdup -@ {threads} -S --barcode-tag BX -f {log} - {output} rm -rf {params.tmpdir} """ -rule mark_duplicates: +rule bwa_markdups_index: input: - bam = outdir + "/align/{sample}.sort.nobc.bam", - bai = outdir + "/align/{sample}.sort.nobc.bam.bai" + outdir + "/bwa_align/{sample}.markdup.nobc.bam" output: - temp(outdir + "/align/{sample}.markdup.nobc.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - container: - None - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S -f {log} {input.bam} {output} 2> /dev/null" - -rule index_markdups: - input: - outdir + "/align/{sample}.markdup.nobc.bam" - output: - temp(outdir + "/align/{sample}.markdup.nobc.bam.bai") + temp(outdir + "/bwa_align/{sample}.markdup.nobc.bam.bai") container: None message: @@ -359,40 +263,29 @@ rule index_markdups: rule concatenate_alignments: input: - aln_bc = outdir + "/align/{sample}.bc.bam", - idx_bc = outdir + "/align/{sample}.bc.bam.bai", - aln_nobc = outdir + "/align/{sample}.markdup.nobc.bam", - idx_nobc = outdir + "/align/{sample}.markdup.nobc.bam.bai" + aln_bc = outdir + "/ema_align/{sample}.bc.bam", + idx_bc = outdir + "/ema_align/{sample}.bc.bam.bai", + aln_nobc = outdir + "/bwa_align/{sample}.markdup.nobc.bam", + idx_nobc = outdir + "/bwa_align/{sample}.markdup.nobc.bam.bai", + genome = f"Genome/{bn}" output: - bam = temp(outdir + "/align/{sample}.concat.unsort.bam") - threads: - 2 - container: - None - message: - "Concatenating barcoded and unbarcoded alignments: {wildcards.sample}" - shell: - "samtools cat -@ {threads} {input.aln_bc} {input.aln_nobc} > {output.bam}" - -rule sort_concatenated: - input: - bam = outdir + "/align/{sample}.concat.unsort.bam", - genome = f"Genome/{bn}" - output: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" threads: 2 resources: - mem_mb = 2000 + mem_mb = 500 container: None message: - "Sorting merged barcoded alignments: {wildcards.sample}" + "Concatenating barcoded and unbarcoded alignments: {wildcards.sample}" shell: - "samtools sort -@ {threads} -O bam --reference {input.genome} -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> /dev/null" + """ + samtools cat -@ 1 {input.aln_bc} {input.aln_nobc} | + samtools sort -@ 1 -O bam --reference {input.genome} -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} - + """ -rule alignment_coverage: +rule coverage: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -420,7 +313,7 @@ rule bx_stats: script: "scripts/bxStats.py" -rule alignment_report: +rule report_persample: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -433,7 +326,7 @@ rule alignment_report: script: "report/AlignStats.Rmd" -rule general_stats: +rule stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -450,29 +343,42 @@ rule general_stats: samtools flagstat {input.bam} > {output.flagstat} """ -rule collate_samtools_stats: +rule report_samtools: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]), output: outdir + "/reports/ema.stats.html" params: - outdir + outdir = f"{outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"Basic Alignment Statistics\"", + comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: - """ - multiqc {outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" + +rule report_bx: + input: + collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) + output: + outdir + "/reports/barcodes.summary.html" + conda: + f"{envdir}/r.yaml" + message: + "Summarizing all barcode information from alignments" + script: + "report/AlignBxStats.Rmd" rule log_workflow: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = [ "bam", "bam.bai"] ), cov_report = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - bx_counts = f"{outdir}/reports/reads.bxcounts.html" if not skipreports else [], - agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [] + agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] params: beadtech = "-p" if platform == "haplotag" else f"-w {whitelist}" message: diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index e04699be6..6a5bbc331 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -18,6 +18,7 @@ skipreports = config["skipreports"] windowsize = config["depth_windowsize"] molecule_distance = config["molecule_distance"] readlen = config["average_read_length"] +autolen = isinstance(readlen, str) wildcard_constraints: sample = "[a-zA-Z0-9._-]+" @@ -53,7 +54,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: @@ -111,106 +112,59 @@ rule align: input: fastq = get_fq, genome = f"Genome/{bn}", - genome_index = f"Genome/{bn}.r{readlen}.sti" + genome_index = f"Genome/{bn}.r{readlen}.sti" if not autolen else [] output: - pipe(outdir + "/samples/{sample}/{sample}.raw.sam") + temp(outdir + "/samples/{sample}/{sample}.strobe.sam") log: outdir + "/logs/{sample}.strobealign.log" params: samps = lambda wc: d[wc.get("sample")], - readlen = readlen, + readlen = "" if autolen else f"--use-index -r {readlen}", + quality = config["quality"], extra = extra benchmark: ".Benchmark/Mapping/strobealign/align.{sample}.txt" threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning sequences: {wildcards.sample}" shell: - "strobealign --use-index -r {params.readlen} -t {threads} -U -C --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} > {output} 2> {log}" - -rule quality_filter: - input: - outdir + "/samples/{sample}/{sample}.raw.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.mm2.sam") - params: - quality = config["quality"] - container: - None - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" - -rule collate: - input: - outdir + "/samples/{sample}/{sample}.mm2.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.collate.bam") - container: - None - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/samples/{sample}/{sample}.collate.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.fixmate.bam") - container: - None - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" + """ + strobealign {params.readlen} -t {threads} -U -C --rg-id={wildcards.sample} --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} 2> {log} | + samtools view -h -F 4 -q {params.quality} > {output} + """ -rule sort_alignments: +rule markduplicates: input: - sam = outdir + "/samples/{sample}/{sample}.fixmate.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn}.fai" + sam = outdir + "/samples/{sample}/{sample}.strobe.sam", + genome = f"Genome/{bn}", + faidx = f"Genome/{bn}.fai" output: - bam = temp(outdir + "/samples/{sample}/{sample}.sort.bam"), - bai = temp(outdir + "/samples/{sample}/{sample}.sort.bam.bai") + temp(outdir + "/samples/{sample}/{sample}.markdup.bam") log: - outdir + "/logs/{sample}.strobealign.sort.log" + outdir + "/logs/{sample}.markdup.log" params: - quality = config["quality"], tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: mem_mb = 2000 + threads: + 2 container: None message: - "Sorting alignments: {wildcards.sample}" + "Marking duplicates: {wildcards.sample}" shell: """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + samtools collate -O -u {input.sam} | + samtools fixmate -m -u - - | + samtools sort -T {params.tmpdir} -u --reference {input.genome} -l 0 -m {resources.mem_mb}M - | + samtools markdup -@ {threads} -S --barcode-tag BX -f {log} - {output} rm -rf {params.tmpdir} """ -rule mark_duplicates: - input: - outdir + "/samples/{sample}/{sample}.sort.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.markdup.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - container: - None - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S --barcode-tag BX -f {log} {input} {output} 2> /dev/null" - -rule index_markdups: +rule markdups_index: input: outdir + "/samples/{sample}/{sample}.markdup.bam" output: @@ -238,7 +192,7 @@ rule assign_molecules: script: "scripts/assignMI.py" -rule alignment_bxstats: +rule bxstats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -253,7 +207,7 @@ rule alignment_bxstats: script: "scripts/bxStats.py" -rule alignment_coverage: +rule coverage: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -268,7 +222,7 @@ rule alignment_coverage: shell: "samtools depth -a {input.bam} | depthWindows.py {params} | gzip > {output}" -rule alignment_report: +rule report_persample: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -283,7 +237,7 @@ rule alignment_report: script: "report/AlignStats.Rmd" -rule general_alignment_stats: +rule stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -300,28 +254,42 @@ rule general_alignment_stats: samtools flagstat {input.bam} > {output.flagstat} """ -rule samtools_reports: +rule report_samtools: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: outdir + "/reports/strobealign.stats.html" params: - outdir + outdir = f"{outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"Basic Alignment Statistics\"", + comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: - """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" + +rule report_bx: + input: + collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) + output: + outdir + "/reports/barcodes.summary.html" + conda: + f"{envdir}/r.yaml" + message: + "Summarizing all barcode information from alignments" + script: + "report/AlignBxStats.Rmd" rule log_workflow: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), samtools = outdir + "/reports/strobealign.stats.html" if not skipreports else [] , - bx_reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [] + reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] params: readlen = readlen, quality = config["quality"], @@ -332,10 +300,14 @@ rule log_workflow: with open(outdir + "/workflow/align.strobealign.summary", "w") as f: _ = f.write("The harpy align strobealign workflow ran using these parameters:\n\n") _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("The genome index was created using:\n") - _ = f.write(" strobealign --create-index -r {params.readlen} genome\n") - _ = f.write("Sequencing were aligned with strobealign using:\n") - _ = f.write(f" strobealign --use-index -U -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") + if autolen: + _ = f.write("Sequencing were aligned with strobealign using:\n") + _ = f.write(f" strobealign -U -C --rg-id=SAMPLE --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") + else: + _ = f.write("The genome index was created using:\n") + _ = f.write(f" strobealign --create-index -r {params.readlen} genome\n") + _ = f.write("Sequencing were aligned with strobealign using:\n") + _ = f.write(f" strobealign --use-index -U -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") _ = f.write(f" samtools view -h -F 4 -q {params.quality} |\n") _ = f.write("Duplicates in the alignments were marked following:\n") _ = f.write(" samtools collate \n") diff --git a/src/harpy/snakefiles/demultiplex.gen1.smk b/src/harpy/snakefiles/demultiplex.gen1.smk index 378bd9093..1850437ee 100644 --- a/src/harpy/snakefiles/demultiplex.gen1.smk +++ b/src/harpy/snakefiles/demultiplex.gen1.smk @@ -184,15 +184,16 @@ rule qc_report: output: outdir + "/reports/demultiplex.QC.html" params: - outdir + "/logs/" + logdir = outdir + "/logs/", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"QC for Demultiplexed Samples\"", + comment = "--comment \"This report aggregates the QC results created by falco.\"" conda: f"{envdir}/qc.yaml" message: "Creating final demultiplexing QC report" shell: - """ - multiqc {params} --force --quiet --title "QC for Demultiplexed Samples" --comment "This report aggregates the QC results created by falco." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" rule log_workflow: default_target: True diff --git a/src/harpy/snakefiles/impute.smk b/src/harpy/snakefiles/impute.smk index 31b2ab326..1a531300a 100644 --- a/src/harpy/snakefiles/impute.smk +++ b/src/harpy/snakefiles/impute.smk @@ -135,7 +135,7 @@ rule impute: threads: workflow.cores message: - "Imputing {wildcards.part}:\nmodel: {wildcards.model}, useBX: {wildcards}, k: {wildcards.k}, bxLimit: {wildcards.bxlimit}, s: {wildcards.s}, nGen: {wildcards.ngen}" + "Imputing {wildcards.part}:\nmodel: {wildcards.model}, useBX: {wildcards.usebx}, k: {wildcards.k}, bxLimit: {wildcards.bxlimit}, s: {wildcards.s}, nGen: {wildcards.ngen}" script: "scripts/stitch_impute.R" diff --git a/src/harpy/snakefiles/qc.smk b/src/harpy/snakefiles/qc.smk index ede7c6881..3621ebee9 100644 --- a/src/harpy/snakefiles/qc.smk +++ b/src/harpy/snakefiles/qc.smk @@ -114,15 +114,18 @@ rule create_report: log: outdir + "/logs/mutliqc.log" params: - outdir + logdir = f"{outdir}/reports/data/fastp/", + module = "-m fastp", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"QC Summary\"", + comment = "--comment \"This report aggregates trimming and quality control metrics reported by fastp.\"" + conda: f"{envdir}/qc.yaml" message: "Aggregating fastp reports" shell: - """ - multiqc {params}/reports/data/fastp/ -m fastp --force --filename {output} --quiet --title "QC Summary" --comment "This report aggregates trimming and quality control metrics reported by fastp" --no-data-dir 2> {log} - """ + "multiqc {params} --filename {output} 2> {log}" rule log_workflow: default_target: True