From 7e5450d21e7c4a655a7d36d7ea9d8d6be489707f Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 21 Jun 2024 21:04:35 -0400 Subject: [PATCH 01/41] better/faster biallelic identification --- src/harpy/fileparsers.py | 35 ++++++++--------------------------- src/harpy/impute.py | 10 +++++----- 2 files changed, 13 insertions(+), 32 deletions(-) diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index a601f3d4a..f5453d438 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -5,7 +5,6 @@ import sys import subprocess from pathlib import Path -from collections import Counter from rich.markdown import Markdown import rich_click as click from .printfunctions import print_error, print_solution_with_culprits @@ -117,21 +116,6 @@ def parse_alignment_inputs(inputs): sys.exit(1) return bam_infiles, len(uniqs) -#def get_samples_from_fastq(directory): -# """Identify the sample names from a directory containing FASTQ files""" -# full_flist = [i for i in glob.iglob(f"{directory}/*") if not os.path.isdir(i)] -# r = re.compile(r".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE) -# full_fqlist = list(filter(r.match, full_flist)) -# fqlist = [os.path.basename(i) for i in full_fqlist] -# bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$" -# if len(fqlist) == 0: -# print_error(f"No fastq files with acceptable names found in [bold]{directory}[/bold]") -# print_solution("Check that the file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]\nRead the documentation for details: https://pdimens.github.io/harpy/haplotagdata/#naming-conventions") -# sys.exit(1) -# -# return set([re.sub(bn_r, "", i, flags = re.IGNORECASE) for i in fqlist]) - - def biallelic_contigs(vcf, workdir): """Identify which contigs have at least 2 biallelic SNPs""" vbn = os.path.basename(vcf) @@ -142,15 +126,12 @@ def biallelic_contigs(vcf, workdir): else: click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) os.makedirs(f"{workdir}/", exist_ok = True) - biallelic = subprocess.Popen(f"bcftools view -M2 -v snps {vcf} -Ob".split(), stdout = subprocess.PIPE) - contigs = subprocess.run("""bcftools query -f '%CHROM\\n'""".split(), stdin = biallelic.stdout, stdout = subprocess.PIPE, check = False).stdout.decode().splitlines() - counts = Counter(contigs) - contigs = [i.replace("\'", "") for i in counts if counts[i] > 1] + biallelic = subprocess.Popen(f"bcftools view -m2 -M2 -v snps {vcf}".split(), stdout = subprocess.PIPE) + contigs = subprocess.Popen("""bcftools query -f '%CHROM\\n'""".split(), stdin = biallelic.stdout, stdout = subprocess.PIPE) + valid = subprocess.run('awk \'{a[$1]++} END{for (i in a) if (a[i] >= 2) print i}\'', shell = True, stdin = contigs.stdout, capture_output=True, text=True).stdout.replace("\'", "") + if not valid: + click.echo("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.") + sys.exit(1) with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f: - _ = [f.write(f"{i}\n") for i in contigs] - - if len(contigs) == 0: - print_error("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.") - sys.exit(1) - else: - return f"{workdir}/{vbn}.biallelic" \ No newline at end of file + f.write(valid) + return f"{workdir}/{vbn}.biallelic" diff --git a/src/harpy/impute.py b/src/harpy/impute.py index 2dbed6b83..a6d4c8e97 100644 --- a/src/harpy/impute.py +++ b/src/harpy/impute.py @@ -77,6 +77,11 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) validate_bam_RG(bamlist) check_impute_params(parameters) + # the biallelic check might take a minute, so print the start text first + print_onstart( + f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/", + "impute" + ) biallelic = biallelic_contigs(vcf, f"{workflowdir}") fetch_rule(workflowdir, "impute.smk") @@ -99,11 +104,6 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para config.write(" alignments:\n") for i in bamlist: config.write(f" - {i}\n") - - print_onstart( - f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/", - "impute" - ) generate_conda_deps() _module = subprocess.run(command.split()) sys.exit(_module.returncode) From 1f5b01cff9b8bd58e678a4af0d30bdcc8a421373 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 21 Jun 2024 21:50:00 -0400 Subject: [PATCH 02/41] biallelic contig finding >400X faster --- src/harpy/fileparsers.py | 34 ++++++++++++++++++++++++++++++++++ src/harpy/impute.py | 15 +++++++-------- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index f5453d438..1867cc56c 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -135,3 +135,37 @@ def biallelic_contigs(vcf, workdir): with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f: f.write(valid) return f"{workdir}/{vbn}.biallelic" + +def biallelic_contigs(vcf, workdir): + vbn = os.path.basename(vcf) + if os.path.exists(f"{workdir}/{vbn}.biallelic"): + with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: + contigs = [line.rstrip() for line in f] + click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr) + else: + click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) + os.makedirs(f"{workdir}/", exist_ok = True) + valid = [] + index_rows = subprocess.check_output(['bcftools', 'index', '-s', vcf]).decode().split('\n') + for row in index_rows: + contig = row.split("\t")[0] + # Use bcftools to count the number of biallelic SNPs in the contig + viewcmd = subprocess.Popen(['bcftools', 'view', '-r', contig, '-v', 'snps', '-m2', '-M2', '-c', '2', vcf], stdout=subprocess.PIPE) + snpcount = 0 + while True: + # Read the next line of output + line = viewcmd.stdout.readline().decode() + if not line: + break + snpcount += 1 + # If there are at least 2 biallellic snps, terminate the process + if snpcount >= 2: + valid.append(contig) + viewcmd.terminate() + break + if not valid: + click.echo("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.") + sys.exit(1) + with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f: + f.write("\n".join(valid)) + return f"{workdir}/{vbn}.biallelic" \ No newline at end of file diff --git a/src/harpy/impute.py b/src/harpy/impute.py index a6d4c8e97..75f669433 100644 --- a/src/harpy/impute.py +++ b/src/harpy/impute.py @@ -46,10 +46,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para individual files/folders, using shell wildcards (e.g. `data/drosophila*.bam`), or both. Requires a parameter file, use **harpy stitchparams** to generate one and adjust it for your study. - The `--vcf-samples` flag toggles to phase only the samples present in your input `--vcf` file rather than all + The `--vcf-samples` option considers only the samples present in your input `--vcf` file rather than all the samples identified in `INPUT`. If providing additional STITCH arguments, they must be in quotes and - in R language style. The extra parameters will remain constant across different models. - Use single-quotes (string literals) if supplying an argument that requires quotes. For example: + in R language style. Use single-quotes (string literals) if supplying an argument that requires quotes. For example: ``` harpy ... -x 'switchModelIteration = 39, splitReadIterations = NA, reference_populations = c("CEU","GBR")'... @@ -77,11 +76,6 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) validate_bam_RG(bamlist) check_impute_params(parameters) - # the biallelic check might take a minute, so print the start text first - print_onstart( - f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/", - "impute" - ) biallelic = biallelic_contigs(vcf, f"{workflowdir}") fetch_rule(workflowdir, "impute.smk") @@ -104,6 +98,11 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para config.write(" alignments:\n") for i in bamlist: config.write(f" - {i}\n") + + print_onstart( + f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/", + "impute" + ) generate_conda_deps() _module = subprocess.run(command.split()) sys.exit(_module.returncode) From 03d5cf4c6617c8f9afef8305101d6fd6e59f4272 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 21 Jun 2024 21:50:55 -0400 Subject: [PATCH 03/41] fix and docstring --- src/harpy/fileparsers.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index 1867cc56c..59b3460fe 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -117,26 +117,7 @@ def parse_alignment_inputs(inputs): return bam_infiles, len(uniqs) def biallelic_contigs(vcf, workdir): - """Identify which contigs have at least 2 biallelic SNPs""" - vbn = os.path.basename(vcf) - if os.path.exists(f"{workdir}/{vbn}.biallelic"): - with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: - contigs = [line.rstrip() for line in f] - click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr) - else: - click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) - os.makedirs(f"{workdir}/", exist_ok = True) - biallelic = subprocess.Popen(f"bcftools view -m2 -M2 -v snps {vcf}".split(), stdout = subprocess.PIPE) - contigs = subprocess.Popen("""bcftools query -f '%CHROM\\n'""".split(), stdin = biallelic.stdout, stdout = subprocess.PIPE) - valid = subprocess.run('awk \'{a[$1]++} END{for (i in a) if (a[i] >= 2) print i}\'', shell = True, stdin = contigs.stdout, capture_output=True, text=True).stdout.replace("\'", "") - if not valid: - click.echo("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.") - sys.exit(1) - with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f: - f.write(valid) - return f"{workdir}/{vbn}.biallelic" - -def biallelic_contigs(vcf, workdir): + """Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic""" vbn = os.path.basename(vcf) if os.path.exists(f"{workdir}/{vbn}.biallelic"): with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: From 6808db9e323b43fdbe3641a62273bb2160fd3032 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 12:18:54 -0400 Subject: [PATCH 04/41] fix printing --- src/harpy/snakefiles/impute.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harpy/snakefiles/impute.smk b/src/harpy/snakefiles/impute.smk index 31b2ab326..1a531300a 100644 --- a/src/harpy/snakefiles/impute.smk +++ b/src/harpy/snakefiles/impute.smk @@ -135,7 +135,7 @@ rule impute: threads: workflow.cores message: - "Imputing {wildcards.part}:\nmodel: {wildcards.model}, useBX: {wildcards}, k: {wildcards.k}, bxLimit: {wildcards.bxlimit}, s: {wildcards.s}, nGen: {wildcards.ngen}" + "Imputing {wildcards.part}:\nmodel: {wildcards.model}, useBX: {wildcards.usebx}, k: {wildcards.k}, bxLimit: {wildcards.bxlimit}, s: {wildcards.s}, nGen: {wildcards.ngen}" script: "scripts/stitch_impute.R" From 20afbb9212b21a5f09e4da1efc86f91aa6da4d73 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 12:24:51 -0400 Subject: [PATCH 05/41] force biallelic to always rewrite --- src/harpy/fileparsers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index 59b3460fe..ef56f7db9 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -119,13 +119,13 @@ def parse_alignment_inputs(inputs): def biallelic_contigs(vcf, workdir): """Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic""" vbn = os.path.basename(vcf) - if os.path.exists(f"{workdir}/{vbn}.biallelic"): - with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: - contigs = [line.rstrip() for line in f] - click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr) - else: - click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) - os.makedirs(f"{workdir}/", exist_ok = True) + #if os.path.exists(f"{workdir}/{vbn}.biallelic"): + # with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: + # contigs = [line.rstrip() for line in f] + # click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr) + #else: + click.echo("\nIdentifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) + os.makedirs(f"{workdir}/", exist_ok = True) valid = [] index_rows = subprocess.check_output(['bcftools', 'index', '-s', vcf]).decode().split('\n') for row in index_rows: From c994523e740785e7f518ffa8e86be81153d08e57 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 14:38:47 -0400 Subject: [PATCH 06/41] no version check --- src/harpy/snakefiles/align-bwa.smk | 2 +- src/harpy/snakefiles/align-ema.smk | 2 +- src/harpy/snakefiles/align-minimap.smk | 2 +- src/harpy/snakefiles/align-strobealign.smk | 9 +++++---- src/harpy/snakefiles/demultiplex.gen1.smk | 2 +- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index 6c51ccdb9..1ca375c1d 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -317,7 +317,7 @@ rule samtools_reports: "Summarizing samtools stats and flagstat" shell: """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null + multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ rule log_workflow: diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 77d5d0167..68857bc4b 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -463,7 +463,7 @@ rule collate_samtools_stats: "Summarizing samtools stats and flagstat" shell: """ - multiqc {outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null + multiqc {outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ rule log_workflow: diff --git a/src/harpy/snakefiles/align-minimap.smk b/src/harpy/snakefiles/align-minimap.smk index 5fb40c9c8..46fb46b1e 100644 --- a/src/harpy/snakefiles/align-minimap.smk +++ b/src/harpy/snakefiles/align-minimap.smk @@ -316,7 +316,7 @@ rule samtools_reports: "Summarizing samtools stats and flagstat" shell: """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null + multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ rule log_workflow: diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index e04699be6..d9c441c33 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -18,6 +18,7 @@ skipreports = config["skipreports"] windowsize = config["depth_windowsize"] molecule_distance = config["molecule_distance"] readlen = config["average_read_length"] +autolen = isinstance(readlen, str) wildcard_constraints: sample = "[a-zA-Z0-9._-]+" @@ -111,14 +112,14 @@ rule align: input: fastq = get_fq, genome = f"Genome/{bn}", - genome_index = f"Genome/{bn}.r{readlen}.sti" + genome_index = f"Genome/{bn}.r{readlen}.sti" if not autolen else [] output: pipe(outdir + "/samples/{sample}/{sample}.raw.sam") log: outdir + "/logs/{sample}.strobealign.log" params: samps = lambda wc: d[wc.get("sample")], - readlen = readlen, + readlen = "" if autolen else f"--use-index -r {readlen}", extra = extra benchmark: ".Benchmark/Mapping/strobealign/align.{sample}.txt" @@ -129,7 +130,7 @@ rule align: message: "Aligning sequences: {wildcards.sample}" shell: - "strobealign --use-index -r {params.readlen} -t {threads} -U -C --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} > {output} 2> {log}" + "strobealign {params.readlen} -t {threads} -U -C --rg-id={wildcards.sample} --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} > {output} 2> {log}" rule quality_filter: input: @@ -313,7 +314,7 @@ rule samtools_reports: "Summarizing samtools stats and flagstat" shell: """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null + multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ rule log_workflow: diff --git a/src/harpy/snakefiles/demultiplex.gen1.smk b/src/harpy/snakefiles/demultiplex.gen1.smk index 378bd9093..2b859556b 100644 --- a/src/harpy/snakefiles/demultiplex.gen1.smk +++ b/src/harpy/snakefiles/demultiplex.gen1.smk @@ -191,7 +191,7 @@ rule qc_report: "Creating final demultiplexing QC report" shell: """ - multiqc {params} --force --quiet --title "QC for Demultiplexed Samples" --comment "This report aggregates the QC results created by falco." --no-data-dir --filename {output} 2> /dev/null + multiqc {params} --no-version-check --force --quiet --title "QC for Demultiplexed Samples" --comment "This report aggregates the QC results created by falco." --no-data-dir --filename {output} 2> /dev/null """ rule log_workflow: From 53bd87cd71df262305b87dd87126cf887942053b Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 14:46:10 -0400 Subject: [PATCH 07/41] update summary --- src/harpy/align.py | 11 ++++++----- src/harpy/snakefiles/align-strobealign.smk | 12 ++++++++---- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/harpy/align.py b/src/harpy/align.py index 08acc4a7c..33e6735cd 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -318,7 +318,7 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping') @click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') @click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') -@click.option('-r', '--read-length', default = "125", show_default = True, type = click.Choice(["50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index') +@click.option('-r', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index') @click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') @click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') @click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') @@ -337,11 +337,12 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra Provide the input fastq files and/or directories at the end of the command as individual files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both. - strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does - not use barcodes when mapping. Harpy will post-processes the alignments using the + strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp and does + not use barcodes when mapping, so Harpy will post-processes the alignments using the specified `--molecule-distance` to assign alignments to unique molecules. The `--read-length` is - an *approximate* parameter and should be one of [`50`, `75`, `100`, `125`, `150`, `250`, `400`]. - If your input is post-qc sequences, then you should expect the read lengths to be <150. + an *approximate* parameter and should be one of [`auto`, `50`, `75`, `100`, `125`, `150`, `250`, `400`]. + The alignment process will be faster and take up less disk/RAM if you specify an `-r` value that isn't + `auto`. If your input has adapters removed, then you should expect the read lengths to be <150. """ output_dir = output_dir.rstrip("/") workflowdir = f"{output_dir}/workflow" diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index d9c441c33..e69510f93 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -333,10 +333,14 @@ rule log_workflow: with open(outdir + "/workflow/align.strobealign.summary", "w") as f: _ = f.write("The harpy align strobealign workflow ran using these parameters:\n\n") _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("The genome index was created using:\n") - _ = f.write(" strobealign --create-index -r {params.readlen} genome\n") - _ = f.write("Sequencing were aligned with strobealign using:\n") - _ = f.write(f" strobealign --use-index -U -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") + if autolen: + _ = f.write("Sequencing were aligned with strobealign using:\n") + _ = f.write(f" strobealign -U -C --rg-id=SAMPLE --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") + else: + _ = f.write("The genome index was created using:\n") + _ = f.write(f" strobealign --create-index -r {params.readlen} genome\n") + _ = f.write("Sequencing were aligned with strobealign using:\n") + _ = f.write(f" strobealign --use-index -U -C --rg=SM:SAMPLE {params.extra} genome reads.F.fq reads.R.fq |\n") _ = f.write(f" samtools view -h -F 4 -q {params.quality} |\n") _ = f.write("Duplicates in the alignments were marked following:\n") _ = f.write(" samtools collate \n") From 442448b4d72f8333902c86a3b4acc769f33e22ac Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 15:48:06 -0400 Subject: [PATCH 08/41] linting improvements, output total unique bx as comment at bottom --- src/harpy/scripts/bxStats.py | 50 +++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/src/harpy/scripts/bxStats.py b/src/harpy/scripts/bxStats.py index 01f923731..205d16a6b 100755 --- a/src/harpy/scripts/bxStats.py +++ b/src/harpy/scripts/bxStats.py @@ -1,5 +1,4 @@ import re -import sys import gzip import pysam @@ -7,31 +6,33 @@ outfile = gzip.open(snakemake.output[0], "wb", 6) outfile.write(b"contig\tmolecule\treads\tstart\tend\tlength_inferred\taligned_bp\tinsert_len\tcoverage_bp\tcoverage_inserts\n") -d = dict() -chromlast = None +d = {} +all_bx = set() +chrom_last = None -def writestats(x, contig): - for mi in x: - x[mi]["inferred"] = x[mi]["end"] - x[mi]["start"] +def writestats(x, writechrom): + """write to file the bx stats dictionary as a table""" + for _mi in x: + x[_mi]["inferred"] = x[_mi]["end"] - x[_mi]["start"] try: - x[mi]["covered_bp"] = min(x[mi]["bp"] / x[mi]["inferred"], 1.0) + x[_mi]["covered_bp"] = min(x[_mi]["bp"] / x[_mi]["inferred"], 1.0) except: - x[mi]["covered_bp"] = 0 + x[_mi]["covered_bp"] = 0 try: - x[mi]["covered_inserts"] = min(x[mi]["insert_len"] / x[mi]["inferred"], 1.0) + x[_mi]["covered_inserts"] = min(x[_mi]["insert_len"] / x[_mi]["inferred"], 1.0) except: - x[mi]["covered_inferred"] = 0 - outtext = f"{chromlast}\t{mi}\t" + "\t".join([str(x[mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) + x[_mi]["covered_inferred"] = 0 + outtext = f"{writechrom}\t{mi}\t" + "\t".join([str(x[_mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) outfile.write(outtext.encode() + b"\n") for read in alnfile.fetch(): chrom = read.reference_name # check if the current chromosome is different from the previous one # if so, print the dict to file and empty it (a consideration for RAM usage) - if chromlast and chrom != chromlast: - writestats(d, chromlast) - d = dict() - chromlast = chrom + if chrom_last and chrom != chrom_last: + writestats(d, chrom_last) + d = {} + chrom_last = chrom # skip duplicates, unmapped, and secondary alignments if read.is_duplicate or read.is_unmapped or read.is_secondary: continue @@ -46,10 +47,11 @@ def writestats(x, contig): try: mi = read.get_tag("MI") + bx = read.get_tag("BX") # do a regex search to find X00 pattern in the BX - if re.search("[ABCD]0{2,4}", read.get_tag("BX")): + if re.search("[ABCD]0{2,4}", bx): # if found, invalid - if "invalidBX" not in d.keys(): + if "invalidBX" not in d: d["invalidBX"] = { "start": 0, "end": 0, @@ -61,6 +63,8 @@ def writestats(x, contig): d["invalidBX"]["bp"] += bp d["invalidBX"]["n"] += 1 continue + # add valid bx to set of all unique barcodes + all_bx.add(bx) except: # There is no bx/MI tag continue @@ -73,8 +77,10 @@ def writestats(x, contig): if read.is_paired: # by using max(), will either add 0 or positive TLEN to avoid double-counting isize = max(0, read.template_length) + #isize = min(bp, abs(read.template_length)) # only count the bp of the first read of paired end reads - bp = bp if read.is_read1 else 0 + #bp = bp if read.is_read1 else 0 + bp = min(abs(read.template_length, bp), read.infer_query_length()) elif read.is_supplementary: # if it's a supplementary alignment, just use the alignment length isize = bp @@ -82,7 +88,7 @@ def writestats(x, contig): # if it's unpaired, use the TLEN or query length, whichever is bigger isize = max(abs(read.template_length), read.infer_query_length()) # create bx entry if it's not present - if mi not in d.keys(): + if mi not in d: d[mi] = { "start": pos_start, "end": pos_end, @@ -99,5 +105,7 @@ def writestats(x, contig): d[mi]["end"] = max(pos_end, d[mi]["end"]) # print the last entry -writestats(d, chrom) -outfile.close() \ No newline at end of file +writestats(d, chrom_last) +# write comment on the last line with the total number of unique BX barcodes +outfile.write(f"#total unique barcodes: {len(all_bx)}") +outfile.close() From 2994e055b5688695bf4a601cf38ae0882a29d5c7 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 15:58:31 -0400 Subject: [PATCH 09/41] fix total bx calc --- src/harpy/reports/AlignStats.Rmd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 22d020ec3..9ed74a127 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -33,7 +33,7 @@ bamfile <- gsub(".bxstats.gz", ".bam", infile) samplename <- gsub(".bxstats.gz", "", basename(infile)) tb <- read.table(infile, header = T, sep = "\t") %>% select(-start, -end) tb$valid <- tb$molecule -tb[!(tb$valid %in% c("noBX", "invalidBX")), "valid"] <- "validBX" +tb[tb$valid != "invalidBX", "valid"] <- "validBX" tb$valid <- gsub("BX", " BX", tb$valid) ``` @@ -44,13 +44,14 @@ nBX <- group_by(valids, contig) %>% avgBX <- round(mean(nBX$nBX), digits = 2) -totuniqBX <- length(unique(valids$molecule)) +totuniqBX <- system(paste("zcat",snakemake@input[[1]]), "| tail -1") +totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() tots <- tb %>% group_by(valid) %>% summarize(total = length(molecule)) %>% as.data.frame() -for(i in c("no BX", "invalid BX", "valid BX")){ +for(i in c("invalid BX", "valid BX")){ if (!(i %in% tots$valid)){ tots <- rbind(tots, data.frame("valid" = i, total = 0)) } From d3c7ca1c35e08418de2bc4196142e7e8caa6fb07 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 16:04:00 -0400 Subject: [PATCH 10/41] simpler, reliable valid/invalid counts --- src/harpy/reports/AlignStats.Rmd | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 9ed74a127..4da93f040 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -51,11 +51,9 @@ tots <- tb %>% group_by(valid) %>% summarize(total = length(molecule)) %>% as.data.frame() -for(i in c("invalid BX", "valid BX")){ - if (!(i %in% tots$valid)){ - tots <- rbind(tots, data.frame("valid" = i, total = 0)) - } -} + +tot_invalid <- sum(tots$valid == "invalid BX") +tot_valid <- sum(tots$valid == "valid BX") ``` ```{r nxx, echo = F, results = F, message = F} @@ -80,12 +78,12 @@ valueBox(scales::comma(length(unique(tb$contig))), caption = "Contigs") ### validBX ```{r} -valueBox(scales::comma(tots[tots$valid == "valid BX", 2]), caption = "Valid Barcodes", color = "success", icon = "fa-vial-circle-check") +valueBox(scales::comma(tot_valid), caption = "Valid Barcodes", color = "success", icon = "fa-vial-circle-check") ``` ### invalidBX ```{r} -valueBox(scales::comma(tots[tots$valid == "invalid BX", 2]), caption = "Invalid Barcodes", color = "warning", icon = "fa-x") +valueBox(scales::comma(tot_invalid), caption = "Invalid Barcodes", color = "warning", icon = "fa-x") ``` ### glob-avg From 109a5a619a09b86f749e2e8388750abe7379aec1 Mon Sep 17 00:00:00 2001 From: pdimens Date: Mon, 24 Jun 2024 16:13:50 -0400 Subject: [PATCH 11/41] begin groundwork for bx aggregate report --- src/harpy/reports/BxStatsAggregate.Rmd | 52 ++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 src/harpy/reports/BxStatsAggregate.Rmd diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd new file mode 100644 index 000000000..e966ed332 --- /dev/null +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -0,0 +1,52 @@ +--- +title: "Harpy Alignment Barcode Summary" +date: "`r format(Sys.time(), '%m-%d-%y %X')`" +output: + flexdashboard::flex_dashboard: + theme: lumen + orientation: rows + vertical_layout: scroll + horizontal_layout: fill + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" +--- + +```{r echo = F, results = F, message = F} +using<-function(...) { + libs<-unlist(list(...)) + req<-unlist(lapply(libs,require,character.only=TRUE)) + need<-libs[req==FALSE] + if(length(need)>0){ + install.packages(need, repos = "https://cloud.r-project.org/") + lapply(need,require,character.only=TRUE) + } +} + +using("flexdashboard","dplyr","highcharter","DT") +``` + +```{r bxper, echo = F, results = F, message = F} +valids <- filter(tb, valid == "valid BX") +nBX <- group_by(valids, contig) %>% + summarize(nBX = length(molecule)) + +avgBX <- round(mean(nBX$nBX), digits = 2) + +totuniqBX <- system(paste("zcat",snakemake@input[[1]]), "| tail -1") +totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() + +tots <- tb %>% + group_by(valid) %>% + summarize(total = length(molecule)) %>% + as.data.frame() + +tot_invalid <- sum(tots$valid == "invalid BX") +tot_valid <- sum(tots$valid == "valid BX") +``` + +```{r nxx, echo = F, results = F, message = F} +NX <- function(lengths, X=50){ + lengths <- as.numeric(sort(lengths, decreasing=TRUE)) + index <- which(cumsum(lengths) / sum(lengths) >= X/100)[1] + return(lengths[index]) +} +``` \ No newline at end of file From 8f23aecd9fd3a05bb0fbdea4241d663534b442be Mon Sep 17 00:00:00 2001 From: pdimens Date: Tue, 25 Jun 2024 11:50:46 -0400 Subject: [PATCH 12/41] swap checking order --- src/harpy/impute.py | 6 +- src/harpy/reports/BxStatsAggregate.Rmd | 93 ++++++++++++++++++++------ 2 files changed, 76 insertions(+), 23 deletions(-) diff --git a/src/harpy/impute.py b/src/harpy/impute.py index 75f669433..bbf25c4c4 100644 --- a/src/harpy/impute.py +++ b/src/harpy/impute.py @@ -71,11 +71,11 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para sys.exit(0) os.makedirs(f"{workflowdir}/", exist_ok= True) - bamlist, n = parse_alignment_inputs(inputs) validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"]) - samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) - validate_bam_RG(bamlist) check_impute_params(parameters) + bamlist, n = parse_alignment_inputs(inputs) + validate_bam_RG(bamlist) + samplenames = vcf_samplematch(vcf, bamlist, vcf_samples) biallelic = biallelic_contigs(vcf, f"{workflowdir}") fetch_rule(workflowdir, "impute.smk") diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index e966ed332..5632375d5 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -10,7 +10,7 @@ output: favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" --- -```{r echo = F, results = F, message = F} +```{r imports, echo = F, results = F, message = F} using<-function(...) { libs<-unlist(list(...)) req<-unlist(lapply(libs,require,character.only=TRUE)) @@ -24,29 +24,82 @@ using<-function(...) { using("flexdashboard","dplyr","highcharter","DT") ``` -```{r bxper, echo = F, results = F, message = F} -valids <- filter(tb, valid == "valid BX") -nBX <- group_by(valids, contig) %>% - summarize(nBX = length(molecule)) +```{r nxx_and_process_funs, echo = F, results = F, message = F} +NX <- function(lengths, X=50){ + lengths <- as.numeric(sort(lengths, decreasing=TRUE)) + index <- which(cumsum(lengths) / sum(lengths) >= X/100)[1] + return(lengths[index]) +} -avgBX <- round(mean(nBX$nBX), digits = 2) +process_input <- function(infile){ + bamfile <- gsub(".bxstats.gz", ".bam", infile) + samplename <- gsub(".bxstats.gz", "", basename(infile)) + tb <- read.table(infile, header = T, sep = "\t") %>% select(1,2,3,6) + tb$valid <- tb$molecule + tb[tb$valid != "invalidBX", "valid"] <- "validBX" + tb$valid <- gsub("BX", " BX", tb$valid) -totuniqBX <- system(paste("zcat",snakemake@input[[1]]), "| tail -1") -totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() + nMol <- group_by(tb[tb$valid == "valid BX",], contig) %>% + summarize(nMol = length(molecule)) + avg_mol <- round(mean(nMol$nMol), digits = 2) -tots <- tb %>% - group_by(valid) %>% - summarize(total = length(molecule)) %>% - as.data.frame() + tot_uniq_bx <- system(paste("zcat",infile, "| tail -1"), intern = T) + tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx) |> as.integer() -tot_invalid <- sum(tots$valid == "invalid BX") -tot_valid <- sum(tots$valid == "valid BX") + tot_mol <- sum(tb$valid == "valid BX") + tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"]) + tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"]) + n50 <- NX(tb$length_inferred, 50) + n75 <- NX(tb$length_inferred, 75) + n90 <- NX(tb$length_inferred, 90) + + outrow <- data.frame( + sample = samplename, + totaluniquemol = tot_mol, + totaluniquebx = tot_uniq_bx, + molecule2bxratio = round(tot_mol / tot_uniq_bx,2), + totalvalidreads = tot_valid_reads, + totalinvalidreads = tot_invalid_reads, + totavalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2), + averagemolpercont = avg_mol, + averagereadspermol = round(tot_valid_reads/tot_mol,1), + N50 = n50, + N75 = n75, + N90 = n90 + ) + return(outrow) +} ``` -```{r nxx, echo = F, results = F, message = F} -NX <- function(lengths, X=50){ - lengths <- as.numeric(sort(lengths, decreasing=TRUE)) - index <- which(cumsum(lengths) / sum(lengths) >= X/100)[1] - return(lengths[index]) +```{r setup_df, echo = F, results = F, message = F} +#infiles <- snakemake@input[[1]] +infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") + +aggregate_df <- data.frame( + sample = character(), + totaluniquemol = integer(), + totaluniquebx = integer(), + molecule2bxratio = numeric(), + totalvalidreads = integer(), + totalinvalidreads = integer(), + totalvalidpercent = numeric(), + averagemolpercont = integer(), + averagereadspermol = numeric(), + N50 = integer(), + N75 = integer(), + N90 = integer() +) +``` + +```{r setup_df, echo = F, results = F, message = F} +for(i in infiles){ + aggregate_df <- rbind(aggregate_df, process_input(i)) } -``` \ No newline at end of file +``` + +#TODO highcharter areaspline of NXX calcs + +#TODO static? areaspline of total valid percent across all samples + +#TODO average molecules per contig plot WITH standard deviation per sample +#TODO will require adding sd() calculation above \ No newline at end of file From a0a0e6af5b3d015efc528db327dec18ea1831c43 Mon Sep 17 00:00:00 2001 From: pdimens Date: Tue, 25 Jun 2024 20:19:58 -0400 Subject: [PATCH 13/41] add plots --- src/harpy/reports/BxStatsAggregate.Rmd | 106 ++++++++++++++++++++++--- 1 file changed, 94 insertions(+), 12 deletions(-) diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index 5632375d5..41a7ae6a1 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -21,7 +21,7 @@ using<-function(...) { } } -using("flexdashboard","dplyr","highcharter","DT") +using("flexdashboard","dplyr","tidyr", "highcharter","DT") ``` ```{r nxx_and_process_funs, echo = F, results = F, message = F} @@ -31,6 +31,10 @@ NX <- function(lengths, X=50){ return(lengths[index]) } +std_error <- function(x){ + sd(x)/sqrt(length((x))) +} + process_input <- function(infile){ bamfile <- gsub(".bxstats.gz", ".bam", infile) samplename <- gsub(".bxstats.gz", "", basename(infile)) @@ -39,15 +43,18 @@ process_input <- function(infile){ tb[tb$valid != "invalidBX", "valid"] <- "validBX" tb$valid <- gsub("BX", " BX", tb$valid) - nMol <- group_by(tb[tb$valid == "valid BX",], contig) %>% + nMolpercont <- group_by(tb[tb$valid == "valid BX",], contig) %>% summarize(nMol = length(molecule)) - avg_mol <- round(mean(nMol$nMol), digits = 2) - + + avg_mol <- round(mean(nMolpercont$nMol), digits = 2) + sem_mol_per_cont <- round(std_error(nMolpercont$nMol), digits = 3) tot_uniq_bx <- system(paste("zcat",infile, "| tail -1"), intern = T) tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx) |> as.integer() tot_mol <- sum(tb$valid == "valid BX") tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"]) + avg_reads_per_mol <- round(tot_valid_reads/tot_mol,1) + sem_reads_per_mol <- round(std_error(tb[tb$valid == "valid BX", "reads"]), 1) tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"]) n50 <- NX(tb$length_inferred, 50) n75 <- NX(tb$length_inferred, 75) @@ -60,9 +67,11 @@ process_input <- function(infile){ molecule2bxratio = round(tot_mol / tot_uniq_bx,2), totalvalidreads = tot_valid_reads, totalinvalidreads = tot_invalid_reads, - totavalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2), + totalvalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2), averagemolpercont = avg_mol, - averagereadspermol = round(tot_valid_reads/tot_mol,1), + semolpercont = sem_mol_per_cont, + averagereadspermol = avg_reads_per_mol, + sereadspermol = sem_reads_per_mol, N50 = n50, N75 = n75, N90 = n90 @@ -84,22 +93,95 @@ aggregate_df <- data.frame( totalinvalidreads = integer(), totalvalidpercent = numeric(), averagemolpercont = integer(), + semolpercont = numeric(), averagereadspermol = numeric(), + sereadspermol = numeric(), N50 = integer(), N75 = integer(), N90 = integer() ) -``` -```{r setup_df, echo = F, results = F, message = F} for(i in infiles){ aggregate_df <- rbind(aggregate_df, process_input(i)) } +aggregate_df$totalreads <- (aggregate_df$totalvalidreads + aggregate_df$totalinvalidreads) ``` -#TODO highcharter areaspline of NXX calcs +```{r nxxplot, echo = F, results = F, message = F} +highchart() |> + hc_chart(type = "area") |> + hc_add_series(data = density(aggregate_df$N50), name = "N50", type = "areaspline") |> + hc_add_series(data = density(aggregate_df$N75), name = "N75", type = "areaspline") |> + hc_add_series(data = density(aggregate_df$N90), name = "N90", color = "#d3805f", type = "areaspline") |> + hc_tooltip(enabled = FALSE) |> + hc_title(text = "NX Stats Across Samples") |> + hc_xAxis(title = list(text = "NX value"), min = 0) |> + hc_yAxis(title = list(text = "density")) |> + hc_exporting(enabled = T, filename = "NX.stats", + buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` -#TODO static? areaspline of total valid percent across all samples +```{r perc_valid_dist, echo = F, results = F, message = F} +hs <- hist( + aggregate_df$totalvalidpercent, + breaks = min(aggregate_df$totalvalidpercent):max(aggregate_df$totalvalidpercent), + plot = F +) +hs$counts <- round(hs$counts / sum(hs$counts) * 100, 4) +hs <- data.frame(val = hs$breaks[-1], freq = hs$counts) -#TODO average molecules per contig plot WITH standard deviation per sample -#TODO will require adding sd() calculation above \ No newline at end of file +hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Percent Alignments", marker = list(enabled = FALSE)) |> + hc_title(text = "Percent of Alignments with Valid BX tags") |> + hc_xAxis(min = 0, max = 100, title = list(text = "percent alignment with valid BX tag")) |> + hc_yAxis(max = 100, title = list(text = "% samples")) |> + hc_tooltip(crosshairs = TRUE) |> + hc_exporting(enabled = T, filename = "percent.valid.dist", + buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +```{r perc_valid_per_sample, echo = F, results = F, message = F} +hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, + hcaes(x = 0, x2 = totalreads, y = 0:(nrow(aggregate_df)-1), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> + hc_xAxis(title = list(text = "Total Alignments")) |> + hc_yAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_tooltip(enabled = FALSE) |> + hc_colors(color = "#95d8a7") |> + hc_title(text = "Percent Alignments with Valid BX Barcode") |> + hc_exporting(enabled = T, filename = "BX.valid.alignments", + buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` +#TODO custom tooltip fixes: y = samplename, x = y_value +```{r mol_per_contig, echo = F, results = F, message = F} +# Create a scatter plot with horizontal error bars +err_df <- data.frame(x = 0:(nrow(aggregate_df)-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) +highchart() |> + hc_chart(inverted=TRUE) |> + hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolpercont), marker = list(radius = 8), color = "#c26d9b", zIndex = 1) |> + hc_add_series(data = err_df, type = "errorbar", name = "standard deviation", linkedTo = ":previous", zIndex = 0, stemColor = "#c26d9b", whiskerColor = "#c98fae") |> + hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_title(text = "Average Molecules Per Contig") |> + hc_caption(text = "Error bars show the standard error of the mean") |> + hc_tooltip(crosshairs = TRUE) |> + hc_exporting(enabled = T, filename = "avg.molecules.per.contig", + buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` + +#TODO average reads per molecule with SD +```{r reads_per_, echo = F, results = F, message = F} +err_df <- data.frame(x = 0:(nrow(aggregate_df)-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) +highchart() |> + hc_chart(inverted=TRUE) |> + hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagereadspermol), marker = list(radius = 8), color = "#6d73c2", zIndex = 1) |> + hc_add_series(data = err_df, type = "errorbar", name = "standard deviation", linkedTo = ":previous", zIndex = 0, stemColor = "#8186c7", whiskerColor = "#8186c7") |> + hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_title(text = "Average Reads Per Molecule") |> + hc_caption(text = "Error bars show the standard error of the mean") |> + hc_tooltip(crosshairs = TRUE) |> + hc_exporting(enabled = T, filename = "avg.reads.per.molecule", + buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) +``` \ No newline at end of file From 004aad15ac68db7cf4217fa7991db2c1a0310853 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 10:42:08 -0400 Subject: [PATCH 14/41] improve --- src/harpy/fileparsers.py | 16 ++++++---------- src/harpy/impute.py | 4 ++-- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index ef56f7db9..969ea0777 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -119,17 +119,13 @@ def parse_alignment_inputs(inputs): def biallelic_contigs(vcf, workdir): """Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic""" vbn = os.path.basename(vcf) - #if os.path.exists(f"{workdir}/{vbn}.biallelic"): - # with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f: - # contigs = [line.rstrip() for line in f] - # click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr) - #else: - click.echo("\nIdentifying which contigs have at least 2 biallelic SNPs", file = sys.stderr) os.makedirs(f"{workdir}/", exist_ok = True) valid = [] - index_rows = subprocess.check_output(['bcftools', 'index', '-s', vcf]).decode().split('\n') - for row in index_rows: - contig = row.split("\t")[0] + vcfheader = subprocess.check_output(['bcftools', 'view', '-h', vcf]).decode().split('\n') + header_contigs = [i.split(",")[0].replace("##contig= Date: Wed, 26 Jun 2024 10:46:39 -0400 Subject: [PATCH 15/41] add conditional for vcf.gz --- src/harpy/fileparsers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/harpy/fileparsers.py b/src/harpy/fileparsers.py index 969ea0777..72ac61bbd 100644 --- a/src/harpy/fileparsers.py +++ b/src/harpy/fileparsers.py @@ -125,6 +125,9 @@ def biallelic_contigs(vcf, workdir): header_contigs = [i.split(",")[0].replace("##contig= Date: Wed, 26 Jun 2024 12:18:48 -0400 Subject: [PATCH 16/41] rm fullscreen --- src/harpy/reports/AlignStats.Rmd | 12 ++++---- src/harpy/reports/BcftoolsStats.Rmd | 10 +++---- src/harpy/reports/BxCount.Rmd | 8 +++--- src/harpy/reports/BxStatsAggregate.Rmd | 39 +++++++++++++++----------- src/harpy/reports/EmaCount.Rmd | 4 +-- src/harpy/reports/HapCut2.Rmd | 2 +- src/harpy/reports/Impute.Rmd | 4 +-- src/harpy/reports/PreflightBam.Rmd | 2 +- src/harpy/reports/PreflightFastq.Rmd | 2 +- 9 files changed, 44 insertions(+), 39 deletions(-) diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 4da93f040..6d7f66d6f 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -137,7 +137,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".readsper"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -154,7 +154,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#75b89e", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".basesper"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -185,7 +185,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#b3519d", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".mollen"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -226,7 +226,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#e59765", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".molcov"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -247,7 +247,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#dbe465", name = "% hc_caption(text = paste0("Total unique molecules: ", totuniqBX)) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = paste0(samplename, ".molcov"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -401,7 +401,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#9393d2", name = "% hc_yAxis(title = list(text = "% molecules")) |> hc_title(text = "Distribution of Alignment Depths") |> hc_exporting(enabled = T, filename = paste0(samplename, ".cov"), - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/BcftoolsStats.Rmd b/src/harpy/reports/BcftoolsStats.Rmd index f51e06f23..e82fda556 100644 --- a/src/harpy/reports/BcftoolsStats.Rmd +++ b/src/harpy/reports/BcftoolsStats.Rmd @@ -157,7 +157,7 @@ if(do_qual){ formatter = JS("function () {return 'Quality ' + this.x + '
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "snp.quality", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } else { @@ -181,7 +181,7 @@ if(do_qual){ formatter = JS("function () {return 'Quality ' + this.x + '
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "indel.quality", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } else { @@ -235,7 +235,7 @@ if(sum(.dpL) == 0){ formatter = JS("function () {return 'depth: ' + this.x + '
Percent: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "genotype.depth", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } ``` @@ -287,7 +287,7 @@ if(sum(.iddL) == 0){ formatter = JS("function () {return '' + this.series.name + '
Length: ' + Math.abs(this.x) + '
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "indel.distribution", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) } ``` @@ -324,7 +324,7 @@ highchart() |> pointFormat= '{point.category}
Count: {point.y:.0f}' ) |> hc_exporting(enabled = T, filename = "samples.variantstats", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/BxCount.Rmd b/src/harpy/reports/BxCount.Rmd index 0886d489d..20fb2604f 100644 --- a/src/harpy/reports/BxCount.Rmd +++ b/src/harpy/reports/BxCount.Rmd @@ -224,7 +224,7 @@ hchart(inv_hist, "areaspline", hcaes(x = Bin, y = Count, group = Segment), stack formatter = JS("function () {return '' + this.series.name + '
' + this.x + '% invalid
Count: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "indel.quality", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -274,7 +274,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "nor hc_yAxis(title = list(text = "count")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "bxcount", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` ### proportional valid barcodes @@ -287,7 +287,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "per hc_yAxis(title = list(text = "percent")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "bxpercent", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -306,6 +306,6 @@ highchart() |> formatter = JS("function () {return '' + this.x + '
' + this.series.name + '
Percent: ' + this.y + '';}") ) |> hc_exporting(enabled = T, filename = "invalid.segments", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` \ No newline at end of file diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index 41a7ae6a1..f2391b01b 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -107,6 +107,12 @@ for(i in infiles){ aggregate_df$totalreads <- (aggregate_df$totalvalidreads + aggregate_df$totalinvalidreads) ``` +```{r sampleplot_height, echo=FALSE, message=FALSE, warning=FALSE} +n_samples <- nrow(aggregate_df) +plotheight <- 150 + (15 * n_samples) +figheight <- 1 + (0.2 * n_samples) +``` + ```{r nxxplot, echo = F, results = F, message = F} highchart() |> hc_chart(type = "area") |> @@ -118,7 +124,7 @@ highchart() |> hc_xAxis(title = list(text = "NX value"), min = 0) |> hc_yAxis(title = list(text = "density")) |> hc_exporting(enabled = T, filename = "NX.stats", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -133,55 +139,54 @@ hs <- data.frame(val = hs$breaks[-1], freq = hs$counts) hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Percent Alignments", marker = list(enabled = FALSE)) |> hc_title(text = "Percent of Alignments with Valid BX tags") |> - hc_xAxis(min = 0, max = 100, title = list(text = "percent alignment with valid BX tag")) |> + hc_xAxis(min = 0, max = 100, title = list(text = "% alignment with valid BX tag")) |> hc_yAxis(max = 100, title = list(text = "% samples")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "percent.valid.dist", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` -```{r perc_valid_per_sample, echo = F, results = F, message = F} +```{r perc_valid_per_sample, echo = F, results = F, message = F, fig.height=figheight, out.width="100%"} hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, - hcaes(x = 0, x2 = totalreads, y = 0:(nrow(aggregate_df)-1), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> + hcaes(x = 0, x2 = totalreads, y = 0:(n_samples-1), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> hc_xAxis(title = list(text = "Total Alignments")) |> hc_yAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_caption(text = "Percentage represents the percent of alignments with a valid BX barcode") |> hc_tooltip(enabled = FALSE) |> hc_colors(color = "#95d8a7") |> hc_title(text = "Percent Alignments with Valid BX Barcode") |> hc_exporting(enabled = T, filename = "BX.valid.alignments", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` -#TODO custom tooltip fixes: y = samplename, x = y_value ```{r mol_per_contig, echo = F, results = F, message = F} # Create a scatter plot with horizontal error bars -err_df <- data.frame(x = 0:(nrow(aggregate_df)-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) +err_df <- data.frame(x = 0:n_samples-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) highchart() |> - hc_chart(inverted=TRUE) |> + hc_chart(inverted=TRUE, pointPadding = 0.1, groupPadding = 0.0001) |> hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolpercont), marker = list(radius = 8), color = "#c26d9b", zIndex = 1) |> hc_add_series(data = err_df, type = "errorbar", name = "standard deviation", linkedTo = ":previous", zIndex = 0, stemColor = "#c26d9b", whiskerColor = "#c98fae") |> hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> hc_title(text = "Average Molecules Per Contig") |> hc_caption(text = "Error bars show the standard error of the mean") |> - hc_tooltip(crosshairs = TRUE) |> + hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> hc_exporting(enabled = T, filename = "avg.molecules.per.contig", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` -#TODO average reads per molecule with SD -```{r reads_per_, echo = F, results = F, message = F} -err_df <- data.frame(x = 0:(nrow(aggregate_df)-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) +```{r reads_per_mol_sample, echo = F, results = F, message = F, fig.height=figheight, out.width="100%"} +err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) highchart() |> - hc_chart(inverted=TRUE) |> + hc_chart(inverted=TRUE, pointPadding = 0.0001, groupPadding = 0.0001) |> hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagereadspermol), marker = list(radius = 8), color = "#6d73c2", zIndex = 1) |> hc_add_series(data = err_df, type = "errorbar", name = "standard deviation", linkedTo = ":previous", zIndex = 0, stemColor = "#8186c7", whiskerColor = "#8186c7") |> hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> hc_title(text = "Average Reads Per Molecule") |> hc_caption(text = "Error bars show the standard error of the mean") |> - hc_tooltip(crosshairs = TRUE) |> + hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> hc_exporting(enabled = T, filename = "avg.reads.per.molecule", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` \ No newline at end of file diff --git a/src/harpy/reports/EmaCount.Rmd b/src/harpy/reports/EmaCount.Rmd index 87d64d91b..dff61634c 100644 --- a/src/harpy/reports/EmaCount.Rmd +++ b/src/harpy/reports/EmaCount.Rmd @@ -132,7 +132,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "nor hc_caption(text = "As determined by ema count, given as counts") |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "ema.bxcount", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` ### proportional valid barcodes @@ -145,7 +145,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "per hc_caption(text = "As determined by ema count, given as percents") |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "ema.bxpercent", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/HapCut2.Rmd b/src/harpy/reports/HapCut2.Rmd index 1f58cf8ba..9c780a5ee 100644 --- a/src/harpy/reports/HapCut2.Rmd +++ b/src/harpy/reports/HapCut2.Rmd @@ -145,7 +145,7 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#7eb495", name = "co hc_yAxis(title = list(text = "count")) |> hc_tooltip(crosshairs = TRUE) |> hc_exporting(enabled = T, filename = "haplotype.hist", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/Impute.Rmd b/src/harpy/reports/Impute.Rmd index dda770aab..f7271e179 100644 --- a/src/harpy/reports/Impute.Rmd +++ b/src/harpy/reports/Impute.Rmd @@ -124,7 +124,7 @@ hchart(h, "areaspline", hcaes(x = info, y = freq), name = "% sites", marker = li formatter = JS("function () {return 'INFO_SCORE ' + this.x + '
' + this.y + '% of sites';}") ) |> hc_exporting(enabled = T, filename = "impute.infoscores", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -253,7 +253,7 @@ hchart(gcts_long, "bar", hcaes(x = sample, y = Count, group = Conversion), stack hc_yAxis(title = list(text = "count")) |> hc_tooltip(crosshairs = TRUE, animation = F) |> hc_exporting(enabled = T, filename = "impute.samples", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/PreflightBam.Rmd b/src/harpy/reports/PreflightBam.Rmd index 4af31228a..3970ebe7b 100644 --- a/src/harpy/reports/PreflightBam.Rmd +++ b/src/harpy/reports/PreflightBam.Rmd @@ -170,7 +170,7 @@ hchart(m) |> formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS') + '';}") ) |> hc_exporting(enabled = T, filename = "bam.preflight", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` diff --git a/src/harpy/reports/PreflightFastq.Rmd b/src/harpy/reports/PreflightFastq.Rmd index 12b7adbcf..ba3f3aaa6 100644 --- a/src/harpy/reports/PreflightFastq.Rmd +++ b/src/harpy/reports/PreflightFastq.Rmd @@ -160,7 +160,7 @@ hchart(m) |> formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS') + '';}") ) |> hc_exporting(enabled = T, filename = "fastq.preflight", - buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` From cb81c5fcda7ad778a75708a61245a50447072a08 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 12:37:49 -0400 Subject: [PATCH 17/41] add table --- src/harpy/reports/BxStatsAggregate.Rmd | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index f2391b01b..d050cf760 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -62,6 +62,7 @@ process_input <- function(infile){ outrow <- data.frame( sample = samplename, + totalreads = tot_valid_reads + tot_invalid_reads, totaluniquemol = tot_mol, totaluniquebx = tot_uniq_bx, molecule2bxratio = round(tot_mol / tot_uniq_bx,2), @@ -86,6 +87,7 @@ infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") aggregate_df <- data.frame( sample = character(), + totalreads = integer(), totaluniquemol = integer(), totaluniquebx = integer(), molecule2bxratio = numeric(), @@ -104,7 +106,25 @@ aggregate_df <- data.frame( for(i in infiles){ aggregate_df <- rbind(aggregate_df, process_input(i)) } -aggregate_df$totalreads <- (aggregate_df$totalvalidreads + aggregate_df$totalinvalidreads) +``` + +## General Barcode Information from Alignments +### Desc {.no-title} +

General Barcode Information from Alignments

+This report aggregates the barcode-specific information from the alignments +that were created using `harpy align`. Detailed information for any one sample +can be found in that sample's individual report. The table below is an aggregation +of data for each sample based on their `*.bxstats.gz` file. + +```{r} +DT::datatable( + aggregate_df, + rownames = F, + extensions = 'Buttons', + options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE), + fillContainer = T, + colnames = c("sample", "alignments", "unique molecules", "unique barcodes", "molecule:bx ratio", "valid bx alignments", "invalid bx alignments", "% valid bx", "avg molecules/contig", "SEM molecules/contig", "avg reads/molecule", "SEM reads/molecule", "N50", "N75","N90") +) ``` ```{r sampleplot_height, echo=FALSE, message=FALSE, warning=FALSE} From cc1da55487643d02dbd4ee82e372f28cee78dd95 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 13:59:26 -0400 Subject: [PATCH 18/41] start adding text --- src/harpy/reports/BxStatsAggregate.Rmd | 40 ++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index d050cf760..c302d3ba7 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -115,7 +115,16 @@ This report aggregates the barcode-specific information from the alignments that were created using `harpy align`. Detailed information for any one sample can be found in that sample's individual report. The table below is an aggregation of data for each sample based on their `*.bxstats.gz` file. - +- `avg` refers to the average (arithmetic mean) +- `SEM` refers to the Standard Error of the mean +- `molecules` are the unique DNA molecules as inferred from linked-read barcodes +- `barcodes` are the linked-read barcodes associated with DNA sequences and are synonymous with `bx` +- `valid` refers to a proper haplotag barcode (e.g. `A01C34B92D51`) +- `invalid` refers to an invalidated haplotag barcode, where there is a `00` in any of the `ACBD` positions (e.g. `A21C00B32D57`) +- `NX` are the N-statistics (explained in more detail below) + +## Sampletable +### Per-Sample Information ```{r} DT::datatable( aggregate_df, @@ -133,6 +142,19 @@ plotheight <- 150 + (15 * n_samples) figheight <- 1 + (0.2 * n_samples) ``` +## NX plots +### NX desc {.no-title} +#### N50 +The **N50** is the length of the shortest molecule in the group of longest molecules that together + represent at least 50% of the total molecules by length. It can be thought of as a kind of median. +#### N75 +The **N75** is the length of the shortest molecule in the group of longest molecules that together +represent at least 75% of the total molecules by length. +#### N90 +The **N90** is the length of the shortest molecule in the group of longest molecules that together +represent at least 90% of the total molecules by length. + +### NX plots {.no-title} ```{r nxxplot, echo = F, results = F, message = F} highchart() |> hc_chart(type = "area") |> @@ -148,6 +170,12 @@ highchart() |> ) ``` +## Distribution of valid bx alignments +### dist description {.no-title} + + +## valid bx plot +### distribution plot {.no-title} ```{r perc_valid_dist, echo = F, results = F, message = F} hs <- hist( aggregate_df$totalvalidpercent, @@ -167,6 +195,12 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Pe ) ``` +## Per-Sample Metrics +### Per-sample desc {.no-title} +Below is a series of plots that shows metrics per-sample. + +## Per-sample plots +### percent valid {.no-title} ```{r perc_valid_per_sample, echo = F, results = F, message = F, fig.height=figheight, out.width="100%"} hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, hcaes(x = 0, x2 = totalreads, y = 0:(n_samples-1), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> @@ -180,6 +214,8 @@ hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` + +### molecules per contig {.no-title} ```{r mol_per_contig, echo = F, results = F, message = F} # Create a scatter plot with horizontal error bars err_df <- data.frame(x = 0:n_samples-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) @@ -195,7 +231,7 @@ highchart() |> buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` - +### reads per molecule{.no-title} ```{r reads_per_mol_sample, echo = F, results = F, message = F, fig.height=figheight, out.width="100%"} err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) highchart() |> From f4e687cbf202ce165c41c84b9e5cc904909778e2 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 15:28:04 -0400 Subject: [PATCH 19/41] formatting --- src/harpy/reports/BxStatsAggregate.Rmd | 52 ++++++++++++++------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index c302d3ba7..b86f1198f 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -82,8 +82,8 @@ process_input <- function(infile){ ``` ```{r setup_df, echo = F, results = F, message = F} -#infiles <- snakemake@input[[1]] -infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") +infiles <- snakemake@input[[1]] +#infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") aggregate_df <- data.frame( sample = character(), @@ -107,7 +107,7 @@ for(i in infiles){ aggregate_df <- rbind(aggregate_df, process_input(i)) } ``` - +# General ## General Barcode Information from Alignments ### Desc {.no-title}

General Barcode Information from Alignments

@@ -115,6 +115,7 @@ This report aggregates the barcode-specific information from the alignments that were created using `harpy align`. Detailed information for any one sample can be found in that sample's individual report. The table below is an aggregation of data for each sample based on their `*.bxstats.gz` file. + - `avg` refers to the average (arithmetic mean) - `SEM` refers to the Standard Error of the mean - `molecules` are the unique DNA molecules as inferred from linked-read barcodes @@ -124,13 +125,14 @@ of data for each sample based on their `*.bxstats.gz` file. - `NX` are the N-statistics (explained in more detail below) ## Sampletable -### Per-Sample Information +### Per-Sample Information {.no-title} ```{r} DT::datatable( aggregate_df, rownames = F, extensions = 'Buttons', options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE), + caption = 'Per-Sample Alignment Information Pertaining to Barcodes', fillContainer = T, colnames = c("sample", "alignments", "unique molecules", "unique barcodes", "molecule:bx ratio", "valid bx alignments", "invalid bx alignments", "% valid bx", "avg molecules/contig", "SEM molecules/contig", "avg reads/molecule", "SEM reads/molecule", "N50", "N75","N90") ) @@ -139,23 +141,20 @@ DT::datatable( ```{r sampleplot_height, echo=FALSE, message=FALSE, warning=FALSE} n_samples <- nrow(aggregate_df) plotheight <- 150 + (15 * n_samples) -figheight <- 1 + (0.2 * n_samples) +figheight <- 0.6 + (0.2 * n_samples) ``` -## NX plots +## NX plots desc ### NX desc {.no-title} -#### N50 -The **N50** is the length of the shortest molecule in the group of longest molecules that together - represent at least 50% of the total molecules by length. It can be thought of as a kind of median. -#### N75 -The **N75** is the length of the shortest molecule in the group of longest molecules that together -represent at least 75% of the total molecules by length. -#### N90 -The **N90** is the length of the shortest molecule in the group of longest molecules that together -represent at least 90% of the total molecules by length. +

NX Information

+The **NX** metric (e.g. **N50**) is the length of the shortest molecule in the group of longest molecules that together +represent at least **X%** of the total molecules by length. For example, `N50` would be the shortest molecule in the +group of longest molecules that together represent **50%** of the total molecules by length. Below is the distribution of +three common NX metrics (N50, N75, N90) across all samples. +## NXX plots actual ### NX plots {.no-title} -```{r nxxplot, echo = F, results = F, message = F} +```{r nxxplot, echo = F, warning = F, message = F} highchart() |> hc_chart(type = "area") |> hc_add_series(data = density(aggregate_df$N50), name = "N50", type = "areaspline") |> @@ -172,11 +171,13 @@ highchart() |> ## Distribution of valid bx alignments ### dist description {.no-title} - +

Distribution of alignments with valid barcodes

+Below is a distribution of what percent of total alignments each sample +had valid haplotag barcodes (`AXXCXXBXXDXX` where `XX` is not `00`). ## valid bx plot ### distribution plot {.no-title} -```{r perc_valid_dist, echo = F, results = F, message = F} +```{r perc_valid_dist, echo = F, warning = F, message = F} hs <- hist( aggregate_df$totalvalidpercent, breaks = min(aggregate_df$totalvalidpercent):max(aggregate_df$totalvalidpercent), @@ -195,17 +196,19 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Pe ) ``` +# Per-Sample ## Per-Sample Metrics ### Per-sample desc {.no-title} +

Per-Sample Metrics

Below is a series of plots that shows metrics per-sample. ## Per-sample plots ### percent valid {.no-title} -```{r perc_valid_per_sample, echo = F, results = F, message = F, fig.height=figheight, out.width="100%"} +```{r perc_valid_per_sample, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, - hcaes(x = 0, x2 = totalreads, y = 0:(n_samples-1), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> + hcaes(x = 0, x2 = totalreads, y = rev(0:(n_samples-1)), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> hc_xAxis(title = list(text = "Total Alignments")) |> - hc_yAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> hc_caption(text = "Percentage represents the percent of alignments with a valid BX barcode") |> hc_tooltip(enabled = FALSE) |> hc_colors(color = "#95d8a7") |> @@ -216,9 +219,9 @@ hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, ``` ### molecules per contig {.no-title} -```{r mol_per_contig, echo = F, results = F, message = F} +```{r mol_per_contig, echo = F, warning = F, message = F} # Create a scatter plot with horizontal error bars -err_df <- data.frame(x = 0:n_samples-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) +err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) highchart() |> hc_chart(inverted=TRUE, pointPadding = 0.1, groupPadding = 0.0001) |> hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolpercont), marker = list(radius = 8), color = "#c26d9b", zIndex = 1) |> @@ -231,8 +234,9 @@ highchart() |> buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` + ### reads per molecule{.no-title} -```{r reads_per_mol_sample, echo = F, results = F, message = F, fig.height=figheight, out.width="100%"} +```{r reads_per_mol_sample, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) highchart() |> hc_chart(inverted=TRUE, pointPadding = 0.0001, groupPadding = 0.0001) |> From 8bf908a712421c184237a077e99673e30fd46349 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 15:48:08 -0400 Subject: [PATCH 20/41] add link to docs, change datetime formate --- src/harpy/reports/AlignStats.Rmd | 5 +- src/harpy/reports/BcftoolsStats.Rmd | 5 +- src/harpy/reports/BxCount.Rmd | 5 +- src/harpy/reports/BxStatsAggregate.Rmd | 9 +- src/harpy/reports/BxStatsAggregate.html | 7726 +++++++++++++++++++++++ src/harpy/reports/EmaCount.Rmd | 5 +- src/harpy/reports/HapCut2.Rmd | 5 +- src/harpy/reports/Impute.Rmd | 5 +- src/harpy/reports/Leviathan.Rmd | 5 +- src/harpy/reports/LeviathanPop.Rmd | 5 +- src/harpy/reports/Naibr.Rmd | 5 +- src/harpy/reports/NaibrPop.Rmd | 5 +- src/harpy/reports/PreflightBam.Rmd | 5 +- src/harpy/reports/PreflightFastq.Rmd | 5 +- src/harpy/reports/StitchCollate.Rmd | 5 +- 15 files changed, 7784 insertions(+), 16 deletions(-) create mode 100644 src/harpy/reports/BxStatsAggregate.html diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 6d7f66d6f..ec60d4334 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -1,13 +1,16 @@ --- title: "Harpy Alignment Report" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- # Barcode Stats diff --git a/src/harpy/reports/BcftoolsStats.Rmd b/src/harpy/reports/BcftoolsStats.Rmd index e82fda556..1d4c13ee9 100644 --- a/src/harpy/reports/BcftoolsStats.Rmd +++ b/src/harpy/reports/BcftoolsStats.Rmd @@ -1,13 +1,16 @@ --- title: "BCFtools Stats Report" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/BxCount.Rmd b/src/harpy/reports/BxCount.Rmd index 20fb2604f..8b94064b3 100644 --- a/src/harpy/reports/BxCount.Rmd +++ b/src/harpy/reports/BxCount.Rmd @@ -1,12 +1,15 @@ --- title: "Haplotag QC Bardcode Summary" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo = FALSE, message = FALSE} diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index b86f1198f..babad8bc6 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -1,13 +1,16 @@ --- title: "Harpy Alignment Barcode Summary" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r imports, echo = F, results = F, message = F} @@ -82,8 +85,8 @@ process_input <- function(infile){ ``` ```{r setup_df, echo = F, results = F, message = F} -infiles <- snakemake@input[[1]] -#infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") +#infiles <- snakemake@input[[1]] +infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") aggregate_df <- data.frame( sample = character(), diff --git a/src/harpy/reports/BxStatsAggregate.html b/src/harpy/reports/BxStatsAggregate.html new file mode 100644 index 000000000..0905a1792 --- /dev/null +++ b/src/harpy/reports/BxStatsAggregate.html @@ -0,0 +1,7726 @@ + + + + + + + + + + +Harpy Alignment Barcode Summary + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ +
+
+ +
+
+ +
+
+

General

+
+

General Barcode Information from Alignments

+
+

Desc

+

+General Barcode Information from Alignments +

+

This report aggregates the barcode-specific information from the +alignments that were created using harpy align. Detailed +information for any one sample can be found in that sample’s individual +report. The table below is an aggregation of data for each sample based +on their *.bxstats.gz file.

+
    +
  • avg refers to the average (arithmetic mean)
  • +
  • SEM refers to the Standard Error of the mean
  • +
  • molecules are the unique DNA molecules as inferred from +linked-read barcodes
  • +
  • barcodes are the linked-read barcodes associated with +DNA sequences and are synonymous with bx
  • +
  • valid refers to a proper haplotag barcode +(e.g. A01C34B92D51)
  • +
  • invalid refers to an invalidated haplotag barcode, +where there is a 00 in any of the ACBD +positions (e.g. A21C00B32D57)
  • +
  • NX are the N-statistics (explained in more detail +below)
  • +
+
+
+
+

Sampletable

+
+

Per-Sample Information

+
+ +
+
+ +
+ +
+
+
+
+

NX plots desc

+
+

NX desc

+

+NX Information +

+

The NX metric (e.g. N50) is the +length of the shortest molecule in the group of longest molecules that +together represent at least X% of the total molecules +by length. For example, N50 would be the shortest molecule +in the group of longest molecules that together represent +50% of the total molecules by length. Below is the +distribution of three common NX metrics (N50, N75, N90) across all +samples.

+
+
+
+

NXX plots actual

+
+

NX plots

+
+ +
+
+ +
+
+
+

Distribution of valid bx alignments

+
+

dist description

+

+Distribution of alignments with valid barcodes +

+

Below is a distribution of what percent of total alignments each +sample had valid haplotag barcodes (AXXCXXBXXDXX where +XX is not 00).

+
+
+
+

valid bx plot

+
+

distribution plot

+
+ +
+
+ +
+
+
+
+

Per-Sample

+
+

Per-Sample Metrics

+
+

Per-sample desc

+

+Per-Sample Metrics +

+

Below is a series of plots that shows metrics per-sample.

+
+
+
+

Per-sample plots

+
+

percent valid

+
+ +
+
+ +
+
+

molecules per contig

+
+ +
+
+ +
+
+

reads per molecule

+
+ +
+
+ +
+
+
+ +
+ + + + + + + diff --git a/src/harpy/reports/EmaCount.Rmd b/src/harpy/reports/EmaCount.Rmd index dff61634c..d624e7731 100644 --- a/src/harpy/reports/EmaCount.Rmd +++ b/src/harpy/reports/EmaCount.Rmd @@ -1,12 +1,15 @@ --- title: "EMA Count Barcode Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo = FALSE, message = FALSE} diff --git a/src/harpy/reports/HapCut2.Rmd b/src/harpy/reports/HapCut2.Rmd index 9c780a5ee..dab10cb9d 100644 --- a/src/harpy/reports/HapCut2.Rmd +++ b/src/harpy/reports/HapCut2.Rmd @@ -1,13 +1,16 @@ --- title: "Haplotype Phasing Report" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo= FALSE, message = FALSE, warning = FALSE} diff --git a/src/harpy/reports/Impute.Rmd b/src/harpy/reports/Impute.Rmd index f7271e179..dfec6638f 100644 --- a/src/harpy/reports/Impute.Rmd +++ b/src/harpy/reports/Impute.Rmd @@ -1,12 +1,15 @@ --- title: "Imputation Assessment" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- # General Stats diff --git a/src/harpy/reports/Leviathan.Rmd b/src/harpy/reports/Leviathan.Rmd index 7f6c2245f..db6e960f0 100644 --- a/src/harpy/reports/Leviathan.Rmd +++ b/src/harpy/reports/Leviathan.Rmd @@ -1,12 +1,15 @@ --- title: "Leviathan Variant Calling Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/LeviathanPop.Rmd b/src/harpy/reports/LeviathanPop.Rmd index 559d3f611..a5c66d2f0 100644 --- a/src/harpy/reports/LeviathanPop.Rmd +++ b/src/harpy/reports/LeviathanPop.Rmd @@ -1,12 +1,15 @@ --- title: "Leviathan Structural Variant Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/Naibr.Rmd b/src/harpy/reports/Naibr.Rmd index 5b5b1a3df..94c66a1a9 100644 --- a/src/harpy/reports/Naibr.Rmd +++ b/src/harpy/reports/Naibr.Rmd @@ -1,13 +1,16 @@ --- title: "NAIBR Structural Variant Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r echo = FALSE, warnings = FALSE, message = FALSE} diff --git a/src/harpy/reports/NaibrPop.Rmd b/src/harpy/reports/NaibrPop.Rmd index d8530a261..e96a57a65 100644 --- a/src/harpy/reports/NaibrPop.Rmd +++ b/src/harpy/reports/NaibrPop.Rmd @@ -1,13 +1,16 @@ --- title: "NAIBR Structural Variant Summary" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll horizontal_layout: fill + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/PreflightBam.Rmd b/src/harpy/reports/PreflightBam.Rmd index 3970ebe7b..0164ad397 100644 --- a/src/harpy/reports/PreflightBam.Rmd +++ b/src/harpy/reports/PreflightBam.Rmd @@ -1,12 +1,15 @@ --- title: "BAM Haplotag Format Check" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/PreflightFastq.Rmd b/src/harpy/reports/PreflightFastq.Rmd index ba3f3aaa6..a1024fecb 100644 --- a/src/harpy/reports/PreflightFastq.Rmd +++ b/src/harpy/reports/PreflightFastq.Rmd @@ -1,12 +1,15 @@ --- title: "FASTQ Haplotag Format Check" -date: "`r format(Sys.time(), '%m-%d-%y %X')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r load environment, echo=FALSE, message=FALSE, warning=FALSE} diff --git a/src/harpy/reports/StitchCollate.Rmd b/src/harpy/reports/StitchCollate.Rmd index 6963b5316..8ce7003d0 100644 --- a/src/harpy/reports/StitchCollate.Rmd +++ b/src/harpy/reports/StitchCollate.Rmd @@ -1,12 +1,15 @@ --- title: "STITCH Imputation Details" -date: "`r format(Sys.time(), '%d %B, %Y')`" +date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: theme: lumen orientation: rows vertical_layout: scroll + logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + navbar: + - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- ```{r setup, include=FALSE} From aee85cdcf816a7c2019ac0c99995f16909b67308 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 16:21:53 -0400 Subject: [PATCH 21/41] updates --- src/harpy/reports/AlignStats.Rmd | 4 ++-- src/harpy/reports/BcftoolsStats.Rmd | 2 +- src/harpy/reports/BxCount.Rmd | 2 +- src/harpy/reports/BxStatsAggregate.Rmd | 2 +- src/harpy/reports/EmaCount.Rmd | 2 +- src/harpy/reports/HapCut2.Rmd | 4 ++-- src/harpy/reports/Impute.Rmd | 2 +- src/harpy/reports/Leviathan.Rmd | 4 ++-- src/harpy/reports/LeviathanPop.Rmd | 4 ++-- src/harpy/reports/Naibr.Rmd | 4 ++-- src/harpy/reports/NaibrPop.Rmd | 4 ++-- src/harpy/reports/PreflightBam.Rmd | 2 +- src/harpy/reports/PreflightFastq.Rmd | 2 +- src/harpy/reports/StitchCollate.Rmd | 2 +- 14 files changed, 20 insertions(+), 20 deletions(-) diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index ec60d4334..73c0a481e 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -1,5 +1,5 @@ --- -title: "Harpy Alignment Report" +title: "Harpy Alignment Summary" date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: @@ -8,7 +8,7 @@ output: vertical_layout: scroll horizontal_layout: fill logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/BcftoolsStats.Rmd b/src/harpy/reports/BcftoolsStats.Rmd index 1d4c13ee9..252aa8d23 100644 --- a/src/harpy/reports/BcftoolsStats.Rmd +++ b/src/harpy/reports/BcftoolsStats.Rmd @@ -8,7 +8,7 @@ output: vertical_layout: scroll horizontal_layout: fill logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/BxCount.Rmd b/src/harpy/reports/BxCount.Rmd index 8b94064b3..07d81b720 100644 --- a/src/harpy/reports/BxCount.Rmd +++ b/src/harpy/reports/BxCount.Rmd @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxStatsAggregate.Rmd index babad8bc6..d9da88273 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxStatsAggregate.Rmd @@ -8,7 +8,7 @@ output: vertical_layout: scroll horizontal_layout: fill logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/EmaCount.Rmd b/src/harpy/reports/EmaCount.Rmd index d624e7731..598d3db52 100644 --- a/src/harpy/reports/EmaCount.Rmd +++ b/src/harpy/reports/EmaCount.Rmd @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/HapCut2.Rmd b/src/harpy/reports/HapCut2.Rmd index dab10cb9d..a44cd9c5c 100644 --- a/src/harpy/reports/HapCut2.Rmd +++ b/src/harpy/reports/HapCut2.Rmd @@ -1,5 +1,5 @@ --- -title: "Haplotype Phasing Report" +title: "Haplotype Phasing Summary" date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: @@ -8,7 +8,7 @@ output: vertical_layout: scroll horizontal_layout: fill logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/Impute.Rmd b/src/harpy/reports/Impute.Rmd index dfec6638f..3da983aaa 100644 --- a/src/harpy/reports/Impute.Rmd +++ b/src/harpy/reports/Impute.Rmd @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/Leviathan.Rmd b/src/harpy/reports/Leviathan.Rmd index db6e960f0..b70ca4600 100644 --- a/src/harpy/reports/Leviathan.Rmd +++ b/src/harpy/reports/Leviathan.Rmd @@ -1,5 +1,5 @@ --- -title: "Leviathan Variant Calling Summary" +title: "Leviathan Structural Variant Calling Summary" date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/LeviathanPop.Rmd b/src/harpy/reports/LeviathanPop.Rmd index a5c66d2f0..07ac4d976 100644 --- a/src/harpy/reports/LeviathanPop.Rmd +++ b/src/harpy/reports/LeviathanPop.Rmd @@ -1,5 +1,5 @@ --- -title: "Leviathan Structural Variant Summary" +title: "Leviathan Structural Variant Calling Summary" date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/Naibr.Rmd b/src/harpy/reports/Naibr.Rmd index 94c66a1a9..60e6af5d2 100644 --- a/src/harpy/reports/Naibr.Rmd +++ b/src/harpy/reports/Naibr.Rmd @@ -1,5 +1,5 @@ --- -title: "NAIBR Structural Variant Summary" +title: "NAIBR Structural Variant Calling Summary" date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: @@ -8,7 +8,7 @@ output: vertical_layout: scroll horizontal_layout: fill logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/NaibrPop.Rmd b/src/harpy/reports/NaibrPop.Rmd index e96a57a65..01f613523 100644 --- a/src/harpy/reports/NaibrPop.Rmd +++ b/src/harpy/reports/NaibrPop.Rmd @@ -1,5 +1,5 @@ --- -title: "NAIBR Structural Variant Summary" +title: "NAIBR Structural Variant Calling Summary" date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`" output: flexdashboard::flex_dashboard: @@ -8,7 +8,7 @@ output: vertical_layout: scroll horizontal_layout: fill logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/PreflightBam.Rmd b/src/harpy/reports/PreflightBam.Rmd index 0164ad397..96186a919 100644 --- a/src/harpy/reports/PreflightBam.Rmd +++ b/src/harpy/reports/PreflightBam.Rmd @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/PreflightFastq.Rmd b/src/harpy/reports/PreflightFastq.Rmd index a1024fecb..518766680 100644 --- a/src/harpy/reports/PreflightFastq.Rmd +++ b/src/harpy/reports/PreflightFastq.Rmd @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- diff --git a/src/harpy/reports/StitchCollate.Rmd b/src/harpy/reports/StitchCollate.Rmd index 8ce7003d0..f3707b19e 100644 --- a/src/harpy/reports/StitchCollate.Rmd +++ b/src/harpy/reports/StitchCollate.Rmd @@ -7,7 +7,7 @@ output: orientation: rows vertical_layout: scroll logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png - favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png" + favicon: "https://github.com/pdimens/harpy/blob/docs/static/report_favicon.png" navbar: - { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right } --- From ff2dbead47ec72c2e3fdd75da90fb1b695869e66 Mon Sep 17 00:00:00 2001 From: pdimens Date: Wed, 26 Jun 2024 16:44:56 -0400 Subject: [PATCH 22/41] workingon fixes --- src/harpy/align.py | 1 + src/harpy/reports/AlignStats.Rmd | 2 +- ...{BxStatsAggregate.Rmd => BxAlignStats.Rmd} | 2 +- src/harpy/reports/BxStatsAggregate.html | 7726 ----------------- src/harpy/scripts/bxStats.py | 4 +- src/harpy/snakefiles/align-bwa.smk | 12 + 6 files changed, 17 insertions(+), 7730 deletions(-) rename src/harpy/reports/{BxStatsAggregate.Rmd => BxAlignStats.Rmd} (99%) delete mode 100644 src/harpy/reports/BxStatsAggregate.html diff --git a/src/harpy/align.py b/src/harpy/align.py index 33e6735cd..a70dd1894 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -119,6 +119,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality fetch_script(workflowdir, "assignMI.py") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") + fetch_report(workflowdir, "BxAlignStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align bwa\n") diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 73c0a481e..45e00e930 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -47,7 +47,7 @@ nBX <- group_by(valids, contig) %>% avgBX <- round(mean(nBX$nBX), digits = 2) -totuniqBX <- system(paste("zcat",snakemake@input[[1]]), "| tail -1") +totuniqBX <- system2(paste("zcat",snakemake@input[[1]], "| tail -1"), stdout = TRUE) totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() tots <- tb %>% diff --git a/src/harpy/reports/BxStatsAggregate.Rmd b/src/harpy/reports/BxAlignStats.Rmd similarity index 99% rename from src/harpy/reports/BxStatsAggregate.Rmd rename to src/harpy/reports/BxAlignStats.Rmd index d9da88273..ebb00362e 100644 --- a/src/harpy/reports/BxStatsAggregate.Rmd +++ b/src/harpy/reports/BxAlignStats.Rmd @@ -51,7 +51,7 @@ process_input <- function(infile){ avg_mol <- round(mean(nMolpercont$nMol), digits = 2) sem_mol_per_cont <- round(std_error(nMolpercont$nMol), digits = 3) - tot_uniq_bx <- system(paste("zcat",infile, "| tail -1"), intern = T) + tot_uniq_bx <- system2(paste("zcat",infile, "| tail -1"), stdout = TRUE) tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx) |> as.integer() tot_mol <- sum(tb$valid == "valid BX") diff --git a/src/harpy/reports/BxStatsAggregate.html b/src/harpy/reports/BxStatsAggregate.html deleted file mode 100644 index 0905a1792..000000000 --- a/src/harpy/reports/BxStatsAggregate.html +++ /dev/null @@ -1,7726 +0,0 @@ - - - - - - - - - - -Harpy Alignment Barcode Summary - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- -
-
- -
-
- -
-
-

General

-
-

General Barcode Information from Alignments

-
-

Desc

-

-General Barcode Information from Alignments -

-

This report aggregates the barcode-specific information from the -alignments that were created using harpy align. Detailed -information for any one sample can be found in that sample’s individual -report. The table below is an aggregation of data for each sample based -on their *.bxstats.gz file.

-
    -
  • avg refers to the average (arithmetic mean)
  • -
  • SEM refers to the Standard Error of the mean
  • -
  • molecules are the unique DNA molecules as inferred from -linked-read barcodes
  • -
  • barcodes are the linked-read barcodes associated with -DNA sequences and are synonymous with bx
  • -
  • valid refers to a proper haplotag barcode -(e.g. A01C34B92D51)
  • -
  • invalid refers to an invalidated haplotag barcode, -where there is a 00 in any of the ACBD -positions (e.g. A21C00B32D57)
  • -
  • NX are the N-statistics (explained in more detail -below)
  • -
-
-
-
-

Sampletable

-
-

Per-Sample Information

-
- -
-
- -
- -
-
-
-
-

NX plots desc

-
-

NX desc

-

-NX Information -

-

The NX metric (e.g. N50) is the -length of the shortest molecule in the group of longest molecules that -together represent at least X% of the total molecules -by length. For example, N50 would be the shortest molecule -in the group of longest molecules that together represent -50% of the total molecules by length. Below is the -distribution of three common NX metrics (N50, N75, N90) across all -samples.

-
-
-
-

NXX plots actual

-
-

NX plots

-
- -
-
- -
-
-
-

Distribution of valid bx alignments

-
-

dist description

-

-Distribution of alignments with valid barcodes -

-

Below is a distribution of what percent of total alignments each -sample had valid haplotag barcodes (AXXCXXBXXDXX where -XX is not 00).

-
-
-
-

valid bx plot

-
-

distribution plot

-
- -
-
- -
-
-
-
-

Per-Sample

-
-

Per-Sample Metrics

-
-

Per-sample desc

-

-Per-Sample Metrics -

-

Below is a series of plots that shows metrics per-sample.

-
-
-
-

Per-sample plots

-
-

percent valid

-
- -
-
- -
-
-

molecules per contig

-
- -
-
- -
-
-

reads per molecule

-
- -
-
- -
-
-
- -
- - - - - - - diff --git a/src/harpy/scripts/bxStats.py b/src/harpy/scripts/bxStats.py index 205d16a6b..5cbd26dd4 100755 --- a/src/harpy/scripts/bxStats.py +++ b/src/harpy/scripts/bxStats.py @@ -80,7 +80,7 @@ def writestats(x, writechrom): #isize = min(bp, abs(read.template_length)) # only count the bp of the first read of paired end reads #bp = bp if read.is_read1 else 0 - bp = min(abs(read.template_length, bp), read.infer_query_length()) + bp = min(abs(read.template_length), read.infer_query_length()) elif read.is_supplementary: # if it's a supplementary alignment, just use the alignment length isize = bp @@ -107,5 +107,5 @@ def writestats(x, writechrom): # print the last entry writestats(d, chrom_last) # write comment on the last line with the total number of unique BX barcodes -outfile.write(f"#total unique barcodes: {len(all_bx)}") +outfile.write(f"#total unique barcodes: {len(all_bx)}".encode()) outfile.close() diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index 1ca375c1d..322b706b1 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -320,6 +320,18 @@ rule samtools_reports: multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ +rule bx_aggregate_report: + input: + collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) + output: + outdir + "/reports/barcodes.summary.html" + conda: + f"{envdir}/r.yaml" + message: + "Summarizing all barcode information from alignments" + script: + "report/BxAlignStats.Rmd" + rule log_workflow: default_target: True input: From 46f09eefc28169a3064d84af847c765e4ee66ba3 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 10:00:10 -0400 Subject: [PATCH 23/41] officially deprecate minimap --- .deprecated/align-minimap.smk | 346 ++++++++++++++++++++++++++++++++++ src/harpy/align.py | 77 -------- 2 files changed, 346 insertions(+), 77 deletions(-) create mode 100644 .deprecated/align-minimap.smk diff --git a/.deprecated/align-minimap.smk b/.deprecated/align-minimap.smk new file mode 100644 index 000000000..46fb46b1e --- /dev/null +++ b/.deprecated/align-minimap.smk @@ -0,0 +1,346 @@ +containerized: "docker://pdimens/harpy:latest" + +import os +import re +import sys +from rich.panel import Panel +from rich import print as rprint + +outdir = config["output_directory"] +envdir = os.getcwd() + "/.harpy_envs" +genomefile = config["inputs"]["genome"] +fqlist = config["inputs"]["fastq"] +extra = config.get("extra", "") +bn = os.path.basename(genomefile) +genome_zip = True if bn.lower().endswith(".gz") else False +bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" +skipreports = config["skipreports"] +windowsize = config["depth_windowsize"] +molecule_distance = config["molecule_distance"] + +wildcard_constraints: + sample = "[a-zA-Z0-9._-]+" + +onerror: + print("") + rprint( + Panel( + f"The workflow has terminated due to an error. See the log file below for more details.", + title = "[bold]harpy align minimap", + title_align = "left", + border_style = "red" + ), + file = sys.stderr + ) + +onsuccess: + print("") + rprint( + Panel( + f"The workflow has finished successfully! Find the results in [bold]{outdir}[/bold]", + title = "[bold]harpy align minimap", + title_align = "left", + border_style = "green" + ), + file = sys.stderr + ) + +bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" +samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} +d = dict(zip(samplenames, samplenames)) + +def get_fq(wildcards): + # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. + r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) + return list(filter(r.match, fqlist))[:2] + +rule genome_setup: + input: + genomefile + output: + f"Genome/{bn}" + container: + None + message: + "copying {input} to Genome/" + shell: + """ + if (file {input} | grep -q compressed ) ;then + # is regular gzipped, needs to be BGzipped + zcat {input} | bgzip -c > {output} + else + cp -f {input} {output} + fi + """ + +rule genome_faidx: + input: + f"Genome/{bn}" + output: + fai = f"Genome/{bn}.fai", + gzi = f"Genome/{bn}.gzi" if genome_zip else [] + log: + f"Genome/{bn}.faidx.log" + params: + genome_zip + container: + None + message: + "Indexing {input}" + shell: + """ + if [ "{params}" = "True" ]; then + samtools faidx --gzi-idx {output.gzi} --fai-idx {output.fai} {input} 2> {log} + else + samtools faidx --fai-idx {output.fai} {input} 2> {log} + fi + """ + +rule genome_index: + input: + f"Genome/{bn}" + output: + temp(f"Genome/{bn}.mmi") + log: + f"Genome/{bn}.mmi.log" + conda: + f"{envdir}/align.yaml" + message: + "Indexing {input}" + shell: + "minimap2 -x sr -d {output} {input} 2> {log}" + +rule align: + input: + fastq = get_fq, + genome = f"Genome/{bn}.mmi" + output: + pipe(outdir + "/samples/{sample}/{sample}.raw.sam") + log: + outdir + "/logs/{sample}.minimap.log" + params: + samps = lambda wc: d[wc.get("sample")], + extra = extra + benchmark: + ".Benchmark/Mapping/minimap/align.{sample}.txt" + threads: + min(10, workflow.cores) - 2 + conda: + f"{envdir}/align.yaml" + message: + "Aligning sequences: {wildcards.sample}" + shell: + """ + minimap2 -ax sr -t {threads} -y --sam-hit-only -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} > {output} 2> {log} + """ + +rule quality_filter: + input: + outdir + "/samples/{sample}/{sample}.raw.sam" + output: + temp(outdir + "/samples/{sample}/{sample}.mm2.sam") + params: + quality = config["quality"] + container: + None + message: + "Quality filtering alignments: {wildcards.sample}" + shell: + "samtools view -h -F 4 -q {params.quality} {input} > {output}" + +rule collate: + input: + outdir + "/samples/{sample}/{sample}.mm2.sam" + output: + temp(outdir + "/samples/{sample}/{sample}.collate.bam") + container: + None + message: + "Collating alignments: {wildcards.sample}" + shell: + "samtools collate -o {output} {input} 2> /dev/null" + +rule fix_mates: + input: + outdir + "/samples/{sample}/{sample}.collate.bam" + output: + temp(outdir + "/samples/{sample}/{sample}.fixmate.bam") + container: + None + message: + "Fixing mates in alignments: {wildcards.sample}" + shell: + "samtools fixmate -m {input} {output} 2> /dev/null" + +rule sort_alignments: + input: + sam = outdir + "/samples/{sample}/{sample}.fixmate.bam", + genome = f"Genome/{bn}", + genome_samidx = f"Genome/{bn_idx}" + output: + bam = temp(outdir + "/samples/{sample}/{sample}.sort.bam"), + bai = temp(outdir + "/samples/{sample}/{sample}.sort.bam.bai") + log: + outdir + "/logs/{sample}.minimap.sort.log" + params: + quality = config["quality"], + tmpdir = lambda wc: outdir + "/." + d[wc.sample] + resources: + mem_mb = 2000 + container: + None + message: + "Sorting alignments: {wildcards.sample}" + shell: + """ + samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + rm -rf {params.tmpdir} + """ + +rule mark_duplicates: + input: + outdir + "/samples/{sample}/{sample}.sort.bam" + output: + temp(outdir + "/samples/{sample}/{sample}.markdup.bam") + log: + outdir + "/logs/{sample}.markdup.log" + threads: + 2 + container: + None + message: + "Marking duplicates in alignments alignment: {wildcards.sample}" + shell: + "samtools markdup -@ {threads} -S --barcode-tag BX -f {log} {input} {output} 2> /dev/null" + +rule index_markdups: + input: + outdir + "/samples/{sample}/{sample}.markdup.bam" + output: + temp(outdir + "/samples/{sample}/{sample}.markdup.bam.bai") + container: + None + message: + "Indexing duplicate-marked alignments: {wildcards.sample}" + shell: + "samtools index {input}" + +rule assign_molecules: + input: + bam = outdir + "/samples/{sample}/{sample}.markdup.bam", + bai = outdir + "/samples/{sample}/{sample}.markdup.bam.bai" + output: + bam = outdir + "/{sample}.bam", + bai = outdir + "/{sample}.bam.bai" + params: + molecule_distance + conda: + f"{envdir}/qc.yaml" + message: + "Assigning barcodes to molecules: {wildcards.sample}" + script: + "scripts/assignMI.py" + +rule alignment_bxstats: + input: + bam = outdir + "/{sample}.bam", + bai = outdir + "/{sample}.bam.bai" + output: + outdir + "/reports/data/bxstats/{sample}.bxstats.gz" + params: + sample = lambda wc: d[wc.sample] + conda: + f"{envdir}/qc.yaml" + message: + "Calculating barcode alignment statistics: {wildcards.sample}" + script: + "scripts/bxStats.py" + +rule alignment_coverage: + input: + bam = outdir + "/{sample}.bam", + bai = outdir + "/{sample}.bam.bai" + output: + outdir + "/reports/data/coverage/{sample}.cov.gz" + params: + windowsize + container: + None + message: + "Calculating genomic coverage: {wildcards.sample}" + shell: + "samtools depth -a {input.bam} | depthWindows.py {params} | gzip > {output}" + +rule alignment_report: + input: + outdir + "/reports/data/bxstats/{sample}.bxstats.gz", + outdir + "/reports/data/coverage/{sample}.cov.gz" + output: + outdir + "/reports/{sample}.html" + params: + molecule_distance + conda: + f"{envdir}/r.yaml" + message: + "Summarizing barcoded alignments: {wildcards.sample}" + script: + "report/AlignStats.Rmd" + +rule general_alignment_stats: + input: + bam = outdir + "/{sample}.bam", + bai = outdir + "/{sample}.bam.bai" + output: + stats = temp(outdir + "/reports/data/samtools_stats/{sample}.stats"), + flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") + container: + None + message: + "Calculating alignment stats: {wildcards.sample}" + shell: + """ + samtools stats -d {input.bam} > {output.stats} + samtools flagstat {input.bam} > {output.flagstat} + """ + +rule samtools_reports: + input: + collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) + output: + outdir + "/reports/minimap.stats.html" + params: + outdir + conda: + f"{envdir}/qc.yaml" + message: + "Summarizing samtools stats and flagstat" + shell: + """ + multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null + """ + +rule log_workflow: + default_target: True + input: + bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), + samtools = outdir + "/reports/minimap.stats.html" if not skipreports else [] , + bx_reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [] + params: + quality = config["quality"], + extra = extra + message: + "Summarizing the workflow: {output}" + run: + with open(outdir + "/workflow/align.minimap.summary", "w") as f: + _ = f.write("The harpy align minimap workflow ran using these parameters:\n\n") + _ = f.write(f"The provided genome: {bn}\n") + _ = f.write("Sequencing were aligned with Minimap2 using:\n") + _ = f.write(f" minimap2 -y {params.extra} --sam-hit-only -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome.mmi forward_reads reverse_reads |\n") + _ = f.write(f" samtools view -h -F 4 -q {params.quality} |\n") + _ = f.write("Duplicates in the alignments were marked following:\n") + _ = f.write(" samtools collate \n") + _ = f.write(" samtools fixmate\n") + _ = f.write(" samtools sort -m 2000M\n") + _ = f.write(" samtools markdup -S --barcode-tag BX\n") + _ = f.write("\nThe Snakemake workflow was called via command line:\n") + _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file diff --git a/src/harpy/align.py b/src/harpy/align.py index a70dd1894..1d618a700 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -239,82 +239,6 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, _module = subprocess.run(command.split()) sys.exit(_module.returncode) -#@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/") -#@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping') -#@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') -#@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering') -#@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats') -#@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes') -#@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use') -#@click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running') -#@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory') -#@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes') -#@click.option('--hpc', type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission') -#@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container') -#@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports') -#@click.option('--print-only', is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit') -#@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1) -#def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only): -# """ -# Align sequences to genome using `Minimap2` -# -# Provide the input fastq files and/or directories at the end of the command as individual -# files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both. -# -# Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does -# not use barcodes when mapping. Harpy will post-processes the alignments using the -# specified `--molecule-distance` to assign alignments to unique molecules. -# """ -# output_dir = output_dir.rstrip("/") -# workflowdir = f"{output_dir}/workflow" -# sdm = "conda" if conda else "conda apptainer" -# command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . ' -# command += f"--snakefile {workflowdir}/align-minimap.smk " -# command += f"--configfile {workflowdir}/config.yaml " -# if hpc: -# command += f"--workflow-profile {hpc} " -# if quiet: -# command += "--quiet all " -# if snakemake is not None: -# command += snakemake -# if print_only: -# click.echo(command) -# sys.exit(0) -# -# os.makedirs(f"{workflowdir}/", exist_ok= True) -# fqlist, sample_count = parse_fastq_inputs(inputs) -# validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"]) -# fetch_rule(workflowdir, "align-minimap.smk") -# fetch_script(workflowdir, "assignMI.py") -# fetch_script(workflowdir, "bxStats.py") -# fetch_report(workflowdir, "AlignStats.Rmd") -# -# with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: -# config.write("workflow: align minimap\n") -# config.write(f"genomefile: {genome}\n") -# config.write(f"seq_directory: {workflowdir}/input\n") -# config.write(f"output_directory: {output_dir}\n") -# config.write(f"quality: {quality_filter}\n") -# config.write(f"molecule_distance: {molecule_distance}\n") -# config.write(f"depth_windowsize: {depth_window}\n") -# config.write(f"skipreports: {skipreports}\n") -# if extra_params is not None: -# config.write(f"extra: {extra_params}\n") -# config.write(f"workflow_call: {command}\n") -# config.write("inputs:\n") -# config.write(f" genome: {Path(genome).resolve()}\n") -# config.write(" fastq:\n") -# for i in fqlist: -# config.write(f" - {i}\n") -# -# print_onstart( -# f"Samples: {sample_count}\nOutput Directory: {output_dir}", -# "align minimap" -# ) -# generate_conda_deps() -# _module = subprocess.run(command.split()) -# sys.exit(_module.returncode) - @click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/") @click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping') @click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules') @@ -398,5 +322,4 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra align.add_command(bwa) align.add_command(ema) -#align.add_command(minimap) align.add_command(strobe) From 5ab514644e0db0968596ffee7ce02490ad9a0663 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 10:00:19 -0400 Subject: [PATCH 24/41] add bx aggregation report --- src/harpy/reports/AlignStats.Rmd | 3 +- src/harpy/reports/BxAlignStats.Rmd | 4 +- src/harpy/snakefiles/align-bwa.smk | 5 +- src/harpy/snakefiles/align-ema.smk | 17 +- src/harpy/snakefiles/align-minimap.smk | 346 --------------------- src/harpy/snakefiles/align-strobealign.smk | 15 +- 6 files changed, 35 insertions(+), 355 deletions(-) delete mode 100644 src/harpy/snakefiles/align-minimap.smk diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 45e00e930..83f479cfa 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -46,8 +46,7 @@ nBX <- group_by(valids, contig) %>% summarize(nBX = length(molecule)) avgBX <- round(mean(nBX$nBX), digits = 2) - -totuniqBX <- system2(paste("zcat",snakemake@input[[1]], "| tail -1"), stdout = TRUE) +totuniqBX <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+") totuniqBX <- gsub("#total unique barcodes: ", "", totuniqBX) |> as.integer() tots <- tb %>% diff --git a/src/harpy/reports/BxAlignStats.Rmd b/src/harpy/reports/BxAlignStats.Rmd index ebb00362e..6b697a49a 100644 --- a/src/harpy/reports/BxAlignStats.Rmd +++ b/src/harpy/reports/BxAlignStats.Rmd @@ -51,9 +51,9 @@ process_input <- function(infile){ avg_mol <- round(mean(nMolpercont$nMol), digits = 2) sem_mol_per_cont <- round(std_error(nMolpercont$nMol), digits = 3) - tot_uniq_bx <- system2(paste("zcat",infile, "| tail -1"), stdout = TRUE) - tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx) |> as.integer() + tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+") + tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer() tot_mol <- sum(tb$valid == "valid BX") tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"]) avg_reads_per_mol <- round(tot_valid_reads/tot_mol,1) diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index 322b706b1..76dc7333a 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -320,7 +320,7 @@ rule samtools_reports: multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ -rule bx_aggregate_report: +rule bx_report: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: @@ -337,7 +337,8 @@ rule log_workflow: input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam", "bam.bai"]), reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - agg_report = outdir + "/reports/bwa.stats.html" if not skipreports else [] + agg_report = outdir + "/reports/bwa.stats.html" if not skipreports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] params: quality = config["alignment_quality"], extra = extra diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 68857bc4b..a8de41be2 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -450,7 +450,7 @@ rule general_stats: samtools flagstat {input.bam} > {output.flagstat} """ -rule collate_samtools_stats: +rule samtools_reports: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]), output: @@ -466,13 +466,26 @@ rule collate_samtools_stats: multiqc {outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ +rule bx_report: + input: + collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) + output: + outdir + "/reports/barcodes.summary.html" + conda: + f"{envdir}/r.yaml" + message: + "Summarizing all barcode information from alignments" + script: + "report/BxAlignStats.Rmd" + rule log_workflow: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = [ "bam", "bam.bai"] ), cov_report = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], bx_counts = f"{outdir}/reports/reads.bxcounts.html" if not skipreports else [], - agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [] + agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] params: beadtech = "-p" if platform == "haplotag" else f"-w {whitelist}" message: diff --git a/src/harpy/snakefiles/align-minimap.smk b/src/harpy/snakefiles/align-minimap.smk deleted file mode 100644 index 46fb46b1e..000000000 --- a/src/harpy/snakefiles/align-minimap.smk +++ /dev/null @@ -1,346 +0,0 @@ -containerized: "docker://pdimens/harpy:latest" - -import os -import re -import sys -from rich.panel import Panel -from rich import print as rprint - -outdir = config["output_directory"] -envdir = os.getcwd() + "/.harpy_envs" -genomefile = config["inputs"]["genome"] -fqlist = config["inputs"]["fastq"] -extra = config.get("extra", "") -bn = os.path.basename(genomefile) -genome_zip = True if bn.lower().endswith(".gz") else False -bn_idx = f"{bn}.gzi" if genome_zip else f"{bn}.fai" -skipreports = config["skipreports"] -windowsize = config["depth_windowsize"] -molecule_distance = config["molecule_distance"] - -wildcard_constraints: - sample = "[a-zA-Z0-9._-]+" - -onerror: - print("") - rprint( - Panel( - f"The workflow has terminated due to an error. See the log file below for more details.", - title = "[bold]harpy align minimap", - title_align = "left", - border_style = "red" - ), - file = sys.stderr - ) - -onsuccess: - print("") - rprint( - Panel( - f"The workflow has finished successfully! Find the results in [bold]{outdir}[/bold]", - title = "[bold]harpy align minimap", - title_align = "left", - border_style = "green" - ), - file = sys.stderr - ) - -bn_r = r"([_\.][12]|[_\.][FR]|[_\.]R[12](?:\_00[0-9])*)?\.((fastq|fq)(\.gz)?)$" -samplenames = {re.sub(bn_r, "", os.path.basename(i), flags = re.IGNORECASE) for i in fqlist} -d = dict(zip(samplenames, samplenames)) - -def get_fq(wildcards): - # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. - r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] - -rule genome_setup: - input: - genomefile - output: - f"Genome/{bn}" - container: - None - message: - "copying {input} to Genome/" - shell: - """ - if (file {input} | grep -q compressed ) ;then - # is regular gzipped, needs to be BGzipped - zcat {input} | bgzip -c > {output} - else - cp -f {input} {output} - fi - """ - -rule genome_faidx: - input: - f"Genome/{bn}" - output: - fai = f"Genome/{bn}.fai", - gzi = f"Genome/{bn}.gzi" if genome_zip else [] - log: - f"Genome/{bn}.faidx.log" - params: - genome_zip - container: - None - message: - "Indexing {input}" - shell: - """ - if [ "{params}" = "True" ]; then - samtools faidx --gzi-idx {output.gzi} --fai-idx {output.fai} {input} 2> {log} - else - samtools faidx --fai-idx {output.fai} {input} 2> {log} - fi - """ - -rule genome_index: - input: - f"Genome/{bn}" - output: - temp(f"Genome/{bn}.mmi") - log: - f"Genome/{bn}.mmi.log" - conda: - f"{envdir}/align.yaml" - message: - "Indexing {input}" - shell: - "minimap2 -x sr -d {output} {input} 2> {log}" - -rule align: - input: - fastq = get_fq, - genome = f"Genome/{bn}.mmi" - output: - pipe(outdir + "/samples/{sample}/{sample}.raw.sam") - log: - outdir + "/logs/{sample}.minimap.log" - params: - samps = lambda wc: d[wc.get("sample")], - extra = extra - benchmark: - ".Benchmark/Mapping/minimap/align.{sample}.txt" - threads: - min(10, workflow.cores) - 2 - conda: - f"{envdir}/align.yaml" - message: - "Aligning sequences: {wildcards.sample}" - shell: - """ - minimap2 -ax sr -t {threads} -y --sam-hit-only -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} > {output} 2> {log} - """ - -rule quality_filter: - input: - outdir + "/samples/{sample}/{sample}.raw.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.mm2.sam") - params: - quality = config["quality"] - container: - None - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" - -rule collate: - input: - outdir + "/samples/{sample}/{sample}.mm2.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.collate.bam") - container: - None - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/samples/{sample}/{sample}.collate.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.fixmate.bam") - container: - None - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" - -rule sort_alignments: - input: - sam = outdir + "/samples/{sample}/{sample}.fixmate.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn_idx}" - output: - bam = temp(outdir + "/samples/{sample}/{sample}.sort.bam"), - bai = temp(outdir + "/samples/{sample}/{sample}.sort.bam.bai") - log: - outdir + "/logs/{sample}.minimap.sort.log" - params: - quality = config["quality"], - tmpdir = lambda wc: outdir + "/." + d[wc.sample] - resources: - mem_mb = 2000 - container: - None - message: - "Sorting alignments: {wildcards.sample}" - shell: - """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} - rm -rf {params.tmpdir} - """ - -rule mark_duplicates: - input: - outdir + "/samples/{sample}/{sample}.sort.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.markdup.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - container: - None - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S --barcode-tag BX -f {log} {input} {output} 2> /dev/null" - -rule index_markdups: - input: - outdir + "/samples/{sample}/{sample}.markdup.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.markdup.bam.bai") - container: - None - message: - "Indexing duplicate-marked alignments: {wildcards.sample}" - shell: - "samtools index {input}" - -rule assign_molecules: - input: - bam = outdir + "/samples/{sample}/{sample}.markdup.bam", - bai = outdir + "/samples/{sample}/{sample}.markdup.bam.bai" - output: - bam = outdir + "/{sample}.bam", - bai = outdir + "/{sample}.bam.bai" - params: - molecule_distance - conda: - f"{envdir}/qc.yaml" - message: - "Assigning barcodes to molecules: {wildcards.sample}" - script: - "scripts/assignMI.py" - -rule alignment_bxstats: - input: - bam = outdir + "/{sample}.bam", - bai = outdir + "/{sample}.bam.bai" - output: - outdir + "/reports/data/bxstats/{sample}.bxstats.gz" - params: - sample = lambda wc: d[wc.sample] - conda: - f"{envdir}/qc.yaml" - message: - "Calculating barcode alignment statistics: {wildcards.sample}" - script: - "scripts/bxStats.py" - -rule alignment_coverage: - input: - bam = outdir + "/{sample}.bam", - bai = outdir + "/{sample}.bam.bai" - output: - outdir + "/reports/data/coverage/{sample}.cov.gz" - params: - windowsize - container: - None - message: - "Calculating genomic coverage: {wildcards.sample}" - shell: - "samtools depth -a {input.bam} | depthWindows.py {params} | gzip > {output}" - -rule alignment_report: - input: - outdir + "/reports/data/bxstats/{sample}.bxstats.gz", - outdir + "/reports/data/coverage/{sample}.cov.gz" - output: - outdir + "/reports/{sample}.html" - params: - molecule_distance - conda: - f"{envdir}/r.yaml" - message: - "Summarizing barcoded alignments: {wildcards.sample}" - script: - "report/AlignStats.Rmd" - -rule general_alignment_stats: - input: - bam = outdir + "/{sample}.bam", - bai = outdir + "/{sample}.bam.bai" - output: - stats = temp(outdir + "/reports/data/samtools_stats/{sample}.stats"), - flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") - container: - None - message: - "Calculating alignment stats: {wildcards.sample}" - shell: - """ - samtools stats -d {input.bam} > {output.stats} - samtools flagstat {input.bam} > {output.flagstat} - """ - -rule samtools_reports: - input: - collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) - output: - outdir + "/reports/minimap.stats.html" - params: - outdir - conda: - f"{envdir}/qc.yaml" - message: - "Summarizing samtools stats and flagstat" - shell: - """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ - -rule log_workflow: - default_target: True - input: - bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), - samtools = outdir + "/reports/minimap.stats.html" if not skipreports else [] , - bx_reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [] - params: - quality = config["quality"], - extra = extra - message: - "Summarizing the workflow: {output}" - run: - with open(outdir + "/workflow/align.minimap.summary", "w") as f: - _ = f.write("The harpy align minimap workflow ran using these parameters:\n\n") - _ = f.write(f"The provided genome: {bn}\n") - _ = f.write("Sequencing were aligned with Minimap2 using:\n") - _ = f.write(f" minimap2 -y {params.extra} --sam-hit-only -R \"@RG\\tID:SAMPLE\\tSM:SAMPLE\" genome.mmi forward_reads reverse_reads |\n") - _ = f.write(f" samtools view -h -F 4 -q {params.quality} |\n") - _ = f.write("Duplicates in the alignments were marked following:\n") - _ = f.write(" samtools collate \n") - _ = f.write(" samtools fixmate\n") - _ = f.write(" samtools sort -m 2000M\n") - _ = f.write(" samtools markdup -S --barcode-tag BX\n") - _ = f.write("\nThe Snakemake workflow was called via command line:\n") - _ = f.write(" " + str(config["workflow_call"]) + "\n") \ No newline at end of file diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index e69510f93..b7ab7700f 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -317,12 +317,25 @@ rule samtools_reports: multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ +rule bx_report: + input: + collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) + output: + outdir + "/reports/barcodes.summary.html" + conda: + f"{envdir}/r.yaml" + message: + "Summarizing all barcode information from alignments" + script: + "report/BxAlignStats.Rmd" + rule log_workflow: default_target: True input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), samtools = outdir + "/reports/strobealign.stats.html" if not skipreports else [] , - bx_reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [] + reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], + bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] params: readlen = readlen, quality = config["quality"], From 082d56d78a118018e82be8af835c27fc6f1a92f8 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 14:50:37 -0400 Subject: [PATCH 25/41] deprecate --- {src/harpy/reports => .deprecated}/EmaCount.Rmd | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename {src/harpy/reports => .deprecated}/EmaCount.Rmd (100%) diff --git a/src/harpy/reports/EmaCount.Rmd b/.deprecated/EmaCount.Rmd similarity index 100% rename from src/harpy/reports/EmaCount.Rmd rename to .deprecated/EmaCount.Rmd From 797abb0c131686d594291381327c2c2ab7e23ba1 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 15:08:02 -0400 Subject: [PATCH 26/41] add conditional --- src/harpy/reports/AlignStats.Rmd | 8 +++ src/harpy/reports/BxAlignStats.Rmd | 103 ++++++++++++++++++----------- 2 files changed, 71 insertions(+), 40 deletions(-) diff --git a/src/harpy/reports/AlignStats.Rmd b/src/harpy/reports/AlignStats.Rmd index 83f479cfa..4a7b4bae8 100644 --- a/src/harpy/reports/AlignStats.Rmd +++ b/src/harpy/reports/AlignStats.Rmd @@ -35,6 +35,10 @@ infile <- snakemake@input[[1]] bamfile <- gsub(".bxstats.gz", ".bam", infile) samplename <- gsub(".bxstats.gz", "", basename(infile)) tb <- read.table(infile, header = T, sep = "\t") %>% select(-start, -end) +if(nrow(tb) == 0){ + print(paste("Input data file",infle, "is empty")) + knittr::knittr_exit() +} tb$valid <- tb$molecule tb[tb$valid != "invalidBX", "valid"] <- "validBX" tb$valid <- gsub("BX", " BX", tb$valid) @@ -308,6 +312,10 @@ knitr::kable( covfile <- snakemake@input[[2]] #covfile <- "~/test.cov.gz" tb <- read.table(covfile, header = F) +if(nrow(tb) == 0){ + print(paste("Input data file",covfile, "is empty")) + knittr::knittr_exit() +} colnames(tb) <- c("contig", "position", "depth") q99 <- quantile(tb$depth, 0.99) Mode <- function(x) { diff --git a/src/harpy/reports/BxAlignStats.Rmd b/src/harpy/reports/BxAlignStats.Rmd index 6b697a49a..ced3e747b 100644 --- a/src/harpy/reports/BxAlignStats.Rmd +++ b/src/harpy/reports/BxAlignStats.Rmd @@ -41,41 +41,41 @@ std_error <- function(x){ process_input <- function(infile){ bamfile <- gsub(".bxstats.gz", ".bam", infile) samplename <- gsub(".bxstats.gz", "", basename(infile)) - tb <- read.table(infile, header = T, sep = "\t") %>% select(1,2,3,6) + tb <- read.table(infile, header = T, sep = "\t") %>% select(-4, -5) tb$valid <- tb$molecule tb[tb$valid != "invalidBX", "valid"] <- "validBX" tb$valid <- gsub("BX", " BX", tb$valid) - - nMolpercont <- group_by(tb[tb$valid == "valid BX",], contig) %>% - summarize(nMol = length(molecule)) - - avg_mol <- round(mean(nMolpercont$nMol), digits = 2) - sem_mol_per_cont <- round(std_error(nMolpercont$nMol), digits = 3) - + # isolate non-singletons b/c molecules with 1 read pair aren't linked reads + multiplex_df <- filter(tb, valid == "valid BX", reads > 2) + singletons <- sum(tb$reads <= 2 & tb$valid == "valid BX") tot_uniq_bx <- read.table(infile, header = F, sep = "\n", as.is = T, skip = nrow(tb) + 1, comment.char = "+") tot_uniq_bx <- gsub("#total unique barcodes: ", "", tot_uniq_bx$V1[1]) |> as.integer() tot_mol <- sum(tb$valid == "valid BX") tot_valid_reads <- sum(tb[tb$valid == "valid BX", "reads"]) - avg_reads_per_mol <- round(tot_valid_reads/tot_mol,1) - sem_reads_per_mol <- round(std_error(tb[tb$valid == "valid BX", "reads"]), 1) + avg_reads_per_mol <- round(mean(multiplex_df$reads),1) + sem_reads_per_mol <- round(std_error(multiplex_df$reads), 2) tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"]) - n50 <- NX(tb$length_inferred, 50) - n75 <- NX(tb$length_inferred, 75) - n90 <- NX(tb$length_inferred, 90) + avg_mol_cov <- round(mean(multiplex_df$coverage_inserts), 2) + sem_mol_cov <- round(std_error(multiplex_df$coverage_inserts), 4) + n50 <- NX(multiplex_df$length_inferred, 50) + n75 <- NX(multiplex_df$length_inferred, 75) + n90 <- NX(multiplex_df$length_inferred, 90) outrow <- data.frame( sample = samplename, totalreads = tot_valid_reads + tot_invalid_reads, totaluniquemol = tot_mol, + singletons = singletons, + multiplex = nrow(multiplex_df), totaluniquebx = tot_uniq_bx, molecule2bxratio = round(tot_mol / tot_uniq_bx,2), totalvalidreads = tot_valid_reads, totalinvalidreads = tot_invalid_reads, totalvalidpercent = round(tot_valid_reads/(tot_valid_reads + tot_invalid_reads) * 100,2), - averagemolpercont = avg_mol, - semolpercont = sem_mol_per_cont, averagereadspermol = avg_reads_per_mol, sereadspermol = sem_reads_per_mol, + averagemolcov = avg_mol_cov, + semolcov = sem_mol_cov, N50 = n50, N75 = n75, N90 = n90 @@ -92,15 +92,17 @@ aggregate_df <- data.frame( sample = character(), totalreads = integer(), totaluniquemol = integer(), + singletons = integer(), + multiplex = integer(), totaluniquebx = integer(), molecule2bxratio = numeric(), totalvalidreads = integer(), totalinvalidreads = integer(), totalvalidpercent = numeric(), - averagemolpercont = integer(), - semolpercont = numeric(), averagereadspermol = numeric(), sereadspermol = numeric(), + averagemolcov = numeric(), + semolcov = numeric(), N50 = integer(), N75 = integer(), N90 = integer() @@ -109,6 +111,10 @@ aggregate_df <- data.frame( for(i in infiles){ aggregate_df <- rbind(aggregate_df, process_input(i)) } +if(nrow(aggregate_df) == 0){ + print("All input files were empty") + knittr::knittr_exit() +} ``` # General ## General Barcode Information from Alignments @@ -122,6 +128,8 @@ of data for each sample based on their `*.bxstats.gz` file. - `avg` refers to the average (arithmetic mean) - `SEM` refers to the Standard Error of the mean - `molecules` are the unique DNA molecules as inferred from linked-read barcodes + - `singletons` are molecules composed of one single-end or paired-end sequence, effectively not a linked read + - `multiplex` refer to molecules composed of more than two single-end or paired-end sequences - `barcodes` are the linked-read barcodes associated with DNA sequences and are synonymous with `bx` - `valid` refers to a proper haplotag barcode (e.g. `A01C34B92D51`) - `invalid` refers to an invalidated haplotag barcode, where there is a `00` in any of the `ACBD` positions (e.g. `A21C00B32D57`) @@ -137,7 +145,7 @@ DT::datatable( options = list(dom = 'Brtip', buttons = c('csv'), scrollX = TRUE), caption = 'Per-Sample Alignment Information Pertaining to Barcodes', fillContainer = T, - colnames = c("sample", "alignments", "unique molecules", "unique barcodes", "molecule:bx ratio", "valid bx alignments", "invalid bx alignments", "% valid bx", "avg molecules/contig", "SEM molecules/contig", "avg reads/molecule", "SEM reads/molecule", "N50", "N75","N90") + colnames = c("sample", "alignments", "unique molecules", "singletons", "multiplex", "unique barcodes", "molecule:bx ratio", "valid bx alignments", "invalid bx alignments", "% valid bx", "avg reads/molecule", "SEM reads/molecule", "avg molecule coverage %","SEM molecule coverage %", "N50", "N75","N90") ) ``` @@ -152,18 +160,19 @@ figheight <- 0.6 + (0.2 * n_samples)

NX Information

The **NX** metric (e.g. **N50**) is the length of the shortest molecule in the group of longest molecules that together represent at least **X%** of the total molecules by length. For example, `N50` would be the shortest molecule in the -group of longest molecules that together represent **50%** of the total molecules by length. Below is the distribution of -three common NX metrics (N50, N75, N90) across all samples. +group of longest molecules that together represent **50%** of the total molecules by length (sort of like a median). +Below is the distributions of three common NX metrics (N50, N75, N90) across all samples. ## NXX plots actual ### NX plots {.no-title} ```{r nxxplot, echo = F, warning = F, message = F} highchart() |> - hc_chart(type = "area") |> + hc_chart(type = "area", animation = F) |> hc_add_series(data = density(aggregate_df$N50), name = "N50", type = "areaspline") |> hc_add_series(data = density(aggregate_df$N75), name = "N75", type = "areaspline") |> hc_add_series(data = density(aggregate_df$N90), name = "N90", color = "#d3805f", type = "areaspline") |> hc_tooltip(enabled = FALSE) |> + hc_caption(text = "Values derived using non-singleton molecules") |> hc_title(text = "NX Stats Across Samples") |> hc_xAxis(title = list(text = "NX value"), min = 0) |> hc_yAxis(title = list(text = "density")) |> @@ -189,7 +198,7 @@ hs <- hist( hs$counts <- round(hs$counts / sum(hs$counts) * 100, 4) hs <- data.frame(val = hs$breaks[-1], freq = hs$counts) -hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Percent Alignments", marker = list(enabled = FALSE)) |> +hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Percent Alignments", marker = list(enabled = FALSE), animation = F) |> hc_title(text = "Percent of Alignments with Valid BX tags") |> hc_xAxis(min = 0, max = 100, title = list(text = "% alignment with valid BX tag")) |> hc_yAxis(max = 100, title = list(text = "% samples")) |> @@ -203,12 +212,12 @@ hchart(hs, "areaspline", hcaes(x = val, y = freq), color = "#8484bd", name = "Pe ## Per-Sample Metrics ### Per-sample desc {.no-title}

Per-Sample Metrics

-Below is a series of plots that shows metrics per-sample. +Below is a series of plots showing per-sample metrics. The meaning of percentages and error bars are provided in the bottom-left captions of the plots. ## Per-sample plots ### percent valid {.no-title} ```{r perc_valid_per_sample, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} -hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, +hchart(aggregate_df, "xrange", animation = F, groupPadding = 0.0001, hcaes(x = 0, x2 = totalreads, y = rev(0:(n_samples-1)), partialFill = totalvalidpercent/100), dataLabels = list(enabled = TRUE)) |> hc_xAxis(title = list(text = "Total Alignments")) |> hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> @@ -221,19 +230,17 @@ hchart(aggregate_df, "xrange", pointPadding = 0.1, groupPadding = 0.0001, ) ``` -### molecules per contig {.no-title} -```{r mol_per_contig, echo = F, warning = F, message = F} -# Create a scatter plot with horizontal error bars -err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolpercont, low = aggregate_df$averagemolpercont - aggregate_df$semolpercont, high = aggregate_df$averagemolpercont + aggregate_df$semolpercont) -highchart() |> - hc_chart(inverted=TRUE, pointPadding = 0.1, groupPadding = 0.0001) |> - hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolpercont), marker = list(radius = 8), color = "#c26d9b", zIndex = 1) |> - hc_add_series(data = err_df, type = "errorbar", name = "standard deviation", linkedTo = ":previous", zIndex = 0, stemColor = "#c26d9b", whiskerColor = "#c98fae") |> - hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> - hc_title(text = "Average Molecules Per Contig") |> - hc_caption(text = "Error bars show the standard error of the mean") |> - hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> - hc_exporting(enabled = T, filename = "avg.molecules.per.contig", +### multiplex percent {.no-title} +```{r perc_multiplex, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} +hchart(aggregate_df, "xrange", animation = F, groupPadding = 0.0001, + hcaes(x = 0, x2 = multiplex + singletons, y = rev(0:(n_samples-1)), partialFill = round(multiplex / (multiplex + singletons), 4)), dataLabels = list(enabled = TRUE)) |> + hc_xAxis(title = list(text = "Total Alignments with Valid Barcodes")) |> + hc_yAxis(title = FALSE, gridLineWidth = 0, categories = rev(aggregate_df$sample)) |> + hc_caption(text = "Percentage represents the percent of molecules composed of more than 2 single-end or paired-end sequences") |> + hc_tooltip(enabled = FALSE) |> + hc_colors(color = "#8dc6f5") |> + hc_title(text = "Percent Non-Singleton Molecules") |> + hc_exporting(enabled = T, filename = "BX.multiplex", buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) ``` @@ -242,14 +249,30 @@ highchart() |> ```{r reads_per_mol_sample, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagereadspermol, low = aggregate_df$averagereadspermol - aggregate_df$sereadspermol, high = aggregate_df$averagereadspermol + aggregate_df$sereadspermol) highchart() |> - hc_chart(inverted=TRUE, pointPadding = 0.0001, groupPadding = 0.0001) |> + hc_chart(inverted=TRUE, animation = F, pointPadding = 0.0001, groupPadding = 0.0001) |> hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagereadspermol), marker = list(radius = 8), color = "#6d73c2", zIndex = 1) |> - hc_add_series(data = err_df, type = "errorbar", name = "standard deviation", linkedTo = ":previous", zIndex = 0, stemColor = "#8186c7", whiskerColor = "#8186c7") |> + hc_add_series(data = err_df, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#8186c7", whiskerColor = "#8186c7") |> hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> hc_title(text = "Average Reads Per Molecule") |> - hc_caption(text = "Error bars show the standard error of the mean") |> + hc_caption(text = "Error bars show the standard error of the mean of non-singleton molecules") |> hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> hc_exporting(enabled = T, filename = "avg.reads.per.molecule", buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) ) +``` + +### average molecule coverage{.no-title} +```{r avg_mol_cov, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} +err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolcov, low = aggregate_df$averagemolcov - aggregate_df$semolcov, high = aggregate_df$averagemolcov + aggregate_df$semolcov) +highchart() |> + hc_chart(inverted=TRUE, animation = F, pointPadding = 0.0001, groupPadding = 0.0001) |> + hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolcov), marker = list(radius = 8), color = "#d774ae", zIndex = 1) |> + hc_add_series(data = err_df, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#e39dc6", whiskerColor = "#e39dc6") |> + hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> + hc_title(text = "Average Molecule Percent Coverage") |> + hc_caption(text = "Error bars show the standard error of the mean percent coverage of non-singleton molecules") |> + hc_tooltip(crosshairs = TRUE, pointFormat= '{point.y}') |> + hc_exporting(enabled = T, filename = "avg.molecule.coverage", + buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) + ) ``` \ No newline at end of file From 608d76600d6048251ac1908c7aba94b6c5c28e6f Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 15:08:07 -0400 Subject: [PATCH 27/41] add rmd --- src/harpy/align.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/harpy/align.py b/src/harpy/align.py index 1d618a700..c51bac7cd 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -209,8 +209,8 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"]) fetch_rule(workflowdir, "align-ema.smk") fetch_script(workflowdir, "bxStats.py") - for i in ["EmaCount", "AlignStats"]: - fetch_report(workflowdir, f"{i}.Rmd") + fetch_report(workflowdir, "AlignStats.Rmd") + fetch_report(workflowdir, "BxAlignStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align ema\n") @@ -292,6 +292,7 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra fetch_script(workflowdir, "assignMI.py") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") + fetch_report(workflowdir, "BxAlignStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align strobe\n") From 31bd599be42b710f21ca95a2bfbecd5593932199 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 15:08:14 -0400 Subject: [PATCH 28/41] piping improvements --- src/harpy/snakefiles/align-bwa.smk | 111 +++-------- src/harpy/snakefiles/align-ema.smk | 208 +++++---------------- src/harpy/snakefiles/align-strobealign.smk | 101 +++------- 3 files changed, 108 insertions(+), 312 deletions(-) diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index 76dc7333a..da7091204 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -116,111 +116,62 @@ rule align: genome = f"Genome/{bn}", genome_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - pipe(outdir + "/samples/{sample}/{sample}.raw.sam") + temp(outdir + "/samples/{sample}/{sample}.bwa.sam") log: outdir + "/logs/{sample}.bwa.log" params: samps = lambda wc: d[wc.get("sample")], + quality = config["alignment_quality"], extra = extra benchmark: ".Benchmark/Mapping/bwa/align.{sample}.txt" threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning sequences: {wildcards.sample}" shell: """ - bwa mem -C -v 2 -t {threads} {params.extra} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} > {output} 2> {log} + bwa mem -C -v 2 -t {threads} {params.extra} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.fastq} 2> {log} | + samtools view -h -F 4 -q {params.quality} > {output} """ - -rule quality_filter: - input: - outdir + "/samples/{sample}/{sample}.raw.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.bwa.sam") - params: - quality = config["alignment_quality"] - conda: - f"{envdir}/align.yaml" - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" -rule collate: - input: - outdir + "/samples/{sample}/{sample}.bwa.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.collate.bam") - conda: - f"{envdir}/align.yaml" - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/samples/{sample}/{sample}.collate.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.fixmate.bam") - conda: - f"{envdir}/align.yaml" - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" - -rule sort_alignments: +rule mark_duplicates: input: - sam = outdir + "/samples/{sample}/{sample}.fixmate.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn_idx}", - genome_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") + sam = outdir + "/samples/{sample}/{sample}.bwa.sam", + genome = f"Genome/{bn}", + faidx = f"Genome/{bn_idx}" output: - bam = temp(outdir + "/samples/{sample}/{sample}.sort.bam"), - bai = temp(outdir + "/samples/{sample}/{sample}.sort.bam.bai") + temp(outdir + "/samples/{sample}/{sample}.markdup.bam") log: - outdir + "/logs/{sample}.bwa.sort.log" + outdir + "/logs/{sample}.markdup.log" params: tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: mem_mb = 2000 - conda: - f"{envdir}/align.yaml" + container: + None + threads: + 2 message: - "Sorting alignments: {wildcards.sample}" + "Marking duplicates: {wildcards.sample}" shell: """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + samtools collate -O -u {input.sam} | + samtools fixmate -m -u - - | + samtools sort -T {params.tmpdir} -u --reference {input.genome} -l 0 -m {resources.mem_mb}M - | + samtools markdup -@ {threads} -S --barcode-tag BX -f {log} - {output} rm -rf {params.tmpdir} """ -rule mark_duplicates: - input: - outdir + "/samples/{sample}/{sample}.sort.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.markdup.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - conda: - f"{envdir}/align.yaml" - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S --barcode-tag BX -f {log} {input} {output} 2> /dev/null" - -rule index_markdups: +rule markdups_index: input: outdir + "/samples/{sample}/{sample}.markdup.bam" output: temp(outdir + "/samples/{sample}/{sample}.markdup.bam.bai") - conda: - f"{envdir}/align.yaml" + container: + None message: "Indexing duplicate-marked alignments: {wildcards.sample}" shell: @@ -242,7 +193,7 @@ rule assign_molecules: script: "scripts/assignMI.py" -rule alignment_bxstats: +rule bxstats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -257,7 +208,7 @@ rule alignment_bxstats: script: "scripts/bxStats.py" -rule alignment_coverage: +rule coverage: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -272,7 +223,7 @@ rule alignment_coverage: shell: "samtools depth -a {input.bam} | depthWindows.py {params} | gzip > {output}" -rule reports: +rule report_persample: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -287,15 +238,15 @@ rule reports: script: "report/AlignStats.Rmd" -rule alignment_report: +rule stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" output: stats = temp(outdir + "/reports/data/samtools_stats/{sample}.stats"), flagstat = temp(outdir + "/reports/data/samtools_flagstat/{sample}.flagstat") - conda: - f"{envdir}/align.yaml" + container: + None message: "Calculating alignment stats: {wildcards.sample}" shell: @@ -304,7 +255,7 @@ rule alignment_report: samtools flagstat {input.bam} > {output.flagstat} """ -rule samtools_reports: +rule report_samtools: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: @@ -320,7 +271,7 @@ rule samtools_reports: multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ -rule bx_report: +rule report_bx: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index a8de41be2..514296215 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -115,25 +115,9 @@ rule genome_bwa_index: shell: "bwa index {input} 2> {log}" -rule interleave: - input: - fastq = get_fq - output: - pipe(outdir + "/.interleave/{sample}.interleave.fq") - container: - None - message: - "Interleaving input fastq files: {wildcards.sample}" - shell: - "seqtk mergepe {input} > {output}" - -use rule interleave as interleave2 with: - output: - outdir + "/.interleave/{sample}.interleave2.fq" - rule beadtag_count: input: - outdir + "/.interleave/{sample}.interleave.fq" + get_fq output: counts = temp(outdir + "/bxcount/{sample}.ema-ncnt"), logs = temp(outdir + "/logs/count/{sample}.count") @@ -148,24 +132,13 @@ rule beadtag_count: shell: """ mkdir -p {params.prefix} {params.logdir} - ema count {params.beadtech} -o {params.prefix} < {input} 2> {output.logs} + seqtk mergepe {input} | + ema count {params.beadtech} -o {params.prefix} 2> {output.logs} """ -rule beadtag_summary: - input: - countlog = collect(outdir + "/logs/count/{sample}.count", sample = samplenames) - output: - outdir + "/reports/reads.bxcounts.html" - conda: - f"{envdir}/r.yaml" - message: - "Creating sample barcode validation report" - script: - "report/EmaCount.Rmd" - rule preprocess: input: - reads = outdir + "/.interleave/{sample}.interleave2.fq", + reads = get_fq, emacounts = outdir + "/bxcount/{sample}.ema-ncnt" output: bins = temp(collect(outdir + "/preproc/{{sample}}/ema-bin-{bin}", bin = binrange)), @@ -184,7 +157,8 @@ rule preprocess: "Preprocessing for EMA mapping: {wildcards.sample}" shell: """ - ema preproc {params.bxtype} -n {params.bins} -t {threads} -o {params.outdir} {input.emacounts} < {input.reads} 2>&1 | + seqtk mergepe {input.reads} | + ema preproc {params.bxtype} -n {params.bins} -t {threads} -o {params.outdir} {input.emacounts} 2>&1 | cat - > {log} """ @@ -192,52 +166,32 @@ rule align: input: readbin = collect(outdir + "/preproc/{{sample}}/ema-bin-{bin}", bin = binrange), genome = f"Genome/{bn}", - geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") - output: - pipe(outdir + "/align/{sample}.bc.raw.sam"), - log: - outdir + "/logs/{sample}.ema.align.log", - params: - bxtype = f"-p {platform}", - extra = extra - threads: - min(10, workflow.cores) - 2 - conda: - f"{envdir}/align.yaml" - message: - "Aligning barcoded sequences: {wildcards.sample}" - shell: - """ - ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" -x {input.readbin} > {output} 2> {log} - """ - -rule sort_raw_ema: - input: - sam = outdir + "/align/{sample}.bc.raw.sam", - genome = f"Genome/{bn}", geno_faidx = f"Genome/{bn_idx}", geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: aln = temp(outdir + "/align/{sample}.bc.bam"), idx = temp(outdir + "/align/{sample}.bc.bam.bai") log: - outdir + "/logs/{sample}.ema.sort.log" + ema = outdir + "/logs/{sample}.ema.align.log", + sort = outdir + "/logs/{sample}.ema.sort.log", + resources: + mem_mb = 2000 params: + bxtype = f"-p {platform}", + tmpdir = lambda wc: outdir + "/." + d[wc.sample], quality = config["quality"], - tmpdir = lambda wc: outdir + "/align/." + d[wc.sample], extra = extra - resources: - mem_mb = 2000 threads: - 2 - container: - None + min(10, workflow.cores) + conda: + f"{envdir}/align.yaml" message: - "Sorting and quality filtering alignments: {wildcards.sample}" + "Aligning barcoded sequences: {wildcards.sample}" shell: """ - samtools view -h -F 4 -q {params.quality} {input.sam} | - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam --write-index -m {resources.mem_mb}M -o {output.aln}##idx##{output.idx} - 2> {log} + ema align -t {threads} {params.extra} -d {params.bxtype} -r {input.genome} -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" -x {input.readbin} 2> {log.ema} | + samtools view -h -F 4 -q {params.quality} | + samtools sort -T {params.tmpdir} --reference {input.genome} -O bam --write-index -m {resources.mem_mb}M -o {output.aln}##idx##{output.idx} - 2> {log.sort} rm -rf {params.tmpdir} """ @@ -248,104 +202,54 @@ rule align_nobarcode: geno_faidx = f"Genome/{bn_idx}", geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - pipe(outdir + "/align/{sample}.nobc.raw.sam") + temp(outdir + "/align/{sample}.bwa.nobc.sam") log: outdir + "/logs/{sample}.bwa.align.log" + params: + quality = config["quality"] benchmark: ".Benchmark/Mapping/ema/bwaAlign.{sample}.txt" threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning unbarcoded sequences: {wildcards.sample}" shell: """ - bwa mem -t {threads} -v2 -C -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.reads} > {output} 2> {log} + bwa mem -t {threads} -v2 -C -R \"@RG\\tID:{wildcards.sample}\\tSM:{wildcards.sample}\" {input.genome} {input.reads} 2> {log} | + samtools view -h -F 4 -q {params.quality} > {output} """ -rule quality_filter: - input: - outdir + "/align/{sample}.nobc.raw.sam" - output: - temp(outdir + "/align/{sample}.bwa.nobc.sam") - params: - quality = config["quality"] - container: - None - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" - -rule collate: - input: - outdir + "/align/{sample}.bwa.nobc.sam" - output: - temp(outdir + "/align/{sample}.collate.nobc.bam") - container: - None - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/align/{sample}.collate.nobc.bam" - output: - temp(outdir + "/align/{sample}.fixmate.nobc.bam") - container: - None - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" - -rule sort_nobc_alignments: +rule mark_duplicates: input: - sam = outdir + "/align/{sample}.fixmate.nobc.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn_idx}", - genome_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") + sam = outdir + "/align/{sample}.bwa.nobc.sam", + genome = f"Genome/{bn}", + faidx = f"Genome/{bn_idx}" output: - bam = temp(outdir + "/align/{sample}.sort.nobc.bam"), - bai = temp(outdir + "/align/{sample}.sort.nobc.bam.bai") + temp(outdir + "/align/{sample}.markdup.nobc.bam") log: - outdir + "/logs/{sample}.bwa.sort.log" + outdir + "/logs/{sample}.markdup.log" params: - quality = config["quality"], tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: mem_mb = 2000 container: None + threads: + 2 message: - "Sorting alignments: {wildcards.sample}" + "Marking duplicates: {wildcards.sample}" shell: """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + samtools collate -O -u {input.sam} | + samtools fixmate -m -u - - | + samtools sort -T {params.tmpdir} -u --reference {input.genome} -l 0 -m {resources.mem_mb}M - | + samtools markdup -@ {threads} -S --barcode-tag BX -f {log} - {output} rm -rf {params.tmpdir} """ -rule mark_duplicates: - input: - bam = outdir + "/align/{sample}.sort.nobc.bam", - bai = outdir + "/align/{sample}.sort.nobc.bam.bai" - output: - temp(outdir + "/align/{sample}.markdup.nobc.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - container: - None - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S -f {log} {input.bam} {output} 2> /dev/null" - -rule index_markdups: +rule markdups_index: input: outdir + "/align/{sample}.markdup.nobc.bam" output: @@ -362,23 +266,9 @@ rule concatenate_alignments: aln_bc = outdir + "/align/{sample}.bc.bam", idx_bc = outdir + "/align/{sample}.bc.bam.bai", aln_nobc = outdir + "/align/{sample}.markdup.nobc.bam", - idx_nobc = outdir + "/align/{sample}.markdup.nobc.bam.bai" + idx_nobc = outdir + "/align/{sample}.markdup.nobc.bam.bai", + genome = f"Genome/{bn}" output: - bam = temp(outdir + "/align/{sample}.concat.unsort.bam") - threads: - 2 - container: - None - message: - "Concatenating barcoded and unbarcoded alignments: {wildcards.sample}" - shell: - "samtools cat -@ {threads} {input.aln_bc} {input.aln_nobc} > {output.bam}" - -rule sort_concatenated: - input: - bam = outdir + "/align/{sample}.concat.unsort.bam", - genome = f"Genome/{bn}" - output: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" threads: @@ -388,9 +278,12 @@ rule sort_concatenated: container: None message: - "Sorting merged barcoded alignments: {wildcards.sample}" + "Concatenating barcoded and unbarcoded alignments: {wildcards.sample}" shell: - "samtools sort -@ {threads} -O bam --reference {input.genome} -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.bam} 2> /dev/null" + """ + samtools cat -@ 1 {input.aln_bc} {input.aln_nobc} | + samtools sort -@ 1 -O bam --reference {input.genome} -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} - + """ rule alignment_coverage: input: @@ -420,7 +313,7 @@ rule bx_stats: script: "scripts/bxStats.py" -rule alignment_report: +rule report_persample: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -433,7 +326,7 @@ rule alignment_report: script: "report/AlignStats.Rmd" -rule general_stats: +rule stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -450,7 +343,7 @@ rule general_stats: samtools flagstat {input.bam} > {output.flagstat} """ -rule samtools_reports: +rule report_samtools: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]), output: @@ -466,7 +359,7 @@ rule samtools_reports: multiqc {outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ -rule bx_report: +rule report_bx: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: @@ -483,7 +376,6 @@ rule log_workflow: input: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = [ "bam", "bam.bai"] ), cov_report = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - bx_counts = f"{outdir}/reports/reads.bxcounts.html" if not skipreports else [], agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [], bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] params: diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index b7ab7700f..f8bbb8e96 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -114,104 +114,57 @@ rule align: genome = f"Genome/{bn}", genome_index = f"Genome/{bn}.r{readlen}.sti" if not autolen else [] output: - pipe(outdir + "/samples/{sample}/{sample}.raw.sam") + temp(outdir + "/samples/{sample}/{sample}.strobe.sam") log: outdir + "/logs/{sample}.strobealign.log" params: samps = lambda wc: d[wc.get("sample")], readlen = "" if autolen else f"--use-index -r {readlen}", + quality = config["quality"], extra = extra benchmark: ".Benchmark/Mapping/strobealign/align.{sample}.txt" threads: - min(10, workflow.cores) - 2 + min(10, workflow.cores) conda: f"{envdir}/align.yaml" message: "Aligning sequences: {wildcards.sample}" shell: - "strobealign {params.readlen} -t {threads} -U -C --rg-id={wildcards.sample} --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} > {output} 2> {log}" - -rule quality_filter: - input: - outdir + "/samples/{sample}/{sample}.raw.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.mm2.sam") - params: - quality = config["quality"] - container: - None - message: - "Quality filtering alignments: {wildcards.sample}" - shell: - "samtools view -h -F 4 -q {params.quality} {input} > {output}" - -rule collate: - input: - outdir + "/samples/{sample}/{sample}.mm2.sam" - output: - temp(outdir + "/samples/{sample}/{sample}.collate.bam") - container: - None - message: - "Collating alignments: {wildcards.sample}" - shell: - "samtools collate -o {output} {input} 2> /dev/null" - -rule fix_mates: - input: - outdir + "/samples/{sample}/{sample}.collate.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.fixmate.bam") - container: - None - message: - "Fixing mates in alignments: {wildcards.sample}" - shell: - "samtools fixmate -m {input} {output} 2> /dev/null" + """ + strobealign {params.readlen} -t {threads} -U -C --rg-id={wildcards.sample} --rg=SM:{wildcards.sample} {params.extra} {input.genome} {input.fastq} 2> {log} | + samtools view -h -F 4 -q {params.quality} > {output} + """ -rule sort_alignments: +rule markduplicates: input: - sam = outdir + "/samples/{sample}/{sample}.fixmate.bam", - genome = f"Genome/{bn}", - genome_samidx = f"Genome/{bn}.fai" + sam = outdir + "/samples/{sample}/{sample}.strobe.sam", + genome = f"Genome/{bn}", + faidx = f"Genome/{bn}.fai" output: - bam = temp(outdir + "/samples/{sample}/{sample}.sort.bam"), - bai = temp(outdir + "/samples/{sample}/{sample}.sort.bam.bai") + temp(outdir + "/samples/{sample}/{sample}.markdup.bam") log: - outdir + "/logs/{sample}.strobealign.sort.log" + outdir + "/logs/{sample}.markdup.log" params: - quality = config["quality"], tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: mem_mb = 2000 + threads: + 2 container: None message: - "Sorting alignments: {wildcards.sample}" + "Marking duplicates: {wildcards.sample}" shell: """ - samtools sort -T {params.tmpdir} --reference {input.genome} -O bam -l 0 -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} {input.sam} 2> {log} + samtools collate -O -u {input.sam} | + samtools fixmate -m -u - - | + samtools sort -T {params.tmpdir} -u --reference {input.genome} -l 0 -m {resources.mem_mb}M - | + samtools markdup -@ {threads} -S --barcode-tag BX -f {log} - {output} rm -rf {params.tmpdir} """ -rule mark_duplicates: - input: - outdir + "/samples/{sample}/{sample}.sort.bam" - output: - temp(outdir + "/samples/{sample}/{sample}.markdup.bam") - log: - outdir + "/logs/{sample}.markdup.log" - threads: - 2 - container: - None - message: - "Marking duplicates in alignments alignment: {wildcards.sample}" - shell: - "samtools markdup -@ {threads} -S --barcode-tag BX -f {log} {input} {output} 2> /dev/null" - -rule index_markdups: +rule markdups_index: input: outdir + "/samples/{sample}/{sample}.markdup.bam" output: @@ -239,7 +192,7 @@ rule assign_molecules: script: "scripts/assignMI.py" -rule alignment_bxstats: +rule bxstats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -254,7 +207,7 @@ rule alignment_bxstats: script: "scripts/bxStats.py" -rule alignment_coverage: +rule coverage: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -269,7 +222,7 @@ rule alignment_coverage: shell: "samtools depth -a {input.bam} | depthWindows.py {params} | gzip > {output}" -rule alignment_report: +rule report_persample: input: outdir + "/reports/data/bxstats/{sample}.bxstats.gz", outdir + "/reports/data/coverage/{sample}.cov.gz" @@ -284,7 +237,7 @@ rule alignment_report: script: "report/AlignStats.Rmd" -rule general_alignment_stats: +rule stats: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" @@ -301,7 +254,7 @@ rule general_alignment_stats: samtools flagstat {input.bam} > {output.flagstat} """ -rule samtools_reports: +rule report_samtools: input: collect(outdir + "/reports/data/samtools_{ext}/{sample}.{ext}", sample = samplenames, ext = ["stats", "flagstat"]) output: @@ -317,7 +270,7 @@ rule samtools_reports: multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null """ -rule bx_report: +rule report_bx: input: collect(outdir + "/reports/data/bxstats/{sample}.bxstats.gz", sample = samplenames) output: From 117cc8cdec21ed491e48a3e1c030653ae2f7e80e Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 15:25:31 -0400 Subject: [PATCH 29/41] make fq sorted --- .deprecated/align-minimap.smk | 2 +- src/harpy/snakefiles/align-bwa.smk | 2 +- src/harpy/snakefiles/align-ema.smk | 2 +- src/harpy/snakefiles/align-strobealign.smk | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.deprecated/align-minimap.smk b/.deprecated/align-minimap.smk index 46fb46b1e..055a3dc63 100644 --- a/.deprecated/align-minimap.smk +++ b/.deprecated/align-minimap.smk @@ -52,7 +52,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index da7091204..403e979ce 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -52,7 +52,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 514296215..46d44850f 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -57,7 +57,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index f8bbb8e96..1810cf6f9 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -54,7 +54,7 @@ d = dict(zip(samplenames, samplenames)) def get_fq(wildcards): # returns a list of fastq files for read 1 based on *wildcards.sample* e.g. r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE) - return list(filter(r.match, fqlist))[:2] + return sorted(list(filter(r.match, fqlist))[:2]) rule genome_setup: input: From 2ea3ccdd4b62827828ba229d8920a5a0a5a56a64 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 15:36:19 -0400 Subject: [PATCH 30/41] change folder structure --- src/harpy/snakefiles/align-ema.smk | 56 +++++++++++++++--------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 46d44850f..98acf3fb2 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -119,12 +119,12 @@ rule beadtag_count: input: get_fq output: - counts = temp(outdir + "/bxcount/{sample}.ema-ncnt"), + counts = temp(outdir + "/ema_count/{sample}.ema-ncnt"), logs = temp(outdir + "/logs/count/{sample}.count") params: - prefix = lambda wc: outdir + "/bxcount/" + wc.get("sample"), + prefix = lambda wc: outdir + "/ema_count/" + wc.get("sample"), beadtech = "-p" if platform == "haplotag" else f"-w {whitelist}", - logdir = f"{outdir}/logs/count/" + logdir = f"{outdir}/logs/ema_count/" message: "Counting barcode frequency: {wildcards.sample}" conda: @@ -139,14 +139,14 @@ rule beadtag_count: rule preprocess: input: reads = get_fq, - emacounts = outdir + "/bxcount/{sample}.ema-ncnt" + emacounts = outdir + "/ema_count/{sample}.ema-ncnt" output: - bins = temp(collect(outdir + "/preproc/{{sample}}/ema-bin-{bin}", bin = binrange)), - unbarcoded = temp(outdir + "/preproc/{sample}/ema-nobc") + bins = temp(collect(outdir + "/ema_preproc/{{sample}}/ema-bin-{bin}", bin = binrange)), + unbarcoded = temp(outdir + "/ema_preproc/{sample}/ema-nobc") log: - outdir + "/logs/preproc/{sample}.preproc.log" + outdir + "/logs/ema_preproc/{sample}.preproc.log" params: - outdir = lambda wc: outdir + "/preproc/" + wc.get("sample"), + outdir = lambda wc: outdir + "/ema_preproc/" + wc.get("sample"), bxtype = "-p" if platform == "haplotag" else f"-w {whitelist}", bins = nbins threads: @@ -164,18 +164,18 @@ rule preprocess: rule align: input: - readbin = collect(outdir + "/preproc/{{sample}}/ema-bin-{bin}", bin = binrange), + readbin = collect(outdir + "/ema_preproc/{{sample}}/ema-bin-{bin}", bin = binrange), genome = f"Genome/{bn}", geno_faidx = f"Genome/{bn_idx}", geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - aln = temp(outdir + "/align/{sample}.bc.bam"), - idx = temp(outdir + "/align/{sample}.bc.bam.bai") + aln = temp(outdir + "/ema_align/{sample}.bc.bam"), + idx = temp(outdir + "/ema_align/{sample}.bc.bam.bai") log: - ema = outdir + "/logs/{sample}.ema.align.log", - sort = outdir + "/logs/{sample}.ema.sort.log", + ema = outdir + "/logs/align/{sample}.ema.align.log", + sort = outdir + "/logs/align/{sample}.ema.sort.log", resources: - mem_mb = 2000 + mem_mb = 500 params: bxtype = f"-p {platform}", tmpdir = lambda wc: outdir + "/." + d[wc.sample], @@ -197,14 +197,14 @@ rule align: rule align_nobarcode: input: - reads = outdir + "/preproc/{sample}/ema-nobc", + reads = outdir + "/ema_preproc/{sample}/ema-nobc", genome = f"Genome/{bn}", geno_faidx = f"Genome/{bn_idx}", geno_idx = multiext(f"Genome/{bn}", ".ann", ".bwt", ".pac", ".sa", ".amb") output: - temp(outdir + "/align/{sample}.bwa.nobc.sam") + temp(outdir + "/bwa_align/{sample}.bwa.nobc.sam") log: - outdir + "/logs/{sample}.bwa.align.log" + outdir + "/logs/align/{sample}.bwa.align.log" params: quality = config["quality"] benchmark: @@ -223,17 +223,17 @@ rule align_nobarcode: rule mark_duplicates: input: - sam = outdir + "/align/{sample}.bwa.nobc.sam", + sam = outdir + "/bwa_align/{sample}.bwa.nobc.sam", genome = f"Genome/{bn}", faidx = f"Genome/{bn_idx}" output: - temp(outdir + "/align/{sample}.markdup.nobc.bam") + temp(outdir + "/bwa_align/{sample}.markdup.nobc.bam") log: - outdir + "/logs/{sample}.markdup.log" + outdir + "/logs/align/{sample}.markdup.log" params: tmpdir = lambda wc: outdir + "/." + d[wc.sample] resources: - mem_mb = 2000 + mem_mb = 500 container: None threads: @@ -251,9 +251,9 @@ rule mark_duplicates: rule markdups_index: input: - outdir + "/align/{sample}.markdup.nobc.bam" + outdir + "/bwa_align/{sample}.markdup.nobc.bam" output: - temp(outdir + "/align/{sample}.markdup.nobc.bam.bai") + temp(outdir + "/bwa_align/{sample}.markdup.nobc.bam.bai") container: None message: @@ -263,10 +263,10 @@ rule markdups_index: rule concatenate_alignments: input: - aln_bc = outdir + "/align/{sample}.bc.bam", - idx_bc = outdir + "/align/{sample}.bc.bam.bai", - aln_nobc = outdir + "/align/{sample}.markdup.nobc.bam", - idx_nobc = outdir + "/align/{sample}.markdup.nobc.bam.bai", + aln_bc = outdir + "/ema_align/{sample}.bc.bam", + idx_bc = outdir + "/ema_align/{sample}.bc.bam.bai", + aln_nobc = outdir + "/bwa_align/{sample}.markdup.nobc.bam", + idx_nobc = outdir + "/bwa_align/{sample}.markdup.nobc.bam.bai", genome = f"Genome/{bn}" output: bam = outdir + "/{sample}.bam", @@ -274,7 +274,7 @@ rule concatenate_alignments: threads: 2 resources: - mem_mb = 2000 + mem_mb = 500 container: None message: From a855dfea871e39be1b664f908ca52ad8b78fe461 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 15:39:46 -0400 Subject: [PATCH 31/41] consistent names --- src/harpy/snakefiles/align-ema.smk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 98acf3fb2..430f00b47 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -115,7 +115,7 @@ rule genome_bwa_index: shell: "bwa index {input} 2> {log}" -rule beadtag_count: +rule ema_count: input: get_fq output: @@ -136,7 +136,7 @@ rule beadtag_count: ema count {params.beadtech} -o {params.prefix} 2> {output.logs} """ -rule preprocess: +rule ema_preprocess: input: reads = get_fq, emacounts = outdir + "/ema_count/{sample}.ema-ncnt" @@ -162,7 +162,7 @@ rule preprocess: cat - > {log} """ -rule align: +rule ema_align: input: readbin = collect(outdir + "/ema_preproc/{{sample}}/ema-bin-{bin}", bin = binrange), genome = f"Genome/{bn}", @@ -195,7 +195,7 @@ rule align: rm -rf {params.tmpdir} """ -rule align_nobarcode: +rule bwa_align: input: reads = outdir + "/ema_preproc/{sample}/ema-nobc", genome = f"Genome/{bn}", @@ -221,7 +221,7 @@ rule align_nobarcode: samtools view -h -F 4 -q {params.quality} > {output} """ -rule mark_duplicates: +rule bwa_markdups: input: sam = outdir + "/bwa_align/{sample}.bwa.nobc.sam", genome = f"Genome/{bn}", @@ -249,7 +249,7 @@ rule mark_duplicates: rm -rf {params.tmpdir} """ -rule markdups_index: +rule bwa_markdups_index: input: outdir + "/bwa_align/{sample}.markdup.nobc.bam" output: @@ -285,7 +285,7 @@ rule concatenate_alignments: samtools sort -@ 1 -O bam --reference {input.genome} -m {resources.mem_mb}M --write-index -o {output.bam}##idx##{output.bai} - """ -rule alignment_coverage: +rule coverage: input: bam = outdir + "/{sample}.bam", bai = outdir + "/{sample}.bam.bai" From 40845c7997adbedaf91d6e0be532548f0396ec82 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 16:28:52 -0400 Subject: [PATCH 32/41] add version to multiqc --- src/harpy/conda_deps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harpy/conda_deps.py b/src/harpy/conda_deps.py index 9a3332a30..6132b5696 100644 --- a/src/harpy/conda_deps.py +++ b/src/harpy/conda_deps.py @@ -6,7 +6,7 @@ def generate_conda_deps(): """Create the YAML files of the workflow conda dependencies""" condachannels = ["bioconda","conda-forge","defaults"] environ = { - "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc", "bioconda::pysam=0.22"], + "qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc=1.22", "bioconda::pysam=0.22"], "align": ["bioconda::bwa", "bioconda::ema","bioconda::strobealign", "conda-forge::icu","conda-forge::libzlib", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"], "snp": ["bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6"], "sv": ["bioconda::leviathan", "bioconda::naibr-plus"], From cdd9a700adb6b4c8246237a2c09f1039fe518f58 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 16:29:21 -0400 Subject: [PATCH 33/41] rm test line --- src/harpy/reports/BxAlignStats.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/harpy/reports/BxAlignStats.Rmd b/src/harpy/reports/BxAlignStats.Rmd index ced3e747b..3677f8efe 100644 --- a/src/harpy/reports/BxAlignStats.Rmd +++ b/src/harpy/reports/BxAlignStats.Rmd @@ -85,8 +85,8 @@ process_input <- function(infile){ ``` ```{r setup_df, echo = F, results = F, message = F} -#infiles <- snakemake@input[[1]] -infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") +infiles <- snakemake@input[[1]] +#infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") aggregate_df <- data.frame( sample = character(), From 452d1c8aecb4c475c24b4556fb9a5d744cbcad51 Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 16:29:28 -0400 Subject: [PATCH 34/41] fix typo --- src/harpy/scripts/bxStats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harpy/scripts/bxStats.py b/src/harpy/scripts/bxStats.py index 5cbd26dd4..52421c6ff 100755 --- a/src/harpy/scripts/bxStats.py +++ b/src/harpy/scripts/bxStats.py @@ -22,7 +22,7 @@ def writestats(x, writechrom): x[_mi]["covered_inserts"] = min(x[_mi]["insert_len"] / x[_mi]["inferred"], 1.0) except: x[_mi]["covered_inferred"] = 0 - outtext = f"{writechrom}\t{mi}\t" + "\t".join([str(x[_mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) + outtext = f"{writechrom}\t{_mi}\t" + "\t".join([str(x[_mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) outfile.write(outtext.encode() + b"\n") for read in alnfile.fetch(): From 28bce576d158846cfd060135a90290bba4303aff Mon Sep 17 00:00:00 2001 From: pdimens Date: Thu, 27 Jun 2024 16:42:49 -0400 Subject: [PATCH 35/41] cleaner multiqc call --- src/harpy/snakefiles/align-bwa.smk | 9 +++++---- src/harpy/snakefiles/align-ema.smk | 9 +++++---- src/harpy/snakefiles/align-strobealign.smk | 9 +++++---- src/harpy/snakefiles/demultiplex.gen1.smk | 9 +++++---- src/harpy/snakefiles/qc.smk | 11 +++++++---- 5 files changed, 27 insertions(+), 20 deletions(-) diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index 403e979ce..b473729c8 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -261,15 +261,16 @@ rule report_samtools: output: outdir + "/reports/bwa.stats.html" params: - outdir + outdir = f"{outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"Basic Alignment Statistics\"", + comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: - """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" rule report_bx: input: diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index 430f00b47..b64245df1 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -349,15 +349,16 @@ rule report_samtools: output: outdir + "/reports/ema.stats.html" params: - outdir + outdir = f"{outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"Basic Alignment Statistics\"", + comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: - """ - multiqc {outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "General Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" rule report_bx: input: diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index 1810cf6f9..faed9fe29 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -260,15 +260,16 @@ rule report_samtools: output: outdir + "/reports/strobealign.stats.html" params: - outdir + outdir = f"{outdir}/reports/data/samtools_stats {outdir}/reports/data/samtools_flagstat", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"Basic Alignment Statistics\"", + comment = "--comment \"This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates.\"" conda: f"{envdir}/qc.yaml" message: "Summarizing samtools stats and flagstat" shell: - """ - multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" rule report_bx: input: diff --git a/src/harpy/snakefiles/demultiplex.gen1.smk b/src/harpy/snakefiles/demultiplex.gen1.smk index 2b859556b..1850437ee 100644 --- a/src/harpy/snakefiles/demultiplex.gen1.smk +++ b/src/harpy/snakefiles/demultiplex.gen1.smk @@ -184,15 +184,16 @@ rule qc_report: output: outdir + "/reports/demultiplex.QC.html" params: - outdir + "/logs/" + logdir = outdir + "/logs/", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"QC for Demultiplexed Samples\"", + comment = "--comment \"This report aggregates the QC results created by falco.\"" conda: f"{envdir}/qc.yaml" message: "Creating final demultiplexing QC report" shell: - """ - multiqc {params} --no-version-check --force --quiet --title "QC for Demultiplexed Samples" --comment "This report aggregates the QC results created by falco." --no-data-dir --filename {output} 2> /dev/null - """ + "multiqc {params} --filename {output} 2> /dev/null" rule log_workflow: default_target: True diff --git a/src/harpy/snakefiles/qc.smk b/src/harpy/snakefiles/qc.smk index ede7c6881..3621ebee9 100644 --- a/src/harpy/snakefiles/qc.smk +++ b/src/harpy/snakefiles/qc.smk @@ -114,15 +114,18 @@ rule create_report: log: outdir + "/logs/mutliqc.log" params: - outdir + logdir = f"{outdir}/reports/data/fastp/", + module = "-m fastp", + options = "--no-version-check --force --quiet --no-data-dir", + title = "--title \"QC Summary\"", + comment = "--comment \"This report aggregates trimming and quality control metrics reported by fastp.\"" + conda: f"{envdir}/qc.yaml" message: "Aggregating fastp reports" shell: - """ - multiqc {params}/reports/data/fastp/ -m fastp --force --filename {output} --quiet --title "QC Summary" --comment "This report aggregates trimming and quality control metrics reported by fastp" --no-data-dir 2> {log} - """ + "multiqc {params} --filename {output} 2> {log}" rule log_workflow: default_target: True From 5ee0efb7963cd11d3b16098b9a81f936f37324d1 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 28 Jun 2024 09:21:49 -0400 Subject: [PATCH 36/41] rename --- src/harpy/align.py | 6 +++--- .../{BxAlignStats.Rmd => AlignBxStats.Rmd} | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) rename src/harpy/reports/{BxAlignStats.Rmd => AlignBxStats.Rmd} (91%) diff --git a/src/harpy/align.py b/src/harpy/align.py index c51bac7cd..a0e03f29c 100644 --- a/src/harpy/align.py +++ b/src/harpy/align.py @@ -119,7 +119,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality fetch_script(workflowdir, "assignMI.py") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") - fetch_report(workflowdir, "BxAlignStats.Rmd") + fetch_report(workflowdir, "AlignBxStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align bwa\n") @@ -210,7 +210,7 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads, fetch_rule(workflowdir, "align-ema.smk") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") - fetch_report(workflowdir, "BxAlignStats.Rmd") + fetch_report(workflowdir, "AlignBxStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align ema\n") @@ -292,7 +292,7 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra fetch_script(workflowdir, "assignMI.py") fetch_script(workflowdir, "bxStats.py") fetch_report(workflowdir, "AlignStats.Rmd") - fetch_report(workflowdir, "BxAlignStats.Rmd") + fetch_report(workflowdir, "AlignBxStats.Rmd") with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config: config.write("workflow: align strobe\n") diff --git a/src/harpy/reports/BxAlignStats.Rmd b/src/harpy/reports/AlignBxStats.Rmd similarity index 91% rename from src/harpy/reports/BxAlignStats.Rmd rename to src/harpy/reports/AlignBxStats.Rmd index 3677f8efe..42759389c 100644 --- a/src/harpy/reports/BxAlignStats.Rmd +++ b/src/harpy/reports/AlignBxStats.Rmd @@ -57,6 +57,8 @@ process_input <- function(infile){ tot_invalid_reads <- sum(tb[tb$valid == "invalid BX", "reads"]) avg_mol_cov <- round(mean(multiplex_df$coverage_inserts), 2) sem_mol_cov <- round(std_error(multiplex_df$coverage_inserts), 4) + avg_mol_cov_bp <- round(mean(multiplex_df$coverage_bp), 2) + sem_mol_cov_bp<- round(std_error(multiplex_df$coverage_bp), 4) n50 <- NX(multiplex_df$length_inferred, 50) n75 <- NX(multiplex_df$length_inferred, 75) n90 <- NX(multiplex_df$length_inferred, 90) @@ -76,6 +78,8 @@ process_input <- function(infile){ sereadspermol = sem_reads_per_mol, averagemolcov = avg_mol_cov, semolcov = sem_mol_cov, + averagemolcovbp = avg_mol_cov_bp, + semolcovbp = sem_mol_cov_bp, N50 = n50, N75 = n75, N90 = n90 @@ -85,7 +89,7 @@ process_input <- function(infile){ ``` ```{r setup_df, echo = F, results = F, message = F} -infiles <- snakemake@input[[1]] +infiles <- unlist(snakemake@input, recursive = FALSE) #infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") aggregate_df <- data.frame( @@ -103,6 +107,8 @@ aggregate_df <- data.frame( sereadspermol = numeric(), averagemolcov = numeric(), semolcov = numeric(), + averagemolcovbp = numeric(), + semolcovbp = numeric(), N50 = integer(), N75 = integer(), N90 = integer() @@ -264,10 +270,14 @@ highchart() |> ### average molecule coverage{.no-title} ```{r avg_mol_cov, echo = F, warning = F, message = F, fig.height=figheight, out.width="100%"} err_df <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolcov, low = aggregate_df$averagemolcov - aggregate_df$semolcov, high = aggregate_df$averagemolcov + aggregate_df$semolcov) +err_df_bp <- data.frame(x = 0:(n_samples-1), y = aggregate_df$averagemolcovbp, low = aggregate_df$averagemolcovbp - aggregate_df$semolcovbp, high = aggregate_df$averagemolcovbp + aggregate_df$semolcovbp) + highchart() |> hc_chart(inverted=TRUE, animation = F, pointPadding = 0.0001, groupPadding = 0.0001) |> - hc_add_series(data = aggregate_df, type = "scatter", name = "mean", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolcov), marker = list(radius = 8), color = "#d774ae", zIndex = 1) |> - hc_add_series(data = err_df, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#e39dc6", whiskerColor = "#e39dc6") |> + hc_add_series(data = aggregate_df, type = "scatter", name = "Inferred Fragments", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolcov), marker = list(radius = 8), color = "#df77b5", zIndex = 1) |> + hc_add_series(data = err_df, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#ef94ca", whiskerColor = "#ef94ca") |> + hc_add_series(data = aggregate_df, type = "scatter", name = "Aligned Base Pairs", hcaes(x = 0:(nrow(aggregate_df)-1), y = averagemolcovbp), marker = list(radius = 8), color = "#924596", zIndex = 1) |> + hc_add_series(data = err_df_bp, type = "errorbar", name = "standard error", linkedTo = ":previous", zIndex = 0, stemColor = "#916094", whiskerColor = "#916094") |> hc_xAxis(title = FALSE, gridLineWidth = 0, categories = aggregate_df$sample) |> hc_title(text = "Average Molecule Percent Coverage") |> hc_caption(text = "Error bars show the standard error of the mean percent coverage of non-singleton molecules") |> From 0c0aa726afeb8741be810c9f15ff379c453f1e8b Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 28 Jun 2024 09:22:09 -0400 Subject: [PATCH 37/41] add rounding --- src/harpy/scripts/bxStats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/harpy/scripts/bxStats.py b/src/harpy/scripts/bxStats.py index 52421c6ff..29765cfcb 100755 --- a/src/harpy/scripts/bxStats.py +++ b/src/harpy/scripts/bxStats.py @@ -15,11 +15,11 @@ def writestats(x, writechrom): for _mi in x: x[_mi]["inferred"] = x[_mi]["end"] - x[_mi]["start"] try: - x[_mi]["covered_bp"] = min(x[_mi]["bp"] / x[_mi]["inferred"], 1.0) + x[_mi]["covered_bp"] = round(min(x[_mi]["bp"] / x[_mi]["inferred"], 1.0),4) except: x[_mi]["covered_bp"] = 0 try: - x[_mi]["covered_inserts"] = min(x[_mi]["insert_len"] / x[_mi]["inferred"], 1.0) + x[_mi]["covered_inserts"] = round(min(x[_mi]["insert_len"] / x[_mi]["inferred"], 1.0), 4) except: x[_mi]["covered_inferred"] = 0 outtext = f"{writechrom}\t{_mi}\t" + "\t".join([str(x[_mi][i]) for i in ["n", "start","end", "inferred", "bp", "insert_len", "covered_bp", "covered_inserts"]]) From 8ab36dc65a875a6988fc74fdb4e96b5cc734c59a Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 28 Jun 2024 09:22:25 -0400 Subject: [PATCH 38/41] add condition to fix handling of one file --- src/harpy/snakefiles/align-bwa.smk | 4 ++-- src/harpy/snakefiles/align-ema.smk | 4 ++-- src/harpy/snakefiles/align-strobealign.smk | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/harpy/snakefiles/align-bwa.smk b/src/harpy/snakefiles/align-bwa.smk index b473729c8..3c3617a98 100644 --- a/src/harpy/snakefiles/align-bwa.smk +++ b/src/harpy/snakefiles/align-bwa.smk @@ -282,7 +282,7 @@ rule report_bx: message: "Summarizing all barcode information from alignments" script: - "report/BxAlignStats.Rmd" + "report/AlignBxStats.Rmd" rule log_workflow: default_target: True @@ -290,7 +290,7 @@ rule log_workflow: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam", "bam.bai"]), reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], agg_report = outdir + "/reports/bwa.stats.html" if not skipreports else [], - bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] + bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] params: quality = config["alignment_quality"], extra = extra diff --git a/src/harpy/snakefiles/align-ema.smk b/src/harpy/snakefiles/align-ema.smk index b64245df1..ee410dd5c 100644 --- a/src/harpy/snakefiles/align-ema.smk +++ b/src/harpy/snakefiles/align-ema.smk @@ -370,7 +370,7 @@ rule report_bx: message: "Summarizing all barcode information from alignments" script: - "report/BxAlignStats.Rmd" + "report/AlignBxStats.Rmd" rule log_workflow: default_target: True @@ -378,7 +378,7 @@ rule log_workflow: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = [ "bam", "bam.bai"] ), cov_report = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], agg_report = f"{outdir}/reports/ema.stats.html" if not skipreports else [], - bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] + bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] params: beadtech = "-p" if platform == "haplotag" else f"-w {whitelist}" message: diff --git a/src/harpy/snakefiles/align-strobealign.smk b/src/harpy/snakefiles/align-strobealign.smk index faed9fe29..6a5bbc331 100644 --- a/src/harpy/snakefiles/align-strobealign.smk +++ b/src/harpy/snakefiles/align-strobealign.smk @@ -281,7 +281,7 @@ rule report_bx: message: "Summarizing all barcode information from alignments" script: - "report/BxAlignStats.Rmd" + "report/AlignBxStats.Rmd" rule log_workflow: default_target: True @@ -289,7 +289,7 @@ rule log_workflow: bams = collect(outdir + "/{sample}.{ext}", sample = samplenames, ext = ["bam","bam.bai"]), samtools = outdir + "/reports/strobealign.stats.html" if not skipreports else [] , reports = collect(outdir + "/reports/{sample}.html", sample = samplenames) if not skipreports else [], - bx_report = outdir + "/reports/barcodes.summary.html" if not skipreports else [] + bx_report = outdir + "/reports/barcodes.summary.html" if (not skipreports or len(samplenames) == 1) else [] params: readlen = readlen, quality = config["quality"], From 4fbc1c86c72dc8623db82c5f46f740c7e1f960f7 Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 28 Jun 2024 10:09:46 -0400 Subject: [PATCH 39/41] add final newline --- src/harpy/scripts/bxStats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/harpy/scripts/bxStats.py b/src/harpy/scripts/bxStats.py index 29765cfcb..fae907bf8 100755 --- a/src/harpy/scripts/bxStats.py +++ b/src/harpy/scripts/bxStats.py @@ -107,5 +107,5 @@ def writestats(x, writechrom): # print the last entry writestats(d, chrom_last) # write comment on the last line with the total number of unique BX barcodes -outfile.write(f"#total unique barcodes: {len(all_bx)}".encode()) +outfile.write(f"#total unique barcodes: {len(all_bx)}\n".encode()) outfile.close() From 1a03b4d51003000379502e1674716ab20fa8474c Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 28 Jun 2024 10:09:52 -0400 Subject: [PATCH 40/41] simplify with mapreduce --- src/harpy/reports/AlignBxStats.Rmd | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/src/harpy/reports/AlignBxStats.Rmd b/src/harpy/reports/AlignBxStats.Rmd index 42759389c..c68645a5f 100644 --- a/src/harpy/reports/AlignBxStats.Rmd +++ b/src/harpy/reports/AlignBxStats.Rmd @@ -92,36 +92,14 @@ process_input <- function(infile){ infiles <- unlist(snakemake@input, recursive = FALSE) #infiles <- c("~/sample1.bxstats.gz", "~/sample2.bxstats.gz") -aggregate_df <- data.frame( - sample = character(), - totalreads = integer(), - totaluniquemol = integer(), - singletons = integer(), - multiplex = integer(), - totaluniquebx = integer(), - molecule2bxratio = numeric(), - totalvalidreads = integer(), - totalinvalidreads = integer(), - totalvalidpercent = numeric(), - averagereadspermol = numeric(), - sereadspermol = numeric(), - averagemolcov = numeric(), - semolcov = numeric(), - averagemolcovbp = numeric(), - semolcovbp = numeric(), - N50 = integer(), - N75 = integer(), - N90 = integer() -) +aggregate_df <- Reduce(rbind, Map(process_input, infiles)) -for(i in infiles){ - aggregate_df <- rbind(aggregate_df, process_input(i)) -} if(nrow(aggregate_df) == 0){ print("All input files were empty") knittr::knittr_exit() } ``` + # General ## General Barcode Information from Alignments ### Desc {.no-title} From ac458c391228c41737fbe87a2a3c850ec2947bfe Mon Sep 17 00:00:00 2001 From: pdimens Date: Fri, 28 Jun 2024 11:42:51 -0400 Subject: [PATCH 41/41] nice-ify --- src/harpy/reports/PreflightBam.Rmd | 142 +++++++++++++++------------ src/harpy/reports/PreflightFastq.Rmd | 135 ++++++++++++++----------- 2 files changed, 154 insertions(+), 123 deletions(-) diff --git a/src/harpy/reports/PreflightBam.Rmd b/src/harpy/reports/PreflightBam.Rmd index 96186a919..446a7512e 100644 --- a/src/harpy/reports/PreflightBam.Rmd +++ b/src/harpy/reports/PreflightBam.Rmd @@ -30,6 +30,7 @@ data <- read.table(infile, header = T) attention_df <- data %>% select(-1,-3) %>% rowwise() %>% summarise(count = sum(c(nameMismatch, noBX, badBX))) attention <- sum(attention_df$count > 0) ``` +# File Checks ## filetop ### fltop {.no-title}

Preflight checks for BAM files

@@ -40,75 +41,24 @@ the files to identify formatting conformity and issues that may require your att ## General Information {data-height=100} ### nfiles ```{r} -valueBox(scales::comma(nrow(data)), caption = "Files", color = "info") -``` - -### fileformat -```{r} -valueBox("BAM", caption = "File Format", color = "info") +valueBox(scales::comma(nrow(data)), caption = "Alignment Files", icon = "fa-file", color = "#C0C0C0") ``` ### no MI tag ```{r} -noMItag <- sum(data$noMI > 0) -valueBox(scales::comma(noMItag), caption = "No MI: tag", color = ifelse(noMItag > 0, "warning", "success")) +noMItag <- sum(data$noMI ==data$alignments) +valueBox(scales::comma(noMItag), caption = "No MI: tag", icon = ifelse(noMItag > 0, "fa-exclamation", "fa-check"), color = ifelse(noMItag > 0, "warning", "success")) ``` ### bxnotlast ```{r} bxnotlast <- sum(data$bxNotLast > 0) -valueBox(scales::comma(bxnotlast), caption = "BX:Z: tag not last", color = ifelse(bxnotlast > 0, "warning", "success")) +valueBox(scales::comma(bxnotlast), caption = "BX:Z: tag not last", icon = ifelse(bxnotlast > 0, "fa-exclamation", "fa-check"), color = ifelse(bxnotlast > 0, "warning", "success")) ``` ### issues ```{r} -valueBox(scales::comma(attention), caption = "Files With Issues", color = ifelse(attention > 0, "warning", "success")) -``` - -## interpret -### interpcol {.no-title} -

Interpreting the Data

- -The `harpy preflight bam -d ...` created a `Preflight` folder within -the supplied directory and within it you will find this report, along with the -`filecheck.bam.tsv` file summing the results that are included in this report. -This file contains a tab-delimited table with the columns: `file`, `nameMismatch`, `alignments`, `noBX`, and `badBX`. These columns are defined by: - -- `file`: the name of the BAM file -- `nameMismatch`: the sample name of the file inferred from name of the file (i.e. Harpy assumes `sample1.bam` to be `sample1`) does not match the `@RG ID` tag in the alignment file. - - severity: critical and will likely cause downstream issues and errors -- `alignments`: the total number of alignment records in the file -- `noMI`: alignment records lack `MI:` tag (`MI:Z:` or `MI:i:`) - - severity: critical and will prevent HapCut2 phasing - - the `MI` tag is the "molecular identifier", meaning it's the unique name of a molecule from which are series of alignments share a barcode. - - `EMA` adds these to alignments, `BWA` does not, but `harpy align` will add them when using `BWA` -- `noBX`: the number of alignments that do not have a `BX:Z:` tag in the record - - severity: minimal and likely won't cause issues unless no alignments have `BX:Z:` tags - - if you expect all of your alignments should have `BX:Z:` tags, then further investigation is necessary -- `badBX`: the haplotag barcode in the `BX:Z:` tag does not adhere to the proper `AxxCxxBxxDxx` format - - severity: critical and will cause downstream issues and errors -- `bxNotLast`: the `BX:Z:` tag in the alignment record is not the last tag - - severity: conditional and only relevant for LEVIATHAN variant calling - - can be ignored if not intending to call structural variants with LEVIATHAN - - if intending to use LEVIATHAN, the `BX:Z:` tag must be the last tag in the alignment records - - can be fixed by editing the BAM file, or fixing FASTQ files prior to alignment - - `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record - - -### Proper format {.no-title} -

Proper BAM file format

- -A proper BAM file will contain a `BX:Z:` tag with an alignment that features a -properly-formatted haplotag barcode `AxxCxxBxxDxx`. If this barcode is not in that -format, then it's likely the input FASTQ used for read mapping is the source of the -issue. You can check those FASTQ files for errors with `harpy preflight fastq ...`. - -Below is an example of a proper alignment record for a file named `sample1.bam`. -Note the tag `RG:Z:sample1`, indicating this alignment is associated with `sample1` and -matches the file name. Also note the correctly formatted haplotag barcode `BX:Z:A19C01B86D78` -and the presence of a `MI:` tag. -``` -A00814:267:HTMH3DRXX:2:1132:26268:10316 113 contig1 6312 60 4S47M1D86M = 6312 0 TAACCCTAACCCTAACCCTAACCCTAAACCTAACCCTAACCCTAACCCTAACCCTATTCCTATTCCTAACCCTAACCCTAAAAACCAACTGCACCCCGGTGCACTTTTGGCTATTACCCCCTATCATACCCTTGTCC FFFF,FFFF:FFFF:FF:FFF:FFFF,,FFFFFFFFFFF:FFFFF:FFFFFFFFFFFFF:FFFF,FFFFFFFFF:FF:F:FFFFF,F:,FFFFFFFFFF:FFF,FFFFFF:FFFFFF:FFFF:FFFFFFFFFF:FFF NM:i:2 MD:Z:23C23^C86 MC:Z:4S47M1D86M AS:i:121 XS:i:86 RG:Z:sample1 MI:i:4040669 RX:Z:GAAACGATGTTGC+CCTAAGCCGAATC QX:Z:FFFFFFFFFFFFF+FFFFF:FFFFFFF BX:Z:A19C01B86D78 +valueBox(scales::comma(attention), caption = "Files With Issues", icon = ifelse(attention > 0, "fa-exclamation", "fa-check"), color = ifelse(attention > 0, "warning", "success")) ``` ## viz desc @@ -140,9 +90,6 @@ knitr::kable( ) ``` -You may click on the plot to expand it in your browser window. Clicking it again will exit -out of the zoomed view. - ## datarow ### plot {.no-title} ```{r status_df} @@ -162,15 +109,15 @@ rownames(m) <- gsub(".bam", "", data$file) ```{r readsper, echo = FALSE, message = FALSE, warning = FALSE, out.width = '100%'} hchart(m) |> - hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2) |> - hc_title(text = "BAM file preflight QC") |> + hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2, min = 0, max = 1) |> + hc_title(text = "Alignment File Checks") |> hc_xAxis(labels = list(style = list(fontSize = '16px')) )|> hc_yAxis(labels = list(style = list(fontSize = '16px')) )|> - hc_plotOptions(heatmap = list(borderWidth = 1, borderColor = "white", animated = F)) |> + hc_plotOptions(heatmap = list(animation = FALSE, borderWidth = 1, borderColor = "white", states = list(inactive = list(animation = FALSE), normal = list(animation = FALSE)))) |> hc_legend(enabled = F) |> hc_tooltip( animation = FALSE, - formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS') + '';}") + formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS');}") ) |> hc_exporting(enabled = T, filename = "bam.preflight", buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) @@ -188,3 +135,72 @@ DT::datatable( fillContainer = T ) ``` + + +# Details +## interpret +### interpcol {.no-title} +

Interpreting the Data

+ +The `harpy preflight bam ...` command created a `filecheck.bam.tsv` file in the specified +output directory that summarizes the results that are included in this report. +This file contains a tab-delimited table with the columns: `file`, `nameMismatch`, `alignments`, `noBX`, and `badBX`. These columns are defined by: + +#### file +The name of the BAM file + +#### nameMismatch +The sample name of the file inferred from name of the file (i.e. Harpy assumes `sample1.bam` to be `sample1`) does not match the `@RG ID` tag in the alignment file. + +- severity: critical and will likely cause downstream issues and errors + +#### alignments +The total number of alignment records in the file + +#### noMI +Alignment records lack `MI:` tag (`MI:Z:` or `MI:i:`) + +- severity: critical and will prevent HapCut2 phasing +- the `MI` tag is the "molecular identifier", meaning it's the unique name of a molecule from which are series of alignments share a barcode. + - `EMA` adds these to alignments, `BWA` does not, but `harpy align` will add them when using `BWA` + +#### noBX +The number of alignments that do not have a `BX:Z:` tag in the record + +- severity: minimal and likely won't cause issues unless no alignments have `BX:Z:` tags +- if you expect all of your alignments should have `BX:Z:` tags, then further investigation is necessary + +#### badBX +The haplotag barcode in the `BX:Z:` tag does not adhere to the proper `AxxCxxBxxDxx` format + +- severity: critical and will cause downstream issues and errors + +#### bxNotLast +The `BX:Z:` tag in the alignment record is not the last tag + +- severity: conditional and only relevant for LEVIATHAN variant calling +- can be ignored if not intending to call structural variants with LEVIATHAN +- if intending to use LEVIATHAN, the `BX:Z:` tag must be the last tag in the alignment records + - can be fixed by editing the BAM file, or fixing FASTQ files prior to alignment + - `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record + + +### Proper format {.no-title} +

Proper BAM file format

+ +#### BX Tag +A proper linked-read alignment file will contain a `BX:Z:` tag with an alignment record +that features a properly-formatted haplotag barcode `AxxCxxBxxDxx`. If this barcode is not in that +format, then it's likely the input FASTQ used for read mapping is the source of the +issue. You can check those FASTQ files for errors with `harpy preflight fastq ...`. + +#### Example Alignment Record +Below is an example of a proper alignment record for a file named `sample1.bam`. +Note the tag `RG:Z:sample1`, indicating this alignment is associated with `sample1` and +matches the file name. Also note the correctly formatted haplotag barcode `BX:Z:A19C01B86D78` +and the presence of a `MI:` tag. To reduce horizontal scrolling, the alignment sequence and Phred +scores have been replaced with `SEQUENCE` and `QUALITY`, respectively. + +``` +A00814:267:HTMH3DRXX:2:1132:26268:10316 113 contig1 6312 60 4S47M1D86M = 6312 0 SEQUENCE QUALITY NM:i:2 MD:Z:23C23^C86 MC:Z:4S47M1D86M AS:i:121 XS:i:86 RG:Z:sample1 MI:i:4040669 RX:Z:GAAACGATGTTGC+CCTAAGCCGAATC QX:Z:FFFFFFFFFFFFF+FFFFF:FFFFFFF BX:Z:A19C01B86D78 +``` \ No newline at end of file diff --git a/src/harpy/reports/PreflightFastq.Rmd b/src/harpy/reports/PreflightFastq.Rmd index 518766680..ab2c97a07 100644 --- a/src/harpy/reports/PreflightFastq.Rmd +++ b/src/harpy/reports/PreflightFastq.Rmd @@ -24,9 +24,9 @@ using<-function(...) { } using("flexdashboard","dplyr","tidyr","DT","highcharter","scales") -#infile <- "~/Documents/harpy/test/fastq/Preflight/filecheck.fastq.tsv" +#infile <- "~/Documents/harpy/Preflight/fastq/filecheck.fastq.tsv" infile <- snakemake@input[[1]] -#dirname <- "filler text" + data <- read.table(infile, header = T) attention_df <- data %>% mutate(allmissing = (reads == noBX)) %>% @@ -35,72 +35,29 @@ attention_df <- data %>% summarise(count = sum(c(badBX,badSamSpec, allmissing))) attention <- sum(attention_df$count > 0) ``` - +# File Checks ## filetop ### fltop {.no-title}

Preflight checks for FASTQ files

This report reflects the FASTQ files provided. Harpy has processed -the files to identify formatting issues that may require your attention. +the files to identify [formatting issues](#details) that may require your attention. ## General Information {data-height=100} ### nfiles ```{r} -valueBox(scales::comma(nrow(data)), caption = "Files", color = "info") -``` -### fileformat -```{r} -valueBox("FASTQ", caption = "File Format", color = "info") +valueBox(scales::comma(nrow(data)), caption = "FASTQ Files", icon = "fa-file", color = "#C0C0C0") ``` + ### issues ```{r} -valueBox(scales::comma(attention), caption = "Files With Issues", color = ifelse(attention > 0, "warning", "success")) +valueBox(scales::comma(attention), caption = "Files With Issues", icon = ifelse(attention > 0, "fa-exclamation", "fa-check"), color = ifelse(attention > 0, "warning", "success")) ``` ### BX not last ```{r} bxnotlast <- sum(data$bxNotLast > 0) -valueBox(bxnotlast, caption = "BX:Z: tag not last", color = ifelse(bxnotlast > 0, "#feda75", "success")) -``` - -## interpret -### interpcol {.no-title} -

Interpreting the Data

- -The `harpy preflight fastq -d ...` created a `Preflight` folder within -the supplied directory and within it you will find this report, along with the -`filecheck.fastq.tsv` file summing the results that are included in this report. -This file contains a tab-delimited table with the columns: `file`, `reads`, `noBX`, `badBX`, and `badSamSpec`. These columns are defined by: - -- `file`: the name of the FASTQ file -- `reads`: the total number of reads in the file -- `noBX`: the number of reads that do not have a `BX:Z:` tag in the read header - - severity: minimal and likely won't cause issues unless no reads have `BX:Z:` tags - - if you expect all of your reads should have `BX:Z:` tags, then further investigation is necessary -- `badBX`: the haplotag barcode in the `BX:Z:` comment does not adhere to the proper `AxxCxxBxxDxx` format - - severity: critical and will cause downstream issues and errors -- `badSamSpec`: the comments in the read header after the read ID do not conform to the `TAG:TYPE:VALUE` [SAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) - - severity: critical and will cause downstream issues and errors -- `bxNotLast`: the `BX:Z:` tag in the FASTQ header is not the last comment - - severity: conditional and only relevant for LEVIATHAN variant calling - - can be ignored if not intending to call structural variants with LEVIATHAN - - if intending to use LEVIATHAN, the `BX:Z:` tag must be the last comment in a read header - - `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record - -### Proper header {.no-title} -

Proper Read Headers

- -An example of a proper FASTQ read header is like the one below, where all comments -following the initial read ID (`@A00814...`) have the proper [SAM spec](https://samtools.github.io/hts-specs/SAMv1.pdf) -`TAG:TYPE:VALUE` format and among them is a `BX:Z:` tag followed by a `AxxCxxBxxDxx` -formatted haplotag barcode. The comments must be **TAB separated** because the `:Z:` tag type allows a -whitespace character for its value. The read header below contains the read ID `@A00814...`, -the `BX:Z:` haplotag barcode tag, and two more comments `RX:Z:` and `QX:Z:` that both -adhere to the SAM specification. If using LEVIATHAN to call structural variants, the `BX:Z:` tag -must be the last comment in the read header. - -``` -@A00814:267:HTMH3DRXX:2:1101:4580:1000 BX:Z:A02C01B11D46 RX:Z:GAAACGACCAACA+CGAACACGTTAGC QX:Z:F,FFFFFFFFFFF+FF,FFF:FFFFFF +valueBox(bxnotlast, caption = "BX:Z: tag not last", icon = ifelse(attention > 0, "fa-exclamation", "fa-check"), color = ifelse(bxnotlast > 0, "#feda75", "success")) ``` ## viz desc @@ -130,9 +87,6 @@ knitr::kable( ) ``` -If the heatmap appears to be completely green, then all of your samples have passed all the preflight checks. -If any sample fails at least one check, fails will appear in yellow and passes will appear in blue. - ## datarow ### plot {.no-title} ```{r status_df} @@ -149,18 +103,17 @@ data2 <- data %>% rownames(m) <- data$file ``` - -```{r readsper, echo = FALSE, message = FALSE, warning = FALSE, out.width = '100%'} +```{r readsper, echo = FALSE, message = FALSE, warning = FALSE} hchart(m) |> - hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2) |> - hc_title(text = "FASTQ file preflight QC") |> + hc_colorAxis(stops = list(c(0, "#feda75"), c(1, "#4faad1")), tickAmount = 2, min = 0, max = 1) |> + hc_title(text = "FASTQ File Checks") |> hc_xAxis(labels = list(style = list(fontSize = '16px')) )|> hc_yAxis(labels = list(style = list(fontSize = '16px')) )|> - hc_plotOptions(heatmap = list(borderWidth = 1, borderColor = "white", animated = F)) |> + hc_plotOptions(heatmap = list(animation = FALSE, borderWidth = 1, borderColor = "white", states = list(inactive = list(animation = FALSE), normal = list(animation = FALSE)))) |> hc_legend(enabled = F) |> hc_tooltip( animation = FALSE, - formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS') + '';}") + formatter = JS("function () {return '' + this.series.xAxis.categories[this.point.x] + '
' + this.series.yAxis.categories[this.point.y] + '
QC: ' + (this.point.value<1 ? 'FAIL' : 'PASS');}") ) |> hc_exporting(enabled = T, filename = "fastq.preflight", buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG"))) @@ -178,3 +131,65 @@ DT::datatable( fillContainer = T ) ``` + +# Details +## interpret +### interpcol {.no-title} +

Interpreting the Data

+ +The `harpy preflight fastq ...` command created a `filecheck.fastq.tsv` file in the specified +output directory that summarizes the results that are included in this report. +This file contains a tab-delimited table with the columns: `file`, `reads`, `noBX`, `badBX`, and `badSamSpec`. These columns are defined by: + +#### file +The name of the FASTQ file + +#### reads +The total number of reads in the file + +#### noBX +The number of reads that do not have a `BX:Z:` tag in the read header + +- severity: minimal and likely won't cause issues unless no reads have `BX:Z:` tags +- if you expect all of your reads should have `BX:Z:` tags, then further investigation is necessary + +#### badBX +The haplotag barcode in the `BX:Z:` comment does not adhere to the proper `AxxCxxBxxDxx` format + +- severity: critical and will cause downstream issues and errors + +#### badSamSpec +The comments in the read header after the read ID do not conform to the `TAG:TYPE:VALUE` [SAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf) + +- severity: critical and will cause downstream issues and errors + +#### bxNotLast +The `BX:Z:` tag in the FASTQ header is not the last comment + +- severity: conditional and only relevant for LEVIATHAN variant calling +- can be ignored if not intending to call structural variants with LEVIATHAN +- if intending to use LEVIATHAN, the `BX:Z:` tag must be the last comment in a read header +- `harpy align -m bwa` automatically moves the `BX:Z:` tag to the end of the alignment record + +### Proper header {.no-title} +

Proper Read Headers

+ +#### SAM Specification +An example of a proper FASTQ read header is like the one below, where all comments +following the initial read ID (`@A00814...`) have the proper [SAM spec](https://samtools.github.io/hts-specs/SAMv1.pdf) +`TAG:TYPE:VALUE` format and among them is a `BX:Z:` tag followed by a `AxxCxxBxxDxx` +formatted haplotag barcode. + +#### Comments +The comments must be **TAB separated** because the `:Z:` tag type allows a +whitespace character for its value. + +#### Example Header +The read header below contains the read ID `@A00814...`, +the `BX:Z:` haplotag barcode tag, and two more comments `RX:Z:` and `QX:Z:` that both +adhere to the SAM specification. If using LEVIATHAN to call structural variants, the `BX:Z:` tag +must be the last comment in the read header. + +``` +@A00814:267:HTMH3DRXX:2:1101:4580:1000 BX:Z:A02C01B11D46 RX:Z:GAAACGACCAACA+CGAACACGTTAGC QX:Z:F,FFFFFFFFFFF+FF,FFF:FFFFFF +``` \ No newline at end of file