Skip to content

Commit

Permalink
Merge pull request #92 from pdimens/align_aggregate
Browse files Browse the repository at this point in the history
Add aggregate report for bx stats
  • Loading branch information
pdimens authored Jun 28, 2024
2 parents 01c9cbe + ac458c3 commit 0cf2243
Show file tree
Hide file tree
Showing 26 changed files with 820 additions and 703 deletions.
11 changes: 7 additions & 4 deletions src/harpy/reports/EmaCount.Rmd → .deprecated/EmaCount.Rmd
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
---
title: "EMA Count Barcode Summary"
date: "`r format(Sys.time(), '%d %B, %Y')`"
date: "`r format(Sys.time(), '%d %b, %Y at %H:%M')`"
output:
flexdashboard::flex_dashboard:
theme: lumen
orientation: rows
vertical_layout: scroll
favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_dark.png"
logo: https://raw.githubusercontent.com/pdimens/harpy/docs/static/logo_mini.png
favicon: "https://raw.githubusercontent.com/pdimens/harpy/docs/static/favicon_report.png"
navbar:
- { title : "Docs", icon: "fa-book", href: "https://pdimens.github.io/harpy/", align: right }
---

```{r echo = FALSE, message = FALSE}
Expand Down Expand Up @@ -132,7 +135,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "nor
hc_caption(text = "As determined by ema count, given as counts") |>
hc_tooltip(crosshairs = TRUE) |>
hc_exporting(enabled = T, filename = "ema.bxcount",
buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
)
```
### proportional valid barcodes
Expand All @@ -145,7 +148,7 @@ hchart(tb, "bar", hcaes(x = Sample, y = Count, group = Barcode), stacking = "per
hc_caption(text = "As determined by ema count, given as percents") |>
hc_tooltip(crosshairs = TRUE) |>
hc_exporting(enabled = T, filename = "ema.bxpercent",
buttons = list(contextButton = list(menuItems = c("viewFullscreen", "separator", "downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
buttons = list(contextButton = list(menuItems = c("downloadPNG", "downloadJPEG", "downloadPDF", "downloadSVG")))
)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ d = dict(zip(samplenames, samplenames))
def get_fq(wildcards):
# returns a list of fastq files for read 1 based on *wildcards.sample* e.g.
r = re.compile(fr".*/({re.escape(wildcards.sample)}){bn_r}", flags = re.IGNORECASE)
return list(filter(r.match, fqlist))[:2]
return sorted(list(filter(r.match, fqlist))[:2])

rule genome_setup:
input:
Expand Down Expand Up @@ -316,7 +316,7 @@ rule samtools_reports:
"Summarizing samtools stats and flagstat"
shell:
"""
multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null
multiqc {params}/reports/data/samtools_stats {params}/reports/data/samtools_flagstat --no-version-check --force --quiet --title "Basic Alignment Statistics" --comment "This report aggregates samtools stats and samtools flagstats results for all alignments. Samtools stats ignores alignments marked as duplicates." --no-data-dir --filename {output} 2> /dev/null
"""

rule log_workflow:
Expand Down
94 changes: 10 additions & 84 deletions src/harpy/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def bwa(inputs, output_dir, genome, depth_window, threads, extra_params, quality
fetch_script(workflowdir, "assignMI.py")
fetch_script(workflowdir, "bxStats.py")
fetch_report(workflowdir, "AlignStats.Rmd")
fetch_report(workflowdir, "AlignBxStats.Rmd")

with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
config.write("workflow: align bwa\n")
Expand Down Expand Up @@ -208,8 +209,8 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
fetch_rule(workflowdir, "align-ema.smk")
fetch_script(workflowdir, "bxStats.py")
for i in ["EmaCount", "AlignStats"]:
fetch_report(workflowdir, f"{i}.Rmd")
fetch_report(workflowdir, "AlignStats.Rmd")
fetch_report(workflowdir, "AlignBxStats.Rmd")

with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
config.write("workflow: align ema\n")
Expand Down Expand Up @@ -238,87 +239,11 @@ def ema(inputs, output_dir, platform, whitelist, genome, depth_window, threads,
_module = subprocess.run(command.split())
sys.exit(_module.returncode)

#@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
#@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
#@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
#@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
#@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
#@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
#@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
#@click.option('-q', '--quiet', is_flag = True, show_default = True, default = False, help = 'Don\'t show output text while running')
#@click.option('-o', '--output-dir', type = str, default = "Align/minimap", show_default=True, help = 'Name of output directory')
#@click.option('--snakemake', type = str, help = 'Additional Snakemake parameters, in quotes')
#@click.option('--hpc', type = click.Path(exists = True, file_okay = False), help = 'Config dir for automatic HPC submission')
#@click.option('--conda', is_flag = True, default = False, help = 'Use conda/mamba instead of container')
#@click.option('--skipreports', is_flag = True, show_default = True, default = False, help = 'Don\'t generate HTML reports')
#@click.option('--print-only', is_flag = True, hidden = True, default = False, help = 'Print the generated snakemake command and exit')
#@click.argument('inputs', required=True, type=click.Path(exists=True), nargs=-1)
#def minimap(inputs, output_dir, genome, depth_window, threads, extra_params, quality_filter, molecule_distance, snakemake, skipreports, quiet, hpc, conda, print_only):
# """
# Align sequences to genome using `Minimap2`
#
# Provide the input fastq files and/or directories at the end of the command as individual
# files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
#
# Minimap2 is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does
# not use barcodes when mapping. Harpy will post-processes the alignments using the
# specified `--molecule-distance` to assign alignments to unique molecules.
# """
# output_dir = output_dir.rstrip("/")
# workflowdir = f"{output_dir}/workflow"
# sdm = "conda" if conda else "conda apptainer"
# command = f'snakemake --rerun-incomplete --rerun-triggers input mtime params --nolock --software-deployment-method {sdm} --conda-prefix ./.snakemake/conda --cores {threads} --directory . '
# command += f"--snakefile {workflowdir}/align-minimap.smk "
# command += f"--configfile {workflowdir}/config.yaml "
# if hpc:
# command += f"--workflow-profile {hpc} "
# if quiet:
# command += "--quiet all "
# if snakemake is not None:
# command += snakemake
# if print_only:
# click.echo(command)
# sys.exit(0)
#
# os.makedirs(f"{workflowdir}/", exist_ok= True)
# fqlist, sample_count = parse_fastq_inputs(inputs)
# validate_input_by_ext(genome, "--genome", [".fasta", ".fa", ".fasta.gz", ".fa.gz"])
# fetch_rule(workflowdir, "align-minimap.smk")
# fetch_script(workflowdir, "assignMI.py")
# fetch_script(workflowdir, "bxStats.py")
# fetch_report(workflowdir, "AlignStats.Rmd")
#
# with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
# config.write("workflow: align minimap\n")
# config.write(f"genomefile: {genome}\n")
# config.write(f"seq_directory: {workflowdir}/input\n")
# config.write(f"output_directory: {output_dir}\n")
# config.write(f"quality: {quality_filter}\n")
# config.write(f"molecule_distance: {molecule_distance}\n")
# config.write(f"depth_windowsize: {depth_window}\n")
# config.write(f"skipreports: {skipreports}\n")
# if extra_params is not None:
# config.write(f"extra: {extra_params}\n")
# config.write(f"workflow_call: {command}\n")
# config.write("inputs:\n")
# config.write(f" genome: {Path(genome).resolve()}\n")
# config.write(" fastq:\n")
# for i in fqlist:
# config.write(f" - {i}\n")
#
# print_onstart(
# f"Samples: {sample_count}\nOutput Directory: {output_dir}",
# "align minimap"
# )
# generate_conda_deps()
# _module = subprocess.run(command.split())
# sys.exit(_module.returncode)

@click.command(no_args_is_help = True, epilog= "read the docs for more information: https://pdimens.github.io/harpy/modules/align/minimap/")
@click.option('-g', '--genome', type=click.Path(exists=True, dir_okay=False), required = True, help = 'Genome assembly for read mapping')
@click.option('-m', '--molecule-distance', default = 100000, show_default = True, type = int, help = 'Base-pair distance threshold to separate molecules')
@click.option('-f', '--quality-filter', default = 30, show_default = True, type = click.IntRange(min = 0, max = 40), help = 'Minimum mapping quality to pass filtering')
@click.option('-r', '--read-length', default = "125", show_default = True, type = click.Choice(["50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
@click.option('-r', '--read-length', default = "auto", show_default = True, type = click.Choice(["auto", "50", "75", "100", "125", "150", "250", "400"]), help = 'Average read length for creating index')
@click.option('-d', '--depth-window', default = 50000, show_default = True, type = int, help = 'Interval size (in bp) for depth stats')
@click.option('-x', '--extra-params', type = str, help = 'Additional aligner parameters, in quotes')
@click.option('-t', '--threads', default = 4, show_default = True, type = click.IntRange(min = 4, max_open = True), help = 'Number of threads to use')
Expand All @@ -337,11 +262,12 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
Provide the input fastq files and/or directories at the end of the command as individual
files/folders, using shell wildcards (e.g. `data/echidna*.fastq.gz`), or both.
strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp. This aligner does
not use barcodes when mapping. Harpy will post-processes the alignments using the
strobealign is an ultra-fast aligner comparable to bwa for sequences >100bp and does
not use barcodes when mapping, so Harpy will post-processes the alignments using the
specified `--molecule-distance` to assign alignments to unique molecules. The `--read-length` is
an *approximate* parameter and should be one of [`50`, `75`, `100`, `125`, `150`, `250`, `400`].
If your input is post-qc sequences, then you should expect the read lengths to be <150.
an *approximate* parameter and should be one of [`auto`, `50`, `75`, `100`, `125`, `150`, `250`, `400`].
The alignment process will be faster and take up less disk/RAM if you specify an `-r` value that isn't
`auto`. If your input has adapters removed, then you should expect the read lengths to be <150.
"""
output_dir = output_dir.rstrip("/")
workflowdir = f"{output_dir}/workflow"
Expand All @@ -366,6 +292,7 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra
fetch_script(workflowdir, "assignMI.py")
fetch_script(workflowdir, "bxStats.py")
fetch_report(workflowdir, "AlignStats.Rmd")
fetch_report(workflowdir, "AlignBxStats.Rmd")

with open(f"{workflowdir}/config.yaml", "w", encoding="utf-8") as config:
config.write("workflow: align strobe\n")
Expand Down Expand Up @@ -396,5 +323,4 @@ def strobe(inputs, output_dir, genome, read_length, depth_window, threads, extra

align.add_command(bwa)
align.add_command(ema)
#align.add_command(minimap)
align.add_command(strobe)
2 changes: 1 addition & 1 deletion src/harpy/conda_deps.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def generate_conda_deps():
"""Create the YAML files of the workflow conda dependencies"""
condachannels = ["bioconda","conda-forge","defaults"]
environ = {
"qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc", "bioconda::pysam=0.22"],
"qc" : ["bioconda::falco", "bioconda::fastp", "bioconda::multiqc=1.22", "bioconda::pysam=0.22"],
"align": ["bioconda::bwa", "bioconda::ema","bioconda::strobealign", "conda-forge::icu","conda-forge::libzlib", "bioconda::samtools=1.20", "bioconda::seqtk", "bioconda::tabix", "conda-forge::xz"],
"snp": ["bioconda::bcftools=1.20", "bioconda::freebayes=1.3.6"],
"sv": ["bioconda::leviathan", "bioconda::naibr-plus"],
Expand Down
63 changes: 29 additions & 34 deletions src/harpy/fileparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import sys
import subprocess
from pathlib import Path
from collections import Counter
from rich.markdown import Markdown
import rich_click as click
from .printfunctions import print_error, print_solution_with_culprits
Expand Down Expand Up @@ -117,40 +116,36 @@ def parse_alignment_inputs(inputs):
sys.exit(1)
return bam_infiles, len(uniqs)

#def get_samples_from_fastq(directory):
# """Identify the sample names from a directory containing FASTQ files"""
# full_flist = [i for i in glob.iglob(f"{directory}/*") if not os.path.isdir(i)]
# r = re.compile(r".*\.f(?:ast)?q(?:\.gz)?$", flags=re.IGNORECASE)
# full_fqlist = list(filter(r.match, full_flist))
# fqlist = [os.path.basename(i) for i in full_fqlist]
# bn_r = r"[\.\_][RF](?:[12])?(?:\_00[1-9])*\.f(?:ast)?q(?:\.gz)?$"
# if len(fqlist) == 0:
# print_error(f"No fastq files with acceptable names found in [bold]{directory}[/bold]")
# print_solution("Check that the file endings conform to [green].[/green][[green]F[/green][dim]|[/dim][green]R1[/green]][green].[/green][[green]fastq[/green][dim]|[/dim][green]fq[/green]][green].gz[/green]\nRead the documentation for details: https://pdimens.github.io/harpy/haplotagdata/#naming-conventions")
# sys.exit(1)
#
# return set([re.sub(bn_r, "", i, flags = re.IGNORECASE) for i in fqlist])


def biallelic_contigs(vcf, workdir):
"""Identify which contigs have at least 2 biallelic SNPs"""
"""Identify which contigs have at least 2 biallelic SNPs and write them to workdir/vcf.biallelic"""
vbn = os.path.basename(vcf)
if os.path.exists(f"{workdir}/{vbn}.biallelic"):
with open(f"{workdir}/{vbn}.biallelic", "r", encoding="utf-8") as f:
contigs = [line.rstrip() for line in f]
click.echo(f"{workdir}/{vbn}.biallelic exists, using the {len(contigs)} contigs listed in it.", file = sys.stderr)
else:
click.echo("Identifying which contigs have at least 2 biallelic SNPs", file = sys.stderr)
os.makedirs(f"{workdir}/", exist_ok = True)
biallelic = subprocess.Popen(f"bcftools view -M2 -v snps {vcf} -Ob".split(), stdout = subprocess.PIPE)
contigs = subprocess.run("""bcftools query -f '%CHROM\\n'""".split(), stdin = biallelic.stdout, stdout = subprocess.PIPE, check = False).stdout.decode().splitlines()
counts = Counter(contigs)
contigs = [i.replace("\'", "") for i in counts if counts[i] > 1]
with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f:
_ = [f.write(f"{i}\n") for i in contigs]
os.makedirs(f"{workdir}/", exist_ok = True)
valid = []
vcfheader = subprocess.check_output(['bcftools', 'view', '-h', vcf]).decode().split('\n')
header_contigs = [i.split(",")[0].replace("##contig=<ID=","") for i in vcfheader if i.startswith("##contig=")]
if vcf.lower().endswith("bcf") and not os.path.exists(f"{vcf}.csi"):
subprocess.run(f"bcftools index {vcf}".split())
if vcf.lower().endswith("vcf.gz") and not os.path.exists(f"{vcf}.csi"):
subprocess.run(f"bcftools index --tbi {vcf}".split())

if len(contigs) == 0:
print_error("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.")
for contig in header_contigs:
# Use bcftools to count the number of biallelic SNPs in the contig
viewcmd = subprocess.Popen(['bcftools', 'view', '-r', contig, '-v', 'snps', '-m2', '-M2', '-c', '2', vcf], stdout=subprocess.PIPE)
snpcount = 0
while True:
# Read the next line of output
line = viewcmd.stdout.readline().decode()
if not line:
break
snpcount += 1
# If there are at least 2 biallellic snps, terminate the process
if snpcount >= 2:
valid.append(contig)
viewcmd.terminate()
break
if not valid:
click.echo("No contigs with at least 2 biallelic SNPs identified. Cannot continue with imputation.")
sys.exit(1)
else:
return f"{workdir}/{vbn}.biallelic"
with open(f"{workdir}/{vbn}.biallelic", "w", encoding="utf-8") as f:
f.write("\n".join(valid))
return f"{workdir}/{vbn}.biallelic", len(valid)
17 changes: 8 additions & 9 deletions src/harpy/impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para
individual files/folders, using shell wildcards (e.g. `data/drosophila*.bam`), or both.
Requires a parameter file, use **harpy stitchparams** to generate one and adjust it for your study.
The `--vcf-samples` flag toggles to phase only the samples present in your input `--vcf` file rather than all
The `--vcf-samples` option considers only the samples present in your input `--vcf` file rather than all
the samples identified in `INPUT`. If providing additional STITCH arguments, they must be in quotes and
in R language style. The extra parameters will remain constant across different models.
Use single-quotes (string literals) if supplying an argument that requires quotes. For example:
in R language style. Use single-quotes (string literals) if supplying an argument that requires quotes. For example:
```
harpy ... -x 'switchModelIteration = 39, splitReadIterations = NA, reference_populations = c("CEU","GBR")'...
Expand All @@ -72,12 +71,12 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para
sys.exit(0)

os.makedirs(f"{workflowdir}/", exist_ok= True)
bamlist, n = parse_alignment_inputs(inputs)
validate_input_by_ext(vcf, "--vcf", ["vcf", "bcf", "vcf.gz"])
samplenames = vcf_samplematch(vcf, bamlist, vcf_samples)
validate_bam_RG(bamlist)
check_impute_params(parameters)
biallelic = biallelic_contigs(vcf, f"{workflowdir}")
bamlist, n = parse_alignment_inputs(inputs)
validate_bam_RG(bamlist)
samplenames = vcf_samplematch(vcf, bamlist, vcf_samples)
biallelic, n_biallelic = biallelic_contigs(vcf, f"{workflowdir}")

fetch_rule(workflowdir, "impute.smk")
fetch_script(workflowdir, "stitch_impute.R")
Expand All @@ -99,9 +98,9 @@ def impute(inputs, output_dir, parameters, threads, vcf, vcf_samples, extra_para
config.write(" alignments:\n")
for i in bamlist:
config.write(f" - {i}\n")

print_onstart(
f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nOutput Directory: {output_dir}/",
f"Input VCF: {vcf}\nSamples in VCF: {len(samplenames)}\nAlignments Provided: {n}\nContigs with ≥2 Biallelic SNPs: {n_biallelic}\nOutput Directory: {output_dir}/",
"impute"
)
generate_conda_deps()
Expand Down
Loading

0 comments on commit 0cf2243

Please sign in to comment.