diff --git a/modules/local/combinebeds/shifts/environment.yml b/modules/local/combinebeds/shifts/environment.yml new file mode 100644 index 00000000..5f93bc08 --- /dev/null +++ b/modules/local/combinebeds/shifts/environment.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::polars=1.8.2 + - conda-forge::altair=5.5.0 + - conda-forge::vl-convert-python==1.7.0 diff --git a/modules/local/combinebeds/shifts/main.nf b/modules/local/combinebeds/shifts/main.nf new file mode 100644 index 00000000..56cdee3d --- /dev/null +++ b/modules/local/combinebeds/shifts/main.nf @@ -0,0 +1,21 @@ +process COMBINEBEDS_SHIFTS { + tag "$meta.id" + label "process_low" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/altair_polars_vl-convert-python:e6f1dca28de76d13' : + 'community.wave.seqera.io/library/altair_polars_vl-convert-python:a6c5ee679445250d' }" + + input: + tuple val(meta), path(beds) + + output: + path "*.png" , emit: plots + path "*.json" , emit: multiqc + path "versions.yml", emit: versions + + script: + prefix = task.ext.prefix ?: "${meta.id}" + template "shifts.py" +} diff --git a/modules/local/combinebeds/shifts/templates/shifts.py b/modules/local/combinebeds/shifts/templates/shifts.py new file mode 100644 index 00000000..e8b25f85 --- /dev/null +++ b/modules/local/combinebeds/shifts/templates/shifts.py @@ -0,0 +1,120 @@ +#!/usr/bin/env python + +import platform +import base64 +import json +from itertools import product + +import polars as pl +import altair as alt + +def format_yaml_like(data: dict, indent: int = 0) -> str: + """Formats a dictionary to a YAML-like string. + + Args: + data (dict): The dictionary to format. + indent (int): The current indentation level. + + Returns: + str: A string formatted as YAML. + """ + yaml_str = "" + for key, value in data.items(): + spaces = " " * indent + if isinstance(value, dict): + yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}" + else: + yaml_str += f"{spaces}{key}: {value}\\n" + return yaml_str + +meta_id = "${meta.id}" + +df = pl.scan_csv("${beds}".split(" "), + separator="\\t", + has_header=False, + new_columns=["chr", "start", "end", "name", "score", "strand", "sample", "tool"]) + +df = df.group_by("chr", "start", "end", "strand").agg(tools=pl.col("tool").unique(), samples=pl.col("sample").unique()) + +def get_group_sizes(df: pl.LazyFrame, max_shift: int, consider_strand: bool) -> pl.LazyFrame: + df = df.sort("end" ).with_columns(end_group =pl.col("end" ).diff().fill_null(0).gt(max_shift).cum_sum()) + df = df.sort("start").with_columns(start_group=pl.col("start").diff().fill_null(0).gt(max_shift).cum_sum()) + + group_cols = ["chr", "start_group", "end_group"] + (["strand"] if consider_strand else []) + df = df.join(df, on=group_cols, how="inner") + df = df.filter((pl.col("start") - pl.col("start_right")).abs() <= max_shift) + df = df.filter((pl.col("end") - pl.col("end_right")).abs() <= max_shift) + df = df.select(["chr", "start", "end", "strand", "samples_right", "tools_right"]) + df = df.group_by(["chr", "start", "end", "strand"]).agg(**{ + "samples": pl.col("samples_right").flatten().unique(), + "tools": pl.col("tools_right").flatten().unique() + }).with_columns(n_samples=pl.col("samples").map_elements(lambda x: len(x), return_dtype=int), + n_tools=pl.col("tools").map_elements(lambda x: len(x), return_dtype=int)) + + df_sample_counts = (df.group_by("n_samples").agg(count=pl.col("n_samples").count()) + .sort("n_samples") + .rename({"n_samples": "value"}) + .with_columns(metric=pl.lit("n_samples"))) + df_tool_counts = (df.group_by("n_tools").agg(count=pl.col("n_tools").count()) + .sort("n_tools") + .rename({"n_tools": "value"}) + .with_columns(metric=pl.lit("n_tools"))) + + df = (pl.concat([df_sample_counts, df_tool_counts]) + .with_columns(max_shift=max_shift, consider_strand=consider_strand)) + + return df + +shifts = [0, 1, 2, 3, 4, 5, 10, 20, 50] +consider_strands = [True, False] + +dfs = [] +for max_shift, consider_strand in product(shifts, consider_strands): + df_ = get_group_sizes(df, max_shift, consider_strand) + dfs.append(df_.collect()) + +df = pl.concat(dfs).with_columns(max_shift=pl.col("max_shift")) + +metrics = { + "n_samples": "Number of samples", + "n_tools": "Number of tools" +} + +for metric, title in metrics.items(): + n_unique = df.filter(pl.col("metric") == metric).select("value").n_unique("value") + df_ = df.filter(pl.col("metric") == metric) + plot = df_.plot.bar(x="max_shift:N", y="count", color=f"value:{("Q" if n_unique > 10 else "N")}", column="consider_strand") + plot = plot.properties(title=title) + + plot_file = f"{metric}.png" + plot.save(plot_file) + + image_string = base64.b64encode(open(plot_file, "rb").read()).decode("utf-8") + image_html = f'
' + + multiqc = { + 'id': f"{meta_id}_shifts_{metric}", + 'parent_id': "shift_plots", + 'parent_name': 'Shift Plots', + 'parent_description': 'Stacked bar plots showing the agreement between tools and samples for different shift values', + 'section_name': title, + 'description': f'Stacked bar plot showing the agreement between tools and samples for different shift values, {"considering" if consider_strand else "ignoring"} strand', + 'plot_type': 'image', + 'data': image_html + } + + with open(f"{metric}.shifts_mqc.json", "w") as f: + f.write(json.dumps(multiqc, indent=4)) + +# Versions + +versions = { + "${task.process}": { + "python": platform.python_version(), + "polars": pl.__version__, + "altair": alt.__version__ + } +} + +with open("versions.yml", "w") as f: + f.write(format_yaml_like(versions)) diff --git a/subworkflows/local/bsj_detection.nf b/subworkflows/local/bsj_detection.nf index 57ed2cfd..416c5163 100644 --- a/subworkflows/local/bsj_detection.nf +++ b/subworkflows/local/bsj_detection.nf @@ -4,9 +4,10 @@ include { CSVTK_JOIN as COMBINE_COUNTS_PER_TOOL } from '../../modules/n include { GAWK as FILTER_BSJS } from '../../modules/nf-core/gawk' include { GAWK as BED_ADD_SAMPLE_TOOL } from '../../modules/nf-core/gawk' include { COMBINEBEDS_FILTER as COMBINE_TOOLS_PER_SAMPLE } from '../../modules/local/combinebeds/filter' +include { COMBINEBEDS_SHIFTS as INVESTIGATE_SHIFTS } from '../../modules/local/combinebeds/shifts' include { COMBINEBEDS_FILTER as COMBINE_SAMPLES } from '../../modules/local/combinebeds/filter' -include { AGAT_SPADDINTRONS as ADD_INTRONS } from '../../modules/nf-core/agat/spaddintrons' -include { GAWK as EXTRACT_EXONS_INTRONS } from '../../modules/nf-core/gawk' +include { AGAT_SPADDINTRONS as ADD_INTRONS } from '../../modules/nf-core/agat/spaddintrons' +include { GAWK as EXTRACT_EXONS_INTRONS } from '../../modules/nf-core/gawk' include { BEDTOOLS_GETFASTA as FASTA_COMBINED } from '../../modules/nf-core/bedtools/getfasta' include { BEDTOOLS_GETFASTA as FASTA_PER_SAMPLE } from '../../modules/nf-core/bedtools/getfasta' include { BEDTOOLS_GETFASTA as FASTA_PER_SAMPLE_TOOL } from '../../modules/nf-core/bedtools/getfasta' @@ -148,8 +149,12 @@ workflow BSJ_DETECTION { ch_bsj_bed_per_sample = COMBINE_TOOLS_PER_SAMPLE.out.combined .filter{ meta, bed -> !bed.isEmpty() } + ch_all_samples = ch_bsj_bed_per_sample_tool_meta + .map{ meta, bed -> [[id: "all"], bed] } + .groupTuple() + COMBINE_SAMPLES( - ch_bsj_bed_per_sample_tool_meta.map{ meta, bed -> [[id: "all"], bed] }.groupTuple(), + ch_all_samples, params.max_shift, params.consider_strand, params.min_tools, @@ -160,6 +165,10 @@ workflow BSJ_DETECTION { .filter{ meta, bed -> !bed.isEmpty() } .collect() + INVESTIGATE_SHIFTS(ch_all_samples) + ch_versions = ch_versions.mix(INVESTIGATE_SHIFTS.out.versions) + ch_multiqc_files = INVESTIGATE_SHIFTS.out.multiqc + // // ANNOTATION //