From 20d3c597abb0d5413f9e6137954576b1aa8e9e8c Mon Sep 17 00:00:00 2001 From: Matt Welland Date: Tue, 6 Jan 2026 11:18:10 +1000 Subject: [PATCH 01/89] Update FRASER for test build --- images/fraser/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 527cbc4d..561653c5 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -2,7 +2,7 @@ FROM debian:bookworm-slim ENV MAMBA_ROOT_PREFIX /root/micromamba ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH -ARG VERSION=${VERSION:-1.12.1} +ARG VERSION=2.4.6 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ rm -r /var/lib/apt/lists/* && \ From 85a965174336c084abfc8b9efb9b6062c14e3771 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 6 Jan 2026 13:50:42 +1100 Subject: [PATCH 02/89] Maybe this should build? syntax fixes for docker --- images/fraser/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 561653c5..04be73b2 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -7,8 +7,9 @@ ARG VERSION=2.4.6 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ rm -r /var/lib/apt/lists/* && \ rm -r /var/cache/apt/* && \ - wget -qO- https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 | tar -xvj -C /usr/local bin/micromamba && \ - mkdir ${MAMBA_ROOT_PREFIX} && \ + wget https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 && \ + tar -xvjf micromamba-0.8.2-he9b6cbd_0.tar.bz2 -C /usr/local && \ + mkdir -p ${MAMBA_ROOT_PREFIX} && \ micromamba install -y --prefix ${MAMBA_ROOT_PREFIX} -c bioconda -c conda-forge \ python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \ bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \ From e9476d358e7f01fb2826b4b8abfa4546e1536198 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 6 Jan 2026 14:02:11 +1100 Subject: [PATCH 03/89] seperating micromamba unpacking and cleaning image --- images/fraser/Dockerfile | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 04be73b2..5d39a5d1 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -5,12 +5,17 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH ARG VERSION=2.4.6 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ - rm -r /var/lib/apt/lists/* && \ - rm -r /var/cache/apt/* && \ - wget https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 && \ - tar -xvjf micromamba-0.8.2-he9b6cbd_0.tar.bz2 -C /usr/local && \ + wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \ + chmod +x /usr/local/bin/micromamba && \ + \ mkdir -p ${MAMBA_ROOT_PREFIX} && \ - micromamba install -y --prefix ${MAMBA_ROOT_PREFIX} -c bioconda -c conda-forge \ + \ + # Perform the installation + micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \ + -c bioconda -c conda-forge \ python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \ bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \ - rm -r /root/micromamba/pkgs + \ + # Clean up to keep the image small + micromamba clean --all --yes && \ + rm -rf /var/lib/apt/lists/* From 75b15369fc6b26e0845b077bdf7fc7dc72e0b9f4 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 6 Jan 2026 14:08:59 +1100 Subject: [PATCH 04/89] version conflict between fraser R and annotationdbi, if this works ill pin the solved versions --- images/fraser/Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 5d39a5d1..9d179938 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -5,17 +5,17 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH ARG VERSION=2.4.6 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ - wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \ + wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \ chmod +x /usr/local/bin/micromamba && \ - \ mkdir -p ${MAMBA_ROOT_PREFIX} && \ - \ - # Perform the installation micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \ -c bioconda -c conda-forge \ - python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \ - bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \ - \ - # Clean up to keep the image small + python=3.10 \ + r-base=4.4 \ + r-reshape2 \ + r-tidyverse \ + bioconductor-fraser \ + bioconductor-txdb.hsapiens.ucsc.hg38.knowngene \ + bioconductor-org.hs.eg.db && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* From 87ca809b9c60e38fcbe5d3e0112312728fba2f01 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 6 Jan 2026 14:23:51 +1100 Subject: [PATCH 05/89] pinning the successful specific versions --- images/fraser/Dockerfile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 9d179938..008be1f3 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -10,12 +10,12 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ mkdir -p ${MAMBA_ROOT_PREFIX} && \ micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \ -c bioconda -c conda-forge \ - python=3.10 \ - r-base=4.4 \ - r-reshape2 \ - r-tidyverse \ - bioconductor-fraser \ - bioconductor-txdb.hsapiens.ucsc.hg38.knowngene \ - bioconductor-org.hs.eg.db && \ + python=3.10.19 \ + r-base=4.4.3 \ + r-reshape2=1.4.5 \ + r-tidyverse=2.0.0 \ + bioconductor-fraser=2.4.6 \ + bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \ + bioconductor-org.hs.eg.db=3.20.0 && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* From e3960ea00135db7a686bcecf502063d24bb2cdf8 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 6 Jan 2026 14:44:39 +1100 Subject: [PATCH 06/89] fraser version variable and re-adding space saving measures --- images/fraser/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 008be1f3..71b7bb94 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -5,6 +5,7 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH ARG VERSION=2.4.6 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ + rm -r /var/cache/apt/* && \ wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \ chmod +x /usr/local/bin/micromamba && \ mkdir -p ${MAMBA_ROOT_PREFIX} && \ @@ -14,7 +15,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ r-base=4.4.3 \ r-reshape2=1.4.5 \ r-tidyverse=2.0.0 \ - bioconductor-fraser=2.4.6 \ + bioconductor-fraser=${VERSION} \ bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \ bioconductor-org.hs.eg.db=3.20.0 && \ micromamba clean --all --yes && \ From 59d4d3b1db748ad60f105675b4359d77da8b8895 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 16 Jan 2026 17:44:15 +1100 Subject: [PATCH 07/89] Baking the R scripts into the fraser image --- images/fraser/Dockerfile | 9 ++++ images/fraser/__init__.py | 0 images/fraser/count_split_reads.R | 32 ++++++++++++ images/fraser/fraser_analysis.R | 70 ++++++++++++++++++++++++++ images/fraser/fraser_count_non_split.R | 29 +++++++++++ images/fraser/fraser_init.R | 49 ++++++++++++++++++ images/fraser/fraser_merge_non_split.R | 38 ++++++++++++++ images/fraser/fraser_merge_split.R | 50 ++++++++++++++++++ 8 files changed, 277 insertions(+) create mode 100644 images/fraser/__init__.py create mode 100644 images/fraser/count_split_reads.R create mode 100644 images/fraser/fraser_analysis.R create mode 100644 images/fraser/fraser_count_non_split.R create mode 100644 images/fraser/fraser_init.R create mode 100644 images/fraser/fraser_merge_non_split.R create mode 100644 images/fraser/fraser_merge_split.R diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 71b7bb94..ab1b77df 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -20,3 +20,12 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ bioconductor-org.hs.eg.db=3.20.0 && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* + +RUN mkdir /RDrnaseq + +COPY count_split_reads.R RDrnaseq/ +COPY fraser_analysis.R RDrnaseq/ +COPY fraser_count_non_split.R RDrnaseq/ +COPY fraser_init.R RDrnaseq/ +COPY fraser_merge_non_split.R RDrnaseq/ +COPY fraser_merge_split.R RDrnaseq/ \ No newline at end of file diff --git a/images/fraser/__init__.py b/images/fraser/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/images/fraser/count_split_reads.R b/images/fraser/count_split_reads.R new file mode 100644 index 00000000..50afdfaf --- /dev/null +++ b/images/fraser/count_split_reads.R @@ -0,0 +1,32 @@ +#!/usr/bin/env Rscript + +library(argparse) +library(FRASER) +library(BiocParallel) + +parser <- ArgumentParser(description = "Count Split Reads for a Single Sample") +parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count") +parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") +args <- parser$parse_args() + +#Setup FRASER directory structure + +dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS")) + +#Force HDF5 to save RAM + +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) + +fds <- countSplitReads( + sampleID = args$sample_id, + fds = fds, + NcpuPerSample = args$nthreads, + recount = TRUE +) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R new file mode 100644 index 00000000..e012ee8b --- /dev/null +++ b/images/fraser/fraser_analysis.R @@ -0,0 +1,70 @@ +#!/usr/bin/env Rscript + +library(argparse) +library(FRASER) +library(tidyverse) +library(TxDb.Hsapiens.UCSC.hg38.knownGene) +library(org.Hs.eg.db) +library(ggplot2) + +parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis") +parser$add_argument("--fds_dir", required = TRUE, help = "Directory containing savedObjects") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--pval_cutoff", type = "double", default = 0.05) +parser$add_argument("--z_cutoff", type = "double") +parser$add_argument("--delta_psi_cutoff", type = "double", default = 0.3) +parser$add_argument("--min_count", type = "integer", default = 5) +parser$add_argument("--nthreads", type = "integer", default = 1) +args <- parser$parse_args() + +#Force HDF5 + +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +z_cutoff <- if (is.null(args$z_cutoff)) NA else args$z_cutoff +bp <- MulticoreParam(workers = args$nthreads) +register(bp) + +#Load + +fds <- loadFraserDataSet(dir = args$fds_dir, name = args$cohort_id) + +#Filter + +fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE) +dir.create("plots/misc", recursive = TRUE, showWarnings = FALSE) + +png("plots/misc/filter_expression.png", width = 2000, height = 2000, res = 300) +plotFilterExpression(fds, bins = 100) +dev.off() + +fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] +psi_types <- c("psi5", "psi3", "jaccard") + +optimal_qs <- c( + psi5 = bestQ(fds_filtered, type = "psi5"), + psi3 = bestQ(fds_filtered, type = "psi3"), + jaccard = bestQ(fds_filtered, type = "jaccard") +) + +fds_fit <- FRASER(fds_filtered, q = optimal_qs, BPPARAM = bp) + +#Annotation + +txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene +orgDb <- org.Hs.eg.db +fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = txdb, orgDb = orgDb) + +#Results + +res <- results(fds_fit, padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff, + zScoreCutoff = z_cutoff, minCount = args$min_count) +res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) + +write_csv(as.data.frame(res), "results.significant.csv") +write_csv(as.data.frame(res_all), "results.all.csv") + +#Save final state + +saveFraserDataSet(fds_fit, dir = getwd(), name = "FraserDataSet") diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R new file mode 100644 index 00000000..a3e743ea --- /dev/null +++ b/images/fraser/fraser_count_non_split.R @@ -0,0 +1,29 @@ +#!/usr/bin/env Rscript + +library(argparse) +library(FRASER) +library(BiocParallel) + +parser <- ArgumentParser(description = "Count Non-Split Reads for a Single Sample") +parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--sample_id", required = TRUE, help = "Sample ID") +parser$add_argument("--coords_path", required = TRUE, help = "Path to splice_site_coords.RDS") +parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") +args <- parser$parse_args() + +fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) +splice_site_coords <- readRDS(args$coords_path) + +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +countNonSplicedReads( + sampleID = args$sample_id, + fds = fds, + spliceSiteCoords = splice_site_coords, + splitCountRanges = NULL, + minAnchor = 5, + NcpuPerSample = args$nthreads +) diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R new file mode 100644 index 00000000..49cf6b9f --- /dev/null +++ b/images/fraser/fraser_init.R @@ -0,0 +1,49 @@ +library(argparse) +library(FRASER) +library(BiocParallel) + +parser <- ArgumentParser(description = "Initialize FRASER Data Set") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--sample_ids", required = TRUE, help = "Comma-separated sample IDs") +parser$add_argument("--bam_files", required = TRUE, help = "Comma-separated BAM file paths") +parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") +args <- parser$parse_args() + +sample_ids <- unlist(strsplit(args$sample_ids, ",")) +bam_files <- unlist(strsplit(args$bam_files, ",")) + +sample_table <- DataFrame( + sampleID = sample_ids, + bamFile = bam_files, + group = seq_len(length(bam_files)), + pairedEnd = TRUE +) +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +fds <- FraserDataSet( + colData = sample_table, + workingDir = args$working_dir, + name = args$cohort_id +) + +#Setup parallel execution +bp <- MulticoreParam(workers = args$nthreads) +register(bp) + + +#Initialize counts and metadata +fds <- countRNAData(fds, BPPARAM = bp) +fds <- calculatePSIValues(fds,BPPARAM = bp) +fds <- saveFraserDataSet(fds) + +#Print location for Python to capture if needed +fds_save_path <- file.path( + args$working_dir, + "savedObjects", + paste0("FRASER_", args$cohort_id), + "fds-object.RDS" +) + +cat(fds_rds_path, "\n") diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R new file mode 100644 index 00000000..59c2b00b --- /dev/null +++ b/images/fraser/fraser_merge_non_split.R @@ -0,0 +1,38 @@ +#!/usr/bin/env Rscript + +library(argparse) +library(FRASER) +library(BiocParallel) + +parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI") +parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to g_ranges_non_split_counts.RDS") +parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") +args <- parser$parse_args() + +fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) +register(MulticoreParam(workers = args$nthreads)) + +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +#1. Merge Split + +fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE) + +#2. Merge Non-Split + +split_count_ranges <- readRDS(args$filtered_ranges_path) +fds <- getNonSplitReadCountsForAllSamples( + fds = fds, + splitCountRanges = split_count_ranges, + minAnchor = 5, + recount = FALSE +) + +#3. FRASER 2.0: Calculate Jaccard & PSI + +fds <- calculatePSIValues(fds) +fds <- saveFraserDataSet(fds) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R new file mode 100644 index 00000000..55a6df34 --- /dev/null +++ b/images/fraser/fraser_merge_split.R @@ -0,0 +1,50 @@ +#!/usr/bin/env Rscript + +library(argparse) +library(FRASER) +library(BiocParallel) + +parser <- ArgumentParser(description = "Merge Split Read Counts") +parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") +args <- parser$parse_args() + +dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS")) + +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) + +bp <- MulticoreParam(workers = args$nthreads) +register(bp) + + +fds <- countRNAData(fds, BPPARAM = bp) + +#Merge split counts + +fds <- getSplitReadCountsForAllSamples(fds = fds, recount = FALSE) + +#Prepare ranges for non-split counting + +split_counts <- asSE(fds, type = "j") +split_count_ranges <- rowRanges(split_counts) +split_count_ranges <- FRASER:::annotateSpliceSite(split_count_ranges) +saveRDS(split_count_ranges, "g_ranges_split_counts.RDS") + +#Filter for expression to reduce non-split counting overhead + +minExpressionInOneSample <- 20 +max_count <- rowMaxs(assay(split_counts, "rawCountsJ")) +passed <- max_count >= minExpressionInOneSample +filtered_ranges <- split_count_ranges[passed, ] +saveRDS(filtered_ranges, "g_ranges_non_split_counts.RDS") + +splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds) +saveRDS(splice_site_coords, "splice_site_coords.RDS") + +saveFraserDataSet(fds) From 021908f4ae9b7389f2f5e8c7fbb3598b9085d49d Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 16 Jan 2026 17:45:28 +1100 Subject: [PATCH 08/89] no need --- images/fraser/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 images/fraser/__init__.py diff --git a/images/fraser/__init__.py b/images/fraser/__init__.py deleted file mode 100644 index e69de29b..00000000 From 264c6d5c4f6856641bc19b56c33c199e4722a014 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 16 Jan 2026 17:47:18 +1100 Subject: [PATCH 09/89] End of file lint --- images/fraser/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index ab1b77df..182f543e 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -28,4 +28,4 @@ COPY fraser_analysis.R RDrnaseq/ COPY fraser_count_non_split.R RDrnaseq/ COPY fraser_init.R RDrnaseq/ COPY fraser_merge_non_split.R RDrnaseq/ -COPY fraser_merge_split.R RDrnaseq/ \ No newline at end of file +COPY fraser_merge_split.R RDrnaseq/ From cb22e31fbeee75cf4f6755fb57040675be0504e7 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 19 Jan 2026 17:48:25 +1100 Subject: [PATCH 10/89] refactoring file paths --- images/fraser/Dockerfile | 2 +- images/fraser/count_split_reads.R | 32 ----------- images/fraser/fraser_analysis.R | 77 ++++++++++++++++---------- images/fraser/fraser_count_non_split.R | 44 +++++++++++++-- images/fraser/fraser_count_split.R | 54 ++++++++++++++++++ images/fraser/fraser_init.R | 35 ++++++------ images/fraser/fraser_merge_non_split.R | 39 +++++++++---- images/fraser/fraser_merge_split.R | 56 ++++++++++++------- 8 files changed, 225 insertions(+), 114 deletions(-) delete mode 100644 images/fraser/count_split_reads.R create mode 100644 images/fraser/fraser_count_split.R diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 182f543e..49e96a56 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -23,7 +23,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ RUN mkdir /RDrnaseq -COPY count_split_reads.R RDrnaseq/ +COPY fraser_count_split.R RDrnaseq/ COPY fraser_analysis.R RDrnaseq/ COPY fraser_count_non_split.R RDrnaseq/ COPY fraser_init.R RDrnaseq/ diff --git a/images/fraser/count_split_reads.R b/images/fraser/count_split_reads.R deleted file mode 100644 index 50afdfaf..00000000 --- a/images/fraser/count_split_reads.R +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env Rscript - -library(argparse) -library(FRASER) -library(BiocParallel) - -parser <- ArgumentParser(description = "Count Split Reads for a Single Sample") -parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") -parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") -parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count") -parser$add_argument("--working_dir", default = "output", help = "Working directory") -parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") -args <- parser$parse_args() - -#Setup FRASER directory structure - -dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE) -file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS")) - -#Force HDF5 to save RAM - -options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = -1) - -fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) - -fds <- countSplitReads( - sampleID = args$sample_id, - fds = fds, - NcpuPerSample = args$nthreads, - recount = TRUE -) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index e012ee8b..54818e44 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -8,7 +8,7 @@ library(org.Hs.eg.db) library(ggplot2) parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis") -parser$add_argument("--fds_dir", required = TRUE, help = "Directory containing savedObjects") +parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") parser$add_argument("--pval_cutoff", type = "double", default = 0.05) parser$add_argument("--z_cutoff", type = "double") @@ -17,8 +17,7 @@ parser$add_argument("--min_count", type = "integer", default = 5) parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() -#Force HDF5 - +# Force HDF5 for large matrix operations options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) @@ -26,45 +25,65 @@ z_cutoff <- if (is.null(args$z_cutoff)) NA else args$z_cutoff bp <- MulticoreParam(workers = args$nthreads) register(bp) -#Load - -fds <- loadFraserDataSet(dir = args$fds_dir, name = args$cohort_id) +# 1. Load the merged FDS +# Python extracted the tar into args$fds_dir. +# Inside is 'savedObjects/FRASER_{cohort_id}/...' +fds_name <- paste0("FRASER_", args$cohort_id) +fds <- loadFraserDataSet(dir = args$fds_dir, name = fds_name) -#Filter +# 2. Filter Expression and Variability +# This reduces the number of tests and improves power +message("Filtering junctions...") +fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) -fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE) dir.create("plots/misc", recursive = TRUE, showWarnings = FALSE) - png("plots/misc/filter_expression.png", width = 2000, height = 2000, res = 300) plotFilterExpression(fds, bins = 100) dev.off() -fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] -psi_types <- c("psi5", "psi3", "jaccard") - -optimal_qs <- c( - psi5 = bestQ(fds_filtered, type = "psi5"), - psi3 = bestQ(fds_filtered, type = "psi3"), - jaccard = bestQ(fds_filtered, type = "jaccard") -) - -fds_fit <- FRASER(fds_filtered, q = optimal_qs, BPPARAM = bp) - -#Annotation +# 3. Hyperparameter Optimization (bestQ) and Fitting +# FRASER uses an autoencoder to control for latent confounders (batch effects) +message("Finding optimal hyperparameter Q and fitting autoencoder...") +# In FRASER 2.0, we typically fit psi5, psi3, and jaccard +fds <- FRASER(fds, q = c(psi5=NA, psi3=NA, jaccard=NA), BPPARAM = bp) +# 4. Annotation +message("Annotating results with UCSC hg38...") txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene orgDb <- org.Hs.eg.db -fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = txdb, orgDb = orgDb) +fds <- annotateRangesWithTxDb(fds, txdb = txdb, orgDb = orgDb) -#Results +# 5. Extract Results +message("Exporting results...") +res <- results(fds, + padjCutoff = args$pval_cutoff, + deltaPsiCutoff = args$delta_psi_cutoff, + zScoreCutoff = z_cutoff, + minCount = args$min_count) -res <- results(fds_fit, padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff, - zScoreCutoff = z_cutoff, minCount = args$min_count) -res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) +res_all <- results(fds, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) write_csv(as.data.frame(res), "results.significant.csv") write_csv(as.data.frame(res_all), "results.all.csv") -#Save final state - -saveFraserDataSet(fds_fit, dir = getwd(), name = "FraserDataSet") +# 6. Generate Diagnostic Plots +message("Generating Volcano and Q-Q plots...") +dir.create("plots/volcano", recursive = TRUE) +for(type in c("psi5", "psi3", "jaccard")){ + png(paste0("plots/volcano/volcano_", type, ".png"), width = 2000, height = 2000, res = 300) + print(plotVolcano(fds, type = type, sampleID = sampleIDs(fds)[1])) # Plots first sample as example + dev.off() +} + +# 7. Final Save +# We save this in a generic folder so Python can easily grab it +saveFraserDataSet(fds, dir = "output", name = paste0("Final_", args$cohort_id)) + +# Summary statistics for the log +sink("statistics_summary.txt") +cat("Cohort ID:", args$cohort_id, "\n") +cat("Total Samples:", length(sampleIDs(fds)), "\n") +cat("Significant Events (adj P <", args$pval_cutoff, "):", nrow(res), "\n") +sink() + +message("Analysis complete.") \ No newline at end of file diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index a3e743ea..d272c9cc 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -5,25 +5,57 @@ library(FRASER) library(BiocParallel) parser <- ArgumentParser(description = "Count Non-Split Reads for a Single Sample") -parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") +parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file") +parser$add_argument("--bam_path", required = TRUE, help = "Path to the localized BAM file") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") parser$add_argument("--sample_id", required = TRUE, help = "Sample ID") parser$add_argument("--coords_path", required = TRUE, help = "Path to splice_site_coords.RDS") -parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--work_dir", default = "/io/work", help = "Working directory") parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) -splice_site_coords <- readRDS(args$coords_path) +# 1. Reconstruct Directory Structure +fds_name <- paste0("FRASER_", args$cohort_id) +save_dir <- file.path(args$work_dir, "savedObjects", fds_name) +dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) + +# Create the specific cache directory for non-split counts +cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name) +dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) +# 2. Configure Parallelism and HDF5 options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) +register(MulticoreParam(workers = args$nthreads)) + +# 3. Load Dataset and Coordinates +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) +splice_site_coords <- readRDS(args$coords_path) + +# 4. Synchronize BAM Path +# Ensure the FDS object points to the current local BAM location on the worker +bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path +# 5. Run Non-Split Counting +# This writes the .h5 or .RDS file into the cache_dir created above +message(paste("Counting non-split reads for sample:", args$sample_id)) countNonSplicedReads( sampleID = args$sample_id, fds = fds, spliceSiteCoords = splice_site_coords, - splitCountRanges = NULL, + splitCountRanges = NULL, # Already handled in merge_split minAnchor = 5, - NcpuPerSample = args$nthreads + NcpuPerSample = args$nthreads, + recount = TRUE ) + +# 6. Verification +# The Python script uses a 'find' command to move this output. +# We just need to check if the file was created in the cache. +out_file <- list.files(cache_dir, pattern = paste0("nonSplicedCounts-", args$sample_id)) +if (length(out_file) > 0) { + message("Successfully created non-split counts: ", out_file[1]) +} else { + stop("Non-split counts file was not found in: ", cache_dir) +} \ No newline at end of file diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R new file mode 100644 index 00000000..510a4a97 --- /dev/null +++ b/images/fraser/fraser_count_split.R @@ -0,0 +1,54 @@ +#!/usr/bin/env Rscript + +library(argparse) +library(FRASER) +library(BiocParallel) + +parser <- ArgumentParser(description = "Count Split Reads for a Single Sample") +parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file") +parser$add_argument("--bam_path", required = TRUE, help = "Path to the specific BAM file for this sample") +parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") +parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count") +parser$add_argument("--work_dir", default = "/io/work", help = "Working directory") +parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") +args <- parser$parse_args() + +# 1. Reconstruct the FRASER directory structure so loadFraserDataSet works +# FRASER expects: {work_dir}/savedObjects/FRASER_{cohort_id}/fds-object.RDS +fds_dir_name <- paste0("FRASER_", args$cohort_id) +save_dir <- file.path(args$work_dir, "savedObjects", fds_dir_name) +dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) + +# 2. Force HDF5 and Configure Parallelism +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) +register(MulticoreParam(workers = args$nthreads)) + +# 3. Load the dataset +# Note: name must match what was used in fraser_init.R +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) + +# 4. Update the BAM path for this specific worker environment +# This ensures that even if the BAM was at a different path during Init, +# the worker uses the current localized path. +bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path + +# 5. Run counting for the specific sample +# countRNAData with sampleId filter is the standard way to run parallel counts +fds <- countRNAData( + fds, + sampleIds = args$sample_id, + NcpuPerSample = args$nthreads, + recount = TRUE +) + +# 6. Verification +# FRASER writes split counts to: {work_dir}/cache/splitCounts/splitCounts-{sample_id}.RDS +expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS")) + +if (file.exists(expected_out)) { + message("Successfully created split counts at: ", expected_out) +} else { + stop("Split counts file was not found at expected location: ", expected_out) +} \ No newline at end of file diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R index 49cf6b9f..5deaabff 100644 --- a/images/fraser/fraser_init.R +++ b/images/fraser/fraser_init.R @@ -4,28 +4,28 @@ library(BiocParallel) parser <- ArgumentParser(description = "Initialize FRASER Data Set") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") -parser$add_argument("--sample_ids", required = TRUE, help = "Comma-separated sample IDs") -parser$add_argument("--bam_files", required = TRUE, help = "Comma-separated BAM file paths") -parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--sample_map", required = TRUE, help = "Path to CSV with sample_id and bam_path") +parser$add_argument("--work_dir", default = "/io/work", help = "Working directory for FRASER") parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -sample_ids <- unlist(strsplit(args$sample_ids, ",")) -bam_files <- unlist(strsplit(args$bam_files, ",")) +# 1. CSV Injection: Read the mapping file instead of parsing strings +# Expected columns: sample_id, bam_path +sample_map <- read.csv(args$sample_map) sample_table <- DataFrame( - sampleID = sample_ids, - bamFile = bam_files, - group = seq_len(length(bam_files)), + sampleID = as.character(sample_map$sample_id), + bamFile = as.character(sample_map$bam_path), + group = "cohort", pairedEnd = TRUE ) options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) fds <- FraserDataSet( - colData = sample_table, - workingDir = args$working_dir, - name = args$cohort_id + colData = sample_table, + workingDir = args$work_dir, + name = paste0("FRASER_", args$cohort_id) ) #Setup parallel execution @@ -33,17 +33,20 @@ bp <- MulticoreParam(workers = args$nthreads) register(bp) -#Initialize counts and metadata -fds <- countRNAData(fds, BPPARAM = bp) -fds <- calculatePSIValues(fds,BPPARAM = bp) +# We calculate initial metadata/PSI skeleton +fds <- calculatePSIValues(fds, BPPARAM = bp) fds <- saveFraserDataSet(fds) #Print location for Python to capture if needed fds_save_path <- file.path( - args$working_dir, + args$work_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS" ) -cat(fds_rds_path, "\n") +if (file.exists(fds_save_path)) { + message("Successfully initialized FDS skeleton at: ", fds_save_path) +} else { + stop("FDS object was not saved to: ", fds_save_path) +} \ No newline at end of file diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 59c2b00b..78c3ed3b 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -5,34 +5,53 @@ library(FRASER) library(BiocParallel) parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI") -parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") +parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to g_ranges_non_split_counts.RDS") -parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--work_dir", default = "/io/work", help = "Working directory") parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) -register(MulticoreParam(workers = args$nthreads)) +# 1. Reconstruct Directory Structure +fds_name <- paste0("FRASER_", args$cohort_id) +save_dir <- file.path(args$work_dir, "savedObjects", fds_name) +dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) +# 2. Configure Backend options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) +bp <- MulticoreParam(workers = args$nthreads) +register(bp) -#1. Merge Split +# 3. Load Dataset +# This loads the metadata; the counts will be pulled from the cache symlinked by Python +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) +# 4. Merge Split Counts +# This step updates the fds with the junctions identified in Step 3 +message("Merging all split-read counts...") fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE) -#2. Merge Non-Split - +# 5. Merge Non-Split Counts +# We use the filtered ranges from Step 3 to define the 'at-site' junctions +message("Merging all non-split-read counts from cache...") split_count_ranges <- readRDS(args$filtered_ranges_path) fds <- getNonSplitReadCountsForAllSamples( fds = fds, splitCountRanges = split_count_ranges, minAnchor = 5, - recount = FALSE + recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache ) -#3. FRASER 2.0: Calculate Jaccard & PSI +# 6. Statistical Normalization (PSI and Jaccard) +# This is the 'FRASER 2.0' logic. It calculates the metrics used for outlier detection. +message("Calculating PSI and Jaccard Index values...") +fds <- calculatePSIValues(fds, BPPARAM = bp) -fds <- calculatePSIValues(fds) +# 7. Final Save +# This creates the complete 'fitted' dataset that the analysis script will use +message("Saving final merged FraserDataSet...") fds <- saveFraserDataSet(fds) + +message("Merge non-split and PSI calculation complete.") \ No newline at end of file diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 55a6df34..7053827e 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -3,48 +3,64 @@ library(argparse) library(FRASER) library(BiocParallel) +library(summarizedRT) # Useful for rowMaxs/assay handling parser <- ArgumentParser(description = "Merge Split Read Counts") parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") -parser$add_argument("--working_dir", default = "output", help = "Working directory") +parser$add_argument("--work_dir", default = "/io/work", help = "Working directory") parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE) -file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS")) +# 1. Reconstruct Directory Structure +fds_name <- paste0("FRASER_", args$cohort_id) +save_dir <- file.path(args$work_dir, "savedObjects", fds_name) +dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) +# 2. Configure Backend options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) - -fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id) - bp <- MulticoreParam(workers = args$nthreads) register(bp) +# 3. Load Dataset +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -fds <- countRNAData(fds, BPPARAM = bp) - -#Merge split counts +# 4. Merge individual split count RDS files from the cache +# FRASER automatically looks in: {work_dir}/cache/splitCounts/ +message("Merging split counts from cache...") +fds <- countRNAData(fds, recount = FALSE, BPPARAM = bp) -fds <- getSplitReadCountsForAllSamples(fds = fds, recount = FALSE) +# 5. Extract and Annotate Junctions +# This identifies which junctions exist across the whole cohort +split_counts_se <- asSE(fds, type = "j") +split_ranges <- rowRanges(split_counts_se) -#Prepare ranges for non-split counting - -split_counts <- asSE(fds, type = "j") -split_count_ranges <- rowRanges(split_counts) -split_count_ranges <- FRASER:::annotateSpliceSite(split_count_ranges) -saveRDS(split_count_ranges, "g_ranges_split_counts.RDS") - -#Filter for expression to reduce non-split counting overhead +# Explicitly annotate splice sites to get donor/acceptor positions +message("Annotating splice sites...") +split_ranges <- FRASER:::annotateSpliceSite(split_ranges) +saveRDS(split_ranges, "g_ranges_split_counts.RDS") +# 6. Filtering for Non-Split Counting (Optimization) +# We only want to count non-split reads for junctions that actually show up +# in our data to save massive amounts of compute time. +message("Filtering ranges for non-split counting...") minExpressionInOneSample <- 20 -max_count <- rowMaxs(assay(split_counts, "rawCountsJ")) +# Use assay(..., "rawCountsJ") to get the counts for junctions +raw_counts <- assay(split_counts_se, "rawCountsJ") +max_count <- rowMaxs(raw_counts) passed <- max_count >= minExpressionInOneSample -filtered_ranges <- split_count_ranges[passed, ] + +filtered_ranges <- split_ranges[passed, ] saveRDS(filtered_ranges, "g_ranges_non_split_counts.RDS") +# 7. Extract Splice Site Coordinates +# These are the specific base-pair positions the next jobs will query in the BAMs +message("Extracting splice site coordinates...") splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds) saveRDS(splice_site_coords, "splice_site_coords.RDS") +# 8. Save the updated FDS saveFraserDataSet(fds) +message("Merge split complete.") \ No newline at end of file From 14c1f66909754efc1e76a4a5c5f111d7ec5a2728 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 19 Jan 2026 17:50:01 +1100 Subject: [PATCH 11/89] end of file linting --- images/fraser/fraser_analysis.R | 2 +- images/fraser/fraser_count_non_split.R | 2 +- images/fraser/fraser_count_split.R | 2 +- images/fraser/fraser_init.R | 2 +- images/fraser/fraser_merge_non_split.R | 2 +- images/fraser/fraser_merge_split.R | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index 54818e44..0ff7e409 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -86,4 +86,4 @@ cat("Total Samples:", length(sampleIDs(fds)), "\n") cat("Significant Events (adj P <", args$pval_cutoff, "):", nrow(res), "\n") sink() -message("Analysis complete.") \ No newline at end of file +message("Analysis complete.") diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index d272c9cc..486b379a 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -58,4 +58,4 @@ if (length(out_file) > 0) { message("Successfully created non-split counts: ", out_file[1]) } else { stop("Non-split counts file was not found in: ", cache_dir) -} \ No newline at end of file +} diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index 510a4a97..7ba2cb2c 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -51,4 +51,4 @@ if (file.exists(expected_out)) { message("Successfully created split counts at: ", expected_out) } else { stop("Split counts file was not found at expected location: ", expected_out) -} \ No newline at end of file +} diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R index 5deaabff..6c67b7fd 100644 --- a/images/fraser/fraser_init.R +++ b/images/fraser/fraser_init.R @@ -49,4 +49,4 @@ if (file.exists(fds_save_path)) { message("Successfully initialized FDS skeleton at: ", fds_save_path) } else { stop("FDS object was not saved to: ", fds_save_path) -} \ No newline at end of file +} diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 78c3ed3b..1dd0163f 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -54,4 +54,4 @@ fds <- calculatePSIValues(fds, BPPARAM = bp) message("Saving final merged FraserDataSet...") fds <- saveFraserDataSet(fds) -message("Merge non-split and PSI calculation complete.") \ No newline at end of file +message("Merge non-split and PSI calculation complete.") diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 7053827e..973b2841 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -63,4 +63,4 @@ saveRDS(splice_site_coords, "splice_site_coords.RDS") # 8. Save the updated FDS saveFraserDataSet(fds) -message("Merge split complete.") \ No newline at end of file +message("Merge split complete.") From 899ec5214f4150e539e753d2eefe57ccd467088e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 19 Jan 2026 18:36:59 +1100 Subject: [PATCH 12/89] argparse is necessary for image --- images/fraser/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 49e96a56..6aedc2b7 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ r-base=4.4.3 \ r-reshape2=1.4.5 \ r-tidyverse=2.0.0 \ + r-argparse=2.2.3 \ bioconductor-fraser=${VERSION} \ bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \ bioconductor-org.hs.eg.db=3.20.0 && \ From f71ba087dc02f4c2c2e55ac82deb33ff55e74095 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 20 Jan 2026 23:09:45 +1100 Subject: [PATCH 13/89] cannot calc before count --- images/fraser/fraser_init.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R index 6c67b7fd..a0aea730 100644 --- a/images/fraser/fraser_init.R +++ b/images/fraser/fraser_init.R @@ -34,7 +34,7 @@ register(bp) # We calculate initial metadata/PSI skeleton -fds <- calculatePSIValues(fds, BPPARAM = bp) + fds <- saveFraserDataSet(fds) #Print location for Python to capture if needed From eea8c9be0497086691b9e57b717198549d100b30 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 10:14:41 +1100 Subject: [PATCH 14/89] bamData is deprecated --- images/fraser/fraser_count_split.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index 7ba2cb2c..cf4cbe09 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -29,11 +29,16 @@ register(MulticoreParam(workers = args$nthreads)) # Note: name must match what was used in fraser_init.R fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) -# 4. Update the BAM path for this specific worker environment -# This ensures that even if the BAM was at a different path during Init, -# the worker uses the current localized path. -bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path +# 4. Update Metadata and Strand +# Important: Ensure the strand matches your actual lab protocol +strandSpecific(fds) <- 0 +# Use colData to update the localized BAM path +if (args$sample_id %in% colData(fds)$sampleID) { + colData(fds)[colData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path +} else { + stop("Sample ID not found in fds object.") +} # 5. Run counting for the specific sample # countRNAData with sampleId filter is the standard way to run parallel counts fds <- countRNAData( From 90bd9c4132b7b0d744bd8ea8148b04eabbf7aff0 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 10:33:34 +1100 Subject: [PATCH 15/89] paralelizing a single sample is confuzling --- images/fraser/fraser_count_split.R | 33 ++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index cf4cbe09..06fc4ec0 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -23,7 +23,9 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) # 2. Force HDF5 and Configure Parallelism options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) -register(MulticoreParam(workers = args$nthreads)) +# Use MulticoreParam for Linux environments +bpparam <- MulticoreParam(workers = args$nthreads) +register(bpparam) # 3. Load the dataset # Note: name must match what was used in fraser_init.R @@ -33,19 +35,23 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) # Important: Ensure the strand matches your actual lab protocol strandSpecific(fds) <- 0 -# Use colData to update the localized BAM path -if (args$sample_id %in% colData(fds)$sampleID) { - colData(fds)[colData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path -} else { - stop("Sample ID not found in fds object.") +# Subset the FDS object to ONLY this sample. +# This prevents the worker from erroring out when looking for other samples' BAMs. +if (!(args$sample_id %in% colData(fds)$sampleID)) { + stop(paste("Sample", args$sample_id, "not found in the provided FDS object.")) } -# 5. Run counting for the specific sample -# countRNAData with sampleId filter is the standard way to run parallel counts +fds <- fds[, fds$sampleID == args$sample_id] + +# Update the localized BAM path in colData +colData(fds)$bamFile <- args$bam_path + +# 5. Run counting +# We pass BPPARAM explicitly to ensure the worker uses the allocated threads fds <- countRNAData( fds, sampleIds = args$sample_id, - NcpuPerSample = args$nthreads, - recount = TRUE + recount = TRUE, + BPPARAM = bpparam ) # 6. Verification @@ -55,5 +61,10 @@ expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCo if (file.exists(expected_out)) { message("Successfully created split counts at: ", expected_out) } else { - stop("Split counts file was not found at expected location: ", expected_out) + # List files in the directory to help debug if it fails + found_files <- list.files(file.path(args$work_dir, "cache", "splitCounts")) + stop(paste0( + "Split counts file not found. Expected: ", expected_out, + "\nFound in cache: ", paste(found_files, collapse=", ") + )) } From bc44e653d0220a324c7138feee8bde4803a19ec0 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 11:14:07 +1100 Subject: [PATCH 16/89] parallel limit --- images/fraser/fraser_count_split.R | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index 06fc4ec0..283fad74 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -1,5 +1,9 @@ #!/usr/bin/env Rscript +# Set memory limit at the environment level before loading heavy libraries +# 11GB limit +Sys.setenv("R_MAX_VSIZE" = "14Gb") + library(argparse) library(FRASER) library(BiocParallel) @@ -23,8 +27,13 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) # 2. Force HDF5 and Configure Parallelism options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) -# Use MulticoreParam for Linux environments -bpparam <- MulticoreParam(workers = args$nthreads) + +# Configure MulticoreParam with the 11GB limit in mind +# Note: nthreads shares the 11GB pool +bpparam <- MulticoreParam( + workers = args$nthreads, + stop.on.error = TRUE +) register(bpparam) # 3. Load the dataset From b4fcd7ab216fbb844191d44917fa37f6a9315824 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 11:35:37 +1100 Subject: [PATCH 17/89] parallel limit --- images/fraser/fraser_count_split.R | 51 ++++++++++++++---------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index 283fad74..2955e946 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -22,40 +22,40 @@ args <- parser$parse_args() fds_dir_name <- paste0("FRASER_", args$cohort_id) save_dir <- file.path(args$work_dir, "savedObjects", fds_dir_name) dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) -file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) # 2. Force HDF5 and Configure Parallelism options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = -1) +options("FRASER.maxJunctionsNoHDF5" = 0) # Changed from -1 to 0 to be safer -# Configure MulticoreParam with the 11GB limit in mind -# Note: nthreads shares the 11GB pool -bpparam <- MulticoreParam( - workers = args$nthreads, - stop.on.error = TRUE -) -register(bpparam) - -# 3. Load the dataset -# Note: name must match what was used in fraser_init.R +# 3. Load and Prune IMMEDIATELY fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) # 4. Update Metadata and Strand # Important: Ensure the strand matches your actual lab protocol strandSpecific(fds) <- 0 -# Subset the FDS object to ONLY this sample. -# This prevents the worker from erroring out when looking for other samples' BAMs. -if (!(args$sample_id %in% colData(fds)$sampleID)) { - stop(paste("Sample", args$sample_id, "not found in the provided FDS object.")) -} +# SUBSET FIRST: This is the most critical memory-saving step. +# By subsetting here, we drop the metadata of all other samples. fds <- fds[, fds$sampleID == args$sample_id] - -# Update the localized BAM path in colData colData(fds)$bamFile <- args$bam_path -# 5. Run counting -# We pass BPPARAM explicitly to ensure the worker uses the allocated threads +# Force Garbage Collection to reclaim memory from the full cohort load +gc() + +# 4. Parallel vs Serial Logic +# If you are hitting 11GB limits, 'MulticoreParam' is risky because it forks the process. +# If nthreads > 1, we use a very conservative setup. +if(args$nthreads > 1) { + # Using 'tasks' helps throttle the memory spikes + bpparam <- MulticoreParam(workers = args$nthreads, tasks = args$nthreads) +} else { + bpparam <- SerialParam() +} + +# 5. Run counting with minimal overhead +# We use recount=FALSE if possible, but keeping TRUE as per your requirement. +# junctionId is set to NULL to ensure a fresh scan of this specific BAM. fds <- countRNAData( fds, sampleIds = args$sample_id, @@ -68,12 +68,7 @@ fds <- countRNAData( expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS")) if (file.exists(expected_out)) { - message("Successfully created split counts at: ", expected_out) + message("Success: ", expected_out) } else { - # List files in the directory to help debug if it fails - found_files <- list.files(file.path(args$work_dir, "cache", "splitCounts")) - stop(paste0( - "Split counts file not found. Expected: ", expected_out, - "\nFound in cache: ", paste(found_files, collapse=", ") - )) + stop("Counting failed to produce output.") } From 11d35a93286f43e30609492d96db672884904280 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 11:55:29 +1100 Subject: [PATCH 18/89] no parallel --- images/fraser/fraser_count_split.R | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index 2955e946..ddfc6a36 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -43,16 +43,6 @@ colData(fds)$bamFile <- args$bam_path # Force Garbage Collection to reclaim memory from the full cohort load gc() -# 4. Parallel vs Serial Logic -# If you are hitting 11GB limits, 'MulticoreParam' is risky because it forks the process. -# If nthreads > 1, we use a very conservative setup. -if(args$nthreads > 1) { - # Using 'tasks' helps throttle the memory spikes - bpparam <- MulticoreParam(workers = args$nthreads, tasks = args$nthreads) -} else { - bpparam <- SerialParam() -} - # 5. Run counting with minimal overhead # We use recount=FALSE if possible, but keeping TRUE as per your requirement. # junctionId is set to NULL to ensure a fresh scan of this specific BAM. @@ -60,7 +50,6 @@ fds <- countRNAData( fds, sampleIds = args$sample_id, recount = TRUE, - BPPARAM = bpparam ) # 6. Verification From afb514c649f2b2712132808ee17a980f5cdcf7b0 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 12:17:05 +1100 Subject: [PATCH 19/89] force serial --- images/fraser/fraser_count_split.R | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index ddfc6a36..ec80204b 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -26,7 +26,7 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE # 2. Force HDF5 and Configure Parallelism options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = 0) # Changed from -1 to 0 to be safer +options("FRASER.maxJunctionsNoHDF5" = 0) # 3. Load and Prune IMMEDIATELY fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) @@ -43,13 +43,27 @@ colData(fds)$bamFile <- args$bam_path # Force Garbage Collection to reclaim memory from the full cohort load gc() -# 5. Run counting with minimal overhead -# We use recount=FALSE if possible, but keeping TRUE as per your requirement. -# junctionId is set to NULL to ensure a fresh scan of this specific BAM. +# 4. Configure Conservative Parallelism +# MulticoreParam (forking) is likely causing the 11GB overflow. +# SerialParam is the safest for memory-constrained environments. +if(args$nthreads > 1){ + # If you MUST use multiple threads, SnowParam is often more memory-stable + # than MulticoreParam on some systems, but SerialParam is the safest. + bpparam <- SerialParam() + message("Note: Defaulting to SerialParam to stay within 14GB limit.") +} else { + bpparam <- SerialParam() +} + +# 5. Run counting +# We use NcpuPerSample = 1 to prevent internal sub-parallelization +# which often bypasses the BPPARAM settings. fds <- countRNAData( fds, sampleIds = args$sample_id, recount = TRUE, + NcpuPerSample = 1, + BPPARAM = bpparam ) # 6. Verification @@ -57,7 +71,7 @@ fds <- countRNAData( expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS")) if (file.exists(expected_out)) { - message("Success: ", expected_out) + message("Success: Created split counts at ", expected_out) } else { - stop("Counting failed to produce output.") + stop("Counting failed. Check if BAM index is valid and accessible.") } From f8587a79c4cdbf717d91f34afd24bf6228e219ee Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 12:43:18 +1100 Subject: [PATCH 20/89] making it even leaner --- images/fraser/fraser_count_split.R | 33 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index ec80204b..ff2a9086 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -27,6 +27,7 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE # 2. Force HDF5 and Configure Parallelism options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = 0) +bpparam <- SerialParam() # 3. Load and Prune IMMEDIATELY fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) @@ -40,30 +41,28 @@ strandSpecific(fds) <- 0 fds <- fds[, fds$sampleID == args$sample_id] colData(fds)$bamFile <- args$bam_path -# Force Garbage Collection to reclaim memory from the full cohort load +# 4. PRE-SCAN DIAGNOSTIC +# We check the first 1 million reads to estimate junction density +message("--- Memory Diagnostic: Pre-scanning BAM for junction density ---") +sample_bam <- BamFile(args$bam_path, yieldSize = 1000000) +tmp_juncs <- summarizeJunctions(sample_bam, genome = NULL) +message(paste("Unique junctions found in first 1M reads:", length(tmp_juncs))) +rm(tmp_juncs, sample_bam) gc() -# 4. Configure Conservative Parallelism -# MulticoreParam (forking) is likely causing the 11GB overflow. -# SerialParam is the safest for memory-constrained environments. -if(args$nthreads > 1){ - # If you MUST use multiple threads, SnowParam is often more memory-stable - # than MulticoreParam on some systems, but SerialParam is the safest. - bpparam <- SerialParam() - message("Note: Defaulting to SerialParam to stay within 14GB limit.") -} else { - bpparam <- SerialParam() -} - -# 5. Run counting -# We use NcpuPerSample = 1 to prevent internal sub-parallelization -# which often bypasses the BPPARAM settings. +# 5. Run Counting with strict filters +# minCount = 2 ignores junctions with only 1 supporting read (saves massive RAM) +message(paste("Starting countRNAData for sample:", args$sample_id)) fds <- countRNAData( fds, sampleIds = args$sample_id, recount = TRUE, NcpuPerSample = 1, - BPPARAM = bpparam + minCount = 2, + BPPARAM = bpparam, + param = ScanBamParam( + flag = scanBamFlag(isDuplicate = FALSE, isSecondaryAlignment = FALSE) + ) ) # 6. Verification From 885788d4fa42ebac1bc5bed26989f4d41f559ab2 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 12:58:11 +1100 Subject: [PATCH 21/89] fix --- images/fraser/fraser_count_split.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index ff2a9086..a14f4b72 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -45,9 +45,6 @@ colData(fds)$bamFile <- args$bam_path # We check the first 1 million reads to estimate junction density message("--- Memory Diagnostic: Pre-scanning BAM for junction density ---") sample_bam <- BamFile(args$bam_path, yieldSize = 1000000) -tmp_juncs <- summarizeJunctions(sample_bam, genome = NULL) -message(paste("Unique junctions found in first 1M reads:", length(tmp_juncs))) -rm(tmp_juncs, sample_bam) gc() # 5. Run Counting with strict filters From 83d0dd9c65aa3cce2b40e855af4f4014e163580c Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 21 Jan 2026 14:58:47 +1100 Subject: [PATCH 22/89] fix --- images/fraser/fraser_count_non_split.R | 22 ++++++++-------------- images/fraser/fraser_count_split.R | 21 +++------------------ 2 files changed, 11 insertions(+), 32 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 486b379a..4ef11cae 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -27,28 +27,22 @@ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) # 2. Configure Parallelism and HDF5 options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) -register(MulticoreParam(workers = args$nthreads)) +bpparam <- SerialParam() # 3. Load Dataset and Coordinates fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -splice_site_coords <- readRDS(args$coords_path) +strandSpecific(fds) <- 0 -# 4. Synchronize BAM Path -# Ensure the FDS object points to the current local BAM location on the worker -bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path + + +fds <- fds[, fds$sampleID == args$sample_id] +colData(fds)$bamFile <- args$bam_path # 5. Run Non-Split Counting # This writes the .h5 or .RDS file into the cache_dir created above message(paste("Counting non-split reads for sample:", args$sample_id)) -countNonSplicedReads( - sampleID = args$sample_id, - fds = fds, - spliceSiteCoords = splice_site_coords, - splitCountRanges = NULL, # Already handled in merge_split - minAnchor = 5, - NcpuPerSample = args$nthreads, - recount = TRUE -) +getNonSplitReadCountsForAllSamples(fds, + recount = TRUE) # 6. Verification # The Python script uses a 'find' command to move this output. diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index a14f4b72..ce1f6613 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -32,8 +32,6 @@ bpparam <- SerialParam() # 3. Load and Prune IMMEDIATELY fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) -# 4. Update Metadata and Strand -# Important: Ensure the strand matches your actual lab protocol strandSpecific(fds) <- 0 # SUBSET FIRST: This is the most critical memory-saving step. @@ -41,26 +39,13 @@ strandSpecific(fds) <- 0 fds <- fds[, fds$sampleID == args$sample_id] colData(fds)$bamFile <- args$bam_path -# 4. PRE-SCAN DIAGNOSTIC -# We check the first 1 million reads to estimate junction density -message("--- Memory Diagnostic: Pre-scanning BAM for junction density ---") -sample_bam <- BamFile(args$bam_path, yieldSize = 1000000) -gc() # 5. Run Counting with strict filters # minCount = 2 ignores junctions with only 1 supporting read (saves massive RAM) message(paste("Starting countRNAData for sample:", args$sample_id)) -fds <- countRNAData( - fds, - sampleIds = args$sample_id, - recount = TRUE, - NcpuPerSample = 1, - minCount = 2, - BPPARAM = bpparam, - param = ScanBamParam( - flag = scanBamFlag(isDuplicate = FALSE, isSecondaryAlignment = FALSE) - ) -) + +fds <- getSplitReadCountsForAllSamples(fds, + recount = TRUE) # 6. Verification # FRASER writes split counts to: {work_dir}/cache/splitCounts/splitCounts-{sample_id}.RDS From d01466b1607b943ce1dd6d29b3990722393a421b Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 09:17:57 +1100 Subject: [PATCH 23/89] adding necessary R libraries --- images/fraser/Dockerfile | 3 +++ images/fraser/fraser_merge_split.R | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 6aedc2b7..b1c454ac 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -16,9 +16,12 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ r-reshape2=1.4.5 \ r-tidyverse=2.0.0 \ r-argparse=2.2.3 \ + r-biocmanager \ bioconductor-fraser=${VERSION} \ bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \ bioconductor-org.hs.eg.db=3.20.0 && \ + # Install summarizedRT via BiocManager + micromamba run -n base Rscript -e 'BiocManager::install("summarizedRT", update=FALSE, ask=FALSE)' && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 973b2841..cc566029 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -30,7 +30,7 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) # 4. Merge individual split count RDS files from the cache # FRASER automatically looks in: {work_dir}/cache/splitCounts/ message("Merging split counts from cache...") -fds <- countRNAData(fds, recount = FALSE, BPPARAM = bp) +fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp) # 5. Extract and Annotate Junctions # This identifies which junctions exist across the whole cohort From ee25ec0ab605422343a6687ed1063a5c53ca9622 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 09:24:53 +1100 Subject: [PATCH 24/89] adding coordinate checking precaution --- images/fraser/fraser_count_non_split.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 4ef11cae..4077ebe8 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -31,13 +31,21 @@ bpparam <- SerialParam() # 3. Load Dataset and Coordinates fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -strandSpecific(fds) <- 0 - - +filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS +# 4. Critical: Subset and Inject +# First, subset to the specific sample fds <- fds[, fds$sampleID == args$sample_id] colData(fds)$bamFile <- args$bam_path +# Replace the internal splice site ranges with your filtered ones +# This forces FRASER to only query these specific BP positions in the BAM +nonDegenerated <- !duplicated(filtered_coords) +mcols(fds)$spliceSiteCoords <- filtered_coords[nonDegenerated] + +# Set strand specificity (Ensure 0 is correct for your library!) +strandSpecific(fds) <- 0 + # 5. Run Non-Split Counting # This writes the .h5 or .RDS file into the cache_dir created above message(paste("Counting non-split reads for sample:", args$sample_id)) From c7a8819a555e68e17f86cdb8f95d7fda199b1803 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 10:03:39 +1100 Subject: [PATCH 25/89] library has been renamed --- images/fraser/Dockerfile | 2 +- images/fraser/fraser_merge_split.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index b1c454ac..11b5508b 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -21,7 +21,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \ bioconductor-org.hs.eg.db=3.20.0 && \ # Install summarizedRT via BiocManager - micromamba run -n base Rscript -e 'BiocManager::install("summarizedRT", update=FALSE, ask=FALSE)' && \ + micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index cc566029..1940770a 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -3,7 +3,7 @@ library(argparse) library(FRASER) library(BiocParallel) -library(summarizedRT) # Useful for rowMaxs/assay handling +library(SummarizedExperiment) # Useful for rowMaxs/assay handling parser <- ArgumentParser(description = "Merge Split Read Counts") parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") From efd22a8474daefef46ef9eb656abf3b485121260 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 12:24:52 +1100 Subject: [PATCH 26/89] hail file caching fix --- images/fraser/fraser_count_split.R | 44 ++++++++++++++++++++---------- images/fraser/fraser_init.R | 12 ++++---- images/fraser/fraser_merge_split.R | 12 +++++++- 3 files changed, 47 insertions(+), 21 deletions(-) diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index ce1f6613..b3938303 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -1,8 +1,7 @@ #!/usr/bin/env Rscript -# Set memory limit at the environment level before loading heavy libraries -# 11GB limit -Sys.setenv("R_MAX_VSIZE" = "14Gb") +# Set memory limit - increased slightly to allow for HDF5 overhead +Sys.setenv("R_MAX_VSIZE" = "16Gb") library(argparse) library(FRASER) @@ -26,8 +25,13 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE # 2. Force HDF5 and Configure Parallelism options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = 0) -bpparam <- SerialParam() +options("FRASER.maxJunctionsNoHDF5" = -1) +# Use MulticoreParam if threads > 1, else Serial +if(args$nthreads > 1){ + bpparam <- MulticoreParam(workers = args$nthreads) +} else { + bpparam <- SerialParam() +} # 3. Load and Prune IMMEDIATELY fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) @@ -37,22 +41,34 @@ strandSpecific(fds) <- 0 # SUBSET FIRST: This is the most critical memory-saving step. # By subsetting here, we drop the metadata of all other samples. fds <- fds[, fds$sampleID == args$sample_id] -colData(fds)$bamFile <- args$bam_path +# Validate the BAM path - Ensure the R script sees what Hail localized +if(!file.exists(args$bam_path)){ + stop(paste("BAM file not found at:", args$bam_path)) +} +colData(fds)$bamFile <- args$bam_path -# 5. Run Counting with strict filters -# minCount = 2 ignores junctions with only 1 supporting read (saves massive RAM) -message(paste("Starting countRNAData for sample:", args$sample_id)) +# 4. Count Split Reads +message(paste("Starting split read counting for sample:", args$sample_id)) -fds <- getSplitReadCountsForAllSamples(fds, - recount = TRUE) +# In FRASER 2.0, we use getSplitReadCountsForAllSamples with recount=TRUE. +# This writes an RDS file to the cache which we will harvest in the merge step. +fds <- getSplitReadCountsForAllSamples( + fds, + recount = TRUE, + BPPARAM = bpparam +) -# 6. Verification -# FRASER writes split counts to: {work_dir}/cache/splitCounts/splitCounts-{sample_id}.RDS +# 5. Verification +# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS")) if (file.exists(expected_out)) { message("Success: Created split counts at ", expected_out) } else { - stop("Counting failed. Check if BAM index is valid and accessible.") + # Check for common Bioinformatic failures + if(!file.exists(paste0(args$bam_path, ".bai"))){ + stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.") + } + stop("Counting failed. The RDS file was not generated in the cache.") } diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R index a0aea730..f31c5fc1 100644 --- a/images/fraser/fraser_init.R +++ b/images/fraser/fraser_init.R @@ -9,14 +9,12 @@ parser$add_argument("--work_dir", default = "/io/work", help = "Working director parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -# 1. CSV Injection: Read the mapping file instead of parsing strings -# Expected columns: sample_id, bam_path -sample_map <- read.csv(args$sample_map) +# 1. Read the static mapping created by the Python job +sample_map <- read.csv(args$sample_map, stringsAsFactors = FALSE) sample_table <- DataFrame( sampleID = as.character(sample_map$sample_id), bamFile = as.character(sample_map$bam_path), - group = "cohort", pairedEnd = TRUE ) options("FRASER.maxSamplesNoHDF5" = 0) @@ -25,7 +23,7 @@ options("FRASER.maxJunctionsNoHDF5" = -1) fds <- FraserDataSet( colData = sample_table, workingDir = args$work_dir, - name = paste0("FRASER_", args$cohort_id) + name = fds_name ) #Setup parallel execution @@ -41,12 +39,14 @@ fds <- saveFraserDataSet(fds) fds_save_path <- file.path( args$work_dir, "savedObjects", - paste0("FRASER_", args$cohort_id), + fds_name, "fds-object.RDS" ) if (file.exists(fds_save_path)) { message("Successfully initialized FDS skeleton at: ", fds_save_path) + # Output path for Hail logs + cat(fds_save_path, "\n") } else { stop("FDS object was not saved to: ", fds_save_path) } diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 1940770a..74ad0776 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -16,7 +16,7 @@ args <- parser$parse_args() fds_name <- paste0("FRASER_", args$cohort_id) save_dir <- file.path(args$work_dir, "savedObjects", fds_name) dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) -file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) # 2. Configure Backend options("FRASER.maxSamplesNoHDF5" = 0) @@ -27,6 +27,16 @@ register(bp) # 3. Load Dataset fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) +# --- VULNERABILITY FIX: HAIL LOCALIZATION --- +# Re-assert the dummy BAM path to prevent seqlevelsStyle crashes +# during junction metadata extraction. +dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE) +dummy_bam <- "/io/batch/input_bams/dummy.bam" +if(!file.exists(dummy_bam)) file.create(dummy_bam) +colData(fds)$bamFile <- dummy_bam +strandSpecific(fds) <- 0 +# -------------------------------------------- + # 4. Merge individual split count RDS files from the cache # FRASER automatically looks in: {work_dir}/cache/splitCounts/ message("Merging split counts from cache...") From c60b5791fbfff1a682532f58c1009afbefa181bd Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 14:50:46 +1100 Subject: [PATCH 27/89] whoops --- images/fraser/fraser_init.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R index f31c5fc1..03be3797 100644 --- a/images/fraser/fraser_init.R +++ b/images/fraser/fraser_init.R @@ -11,6 +11,9 @@ args <- parser$parse_args() # 1. Read the static mapping created by the Python job sample_map <- read.csv(args$sample_map, stringsAsFactors = FALSE) +# Define the dataset name consistently with the Python expectations +fds_name <- paste0("FRASER_", args$cohort_id) + sample_table <- DataFrame( sampleID = as.character(sample_map$sample_id), From a0984368832d7a8c28b0edc6b01eba4612f5af88 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 16:51:14 +1100 Subject: [PATCH 28/89] fraser_merge_split.R fix --- images/fraser/fraser_merge_split.R | 37 +++++++++++++++++++----------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 74ad0776..22f1c742 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -21,19 +21,30 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE # 2. Configure Backend options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) + + bp <- MulticoreParam(workers = args$nthreads) register(bp) # 3. Load Dataset +# Match the name exactly as defined in the init stage +fds_name <- paste0("FRASER_", args$cohort_id) fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# --- VULNERABILITY FIX: HAIL LOCALIZATION --- -# Re-assert the dummy BAM path to prevent seqlevelsStyle crashes -# during junction metadata extraction. -dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE) -dummy_bam <- "/io/batch/input_bams/dummy.bam" -if(!file.exists(dummy_bam)) file.create(dummy_bam) -colData(fds)$bamFile <- dummy_bam +# --- VULNERABILITY FIX: BAM Metadata Extraction --- +# Instead of a dummy file, point metadata queries to an existing BAM +# to ensure chromosome naming (seqlevelsStyle) can be validated. +# We assume the symlinks were created in /io/batch/input_bams/ +available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) +if(length(available_bams) > 0){ + colData(fds)$bamFile <- available_bams[1] +} else { + # Fallback if no bams localized (though this shouldn't happen) + dummy_bam <- "/io/batch/input_bams/dummy.bam" + dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE) + if(!file.exists(dummy_bam)) file.create(dummy_bam) + colData(fds)$bamFile <- dummy_bam +} strandSpecific(fds) <- 0 # -------------------------------------------- @@ -50,26 +61,24 @@ split_ranges <- rowRanges(split_counts_se) # Explicitly annotate splice sites to get donor/acceptor positions message("Annotating splice sites...") split_ranges <- FRASER:::annotateSpliceSite(split_ranges) -saveRDS(split_ranges, "g_ranges_split_counts.RDS") # 6. Filtering for Non-Split Counting (Optimization) -# We only want to count non-split reads for junctions that actually show up -# in our data to save massive amounts of compute time. message("Filtering ranges for non-split counting...") minExpressionInOneSample <- 20 -# Use assay(..., "rawCountsJ") to get the counts for junctions raw_counts <- assay(split_counts_se, "rawCountsJ") max_count <- rowMaxs(raw_counts) passed <- max_count >= minExpressionInOneSample filtered_ranges <- split_ranges[passed, ] -saveRDS(filtered_ranges, "g_ranges_non_split_counts.RDS") # 7. Extract Splice Site Coordinates -# These are the specific base-pair positions the next jobs will query in the BAMs message("Extracting splice site coordinates...") splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds) -saveRDS(splice_site_coords, "splice_site_coords.RDS") + +# Use absolute paths for saving to match Python 'mv' commands +saveRDS(split_ranges, file.path(args$work_dir, "g_ranges_split_counts.RDS")) +saveRDS(filtered_ranges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) +saveRDS(splice_site_coords, file.path(args$work_dir, "splice_site_coords.RDS")) # 8. Save the updated FDS saveFraserDataSet(fds) From c2910ed06a45e76bf53e6ab64322fc9592c15d71 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 17:11:22 +1100 Subject: [PATCH 29/89] fraser_merge_split.R fix --- images/fraser/Dockerfile | 1 + images/fraser/fraser_merge_split.R | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 11b5508b..5d26c50f 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -22,6 +22,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ bioconductor-org.hs.eg.db=3.20.0 && \ # Install summarizedRT via BiocManager micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \ + micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 22f1c742..cfa931ed 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -3,7 +3,8 @@ library(argparse) library(FRASER) library(BiocParallel) -library(SummarizedExperiment) # Useful for rowMaxs/assay handling +library(SummarizedExperiment) +library(Rsamtools) # Added for BamFile validation parser <- ArgumentParser(description = "Merge Split Read Counts") parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") @@ -37,13 +38,19 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) # We assume the symlinks were created in /io/batch/input_bams/ available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) if(length(available_bams) > 0){ - colData(fds)$bamFile <- available_bams[1] + message("Using reference BAM for metadata: ", available_bams[1]) + + # Force-update colData for ALL samples to point to a valid file. + # This prevents the 'dummy.bam' error during internal validation. + new_colData <- colData(fds) + new_colData$bamFile <- available_bams[1] + colData(fds) <- new_colData + + # Manually assert the seqlevelsStyle (e.g., 'UCSC' or 'Ensembl') + # This satisfies the internal check that is currently crashing. + seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1])) } else { - # Fallback if no bams localized (though this shouldn't happen) - dummy_bam <- "/io/batch/input_bams/dummy.bam" - dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE) - if(!file.exists(dummy_bam)) file.create(dummy_bam) - colData(fds)$bamFile <- dummy_bam + stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. The merge cannot proceed without at least one valid BAM header for seqlevelsStyle validation.") } strandSpecific(fds) <- 0 # -------------------------------------------- From 62e2fa2608a1783f7dc63491065605ac07d22126 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 22 Jan 2026 18:01:16 +1100 Subject: [PATCH 30/89] fraser_merge_split.R fix --- images/fraser/fraser_merge_split.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index cfa931ed..10095fc5 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -59,10 +59,12 @@ strandSpecific(fds) <- 0 # FRASER automatically looks in: {work_dir}/cache/splitCounts/ message("Merging split counts from cache...") fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp) +split_counts_se <- SummarizedExperiment( + assays = list(rawCountsJ = K(fds, type="j")), + rowRanges = rowRanges(fds, type="j"), + colData = colData(fds) +) -# 5. Extract and Annotate Junctions -# This identifies which junctions exist across the whole cohort -split_counts_se <- asSE(fds, type = "j") split_ranges <- rowRanges(split_counts_se) # Explicitly annotate splice sites to get donor/acceptor positions @@ -73,8 +75,8 @@ split_ranges <- FRASER:::annotateSpliceSite(split_ranges) message("Filtering ranges for non-split counting...") minExpressionInOneSample <- 20 raw_counts <- assay(split_counts_se, "rawCountsJ") -max_count <- rowMaxs(raw_counts) -passed <- max_count >= minExpressionInOneSample +max_count <- rowMaxs(raw_counts) +passed <- max_count >= minExpressionInOneSample filtered_ranges <- split_ranges[passed, ] From 999f930effd43898dd019c4226700add63c9eccd Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 00:29:40 +1100 Subject: [PATCH 31/89] fraser_merge_split.R fix --- images/fraser/fraser_merge_split.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 10095fc5..fc9871f9 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -4,7 +4,8 @@ library(argparse) library(FRASER) library(BiocParallel) library(SummarizedExperiment) -library(Rsamtools) # Added for BamFile validation +library(Rsamtools) +library(DelayedMatrixStats) # Crucial for rowMaxs on HDF5 data parser <- ArgumentParser(description = "Merge Split Read Counts") parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") @@ -32,11 +33,9 @@ register(bp) fds_name <- paste0("FRASER_", args$cohort_id) fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# --- VULNERABILITY FIX: BAM Metadata Extraction --- -# Instead of a dummy file, point metadata queries to an existing BAM -# to ensure chromosome naming (seqlevelsStyle) can be validated. -# We assume the symlinks were created in /io/batch/input_bams/ +# --- NUCLEAR FIX: Force Valid Metadata --- available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) + if(length(available_bams) > 0){ message("Using reference BAM for metadata: ", available_bams[1]) @@ -50,7 +49,7 @@ if(length(available_bams) > 0){ # This satisfies the internal check that is currently crashing. seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1])) } else { - stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. The merge cannot proceed without at least one valid BAM header for seqlevelsStyle validation.") + stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") } strandSpecific(fds) <- 0 # -------------------------------------------- @@ -60,20 +59,21 @@ strandSpecific(fds) <- 0 message("Merging split counts from cache...") fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp) split_counts_se <- SummarizedExperiment( - assays = list(rawCountsJ = K(fds, type="j")), + assays = list(rawCountsJ = K(fds, type="j")), rowRanges = rowRanges(fds, type="j"), colData = colData(fds) ) split_ranges <- rowRanges(split_counts_se) -# Explicitly annotate splice sites to get donor/acceptor positions message("Annotating splice sites...") split_ranges <- FRASER:::annotateSpliceSite(split_ranges) # 6. Filtering for Non-Split Counting (Optimization) message("Filtering ranges for non-split counting...") minExpressionInOneSample <- 20 + +# Use assay() and rowMaxs from DelayedMatrixStats for HDF5 compatibility raw_counts <- assay(split_counts_se, "rawCountsJ") max_count <- rowMaxs(raw_counts) passed <- max_count >= minExpressionInOneSample From ade8becbe329efcccd1ac56982299b519ac1a39d Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 11:57:51 +1100 Subject: [PATCH 32/89] good mornin, lets get crackin --- images/fraser/fraser_merge_split.R | 51 +++++++++++------------------- 1 file changed, 19 insertions(+), 32 deletions(-) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index fc9871f9..9a2a822a 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -5,7 +5,8 @@ library(FRASER) library(BiocParallel) library(SummarizedExperiment) library(Rsamtools) -library(DelayedMatrixStats) # Crucial for rowMaxs on HDF5 data +library(DelayedMatrixStats) + parser <- ArgumentParser(description = "Merge Split Read Counts") parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file") @@ -33,7 +34,7 @@ register(bp) fds_name <- paste0("FRASER_", args$cohort_id) fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# --- NUCLEAR FIX: Force Valid Metadata --- + available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) if(length(available_bams) > 0){ @@ -53,42 +54,28 @@ if(length(available_bams) > 0){ } strandSpecific(fds) <- 0 # -------------------------------------------- - +minExpressionInOneSample <- 20 # 4. Merge individual split count RDS files from the cache # FRASER automatically looks in: {work_dir}/cache/splitCounts/ message("Merging split counts from cache...") -fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp) -split_counts_se <- SummarizedExperiment( - assays = list(rawCountsJ = K(fds, type="j")), - rowRanges = rowRanges(fds, type="j"), - colData = colData(fds) -) - -split_ranges <- rowRanges(split_counts_se) - -message("Annotating splice sites...") -split_ranges <- FRASER:::annotateSpliceSite(split_ranges) - -# 6. Filtering for Non-Split Counting (Optimization) -message("Filtering ranges for non-split counting...") -minExpressionInOneSample <- 20 - -# Use assay() and rowMaxs from DelayedMatrixStats for HDF5 compatibility -raw_counts <- assay(split_counts_se, "rawCountsJ") -max_count <- rowMaxs(raw_counts) -passed <- max_count >= minExpressionInOneSample +splitCounts <- getSplitReadCountsForAllSamples(fds=fds, + recount=FALSE) +splitCountRanges <- rowRanges(splitCounts) +splitCountRangesNonFilt <- FRASER:::annotateSpliceSite(splitCountRanges) -filtered_ranges <- split_ranges[passed, ] +maxCount <- rowMaxs(assay(splitCounts, "rawCountsJ")) +passed <- maxCount >= minExpressionInOneSample +# extract granges after filtering +splitCountRanges <- splitCountRangesNonFilt[passed,] +spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges) -# 7. Extract Splice Site Coordinates -message("Extracting splice site coordinates...") -splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds) # Use absolute paths for saving to match Python 'mv' commands -saveRDS(split_ranges, file.path(args$work_dir, "g_ranges_split_counts.RDS")) -saveRDS(filtered_ranges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) -saveRDS(splice_site_coords, file.path(args$work_dir, "splice_site_coords.RDS")) +#This is sslightly confusing, but the filtered granges will be used to annotate non_split counts +saveRDS(splitCountRangesNonFilt, file.path(args$work_dir, "g_ranges_split_counts.RDS")) +saveRDS(splitCountRanges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) +saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) -# 8. Save the updated FDS -saveFraserDataSet(fds) +# 8. Save the updated FDS note that the dummy bams were used here so we dont have to localised the bam paths every time +saveFraserDataSet(splitCounts) message("Merge split complete.") From c094b082bd29affe921b789766018383474e9456 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 12:31:06 +1100 Subject: [PATCH 33/89] this should be it --- images/fraser/fraser_merge_split.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 9a2a822a..4eb6a8e4 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -76,6 +76,3 @@ saveRDS(splitCountRangesNonFilt, file.path(args$work_dir, "g_ranges_split_ saveRDS(splitCountRanges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) -# 8. Save the updated FDS note that the dummy bams were used here so we dont have to localised the bam paths every time -saveFraserDataSet(splitCounts) -message("Merge split complete.") From ee3b46c6200f45a4698a2bd350223c6ae0f02642 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 13:02:32 +1100 Subject: [PATCH 34/89] count non_split --- images/fraser/fraser_count_non_split.R | 48 +++++++++++++++++--------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 4077ebe8..b1582688 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -1,5 +1,8 @@ #!/usr/bin/env Rscript +# Set memory limit - increased slightly to allow for HDF5 overhead +Sys.setenv("R_MAX_VSIZE" = "16Gb") + library(argparse) library(FRASER) library(BiocParallel) @@ -27,23 +30,31 @@ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) # 2. Configure Parallelism and HDF5 options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) -bpparam <- SerialParam() +# Use MulticoreParam if threads > 1, else Serial +if(args$nthreads > 1){ + bpparam <- MulticoreParam(workers = args$nthreads) +} else { + bpparam <- SerialParam() +} # 3. Load Dataset and Coordinates fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS -# 4. Critical: Subset and Inject -# First, subset to the specific sample -fds <- fds[, fds$sampleID == args$sample_id] -colData(fds)$bamFile <- args$bam_path - -# Replace the internal splice site ranges with your filtered ones -# This forces FRASER to only query these specific BP positions in the BAM +# 4. Inject coordinates FIRST (before subsetting) nonDegenerated <- !duplicated(filtered_coords) mcols(fds)$spliceSiteCoords <- filtered_coords[nonDegenerated] -# Set strand specificity (Ensure 0 is correct for your library!) +# THEN subset to the specific sample +fds <- fds[, fds$sampleID == args$sample_id] + +# Validate the BAM path - Ensure the R script sees what Hail localized +if(!file.exists(args$bam_path)){ + stop(paste("BAM file not found at:", args$bam_path)) +} +colData(fds)$bamFile <- args$bam_path + +# Set strand specificity strandSpecific(fds) <- 0 # 5. Run Non-Split Counting @@ -52,12 +63,17 @@ message(paste("Counting non-split reads for sample:", args$sample_id)) getNonSplitReadCountsForAllSamples(fds, recount = TRUE) + # 6. Verification -# The Python script uses a 'find' command to move this output. -# We just need to check if the file was created in the cache. -out_file <- list.files(cache_dir, pattern = paste0("nonSplicedCounts-", args$sample_id)) -if (length(out_file) > 0) { - message("Successfully created non-split counts: ", out_file[1]) +# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS +expected_out <- file.path(args$work_dir, "cache", "nonSplicedCounts-", paste0("nonSplicedCounts--", args$sample_id, ".RDS")) + +if (file.exists(expected_out)) { + message("Success: Created split counts at ", expected_out) } else { - stop("Non-split counts file was not found in: ", cache_dir) -} + # Check for common Bioinformatic failures + if(!file.exists(paste0(args$bam_path, ".bai"))){ + stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.") + } + stop("Counting failed. The RDS file was not generated in the cache.") +} \ No newline at end of file From 9abdee7ba9c81ce407e288274ef07fb15fb09e78 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 13:38:35 +1100 Subject: [PATCH 35/89] Kiss,ONLY VULNERABILITY HERE IS bampaths --- images/fraser/fraser_count_non_split.R | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index b1582688..915614be 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -41,18 +41,6 @@ if(args$nthreads > 1){ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS -# 4. Inject coordinates FIRST (before subsetting) -nonDegenerated <- !duplicated(filtered_coords) -mcols(fds)$spliceSiteCoords <- filtered_coords[nonDegenerated] - -# THEN subset to the specific sample -fds <- fds[, fds$sampleID == args$sample_id] - -# Validate the BAM path - Ensure the R script sees what Hail localized -if(!file.exists(args$bam_path)){ - stop(paste("BAM file not found at:", args$bam_path)) -} -colData(fds)$bamFile <- args$bam_path # Set strand specificity strandSpecific(fds) <- 0 @@ -63,7 +51,13 @@ message(paste("Counting non-split reads for sample:", args$sample_id)) getNonSplitReadCountsForAllSamples(fds, recount = TRUE) - +sample_result <- countNonSplicedReads(args$sample_id, + splitCountRanges = NULL, + fds = fds, + NcpuPerSample = args$nthreads, + minAnchor=5, + recount= TRUE, + spliceSiteCoords=filtered_coords) # 6. Verification # FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS expected_out <- file.path(args$work_dir, "cache", "nonSplicedCounts-", paste0("nonSplicedCounts--", args$sample_id, ".RDS")) From b746910c265a3e4d89dd2088cd4278b974a397d1 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 13:59:40 +1100 Subject: [PATCH 36/89] I was the true vulnerability here --- images/fraser/fraser_count_non_split.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 915614be..710d93e1 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -48,8 +48,6 @@ strandSpecific(fds) <- 0 # 5. Run Non-Split Counting # This writes the .h5 or .RDS file into the cache_dir created above message(paste("Counting non-split reads for sample:", args$sample_id)) -getNonSplitReadCountsForAllSamples(fds, - recount = TRUE) sample_result <- countNonSplicedReads(args$sample_id, splitCountRanges = NULL, From 8be7cff03dea2f5a7849e23d3516f84e8413958e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 23 Jan 2026 14:42:12 +1100 Subject: [PATCH 37/89] the output checking didnt check the write place --- images/fraser/fraser_count_non_split.R | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 710d93e1..8d3b5a1d 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -57,15 +57,25 @@ sample_result <- countNonSplicedReads(args$sample_id, recount= TRUE, spliceSiteCoords=filtered_coords) # 6. Verification -# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS -expected_out <- file.path(args$work_dir, "cache", "nonSplicedCounts-", paste0("nonSplicedCounts--", args$sample_id, ".RDS")) +# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/ +actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name) -if (file.exists(expected_out)) { - message("Success: Created split counts at ", expected_out) +# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS +expected_h5 <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5")) +expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS")) + +if (file.exists(expected_h5)) { + message("Success: Created non-split counts (HDF5) at ", expected_h5) +} else if (file.exists(expected_rds)) { + message("Success: Created non-split counts (RDS) at ", expected_rds) } else { - # Check for common Bioinformatic failures + # Debugging: List files in the directory to see what actually happened + if(dir.exists(actual_cache_dir)){ + message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", ")) + } + if(!file.exists(paste0(args$bam_path, ".bai"))){ - stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.") + stop("BAM Index (.bai) missing. FRASER cannot perform random access.") } - stop("Counting failed. The RDS file was not generated in the cache.") + stop(paste("Counting failed. Output not found for sample:", args$sample_id)) } \ No newline at end of file From a3b158980d96a245f5a671c4b8975e270f675212 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 27 Jan 2026 13:26:12 +1100 Subject: [PATCH 38/89] merge_non_split --- images/fraser/fraser_merge_non_split.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 1dd0163f..058a4dd7 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -26,8 +26,28 @@ register(bp) # 3. Load Dataset # This loads the metadata; the counts will be pulled from the cache symlinked by Python +fds_name <- paste0("FRASER_", args$cohort_id) fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) + +available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) + +if(length(available_bams) > 0){ + message("Using reference BAM for metadata: ", available_bams[1]) + + # Force-update colData for ALL samples to point to a valid file. + # This prevents the 'dummy.bam' error during internal validation. + new_colData <- colData(fds) + new_colData$bamFile <- available_bams[1] + colData(fds) <- new_colData + + # Manually assert the seqlevelsStyle (e.g., 'UCSC' or 'Ensembl') + # This satisfies the internal check that is currently crashing. + seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1])) +} else { + stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") +} +strandSpecific(fds) <- 0 # 4. Merge Split Counts # This step updates the fds with the junctions identified in Step 3 message("Merging all split-read counts...") From 1de313541651a70d9df6d1abe2b6df62949cebd4 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 27 Jan 2026 15:51:27 +1100 Subject: [PATCH 39/89] merge_non_split --- images/fraser/fraser_merge_non_split.R | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 058a4dd7..67169243 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -33,19 +33,18 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) if(length(available_bams) > 0){ - message("Using reference BAM for metadata: ", available_bams[1]) + # We use the reference BAM to satisfy the internal 'file.exists' checks. + # available_bams[1] should be the 'reference.bam' symlinked by the Python task. + ref_bam <- available_bams[1] + message("Validating metadata against reference BAM: ", ref_bam) - # Force-update colData for ALL samples to point to a valid file. - # This prevents the 'dummy.bam' error during internal validation. - new_colData <- colData(fds) - new_colData$bamFile <- available_bams[1] - colData(fds) <- new_colData + # Update colData paths for all samples to the reference BAM + colData(fds)$bamFile <- ref_bam - # Manually assert the seqlevelsStyle (e.g., 'UCSC' or 'Ensembl') - # This satisfies the internal check that is currently crashing. - seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1])) + # Ensure seqlevels match (e.g., 'chr1' vs '1') to prevent crashing during merge + seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(ref_bam)) } else { - stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") + stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. Validation will fail.") } strandSpecific(fds) <- 0 # 4. Merge Split Counts From 57ef4afdf3f54820d881eb0fdab31cb5db36948e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 27 Jan 2026 16:24:58 +1100 Subject: [PATCH 40/89] how did that get in there? --- images/fraser/fraser_merge_non_split.R | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 67169243..a12c02a6 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -47,10 +47,6 @@ if(length(available_bams) > 0){ stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. Validation will fail.") } strandSpecific(fds) <- 0 -# 4. Merge Split Counts -# This step updates the fds with the junctions identified in Step 3 -message("Merging all split-read counts...") -fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE) # 5. Merge Non-Split Counts # We use the filtered ranges from Step 3 to define the 'at-site' junctions @@ -60,7 +56,7 @@ fds <- getNonSplitReadCountsForAllSamples( fds = fds, splitCountRanges = split_count_ranges, minAnchor = 5, - recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache + recount =FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache ) # 6. Statistical Normalization (PSI and Jaccard) From cac6886e7d6eec4f66ff32d353d61623b0381769 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 28 Jan 2026 12:51:36 +1100 Subject: [PATCH 41/89] name change --- images/fraser/fraser_merge_non_split.R | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index a12c02a6..a82d065a 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -51,19 +51,14 @@ strandSpecific(fds) <- 0 # 5. Merge Non-Split Counts # We use the filtered ranges from Step 3 to define the 'at-site' junctions message("Merging all non-split-read counts from cache...") -split_count_ranges <- readRDS(args$filtered_ranges_path) +non_split_count_ranges <- readRDS(args$filtered_ranges_path) fds <- getNonSplitReadCountsForAllSamples( fds = fds, - splitCountRanges = split_count_ranges, + splitCountRanges = non_split_count_ranges, minAnchor = 5, recount =FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache ) -# 6. Statistical Normalization (PSI and Jaccard) -# This is the 'FRASER 2.0' logic. It calculates the metrics used for outlier detection. -message("Calculating PSI and Jaccard Index values...") -fds <- calculatePSIValues(fds, BPPARAM = bp) - # 7. Final Save # This creates the complete 'fitted' dataset that the analysis script will use message("Saving final merged FraserDataSet...") From 8aae29f7cefec4e6886a0fb41e8b2613686a83dd Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 11:00:07 +1100 Subject: [PATCH 42/89] lets start again --- images/fraser/fraser_merge_non_split.R | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index a82d065a..bfbcd4e5 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -15,7 +15,11 @@ args <- parser$parse_args() # 1. Reconstruct Directory Structure fds_name <- paste0("FRASER_", args$cohort_id) save_dir <- file.path(args$work_dir, "savedObjects", fds_name) -dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) + +# Create the specific sub-directory for nonSplitCounts +dir.create(file.path(save_dir, "nonSplitCounts"), recursive = TRUE, showWarnings = FALSE) + +# Copy the object so loadFraserDataSet can find it file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) # 2. Configure Backend @@ -24,9 +28,7 @@ options("FRASER.maxJunctionsNoHDF5" = -1) bp <- MulticoreParam(workers = args$nthreads) register(bp) -# 3. Load Dataset -# This loads the metadata; the counts will be pulled from the cache symlinked by Python -fds_name <- paste0("FRASER_", args$cohort_id) +# --- 3. Load Dataset --- fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) @@ -40,11 +42,9 @@ if(length(available_bams) > 0){ # Update colData paths for all samples to the reference BAM colData(fds)$bamFile <- ref_bam - - # Ensure seqlevels match (e.g., 'chr1' vs '1') to prevent crashing during merge - seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(ref_bam)) + seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam)) } else { - stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. Validation will fail.") + stop("CRITICAL ERROR: No BAM files found.") } strandSpecific(fds) <- 0 @@ -52,16 +52,16 @@ strandSpecific(fds) <- 0 # We use the filtered ranges from Step 3 to define the 'at-site' junctions message("Merging all non-split-read counts from cache...") non_split_count_ranges <- readRDS(args$filtered_ranges_path) -fds <- getNonSplitReadCountsForAllSamples( +non_split_counts <- getNonSplitReadCountsForAllSamples( fds = fds, splitCountRanges = non_split_count_ranges, minAnchor = 5, - recount =FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache + recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache ) # 7. Final Save # This creates the complete 'fitted' dataset that the analysis script will use message("Saving final merged FraserDataSet...") -fds <- saveFraserDataSet(fds) +saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) message("Merge non-split and PSI calculation complete.") From ec6da3983ff785fff8a44a733b2ddaaeaf0325f8 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 11:21:57 +1100 Subject: [PATCH 43/89] fixing paths --- images/fraser/fraser_merge_non_split.R | 50 +++++++++++++------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index bfbcd4e5..42ae2f14 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -14,43 +14,46 @@ args <- parser$parse_args() # 1. Reconstruct Directory Structure fds_name <- paste0("FRASER_", args$cohort_id) -save_dir <- file.path(args$work_dir, "savedObjects", fds_name) +# Where the pipeline put them +old_dir <- file.path(args$work_dir, "savedObjects", "Data_Analysis", "nonSplitCounts") +# Where FRASER expects them +new_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts") -# Create the specific sub-directory for nonSplitCounts -dir.create(file.path(save_dir, "nonSplitCounts"), recursive = TRUE, showWarnings = FALSE) +# --- 2. Move Files to Correct Location --- +if (dir.exists(old_dir)) { + message("Moving .h5 files from ", old_dir, " to ", new_dir) + dir.create(new_dir, recursive = TRUE, showWarnings = FALSE) -# Copy the object so loadFraserDataSet can find it -file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) + h5_files <- list.files(old_dir, pattern = "\\.h5$", full.names = TRUE) + for (f in h5_files) { + dest <- file.path(new_dir, basename(f)) + file.rename(f, dest) + } +} else { + message("Notice: Source directory ", old_dir, " not found. Checking if files are already in place.") +} -# 2. Configure Backend -options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = -1) -bp <- MulticoreParam(workers = args$nthreads) -register(bp) +# --- 3. Prepare FDS for Loading --- +# FRASER's loadFraserDataSet looks for the .RDS file in a specific subfolder +save_dir <- file.path(args$work_dir, "savedObjects", fds_name) +dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) -# --- 3. Load Dataset --- +# --- 4. Load Dataset --- +# dir is the ROOT workdir, name is the cohort folder name fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) - if(length(available_bams) > 0){ - # We use the reference BAM to satisfy the internal 'file.exists' checks. - # available_bams[1] should be the 'reference.bam' symlinked by the Python task. ref_bam <- available_bams[1] - message("Validating metadata against reference BAM: ", ref_bam) - - # Update colData paths for all samples to the reference BAM colData(fds)$bamFile <- ref_bam seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam)) -} else { - stop("CRITICAL ERROR: No BAM files found.") } strandSpecific(fds) <- 0 -# 5. Merge Non-Split Counts -# We use the filtered ranges from Step 3 to define the 'at-site' junctions -message("Merging all non-split-read counts from cache...") +# --- 6. Merge Non-Split Counts --- +message("Merging counts using HDF5 files in: ", new_dir) non_split_count_ranges <- readRDS(args$filtered_ranges_path) non_split_counts <- getNonSplitReadCountsForAllSamples( fds = fds, @@ -61,7 +64,6 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( # 7. Final Save # This creates the complete 'fitted' dataset that the analysis script will use -message("Saving final merged FraserDataSet...") + saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) -message("Merge non-split and PSI calculation complete.") From c2a728461f1295eb3ccafa9d77b8e0bf8f4fade2 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 11:54:29 +1100 Subject: [PATCH 44/89] metadata file was missing? --- images/fraser/fraser_merge_non_split.R | 56 ++++++++++++++------------ 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 42ae2f14..64177a6b 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -3,6 +3,7 @@ library(argparse) library(FRASER) library(BiocParallel) +library(SummarizedExperiment) parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI") parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file") @@ -12,43 +13,48 @@ parser$add_argument("--work_dir", default = "/io/work", help = "Working director parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -# 1. Reconstruct Directory Structure +# 1. Load the ranges first so they are available for all steps +non_split_count_ranges <- readRDS(args$filtered_ranges_path) + +# 2. Reconstruct Directory Structure fds_name <- paste0("FRASER_", args$cohort_id) -# Where the pipeline put them -old_dir <- file.path(args$work_dir, "savedObjects", "Data_Analysis", "nonSplitCounts") -# Where FRASER expects them -new_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts") - -# --- 2. Move Files to Correct Location --- -if (dir.exists(old_dir)) { - message("Moving .h5 files from ", old_dir, " to ", new_dir) - dir.create(new_dir, recursive = TRUE, showWarnings = FALSE) - - h5_files <- list.files(old_dir, pattern = "\\.h5$", full.names = TRUE) - for (f in h5_files) { - dest <- file.path(new_dir, basename(f)) - file.rename(f, dest) - } -} else { - message("Notice: Source directory ", old_dir, " not found. Checking if files are already in place.") +save_dir <- file.path(args$work_dir, "savedObjects", fds_name) +out_dir <- file.path(save_dir, "nonSplitCounts") + +dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) + +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) + +# --- MINIMUM FIX: Move files and create the missing 'se.rds' metadata --- +h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE) +for(f in h5_files) { + file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) } -# --- 3. Prepare FDS for Loading --- -# FRASER's loadFraserDataSet looks for the .RDS file in a specific subfolder -save_dir <- file.path(args$work_dir, "savedObjects", fds_name) -dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) -file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) +# Create the se.rds 'glue' file that FRASER/HDF5Array looks for +non_split_count_ranges <- readRDS(args$filtered_ranges_path) +tmp_se <- SummarizedExperiment(rowRanges = non_split_count_ranges) +saveRDS(tmp_se, file.path(out_dir, "se.rds")) +# ----------------------------------------------------------------------- -# --- 4. Load Dataset --- -# dir is the ROOT workdir, name is the cohort folder name +# 2. Configure Backend +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) +bp <- MulticoreParam(workers = args$nthreads) +register(bp) + +# 3. Load Dataset fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) if(length(available_bams) > 0){ ref_bam <- available_bams[1] + message("Validating metadata against reference BAM: ", ref_bam) colData(fds)$bamFile <- ref_bam seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam)) +} else { + stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") } strandSpecific(fds) <- 0 From 127cb74cf49d9146df63c70ca4e929d7061c6256 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 12:13:45 +1100 Subject: [PATCH 45/89] paths --- images/fraser/fraser_merge_non_split.R | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 64177a6b..e6b68478 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -22,8 +22,7 @@ save_dir <- file.path(args$work_dir, "savedObjects", fds_name) out_dir <- file.path(save_dir, "nonSplitCounts") dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) - -file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) +file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) # --- MINIMUM FIX: Move files and create the missing 'se.rds' metadata --- h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE) @@ -31,8 +30,7 @@ for(f in h5_files) { file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) } -# Create the se.rds 'glue' file that FRASER/HDF5Array looks for -non_split_count_ranges <- readRDS(args$filtered_ranges_path) +# Create the se.rds 'glue' file that loadHDF5SummarizedExperiment requires tmp_se <- SummarizedExperiment(rowRanges = non_split_count_ranges) saveRDS(tmp_se, file.path(out_dir, "se.rds")) # ----------------------------------------------------------------------- @@ -59,8 +57,7 @@ if(length(available_bams) > 0){ strandSpecific(fds) <- 0 # --- 6. Merge Non-Split Counts --- -message("Merging counts using HDF5 files in: ", new_dir) -non_split_count_ranges <- readRDS(args$filtered_ranges_path) +message("Merging counts using HDF5 files in: ", out_dir) non_split_counts <- getNonSplitReadCountsForAllSamples( fds = fds, splitCountRanges = non_split_count_ranges, @@ -71,5 +68,5 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( # 7. Final Save # This creates the complete 'fitted' dataset that the analysis script will use -saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) +saveRDS(non_split_counts, file.path(args$work_dir, "non_split_counts.RDS")) From 379900a33cf6297f4389547d90e2a4dcde14d7a6 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 12:45:31 +1100 Subject: [PATCH 46/89] make a better simmarized exp --- images/fraser/fraser_merge_non_split.R | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index e6b68478..4530e0e8 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -30,8 +30,19 @@ for(f in h5_files) { file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) } -# Create the se.rds 'glue' file that loadHDF5SummarizedExperiment requires -tmp_se <- SummarizedExperiment(rowRanges = non_split_count_ranges) +# 1. Create a dummy matrix with the right number of samples (columns) +# This satisfies the: if(all(samples(fds) %in% colnames(siteCounts))) check +sample_names <- samples(fds) +dummy_matrix <- matrix(0, nrow=length(non_split_count_ranges), ncol=length(sample_names)) +colnames(dummy_matrix) <- sample_names + +# 2. Create the SE object with the dummy matrix +tmp_se <- SummarizedExperiment( + assays = list(rawCountsSS = dummy_matrix), # Name must match what FRASER expects + rowRanges = non_split_count_ranges +) + +# 3. Save it to the cache directory saveRDS(tmp_se, file.path(out_dir, "se.rds")) # ----------------------------------------------------------------------- From f32506ec7aed195dc3ce3787b89ed072df67df52 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 12:59:04 +1100 Subject: [PATCH 47/89] whoops --- images/fraser/fraser_merge_non_split.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 4530e0e8..5099d849 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -29,6 +29,8 @@ h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.n for(f in h5_files) { file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) } +# 3. Load Dataset +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) # 1. Create a dummy matrix with the right number of samples (columns) # This satisfies the: if(all(samples(fds) %in% colnames(siteCounts))) check @@ -52,9 +54,6 @@ options("FRASER.maxJunctionsNoHDF5" = -1) bp <- MulticoreParam(workers = args$nthreads) register(bp) -# 3. Load Dataset -fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) - available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) if(length(available_bams) > 0){ From f162368ccc70b4f4423d140478c4892cc4d756a1 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 13:39:49 +1100 Subject: [PATCH 48/89] handshake fix --- images/fraser/fraser_merge_non_split.R | 33 ++++++++++---------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 5099d849..71af95ec 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -4,6 +4,7 @@ library(argparse) library(FRASER) library(BiocParallel) library(SummarizedExperiment) +library(HDF5Array) parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI") parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file") @@ -15,40 +16,32 @@ args <- parser$parse_args() # 1. Load the ranges first so they are available for all steps non_split_count_ranges <- readRDS(args$filtered_ranges_path) - -# 2. Reconstruct Directory Structure fds_name <- paste0("FRASER_", args$cohort_id) save_dir <- file.path(args$work_dir, "savedObjects", fds_name) out_dir <- file.path(save_dir, "nonSplitCounts") - dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) -# --- MINIMUM FIX: Move files and create the missing 'se.rds' metadata --- +# 2. Load FDS +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) + +# 3. Anchor HDF5 Directory h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE) for(f in h5_files) { file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) } -# 3. Load Dataset -fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) - -# 1. Create a dummy matrix with the right number of samples (columns) -# This satisfies the: if(all(samples(fds) %in% colnames(siteCounts))) check -sample_names <- samples(fds) -dummy_matrix <- matrix(0, nrow=length(non_split_count_ranges), ncol=length(sample_names)) -colnames(dummy_matrix) <- sample_names -# 2. Create the SE object with the dummy matrix -tmp_se <- SummarizedExperiment( - assays = list(rawCountsSS = dummy_matrix), # Name must match what FRASER expects +anchor_se <- SummarizedExperiment( + assays = list(rawCountsSS = matrix(0, + nrow = length(non_split_count_ranges), + ncol = length(samples(fds)), + dimnames = list(NULL, samples(fds)))), rowRanges = non_split_count_ranges ) +# This creates the se.rds that FRASER's loadHDF5SummarizedExperiment needs +saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "") -# 3. Save it to the cache directory -saveRDS(tmp_se, file.path(out_dir, "se.rds")) -# ----------------------------------------------------------------------- - -# 2. Configure Backend +# 4. Configure Backend options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) bp <- MulticoreParam(workers = args$nthreads) From 7c09fd82c7c26e0cac4e72470d164e6f0dd0c19a Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 14:17:56 +1100 Subject: [PATCH 49/89] save fix --- images/fraser/fraser_merge_non_split.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 71af95ec..40d92e77 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -71,5 +71,13 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( # 7. Final Save # This creates the complete 'fitted' dataset that the analysis script will use -saveRDS(non_split_counts, file.path(args$work_dir, "non_split_counts.RDS")) +# Use the HDF5-safe saving method for the counts object specifically +# We save this to a new directory to avoid clobbering the input cache +saveHDF5SummarizedExperiment( + non_split_counts, + dir = file.path(args$work_dir, "merged_non_split_counts"), + replace = TRUE +) +# Also save the updated FDS object (this updates the internal fds-object.RDS) +fds <- saveFraserDataSet(fds) From 9b5e7b1efc201d396a81def2ae8f1e9b9084b3d7 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 16:35:12 +1100 Subject: [PATCH 50/89] Add join --- images/fraser/fraser_join_counts.R | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 images/fraser/fraser_join_counts.R diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R new file mode 100644 index 00000000..3ef817c0 --- /dev/null +++ b/images/fraser/fraser_join_counts.R @@ -0,0 +1,30 @@ +#!/usr/bin/env Rscript +library(argparse) +library(FRASER) +library(SummarizedExperiment) +library(HDF5Array) + +parser <- ArgumentParser(description = "Join Split and Non-Split Counts") +parser$add_argument("--fds_path", required = TRUE) +parser$add_argument("--cohort_id", required = TRUE) +parser$add_argument("--work_dir", default = "/io/work") +parser$add_argument("--nthreads", type = "integer", default = 1) +args <- parser$parse_args() + +# Set number of threads +register(MulticoreParam(args$nthreads)) + +fds_name <- paste0("FRASER_", args$cohort_id) + +# Load the FDS object +message(paste("Loading FraserDataSet from:", args$work_dir)) +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) + +# calculatePSIValues will look for HDF5 arrays in: +# 1. work_dir/savedObjects/fds_name/splitCounts +# 2. work_dir/savedObjects/fds_name/nonSplitCounts +message("Calculating PSI values using HDF5-backed assays...") +fds <- calculatePSIValues(fds) + +message("Saving integrated FraserDataSet...") +fds <- saveFraserDataSet(fds) From 98972c71e8e9bbae40e57ddbf0fe462d893e2a9c Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 29 Jan 2026 16:45:08 +1100 Subject: [PATCH 51/89] Add join --- images/fraser/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 5d26c50f..630f1f38 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -34,3 +34,4 @@ COPY fraser_count_non_split.R RDrnaseq/ COPY fraser_init.R RDrnaseq/ COPY fraser_merge_non_split.R RDrnaseq/ COPY fraser_merge_split.R RDrnaseq/ +COPY fraser_join_counts.R RDrnaseq/ From 87097f02fa24a18e956462dd28a7790ad2e4c75b Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 10:24:15 +1100 Subject: [PATCH 52/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 43 +++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 3ef817c0..8e1b2869 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -7,23 +7,46 @@ library(HDF5Array) parser <- ArgumentParser(description = "Join Split and Non-Split Counts") parser$add_argument("--fds_path", required = TRUE) parser$add_argument("--cohort_id", required = TRUE) +parser$add_argument("--filtered_ranges_path", required = TRUE) parser$add_argument("--work_dir", default = "/io/work") -parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() -# Set number of threads -register(MulticoreParam(args$nthreads)) - fds_name <- paste0("FRASER_", args$cohort_id) +# Paths to the HDF5 anchor directories +split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "splitCounts") +non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts") -# Load the FDS object -message(paste("Loading FraserDataSet from:", args$work_dir)) +# 1. Load the FDS object fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# calculatePSIValues will look for HDF5 arrays in: -# 1. work_dir/savedObjects/fds_name/splitCounts -# 2. work_dir/savedObjects/fds_name/nonSplitCounts -message("Calculating PSI values using HDF5-backed assays...") +# 2. Re-anchor Split Counts (The 'j' counts) +# If the splitCounts folder was moved/localized, fds might lose the pointer. +# We reload the HDF5 SummarizedExperiment and inject it back into fds. +if(file.exists(file.path(split_dir, "se.rds"))){ + message("Re-anchoring split counts (rawCountsJ)...") + split_se <- loadHDF5SummarizedExperiment(dir = split_dir) + assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se) +} else { + stop("CRITICAL ERROR: splitCounts/se.rds not found. Cannot calculate PSI.") +} + +# 3. Handle Non-Split Counts (The 'ss' counts) +# Use the shim logic from before to ensure the 'rawCountsSS' is present +non_split_count_ranges <- readRDS(args$filtered_ranges_path) +if(!file.exists(file.path(non_split_dir, "se.rds"))){ + message("Creating anchor for non-split counts (rawCountsSS)...") + anchor_se <- SummarizedExperiment( + assays = list(rawCountsSS = matrix(0, nrow=length(non_split_count_ranges), + ncol=length(samples(fds)), + dimnames=list(NULL, samples(fds)))), + rowRanges = non_split_count_ranges + ) + saveHDF5SummarizedExperiment(anchor_se, dir=non_split_dir, replace=TRUE, prefix="") +} + +# 4. Calculate PSI +# Now that both 'rawCountsJ' and 'rawCountsSS' are linked, this will succeed +message("Calculating PSI values...") fds <- calculatePSIValues(fds) message("Saving integrated FraserDataSet...") From 15f83aea4886af2d1e9aeac6c380d1ad001b346e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 10:30:50 +1100 Subject: [PATCH 53/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 47 +++++++++++++++++------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 8e1b2869..c1310b9e 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -7,8 +7,9 @@ library(HDF5Array) parser <- ArgumentParser(description = "Join Split and Non-Split Counts") parser$add_argument("--fds_path", required = TRUE) parser$add_argument("--cohort_id", required = TRUE) -parser$add_argument("--filtered_ranges_path", required = TRUE) parser$add_argument("--work_dir", default = "/io/work") +parser$add_argument("--filtered_ranges_path", required = TRUE) # Added this back +parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() fds_name <- paste0("FRASER_", args$cohort_id) @@ -19,35 +20,41 @@ non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCou # 1. Load the FDS object fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# 2. Re-anchor Split Counts (The 'j' counts) -# If the splitCounts folder was moved/localized, fds might lose the pointer. -# We reload the HDF5 SummarizedExperiment and inject it back into fds. +# 2. Re-attach Split Counts (Junctions) +# This prevents the "Missing rawCounts for 'j'" error if(file.exists(file.path(split_dir, "se.rds"))){ - message("Re-anchoring split counts (rawCountsJ)...") + message("Linking split counts (rawCountsJ)...") split_se <- loadHDF5SummarizedExperiment(dir = split_dir) - assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se) -} else { - stop("CRITICAL ERROR: splitCounts/se.rds not found. Cannot calculate PSI.") + assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se[, samples(fds)]) } -# 3. Handle Non-Split Counts (The 'ss' counts) -# Use the shim logic from before to ensure the 'rawCountsSS' is present +# 3. Handle Non-Split Counts (Splice Sites - 'ss') +# We need to ensure the se.rds anchor exists for the non-split files moved from Step 5 non_split_count_ranges <- readRDS(args$filtered_ranges_path) + if(!file.exists(file.path(non_split_dir, "se.rds"))){ - message("Creating anchor for non-split counts (rawCountsSS)...") + message("Creating HDF5 anchor for non-split counts (rawCountsSS)...") + # This shim tells FRASER how to read the loose .h5 files as a single assay anchor_se <- SummarizedExperiment( - assays = list(rawCountsSS = matrix(0, nrow=length(non_split_count_ranges), - ncol=length(samples(fds)), - dimnames=list(NULL, samples(fds)))), + assays = list(rawCountsSS = matrix(0, + nrow = length(non_split_count_ranges), + ncol = length(samples(fds)), + dimnames = list(NULL, samples(fds)))), rowRanges = non_split_count_ranges ) - saveHDF5SummarizedExperiment(anchor_se, dir=non_split_dir, replace=TRUE, prefix="") + # This creates the se.rds that FRASER needs to 'see' the .h5 files + saveHDF5SummarizedExperiment(anchor_se, dir = non_split_dir, replace = TRUE, prefix = "") } -# 4. Calculate PSI -# Now that both 'rawCountsJ' and 'rawCountsSS' are linked, this will succeed -message("Calculating PSI values...") -fds <- calculatePSIValues(fds) +# 4. Final Join and PSI Calculation +message("Calculating PSI values using both assays...") +# Link the non-split counts now that the anchor is ready +non_split_se <- loadHDF5SummarizedExperiment(dir = non_split_dir) +assay(fds, "rawCountsSS", withDimnames=FALSE) <- assay(non_split_se[, samples(fds)]) + +# Calculate ratios (PSI) +bp <- MulticoreParam(workers = args$nthreads) +fds <- calculatePSIValues(fds, BPPARAM = bp) -message("Saving integrated FraserDataSet...") +# 5. Save the final integrated object fds <- saveFraserDataSet(fds) From 8eb6413a9c299d445a501fa7366ca83caa0b2437 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 12:20:26 +1100 Subject: [PATCH 54/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index c1310b9e..f3ece970 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -3,16 +3,19 @@ library(argparse) library(FRASER) library(SummarizedExperiment) library(HDF5Array) +library(BiocParallel) parser <- ArgumentParser(description = "Join Split and Non-Split Counts") parser$add_argument("--fds_path", required = TRUE) parser$add_argument("--cohort_id", required = TRUE) parser$add_argument("--work_dir", default = "/io/work") -parser$add_argument("--filtered_ranges_path", required = TRUE) # Added this back parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() fds_name <- paste0("FRASER_", args$cohort_id) +# The orchestrator puts the non-split ranges here: +filtered_ranges_path <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS") + # Paths to the HDF5 anchor directories split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "splitCounts") non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts") @@ -29,12 +32,16 @@ if(file.exists(file.path(split_dir, "se.rds"))){ } # 3. Handle Non-Split Counts (Splice Sites - 'ss') -# We need to ensure the se.rds anchor exists for the non-split files moved from Step 5 -non_split_count_ranges <- readRDS(args$filtered_ranges_path) +# We must use the ranges generated in the 'Merge Split' step +if(!file.exists(filtered_ranges_path)){ + stop(paste("Required non-split ranges not found at:", filtered_ranges_path)) +} +non_split_count_ranges <- readRDS(filtered_ranges_path) if(!file.exists(file.path(non_split_dir, "se.rds"))){ message("Creating HDF5 anchor for non-split counts (rawCountsSS)...") - # This shim tells FRASER how to read the loose .h5 files as a single assay + + # Create the container matching the dimensions FRASER expects anchor_se <- SummarizedExperiment( assays = list(rawCountsSS = matrix(0, nrow = length(non_split_count_ranges), @@ -47,12 +54,12 @@ if(!file.exists(file.path(non_split_dir, "se.rds"))){ } # 4. Final Join and PSI Calculation -message("Calculating PSI values using both assays...") -# Link the non-split counts now that the anchor is ready +message("Loading non-split counts...") non_split_se <- loadHDF5SummarizedExperiment(dir = non_split_dir) assay(fds, "rawCountsSS", withDimnames=FALSE) <- assay(non_split_se[, samples(fds)]) # Calculate ratios (PSI) +message("Calculating PSI values...") bp <- MulticoreParam(workers = args$nthreads) fds <- calculatePSIValues(fds, BPPARAM = bp) From a0632e03e49342847db325c86a88f3f631d5a9a0 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 12:51:26 +1100 Subject: [PATCH 55/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 82 ++++++++++++++++-------------- 1 file changed, 45 insertions(+), 37 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index f3ece970..67a39755 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -3,7 +3,6 @@ library(argparse) library(FRASER) library(SummarizedExperiment) library(HDF5Array) -library(BiocParallel) parser <- ArgumentParser(description = "Join Split and Non-Split Counts") parser$add_argument("--fds_path", required = TRUE) @@ -13,55 +12,64 @@ parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() fds_name <- paste0("FRASER_", args$cohort_id) -# The orchestrator puts the non-split ranges here: -filtered_ranges_path <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS") +saveDir <- file.path(args$work_dir, "savedObjects", fds_name) # Paths to the HDF5 anchor directories -split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "splitCounts") -non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts") +split_dir <- file.path(saveDir, "splitCounts") +non_split_dir <- file.path(saveDir, "nonSplitCounts") + +# Paths to the RDS files generated in previous steps +gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS") +spliceSitePath <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS") # 1. Load the FDS object +message("Loading Fraser Data Set...") fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# 2. Re-attach Split Counts (Junctions) -# This prevents the "Missing rawCounts for 'j'" error -if(file.exists(file.path(split_dir, "se.rds"))){ - message("Linking split counts (rawCountsJ)...") - split_se <- loadHDF5SummarizedExperiment(dir = split_dir) - assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se[, samples(fds)]) -} +# Load the ranges +splitCounts_gRanges <- readRDS(gRangesSplitPath) +spliceSiteCoords <- readRDS(spliceSitePath) + +# 2. Get splitReads counts +# The HDF5 file is named 'rawCountsJ.h5' inside the splitCounts subdir +splitCounts_h5_path <- file.path(split_dir, "rawCountsJ.h5") +message(paste("Loading split counts from:", splitCounts_h5_path)) -# 3. Handle Non-Split Counts (Splice Sites - 'ss') -# We must use the ranges generated in the 'Merge Split' step -if(!file.exists(filtered_ranges_path)){ - stop(paste("Required non-split ranges not found at:", filtered_ranges_path)) -} -non_split_count_ranges <- readRDS(filtered_ranges_path) +splitCounts_h5 <- HDF5Array::HDF5Array(splitCounts_h5_path, "rawCountsJ") +splitCounts_se <- SummarizedExperiment( + colData = colData(fds), + rowRanges = splitCounts_gRanges, + assays = list(rawCountsJ = splitCounts_h5) +) -if(!file.exists(file.path(non_split_dir, "se.rds"))){ - message("Creating HDF5 anchor for non-split counts (rawCountsSS)...") +# 3. Get nonSplitRead counts +# The HDF5 file is named 'rawCountsSS.h5' inside the nonSplitCounts subdir +nonSplitCounts_h5_path <- file.path(non_split_dir, "rawCountsSS.h5") +message(paste("Loading non-split counts from:", nonSplitCounts_h5_path)) - # Create the container matching the dimensions FRASER expects - anchor_se <- SummarizedExperiment( - assays = list(rawCountsSS = matrix(0, - nrow = length(non_split_count_ranges), - ncol = length(samples(fds)), - dimnames = list(NULL, samples(fds)))), - rowRanges = non_split_count_ranges - ) - # This creates the se.rds that FRASER needs to 'see' the .h5 files - saveHDF5SummarizedExperiment(anchor_se, dir = non_split_dir, replace = TRUE, prefix = "") -} +nonSplitCounts_h5 <- HDF5Array::HDF5Array(nonSplitCounts_h5_path, "rawCountsSS") +nonSplitCounts_se <- SummarizedExperiment( + colData = colData(fds), + rowRanges = spliceSiteCoords, + assays = list(rawCountsSS = nonSplitCounts_h5) +) -# 4. Final Join and PSI Calculation -message("Loading non-split counts...") -non_split_se <- loadHDF5SummarizedExperiment(dir = non_split_dir) -assay(fds, "rawCountsSS", withDimnames=FALSE) <- assay(non_split_se[, samples(fds)]) +# 4. Add Counts to FRASER object +# This function automatically populates the internal slots correctly +message("Joining assays into FDS object...") +fds <- addCountsToFraserDataSet( + fds = fds, + splitCounts = splitCounts_se, + nonSplitCounts = nonSplitCounts_se +) -# Calculate ratios (PSI) +# 5. Calculate PSI values message("Calculating PSI values...") bp <- MulticoreParam(workers = args$nthreads) fds <- calculatePSIValues(fds, BPPARAM = bp) -# 5. Save the final integrated object +# 6. Save final FRASER object +message("Saving integrated FDS...") fds <- saveFraserDataSet(fds) + +message("FRASER join complete.") From 0c832c3d3c107b83517cd5999412a07d316bc4cb Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 13:34:02 +1100 Subject: [PATCH 56/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 67a39755..b413f327 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -31,11 +31,8 @@ splitCounts_gRanges <- readRDS(gRangesSplitPath) spliceSiteCoords <- readRDS(spliceSitePath) # 2. Get splitReads counts -# The HDF5 file is named 'rawCountsJ.h5' inside the splitCounts subdir -splitCounts_h5_path <- file.path(split_dir, "rawCountsJ.h5") -message(paste("Loading split counts from:", splitCounts_h5_path)) - -splitCounts_h5 <- HDF5Array::HDF5Array(splitCounts_h5_path, "rawCountsJ") +message("Loading split counts...") +splitCounts_h5 <- HDF5Array(file.path(split_dir, "rawCountsJ"), "rawCountsJ") splitCounts_se <- SummarizedExperiment( colData = colData(fds), rowRanges = splitCounts_gRanges, @@ -43,11 +40,8 @@ splitCounts_se <- SummarizedExperiment( ) # 3. Get nonSplitRead counts -# The HDF5 file is named 'rawCountsSS.h5' inside the nonSplitCounts subdir -nonSplitCounts_h5_path <- file.path(non_split_dir, "rawCountsSS.h5") -message(paste("Loading non-split counts from:", nonSplitCounts_h5_path)) - -nonSplitCounts_h5 <- HDF5Array::HDF5Array(nonSplitCounts_h5_path, "rawCountsSS") +message("Loading non-split counts...") +nonSplitCounts_h5 <- HDF5Array(file.path(non_split_dir, "rawCountsSS"), "rawCountsSS") nonSplitCounts_se <- SummarizedExperiment( colData = colData(fds), rowRanges = spliceSiteCoords, From d5b51ef9d942ff4cd074b3d11a89a809017622d5 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 14:57:41 +1100 Subject: [PATCH 57/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 42 +++++++++++++----------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index b413f327..1fee3300 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -14,13 +14,14 @@ args <- parser$parse_args() fds_name <- paste0("FRASER_", args$cohort_id) saveDir <- file.path(args$work_dir, "savedObjects", fds_name) -# Paths to the HDF5 anchor directories -split_dir <- file.path(saveDir, "splitCounts") -non_split_dir <- file.path(saveDir, "nonSplitCounts") - # Paths to the RDS files generated in previous steps gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS") -spliceSitePath <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS") +spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS") # Correct file + +# Load the ranges +splitCounts_gRanges <- readRDS(gRangesSplitPath) +spliceSiteCoords <- readRDS(spliceSitePath) # Now loads the actual splice site coords + # 1. Load the FDS object message("Loading Fraser Data Set...") @@ -32,21 +33,19 @@ spliceSiteCoords <- readRDS(spliceSitePath) # 2. Get splitReads counts message("Loading split counts...") -splitCounts_h5 <- HDF5Array(file.path(split_dir, "rawCountsJ"), "rawCountsJ") -splitCounts_se <- SummarizedExperiment( - colData = colData(fds), - rowRanges = splitCounts_gRanges, - assays = list(rawCountsJ = splitCounts_h5) -) +split_se_path <- file.path(saveDir, "splitCounts", "se.rds") +if(!file.exists(split_se_path)){ + stop(paste("Missing splitCounts anchor at:", split_se_path)) +} +splitCounts_se <- readRDS(split_se_path) # 3. Get nonSplitRead counts message("Loading non-split counts...") -nonSplitCounts_h5 <- HDF5Array(file.path(non_split_dir, "rawCountsSS"), "rawCountsSS") -nonSplitCounts_se <- SummarizedExperiment( - colData = colData(fds), - rowRanges = spliceSiteCoords, - assays = list(rawCountsSS = nonSplitCounts_h5) -) +non_split_se_path <- file.path(saveDir, "nonSplitCounts", "se.rds") +if(!file.exists(non_split_se_path)){ + stop(paste("Missing nonSplitCounts anchor at:", non_split_se_path)) +} +nonSplitCounts_se <- readRDS(non_split_se_path) # 4. Add Counts to FRASER object # This function automatically populates the internal slots correctly @@ -57,13 +56,8 @@ fds <- addCountsToFraserDataSet( nonSplitCounts = nonSplitCounts_se ) -# 5. Calculate PSI values -message("Calculating PSI values...") -bp <- MulticoreParam(workers = args$nthreads) -fds <- calculatePSIValues(fds, BPPARAM = bp) - -# 6. Save final FRASER object -message("Saving integrated FDS...") +# 5. Save final FRASER object +message("Saving final integrated FDS...") fds <- saveFraserDataSet(fds) message("FRASER join complete.") From 20a8af25b54709c3662f55f816c7f0387b4c04c5 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 15:54:32 +1100 Subject: [PATCH 58/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 1fee3300..2994d28a 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -16,16 +16,14 @@ saveDir <- file.path(args$work_dir, "savedObjects", fds_name) # Paths to the RDS files generated in previous steps gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS") -spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS") # Correct file - -# Load the ranges -splitCounts_gRanges <- readRDS(gRangesSplitPath) -spliceSiteCoords <- readRDS(spliceSitePath) # Now loads the actual splice site coords +spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS") +# 1. Load the FDS object DIRECTLY from RDS (not using loadFraserDataSet) +message("Loading Fraser Data Set from RDS...") +fds <- readRDS(args$fds_path) -# 1. Load the FDS object -message("Loading Fraser Data Set...") -fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) +# Update the internal directory path to match the current work directory +workingDir(fds) <- saveDir # Load the ranges splitCounts_gRanges <- readRDS(gRangesSplitPath) From 1f418f4ba498b639c1eaf5d0420e1b2e5dc75b8a Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 16:57:28 +1100 Subject: [PATCH 59/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 47 ++++++++++++++++-------------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 2994d28a..b8eef65e 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -14,39 +14,42 @@ args <- parser$parse_args() fds_name <- paste0("FRASER_", args$cohort_id) saveDir <- file.path(args$work_dir, "savedObjects", fds_name) -# Paths to the RDS files generated in previous steps -gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS") -spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS") - -# 1. Load the FDS object DIRECTLY from RDS (not using loadFraserDataSet) +# 1. Load the FDS object message("Loading Fraser Data Set from RDS...") fds <- readRDS(args$fds_path) # Update the internal directory path to match the current work directory workingDir(fds) <- saveDir -# Load the ranges -splitCounts_gRanges <- readRDS(gRangesSplitPath) -spliceSiteCoords <- readRDS(spliceSitePath) - -# 2. Get splitReads counts +# 2. Load split counts message("Loading split counts...") -split_se_path <- file.path(saveDir, "splitCounts", "se.rds") -if(!file.exists(split_se_path)){ - stop(paste("Missing splitCounts anchor at:", split_se_path)) +split_counts_path <- file.path(args$work_dir, "g_ranges_split_counts.RDS") +if(!file.exists(split_counts_path)){ + stop(paste("Missing split counts at:", split_counts_path)) +} +splitCounts_se <- readRDS(split_counts_path) + +# CRITICAL: Materialize the assays if they're HDF5-backed with broken paths +message("Materializing split counts assays...") +for(assay_name in names(assays(splitCounts_se))){ + assays(splitCounts_se)[[assay_name]] <- as.matrix(assays(splitCounts_se)[[assay_name]]) +} + +# 3. Load merged non-split counts +message("Loading merged non-split counts...") +merged_non_split_dir <- file.path(args$work_dir, "merged_non_split_counts") +if(!dir.exists(merged_non_split_dir)){ + stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir)) } -splitCounts_se <- readRDS(split_se_path) +nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) -# 3. Get nonSplitRead counts -message("Loading non-split counts...") -non_split_se_path <- file.path(saveDir, "nonSplitCounts", "se.rds") -if(!file.exists(non_split_se_path)){ - stop(paste("Missing nonSplitCounts anchor at:", non_split_se_path)) +# CRITICAL: Materialize non-split counts assays too +message("Materializing non-split counts assays...") +for(assay_name in names(assays(nonSplitCounts_se))){ + assays(nonSplitCounts_se)[[assay_name]] <- as.matrix(assays(nonSplitCounts_se)[[assay_name]]) } -nonSplitCounts_se <- readRDS(non_split_se_path) -# 4. Add Counts to FRASER object -# This function automatically populates the internal slots correctly +# 4. Add counts to FRASER object message("Joining assays into FDS object...") fds <- addCountsToFraserDataSet( fds = fds, From 0e7bdb2f7a51a3c0d12b0f5e6c3df2c0edd87b3b Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 17:26:37 +1100 Subject: [PATCH 60/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 23 +++++------------------ images/fraser/fraser_merge_split.R | 13 ++++++++++--- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index b8eef65e..70d6c06d 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -17,23 +17,16 @@ saveDir <- file.path(args$work_dir, "savedObjects", fds_name) # 1. Load the FDS object message("Loading Fraser Data Set from RDS...") fds <- readRDS(args$fds_path) - -# Update the internal directory path to match the current work directory workingDir(fds) <- saveDir # 2. Load split counts message("Loading split counts...") -split_counts_path <- file.path(args$work_dir, "g_ranges_split_counts.RDS") -if(!file.exists(split_counts_path)){ - stop(paste("Missing split counts at:", split_counts_path)) -} -splitCounts_se <- readRDS(split_counts_path) - -# CRITICAL: Materialize the assays if they're HDF5-backed with broken paths -message("Materializing split counts assays...") -for(assay_name in names(assays(splitCounts_se))){ - assays(splitCounts_se)[[assay_name]] <- as.matrix(assays(splitCounts_se)[[assay_name]]) +split_se_dir <- file.path(saveDir, "splitCounts") +if(!dir.exists(split_se_dir)){ + stop(paste("Missing splitCounts directory at:", split_se_dir, + "\nYou need to run the split counting step first")) } +splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) # 3. Load merged non-split counts message("Loading merged non-split counts...") @@ -43,12 +36,6 @@ if(!dir.exists(merged_non_split_dir)){ } nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) -# CRITICAL: Materialize non-split counts assays too -message("Materializing non-split counts assays...") -for(assay_name in names(assays(nonSplitCounts_se))){ - assays(nonSplitCounts_se)[[assay_name]] <- as.matrix(assays(nonSplitCounts_se)[[assay_name]]) -} - # 4. Add counts to FRASER object message("Joining assays into FDS object...") fds <- addCountsToFraserDataSet( diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 4eb6a8e4..3b0758d4 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -58,8 +58,15 @@ minExpressionInOneSample <- 20 # 4. Merge individual split count RDS files from the cache # FRASER automatically looks in: {work_dir}/cache/splitCounts/ message("Merging split counts from cache...") -splitCounts <- getSplitReadCountsForAllSamples(fds=fds, - recount=FALSE) +splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE) + +# SAVE THE SPLIT COUNTS BEFORE FILTERING +message("Saving merged split counts SummarizedExperiment...") +split_counts_dir <- file.path(save_dir, "splitCounts") +dir.create(split_counts_dir, recursive = TRUE, showWarnings = FALSE) +saveHDF5SummarizedExperiment(splitCounts, dir = split_counts_dir, replace = TRUE) + +# Now continue with filtering for annotations splitCountRanges <- rowRanges(splitCounts) splitCountRangesNonFilt <- FRASER:::annotateSpliceSite(splitCountRanges) @@ -71,7 +78,7 @@ spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges) # Use absolute paths for saving to match Python 'mv' commands -#This is sslightly confusing, but the filtered granges will be used to annotate non_split counts +#This is slightly confusing, but the filtered granges will be used to annotate non_split counts saveRDS(splitCountRangesNonFilt, file.path(args$work_dir, "g_ranges_split_counts.RDS")) saveRDS(splitCountRanges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) From 3c025364b0b1ba296e87bdaddd0c115e1955f68d Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 17:45:24 +1100 Subject: [PATCH 61/89] fix file path handling in join --- images/fraser/Dockerfile | 1 + images/fraser/fraser_merge_split.R | 1 + 2 files changed, 2 insertions(+) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 630f1f38..8e7505df 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -23,6 +23,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ # Install summarizedRT via BiocManager micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \ micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \ + micromamba run -n base Rscript -e 'BiocManager::install("HDF5Array", update=FALSE, ask=FALSE)' && \ micromamba clean --all --yes && \ rm -rf /var/lib/apt/lists/* diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 3b0758d4..a1ea45e8 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -6,6 +6,7 @@ library(BiocParallel) library(SummarizedExperiment) library(Rsamtools) library(DelayedMatrixStats) +library(HDF5Array) parser <- ArgumentParser(description = "Merge Split Read Counts") From b7b7bd5104b1ea7c7d3d301b3f25ebd29abf6e7e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 30 Jan 2026 18:19:34 +1100 Subject: [PATCH 62/89] fix file path handling in join --- images/fraser/fraser_join_counts.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 70d6c06d..1429b558 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -23,14 +23,13 @@ workingDir(fds) <- saveDir message("Loading split counts...") split_se_dir <- file.path(saveDir, "splitCounts") if(!dir.exists(split_se_dir)){ - stop(paste("Missing splitCounts directory at:", split_se_dir, - "\nYou need to run the split counting step first")) + stop(paste("Missing splitCounts directory at:", split_se_dir)) } splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) # 3. Load merged non-split counts message("Loading merged non-split counts...") -merged_non_split_dir <- file.path(args$work_dir, "merged_non_split_counts") +merged_non_split_dir <- file.path(saveDir, "nonSplitCounts") if(!dir.exists(merged_non_split_dir)){ stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir)) } From d574eaff4625d88ee3d27e19d3484d078a7ea553 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 2 Feb 2026 15:54:09 +1100 Subject: [PATCH 63/89] fixing the analysis script --- images/fraser/fraser_analysis.R | 117 ++++++++++++++++++-------------- 1 file changed, 65 insertions(+), 52 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index 0ff7e409..dbdd7cda 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -5,6 +5,7 @@ library(FRASER) library(tidyverse) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(org.Hs.eg.db) +library(HDF5Array) library(ggplot2) parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis") @@ -17,73 +18,85 @@ parser$add_argument("--min_count", type = "integer", default = 5) parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() -# Force HDF5 for large matrix operations -options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = -1) - -z_cutoff <- if (is.null(args$z_cutoff)) NA else args$z_cutoff +# --- 1. Parallelization Setup --- +# Use the user-provided nthreads for all BiocParallel operations bp <- MulticoreParam(workers = args$nthreads) register(bp) # 1. Load the merged FDS # Python extracted the tar into args$fds_dir. # Inside is 'savedObjects/FRASER_{cohort_id}/...' +options(delayedArray.block.size = 1e9) # 1GB blocks fds_name <- paste0("FRASER_", args$cohort_id) -fds <- loadFraserDataSet(dir = args$fds_dir, name = fds_name) +message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir)) +fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name) + + +# --- 3. Filtering --- +# It is critical to filter before fitting to reduce the size of the latent space matrices +fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp) + +# Create QC directory +dir.create("qc_plots", showWarnings = FALSE) + +# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast +set.seed(42) +plot_idx <- sample(nrow(fds), min(nrow(fds), 30000)) +fds_plot_subset <- fds[plot_idx, ] -# 2. Filter Expression and Variability -# This reduces the number of tests and improves power -message("Filtering junctions...") +message("Generating QC plots using downsampled subset...") +png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150) +plotFilterExpression(fds_plot_subset, bins = 100) +dev.off() + +# Apply actual filtering to full dataset fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) -dir.create("plots/misc", recursive = TRUE, showWarnings = FALSE) -png("plots/misc/filter_expression.png", width = 2000, height = 2000, res = 300) -plotFilterExpression(fds, bins = 100) +# --- 4. Dimensionality Message --- +raw_dim <- nrow(fds) +fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] +filtered_dim <- nrow(fds_filtered) + +message(paste0("\n--- Filtering Summary ---")) +message(paste0("Original junctions: ", raw_dim)) +message(paste0("Filtered junctions: ", filtered_dim)) +message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) + +# --- 5. Hyperparameter Optimization --- +# Optimization must run on the filtered set +opt_q <- bestQ(fds_filtered, type = "jaccard") + +png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150) +plotEncDimSearch(fds_filtered, type = "jaccard") dev.off() -# 3. Hyperparameter Optimization (bestQ) and Fitting -# FRASER uses an autoencoder to control for latent confounders (batch effects) -message("Finding optimal hyperparameter Q and fitting autoencoder...") -# In FRASER 2.0, we typically fit psi5, psi3, and jaccard -fds <- FRASER(fds, q = c(psi5=NA, psi3=NA, jaccard=NA), BPPARAM = bp) - -# 4. Annotation -message("Annotating results with UCSC hg38...") -txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene -orgDb <- org.Hs.eg.db -fds <- annotateRangesWithTxDb(fds, txdb = txdb, orgDb = orgDb) - -# 5. Extract Results -message("Exporting results...") -res <- results(fds, - padjCutoff = args$pval_cutoff, - deltaPsiCutoff = args$delta_psi_cutoff, - zScoreCutoff = z_cutoff, - minCount = args$min_count) +# --- 6. Fitting --- +fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp) -res_all <- results(fds, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) +# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit) +png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) +plotQQ(fds_fit, type = "jaccard") +dev.off() -write_csv(as.data.frame(res), "results.significant.csv") -write_csv(as.data.frame(res_all), "results.all.csv") +# --- 7. Annotation --- +fds_fit <- annotateRangesWithTxDb(fds_fit, + txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, + orgDb = org.Hs.eg.db) -# 6. Generate Diagnostic Plots -message("Generating Volcano and Q-Q plots...") -dir.create("plots/volcano", recursive = TRUE) -for(type in c("psi5", "psi3", "jaccard")){ - png(paste0("plots/volcano/volcano_", type, ".png"), width = 2000, height = 2000, res = 300) - print(plotVolcano(fds, type = type, sampleID = sampleIDs(fds)[1])) # Plots first sample as example - dev.off() -} +# --- 8. Results & Compressed Export --- +res <- results(fds_fit, + padjCutoff = args$pval_cutoff, + deltaPsiCutoff = args$delta_psi_cutoff, + zScoreCutoff = args$z_cutoff, + minCount = args$min_count) -# 7. Final Save -# We save this in a generic folder so Python can easily grab it -saveFraserDataSet(fds, dir = "output", name = paste0("Final_", args$cohort_id)) +# Extract all results for Jaccard +res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) -# Summary statistics for the log -sink("statistics_summary.txt") -cat("Cohort ID:", args$cohort_id, "\n") -cat("Total Samples:", length(sampleIDs(fds)), "\n") -cat("Significant Events (adj P <", args$pval_cutoff, "):", nrow(res), "\n") -sink() +message("Saving results...") +write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv")) +write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) -message("Analysis complete.") +# --- 9. Final Save --- +saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final")) +message("Analysis Complete.") \ No newline at end of file From 58c4462e728e6e83f0e3e11b547b110905107952 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 3 Feb 2026 10:12:45 +1100 Subject: [PATCH 64/89] Analysis steps are out of order? --- images/fraser/fraser_analysis.R | 56 ++++++++++++++------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index dbdd7cda..cad479ec 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -7,6 +7,7 @@ library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(org.Hs.eg.db) library(HDF5Array) library(ggplot2) +library(BiocParallel) parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis") parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder") @@ -19,51 +20,39 @@ parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() # --- 1. Parallelization Setup --- -# Use the user-provided nthreads for all BiocParallel operations bp <- MulticoreParam(workers = args$nthreads) register(bp) -# 1. Load the merged FDS -# Python extracted the tar into args$fds_dir. -# Inside is 'savedObjects/FRASER_{cohort_id}/...' -options(delayedArray.block.size = 1e9) # 1GB blocks +# --- 2. Load the Dataset --- +options(delayedArray.block.size = 1e9) fds_name <- paste0("FRASER_", args$cohort_id) message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir)) fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name) +# --- 3. Filtering & PSI Calculation --- +# Filter first to keep the Jaccard matrix calculation lean +fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) +fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] -# --- 3. Filtering --- -# It is critical to filter before fitting to reduce the size of the latent space matrices -fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp) +# Calculate Jaccard metrics only on filtered junctions +fitMetrics(fds_filtered) <- "jaccard" +fds_filtered <- calculatePSIValues(fds_filtered, types = "jaccard", BPPARAM = bp) -# Create QC directory +# --- 4. QC Plots (Downsampled) --- dir.create("qc_plots", showWarnings = FALSE) - -# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast set.seed(42) -plot_idx <- sample(nrow(fds), min(nrow(fds), 30000)) -fds_plot_subset <- fds[plot_idx, ] +plot_idx <- sample(nrow(fds_filtered), min(nrow(fds_filtered), 30000)) +fds_plot_subset <- fds_filtered[plot_idx, ] -message("Generating QC plots using downsampled subset...") +message("Generating QC plots...") png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150) plotFilterExpression(fds_plot_subset, bins = 100) dev.off() -# Apply actual filtering to full dataset -fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) - -# --- 4. Dimensionality Message --- -raw_dim <- nrow(fds) -fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] -filtered_dim <- nrow(fds_filtered) - -message(paste0("\n--- Filtering Summary ---")) -message(paste0("Original junctions: ", raw_dim)) -message(paste0("Filtered junctions: ", filtered_dim)) -message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) - # --- 5. Hyperparameter Optimization --- -# Optimization must run on the filtered set +message("Optimizing Hyperparameters (q) for Jaccard...") +# This step is required before calling bestQ +fds_filtered <- optimHyperParams(fds_filtered, type = "jaccard", BPPARAM = bp) opt_q <- bestQ(fds_filtered, type = "jaccard") png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150) @@ -71,29 +60,30 @@ plotEncDimSearch(fds_filtered, type = "jaccard") dev.off() # --- 6. Fitting --- +message(paste0("Fitting model with q=", opt_q)) fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp) -# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit) png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) plotQQ(fds_fit, type = "jaccard") dev.off() # --- 7. Annotation --- +message("Annotating junctions...") fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, orgDb = org.Hs.eg.db) -# --- 8. Results & Compressed Export --- +# --- 8. Results Export --- +message("Extracting results...") res <- results(fds_fit, + type = "jaccard", padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff, zScoreCutoff = args$z_cutoff, minCount = args$min_count) -# Extract all results for Jaccard -res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) +res_all <- results(fds_fit, type = "jaccard", padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) -message("Saving results...") write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv")) write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) From 696a1f7bd8d1a1e2544a5b3752aa15a0ccab55e1 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 3 Feb 2026 15:40:25 +1100 Subject: [PATCH 65/89] they were not, putting it back --- images/fraser/fraser_analysis.R | 56 +++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index cad479ec..dbdd7cda 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -7,7 +7,6 @@ library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(org.Hs.eg.db) library(HDF5Array) library(ggplot2) -library(BiocParallel) parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis") parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder") @@ -20,39 +19,51 @@ parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() # --- 1. Parallelization Setup --- +# Use the user-provided nthreads for all BiocParallel operations bp <- MulticoreParam(workers = args$nthreads) register(bp) -# --- 2. Load the Dataset --- -options(delayedArray.block.size = 1e9) +# 1. Load the merged FDS +# Python extracted the tar into args$fds_dir. +# Inside is 'savedObjects/FRASER_{cohort_id}/...' +options(delayedArray.block.size = 1e9) # 1GB blocks fds_name <- paste0("FRASER_", args$cohort_id) message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir)) fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name) -# --- 3. Filtering & PSI Calculation --- -# Filter first to keep the Jaccard matrix calculation lean -fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) -fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] -# Calculate Jaccard metrics only on filtered junctions -fitMetrics(fds_filtered) <- "jaccard" -fds_filtered <- calculatePSIValues(fds_filtered, types = "jaccard", BPPARAM = bp) +# --- 3. Filtering --- +# It is critical to filter before fitting to reduce the size of the latent space matrices +fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp) -# --- 4. QC Plots (Downsampled) --- +# Create QC directory dir.create("qc_plots", showWarnings = FALSE) + +# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast set.seed(42) -plot_idx <- sample(nrow(fds_filtered), min(nrow(fds_filtered), 30000)) -fds_plot_subset <- fds_filtered[plot_idx, ] +plot_idx <- sample(nrow(fds), min(nrow(fds), 30000)) +fds_plot_subset <- fds[plot_idx, ] -message("Generating QC plots...") +message("Generating QC plots using downsampled subset...") png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150) plotFilterExpression(fds_plot_subset, bins = 100) dev.off() +# Apply actual filtering to full dataset +fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) + +# --- 4. Dimensionality Message --- +raw_dim <- nrow(fds) +fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] +filtered_dim <- nrow(fds_filtered) + +message(paste0("\n--- Filtering Summary ---")) +message(paste0("Original junctions: ", raw_dim)) +message(paste0("Filtered junctions: ", filtered_dim)) +message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) + # --- 5. Hyperparameter Optimization --- -message("Optimizing Hyperparameters (q) for Jaccard...") -# This step is required before calling bestQ -fds_filtered <- optimHyperParams(fds_filtered, type = "jaccard", BPPARAM = bp) +# Optimization must run on the filtered set opt_q <- bestQ(fds_filtered, type = "jaccard") png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150) @@ -60,30 +71,29 @@ plotEncDimSearch(fds_filtered, type = "jaccard") dev.off() # --- 6. Fitting --- -message(paste0("Fitting model with q=", opt_q)) fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp) +# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit) png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) plotQQ(fds_fit, type = "jaccard") dev.off() # --- 7. Annotation --- -message("Annotating junctions...") fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, orgDb = org.Hs.eg.db) -# --- 8. Results Export --- -message("Extracting results...") +# --- 8. Results & Compressed Export --- res <- results(fds_fit, - type = "jaccard", padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff, zScoreCutoff = args$z_cutoff, minCount = args$min_count) -res_all <- results(fds_fit, type = "jaccard", padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) +# Extract all results for Jaccard +res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0) +message("Saving results...") write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv")) write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) From f6674ad0144e50d6ddbd1b5247e26e72a5e4f8f4 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 4 Feb 2026 12:02:14 +1100 Subject: [PATCH 66/89] adds rescue block --- images/fraser/fraser_analysis.R | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index dbdd7cda..c73e1f30 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -26,11 +26,23 @@ register(bp) # 1. Load the merged FDS # Python extracted the tar into args$fds_dir. # Inside is 'savedObjects/FRASER_{cohort_id}/...' -options(delayedArray.block.size = 1e9) # 1GB blocks + fds_name <- paste0("FRASER_", args$cohort_id) -message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir)) +message(paste0("Loading Fraser Data Set: ", fds_name)) fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name) +# REPAIR: If spliceSiteID is missing, FRASER 2.0 cannot calculate Jaccard/PSI +if(is.null(nonSplicedReads(fds)) || !("spliceSiteID" %in% colnames(mcols(fds, type="ss")))){ + message("Restoring missing Splice Site IDs for Jaccard calculation...") + + # We re-extract the splice site map from the existing junctions + # this forces FRASER to re-index the donor/acceptor relationships + fds <- makeFraserDataSet(colData=colData(fds), + junctions=rowRanges(fds, type="j")) + + # Re-assign counts if they were lost during the re-make + # (Usually not necessary if using loadFraserDataSet, but good for safety) +} # --- 3. Filtering --- # It is critical to filter before fitting to reduce the size of the latent space matrices From cc80fc96aeb95e0fac96d65689090f8762e0a94b Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 4 Feb 2026 13:06:25 +1100 Subject: [PATCH 67/89] rescuing the rescue block --- images/fraser/fraser_analysis.R | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index c73e1f30..7f534135 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -23,25 +23,26 @@ args <- parser$parse_args() bp <- MulticoreParam(workers = args$nthreads) register(bp) -# 1. Load the merged FDS -# Python extracted the tar into args$fds_dir. -# Inside is 'savedObjects/FRASER_{cohort_id}/...' - +# --- 2. Load the Dataset --- +options(delayedArray.block.size = 1e9) fds_name <- paste0("FRASER_", args$cohort_id) message(paste0("Loading Fraser Data Set: ", fds_name)) fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name) # REPAIR: If spliceSiteID is missing, FRASER 2.0 cannot calculate Jaccard/PSI -if(is.null(nonSplicedReads(fds)) || !("spliceSiteID" %in% colnames(mcols(fds, type="ss")))){ - message("Restoring missing Splice Site IDs for Jaccard calculation...") - - # We re-extract the splice site map from the existing junctions - # this forces FRASER to re-index the donor/acceptor relationships - fds <- makeFraserDataSet(colData=colData(fds), - junctions=rowRanges(fds, type="j")) - - # Re-assign counts if they were lost during the re-make - # (Usually not necessary if using loadFraserDataSet, but good for safety) +if(! "spliceSiteID" %in% colnames(mcols(fds, type="ss"))){ + message("Manually injecting Splice Site IDs...") + + # This internal FRASER call generates the mapping without needing the full constructor + # It populates the 'spliceSiteCoords' slot which calculatePSIValues needs + fds <- FRASER:::annotateSpliceSite(fds) + + # We also need to ensure the Jaccard-specific metadata is initialized + # Often, this is what's missing when the merge.data.table fails + if(is.null(mcols(fds, type="j")$startID)){ + # This maps junctions to the spliceSiteIDs we just generated + fds <- FRASER:::updateIndices(fds) + } } # --- 3. Filtering --- From 7365090d0e8ae4ac8b192c6442a77cbf9b41a647 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 4 Feb 2026 13:46:58 +1100 Subject: [PATCH 68/89] unrescuing the rescue block --- images/fraser/fraser_join_counts.R | 48 ++++++++++++++++++++++++++ images/fraser/fraser_merge_non_split.R | 4 ++- images/fraser/fraser_merge_split.R | 15 +++++--- 3 files changed, 61 insertions(+), 6 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 1429b558..d1ae66a0 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -11,6 +11,48 @@ parser$add_argument("--work_dir", default = "/io/work") parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() +check_fds_integrity <- function(fds) { + message("\n--- Running FDS Integrity Check ---") + + # 1. Check for basic ID existence + j_cols <- colnames(mcols(fds, type="j")) + ss_cols <- colnames(mcols(fds, type="ss")) + + has_ids <- all(c("startID", "endID") %in% j_cols) && ("spliceSiteID" %in% ss_cols) + + if(has_ids) { + message("SUCCESS: SpliceSiteIDs found in both Junctions and SpliceSites.") + } else { + stop("CRITICAL ERROR: SpliceSiteID metadata is missing. Analysis will fail.") + } + + # 2. Check for ID alignment (The "by.y" error test) + # Are all junction startIDs present in the Splice Site map? + missing_starts <- !all(mcols(fds, type="j")$startID %in% mcols(fds, type="ss")$spliceSiteID) + if(missing_starts) { + stop("CRITICAL ERROR: Junction startIDs do not match SpliceSiteIDs. Map is broken.") + } + message("SUCCESS: Junction IDs are correctly mapped to Splice Sites.") + + # 3. Check HDF5 Backend connectivity + # Ensure the pointers to the .h5 files are valid and readable + tryCatch({ + test_val <- as.matrix(counts(fds, type="j")[1, 1]) + message("SUCCESS: HDF5 backends are reachable and readable.") + }, error = function(e) { + stop("CRITICAL ERROR: HDF5 files are missing or paths are broken: ", e$message) + }) + + # 4. Check for Non-Split Counts + if("rawCountsSS" %in% assayNames(fds)) { + message("SUCCESS: Non-split counts (rawCountsSS) are present.") + } else { + warning("WARNING: Non-split counts are missing. Jaccard/PSI calculation may fail.") + } + + message("--- Integrity Check Passed ---\n") +} + fds_name <- paste0("FRASER_", args$cohort_id) saveDir <- file.path(args$work_dir, "savedObjects", fds_name) @@ -43,6 +85,12 @@ fds <- addCountsToFraserDataSet( nonSplitCounts = nonSplitCounts_se ) +# Instead of updateIndices, use this to force the internal ID mapping +# This will populate the 'ss' (splice site) metadata correctly +fds <- calculatePSIValues(fds, types="psi5") + +# --- Call the check before finishing --- +check_fds_integrity(fds) # 5. Save final FRASER object message("Saving final integrated FDS...") fds <- saveFraserDataSet(fds) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 40d92e77..2519ce2f 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -69,7 +69,8 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( ) # 7. Final Save -# This creates the complete 'fitted' dataset that the analysis script will use +# This populates the internal 'nonSplicedReads' slot and the SS map +nonSplicedReads(fds) <- non_split_counts # Use the HDF5-safe saving method for the counts object specifically # We save this to a new directory to avoid clobbering the input cache @@ -79,5 +80,6 @@ saveHDF5SummarizedExperiment( replace = TRUE ) +fds <- calculatePSIValues(fds) # Also save the updated FDS object (this updates the internal fds-object.RDS) fds <- saveFraserDataSet(fds) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index a1ea45e8..b403066e 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -61,15 +61,20 @@ minExpressionInOneSample <- 20 message("Merging split counts from cache...") splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE) -# SAVE THE SPLIT COUNTS BEFORE FILTERING -message("Saving merged split counts SummarizedExperiment...") +# --- CRITICAL FIX: Annotate the object ITSELF, not a temporary variable --- +# This generates startID and endID and attaches them to the rowRanges +message("Generating Splice Site IDs...") +rowRanges(splitCounts) <- FRASER:::annotateSpliceSite(rowRanges(splitCounts)) + +# 5. Now save the Annotated SE +message("Saving merged split counts SummarizedExperiment with IDs...") split_counts_dir <- file.path(save_dir, "splitCounts") dir.create(split_counts_dir, recursive = TRUE, showWarnings = FALSE) saveHDF5SummarizedExperiment(splitCounts, dir = split_counts_dir, replace = TRUE) -# Now continue with filtering for annotations -splitCountRanges <- rowRanges(splitCounts) -splitCountRangesNonFilt <- FRASER:::annotateSpliceSite(splitCountRanges) +# 6. Extract coordinates for the next pipeline steps +# We use the rowRanges that NOW HAVE the startID/endID columns +splitCountRangesNonFilt <- rowRanges(splitCounts) maxCount <- rowMaxs(assay(splitCounts, "rawCountsJ")) passed <- maxCount >= minExpressionInOneSample From f273223d7b957f57f5231bd4a3c971c07c20af41 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 4 Feb 2026 14:48:42 +1100 Subject: [PATCH 69/89] merging non split cannot calculate psi --- images/fraser/fraser_merge_non_split.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 2519ce2f..8e22c44c 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -80,6 +80,6 @@ saveHDF5SummarizedExperiment( replace = TRUE ) -fds <- calculatePSIValues(fds) + # Also save the updated FDS object (this updates the internal fds-object.RDS) fds <- saveFraserDataSet(fds) From 149615db48c80c2b3054a35fb097b95ca0f933e1 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 4 Feb 2026 15:24:18 +1100 Subject: [PATCH 70/89] join counts still cant calculate psi --- images/fraser/fraser_join_counts.R | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index d1ae66a0..80d9c019 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -77,7 +77,13 @@ if(!dir.exists(merged_non_split_dir)){ } nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) -# 4. Add counts to FRASER object +# 4. Annotate the Split Counts +# This is the missing link. It generates the spliceSiteID mapping +# that calculatePSIValues needs later. +message("Annotating splice sites...") +splitCounts_se <- annotateSpliceSites(splitCounts_se) + +# 5. Add counts to FRASER object message("Joining assays into FDS object...") fds <- addCountsToFraserDataSet( fds = fds, @@ -85,9 +91,11 @@ fds <- addCountsToFraserDataSet( nonSplitCounts = nonSplitCounts_se ) -# Instead of updateIndices, use this to force the internal ID mapping +# --- Call the check before finishing --- +check_fds_integrity(fds) + # This will populate the 'ss' (splice site) metadata correctly -fds <- calculatePSIValues(fds, types="psi5") +fds <- calculatePSIValues(fds, types="jaccard") # --- Call the check before finishing --- check_fds_integrity(fds) From 34f7991fbdb425be4388bbfc42d13111ecfb5423 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 4 Feb 2026 15:27:04 +1100 Subject: [PATCH 71/89] explicitly annotating non-split counts --- images/fraser/fraser_merge_non_split.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 8e22c44c..f33c3254 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -68,6 +68,11 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache ) +# CRITICAL ADDITION: Annotate the non-split counts object! +# This assigns 'spliceSiteID' to the rowRanges of your non-split object. +message("Annotating non-split splice site IDs...") +non_split_counts <- annotateSpliceSites(non_split_counts) + # 7. Final Save # This populates the internal 'nonSplicedReads' slot and the SS map nonSplicedReads(fds) <- non_split_counts From 9ce1a016cc28bbfda2e6842ab842ab0dafca2c4a Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 5 Feb 2026 13:33:47 +1100 Subject: [PATCH 72/89] explicitly annotating non-split counts, must refer to fraser function while I do that --- images/fraser/fraser_join_counts.R | 2 +- images/fraser/fraser_merge_non_split.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 80d9c019..296211aa 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -81,7 +81,7 @@ nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) # This is the missing link. It generates the spliceSiteID mapping # that calculatePSIValues needs later. message("Annotating splice sites...") -splitCounts_se <- annotateSpliceSites(splitCounts_se) +splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) # 5. Add counts to FRASER object message("Joining assays into FDS object...") diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index f33c3254..2fae4033 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -71,7 +71,7 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( # CRITICAL ADDITION: Annotate the non-split counts object! # This assigns 'spliceSiteID' to the rowRanges of your non-split object. message("Annotating non-split splice site IDs...") -non_split_counts <- annotateSpliceSites(non_split_counts) +non_split_counts <- FRASER:::annotateSpliceSite(non_split_counts) # 7. Final Save # This populates the internal 'nonSplicedReads' slot and the SS map From baab41398cc3092a69340052c651a966c0ed4640 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 5 Feb 2026 13:38:44 +1100 Subject: [PATCH 73/89] Always assume paired end samples --- images/fraser/fraser_count_non_split.R | 2 +- images/fraser/fraser_count_split.R | 2 +- images/fraser/fraser_merge_non_split.R | 2 +- images/fraser/fraser_merge_split.R | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 8d3b5a1d..7caa0c94 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -43,7 +43,7 @@ filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.R # Set strand specificity -strandSpecific(fds) <- 0 +strandSpecific(fds) <- 2 # 5. Run Non-Split Counting # This writes the .h5 or .RDS file into the cache_dir created above diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R index b3938303..cf2e8760 100644 --- a/images/fraser/fraser_count_split.R +++ b/images/fraser/fraser_count_split.R @@ -36,7 +36,7 @@ if(args$nthreads > 1){ # 3. Load and Prune IMMEDIATELY fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name) -strandSpecific(fds) <- 0 +strandSpecific(fds) <- 2 # SUBSET FIRST: This is the most critical memory-saving step. # By subsetting here, we drop the metadata of all other samples. diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 2fae4033..3768d467 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -57,7 +57,7 @@ if(length(available_bams) > 0){ } else { stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") } -strandSpecific(fds) <- 0 +strandSpecific(fds) <- 2 # --- 6. Merge Non-Split Counts --- message("Merging counts using HDF5 files in: ", out_dir) diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index b403066e..791961c4 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -53,7 +53,7 @@ if(length(available_bams) > 0){ } else { stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") } -strandSpecific(fds) <- 0 +strandSpecific(fds) <- 2 # -------------------------------------------- minExpressionInOneSample <- 20 # 4. Merge individual split count RDS files from the cache From c6f64b90f4552ec1dd1312b31c5cdbf43af634a5 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Thu, 5 Feb 2026 14:56:09 +1100 Subject: [PATCH 74/89] mini changes for data integrity --- images/fraser/fraser_count_non_split.R | 3 ++- images/fraser/fraser_merge_non_split.R | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 7caa0c94..9b6891af 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -40,7 +40,8 @@ if(args$nthreads > 1){ # 3. Load Dataset and Coordinates fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS - +# Add this before calling countNonSplicedReads +colData(fds)$bamFile <- args$bam_path # Set strand specificity strandSpecific(fds) <- 2 diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 3768d467..93dd6774 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -39,7 +39,7 @@ anchor_se <- SummarizedExperiment( rowRanges = non_split_count_ranges ) # This creates the se.rds that FRASER's loadHDF5SummarizedExperiment needs -saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "") +saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "rawCountsSS") # 4. Configure Backend options("FRASER.maxSamplesNoHDF5" = 0) @@ -73,6 +73,21 @@ non_split_counts <- getNonSplitReadCountsForAllSamples( message("Annotating non-split splice site IDs...") non_split_counts <- FRASER:::annotateSpliceSite(non_split_counts) +# --- Verification Block --- +# 1. Check if the assay exists +if(!"rawCountsSS" %in% assayNames(non_split_counts)){ + stop("Merge failed: 'rawCountsSS' assay not found in merged object.") +} + +# 2. Check for zeros (Common sign of a path/naming mismatch) +total_counts <- sum(assay(non_split_counts, "rawCountsSS")) +if(total_counts == 0){ + stop("Merge Error: The merged non-split counts matrix is empty (all zeros). + Check if sample IDs in fds match the filenames in the cache.") +} + +message("Merge verified: Found ", total_counts, " total site coverage counts.") + # 7. Final Save # This populates the internal 'nonSplicedReads' slot and the SS map nonSplicedReads(fds) <- non_split_counts From 908c59f3f48b687084c2f760b45d5efdd8302666 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Fri, 6 Feb 2026 16:22:29 +1100 Subject: [PATCH 75/89] here we go --- images/fraser/fraser_merge_non_split.R | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 93dd6774..85c3aed9 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -25,8 +25,10 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE # 2. Load FDS fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) -# 3. Anchor HDF5 Directory -h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE) +# 3. Copy H5 files from cache to the proper output directory +cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts") +h5_files <- list.files(cache_dir, pattern = "\\.h5$", full.names = TRUE) +message("Copying ", length(h5_files), " HDF5 files from cache to output directory...") for(f in h5_files) { file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) } @@ -82,24 +84,21 @@ if(!"rawCountsSS" %in% assayNames(non_split_counts)){ # 2. Check for zeros (Common sign of a path/naming mismatch) total_counts <- sum(assay(non_split_counts, "rawCountsSS")) if(total_counts == 0){ - stop("Merge Error: The merged non-split counts matrix is empty (all zeros). - Check if sample IDs in fds match the filenames in the cache.") + stop("Merge Error: The merged non-split counts matrix is empty (all zeros).") } message("Merge verified: Found ", total_counts, " total site coverage counts.") -# 7. Final Save -# This populates the internal 'nonSplicedReads' slot and the SS map -nonSplicedReads(fds) <- non_split_counts - -# Use the HDF5-safe saving method for the counts object specifically -# We save this to a new directory to avoid clobbering the input cache +# 8. Save the merged non-split counts to the correct location for join step +message("Saving merged non-split counts to: ", out_dir) saveHDF5SummarizedExperiment( non_split_counts, - dir = file.path(args$work_dir, "merged_non_split_counts"), + dir = out_dir, replace = TRUE ) - -# Also save the updated FDS object (this updates the internal fds-object.RDS) +# 9. Update FDS object and save +nonSplicedReads(fds) <- non_split_counts fds <- saveFraserDataSet(fds) + +message("Non-split merge complete!") From aaf1dcaf6d8f9d36caefcf39d9fc6d64f25e80f1 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 10 Feb 2026 18:25:00 +1100 Subject: [PATCH 76/89] Updates to merge non-split and join_counts --- images/fraser/fraser_join_counts.R | 65 +++++--- images/fraser/fraser_merge_non_split.R | 207 +++++++++++++++++-------- 2 files changed, 191 insertions(+), 81 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 296211aa..d4d495ca 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -3,6 +3,7 @@ library(argparse) library(FRASER) library(SummarizedExperiment) library(HDF5Array) +library(BiocParallel) parser <- ArgumentParser(description = "Join Split and Non-Split Counts") parser$add_argument("--fds_path", required = TRUE) @@ -11,6 +12,14 @@ parser$add_argument("--work_dir", default = "/io/work") parser$add_argument("--nthreads", type = "integer", default = 1) args <- parser$parse_args() +# Configure FRASER to use HDF5 backend +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +# Setup parallel processing +bp <- MulticoreParam(workers = args$nthreads) +register(bp) + check_fds_integrity <- function(fds) { message("\n--- Running FDS Integrity Check ---") @@ -27,7 +36,6 @@ check_fds_integrity <- function(fds) { } # 2. Check for ID alignment (The "by.y" error test) - # Are all junction startIDs present in the Splice Site map? missing_starts <- !all(mcols(fds, type="j")$startID %in% mcols(fds, type="ss")$spliceSiteID) if(missing_starts) { stop("CRITICAL ERROR: Junction startIDs do not match SpliceSiteIDs. Map is broken.") @@ -35,7 +43,6 @@ check_fds_integrity <- function(fds) { message("SUCCESS: Junction IDs are correctly mapped to Splice Sites.") # 3. Check HDF5 Backend connectivity - # Ensure the pointers to the .h5 files are valid and readable tryCatch({ test_val <- as.matrix(counts(fds, type="j")[1, 1]) message("SUCCESS: HDF5 backends are reachable and readable.") @@ -57,50 +64,70 @@ fds_name <- paste0("FRASER_", args$cohort_id) saveDir <- file.path(args$work_dir, "savedObjects", fds_name) # 1. Load the FDS object -message("Loading Fraser Data Set from RDS...") +message("\n[1/5] Loading Fraser Data Set from RDS...") fds <- readRDS(args$fds_path) workingDir(fds) <- saveDir # 2. Load split counts -message("Loading split counts...") +message("\n[2/5] Loading split counts...") split_se_dir <- file.path(saveDir, "splitCounts") if(!dir.exists(split_se_dir)){ stop(paste("Missing splitCounts directory at:", split_se_dir)) } splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) +message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples") + +# Annotate split counts with splice site IDs +message("Annotating split counts with splice site IDs...") +splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) # 3. Load merged non-split counts -message("Loading merged non-split counts...") +message("\n[3/5] Loading merged non-split counts...") merged_non_split_dir <- file.path(saveDir, "nonSplitCounts") if(!dir.exists(merged_non_split_dir)){ stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir)) } nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) +message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", ncol(nonSplitCounts_se), " samples") -# 4. Annotate the Split Counts -# This is the missing link. It generates the spliceSiteID mapping -# that calculatePSIValues needs later. -message("Annotating splice sites...") -splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) +# Verify non-split counts are annotated +if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) { + message("Annotating non-split counts with splice site IDs...") + nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) +} -# 5. Add counts to FRASER object -message("Joining assays into FDS object...") +# 4. Add counts to FRASER object +message("\n[4/5] Joining split and non-split counts into FDS object...") fds <- addCountsToFraserDataSet( fds = fds, splitCounts = splitCounts_se, nonSplitCounts = nonSplitCounts_se ) -# --- Call the check before finishing --- +message("Counts successfully joined!") +message(" - Split junctions: ", nrow(counts(fds, type = "j"))) +message(" - Splice sites: ", nrow(counts(fds, type = "ss"))) + +# Run integrity check check_fds_integrity(fds) -# This will populate the 'ss' (splice site) metadata correctly -fds <- calculatePSIValues(fds, types="jaccard") +# 5. Calculate PSI values +message("\n[5/5] Calculating PSI values...") +fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard")) -# --- Call the check before finishing --- +message("PSI values calculated successfully!") +message("Available assays: ", paste(assayNames(fds), collapse = ", ")) + +# Final integrity check check_fds_integrity(fds) -# 5. Save final FRASER object -message("Saving final integrated FDS...") + +# 6. Save final FRASER object +message("\nSaving final integrated FDS...") fds <- saveFraserDataSet(fds) -message("FRASER join complete.") +message("\n=== FRASER join complete! ===") +message("FDS object saved in: ", workingDir(fds)) +message("\nNext steps for analysis:") +message(" 1. Filter junctions: fds <- filterExpressionAndVariability(fds)") +message(" 2. Fit model: fds <- FRASER(fds)") +message(" 3. Get results: results(fds)") diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 85c3aed9..68b243d8 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -5,100 +5,183 @@ library(FRASER) library(BiocParallel) library(SummarizedExperiment) library(HDF5Array) +library(rhdf5) parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI") parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") -parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to g_ranges_non_split_counts.RDS") +parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to splice_site_coords.RDS") parser$add_argument("--work_dir", default = "/io/work", help = "Working directory") parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads") args <- parser$parse_args() -# 1. Load the ranges first so they are available for all steps -non_split_count_ranges <- readRDS(args$filtered_ranges_path) +# Configure FRASER to use HDF5 backend +options("FRASER.maxSamplesNoHDF5" = 0) +options("FRASER.maxJunctionsNoHDF5" = -1) + +# 1. Setup directories +message("\n[1/6] Setting up directories...") fds_name <- paste0("FRASER_", args$cohort_id) save_dir <- file.path(args$work_dir, "savedObjects", fds_name) out_dir <- file.path(save_dir, "nonSplitCounts") dir.create(out_dir, recursive = TRUE, showWarnings = FALSE) -file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE) -# 2. Load FDS -fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) +# Copy FDS file only if source and destination are different +fds_dest <- file.path(save_dir, "fds-object.RDS") +if(normalizePath(args$fds_path, mustWork = FALSE) != normalizePath(fds_dest, mustWork = FALSE)) { + file.copy(args$fds_path, fds_dest, overwrite = TRUE) + message("Copied FDS from: ", args$fds_path) +} else { + message("FDS already in correct location: ", fds_dest) +} -# 3. Copy H5 files from cache to the proper output directory +# 2. Load FDS to get sample information +message("\n[2/6] Loading FraserDataSet...") +fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) +sample_ids <- samples(fds) +n_samples <- length(sample_ids) +message("Samples: ", n_samples) + +# 3. Load the genomic ranges +message("\n[3/6] Loading genomic ranges...") +non_split_ranges <- readRDS(args$filtered_ranges_path) +n_sites <- length(non_split_ranges) +message("Non-split sites: ", n_sites) + +# 4. Copy H5 files from cache to the proper output directory +message("\n[4/6] Copying and organizing H5 files...") cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts") h5_files <- list.files(cache_dir, pattern = "\\.h5$", full.names = TRUE) -message("Copying ", length(h5_files), " HDF5 files from cache to output directory...") -for(f in h5_files) { - file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE) +message("Found ", length(h5_files), " HDF5 files in cache") + +if(length(h5_files) == 0) { + stop("ERROR: No H5 files found in cache directory: ", cache_dir) } -anchor_se <- SummarizedExperiment( - assays = list(rawCountsSS = matrix(0, - nrow = length(non_split_count_ranges), - ncol = length(samples(fds)), - dimnames = list(NULL, samples(fds)))), - rowRanges = non_split_count_ranges -) -# This creates the se.rds that FRASER's loadHDF5SummarizedExperiment needs -saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "rawCountsSS") +# Copy with standardized naming convention +for(i in seq_along(sample_ids)) { + sid <- sample_ids[i] + src_file <- h5_files[grepl(sid, h5_files)][1] -# 4. Configure Backend -options("FRASER.maxSamplesNoHDF5" = 0) -options("FRASER.maxJunctionsNoHDF5" = -1) -bp <- MulticoreParam(workers = args$nthreads) -register(bp) + if(is.na(src_file)) { + warning("No H5 file found for sample: ", sid) + next + } + dest <- file.path(out_dir, paste0("nonSplicedCounts-", sid, ".h5")) + file.copy(src_file, dest, overwrite = TRUE) + message(" Copied: ", basename(src_file), " -> ", basename(dest)) +} -available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) -if(length(available_bams) > 0){ - ref_bam <- available_bams[1] - message("Validating metadata against reference BAM: ", ref_bam) - colData(fds)$bamFile <- ref_bam - seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam)) -} else { - stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.") +# 5. Validate H5 files and build combined count matrix +message("\n[5/6] Building combined count matrix (memory-efficient)...") + +# Read first file to validate dimensions +first_h5 <- file.path(out_dir, paste0("nonSplicedCounts-", sample_ids[1], ".h5")) +if(!file.exists(first_h5)) { + stop("ERROR: First H5 file not found: ", first_h5) } -strandSpecific(fds) <- 2 -# --- 6. Merge Non-Split Counts --- -message("Merging counts using HDF5 files in: ", out_dir) -non_split_counts <- getNonSplitReadCountsForAllSamples( - fds = fds, - splitCountRanges = non_split_count_ranges, - minAnchor = 5, - recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache -) +h5_info <- h5ls(first_h5) +dataset_row <- h5_info[h5_info$name == "nonSplicedCounts", ] +actual_rows <- as.numeric(strsplit(dataset_row$dim, " x ")[[1]][1]) + +if(actual_rows != n_sites) { + stop("ERROR: H5 files have ", actual_rows, " rows but ranges have ", n_sites, + " sites. Mismatch!") +} -# CRITICAL ADDITION: Annotate the non-split counts object! -# This assigns 'spliceSiteID' to the rowRanges of your non-split object. -message("Annotating non-split splice site IDs...") -non_split_counts <- FRASER:::annotateSpliceSite(non_split_counts) +message("Validated: H5 dimensions match genomic ranges (", actual_rows, " sites)") -# --- Verification Block --- -# 1. Check if the assay exists -if(!"rawCountsSS" %in% assayNames(non_split_counts)){ - stop("Merge failed: 'rawCountsSS' assay not found in merged object.") +# Create a combined count matrix by reading each sample one at a time +count_matrix <- matrix(0, nrow = n_sites, ncol = n_samples) +colnames(count_matrix) <- sample_ids + +total_counts <- 0 +for(i in seq_along(sample_ids)) { + sid <- sample_ids[i] + h5_file <- file.path(out_dir, paste0("nonSplicedCounts-", sid, ".h5")) + + if(!file.exists(h5_file)) { + warning("H5 file not found for sample ", sid, ", using zeros") + next + } + + message(" Reading [", i, "/", n_samples, "]: ", sid) + + # Read the counts data for this sample only + counts <- h5read(h5_file, "nonSplicedCounts") + if(is.matrix(counts)) { + counts <- counts[, 1] + } + + count_matrix[, i] <- as.vector(counts) + total_counts <- total_counts + sum(counts) } -# 2. Check for zeros (Common sign of a path/naming mismatch) -total_counts <- sum(assay(non_split_counts, "rawCountsSS")) -if(total_counts == 0){ - stop("Merge Error: The merged non-split counts matrix is empty (all zeros).") +message("Combined matrix dimensions: ", nrow(count_matrix), " x ", ncol(count_matrix)) +message("Total counts: ", total_counts) + +if(total_counts == 0) { + stop("ERROR: The merged non-split counts matrix is empty (all zeros).") } -message("Merge verified: Found ", total_counts, " total site coverage counts.") +# 6. Create HDF5-backed SummarizedExperiment +message("\n[6/6] Creating HDF5-backed SummarizedExperiment...") -# 8. Save the merged non-split counts to the correct location for join step -message("Saving merged non-split counts to: ", out_dir) -saveHDF5SummarizedExperiment( - non_split_counts, - dir = out_dir, - replace = TRUE +# Create SE with in-memory matrix +nonSplitCounts_se <- SummarizedExperiment( + assays = list(rawCountsSS = count_matrix), + rowRanges = non_split_ranges, + colData = DataFrame(sampleID = sample_ids, row.names = sample_ids) ) -# 9. Update FDS object and save +message("Non-split SE created with dimensions: ", nrow(nonSplitCounts_se), + " x ", ncol(nonSplitCounts_se)) + +# Annotate non-split counts with splice site IDs +message("Annotating non-split counts with splice site IDs...") +nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) + +# Save the SummarizedExperiment - this will automatically create HDF5 backend +message("Saving non-split HDF5 SummarizedExperiment...") +saveHDF5SummarizedExperiment(nonSplitCounts_se, dir = out_dir, replace = TRUE) + +# Reload it to get the proper HDF5-backed version +message("Reloading HDF5-backed version...") +non_split_counts <- loadHDF5SummarizedExperiment(dir = out_dir) + +# Final verification +if(!"rawCountsSS" %in% assayNames(non_split_counts)) { + stop("ERROR: 'rawCountsSS' assay not found in reloaded object.") +} + +message("Verification: Reloaded object has ", sum(assay(non_split_counts, "rawCountsSS")), " total counts") + +# 7. Configure Backend for FDS +message("\nConfiguring FRASER backend...") +bp <- MulticoreParam(workers = args$nthreads) +register(bp) + +# Set BAM file metadata (for validation purposes) +available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE) +if(length(available_bams) > 0) { + ref_bam <- available_bams[1] + message("Validating metadata against reference BAM: ", ref_bam) + colData(fds)$bamFile <- ref_bam + seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam)) +} else { + warning("WARNING: No BAM files found in /io/batch/input_bams.") +} +strandSpecific(fds) <- 2 + +# 8. Update FDS object with the merged non-split counts +message("\nUpdating FDS object with merged non-split counts...") nonSplicedReads(fds) <- non_split_counts fds <- saveFraserDataSet(fds) -message("Non-split merge complete!") +message("\n=== Non-split merge complete! ===") +message("Final stats:") +message(" - Sites: ", nrow(non_split_counts)) +message(" - Samples: ", ncol(non_split_counts)) +message(" - Total counts: ", sum(assay(non_split_counts, "rawCountsSS"))) From b1d55ee48d82b8dd9480dbac6945bd622b537321 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 10 Feb 2026 21:48:11 +1100 Subject: [PATCH 77/89] Updates to merge non-split and join_counts --- images/fraser/fraser_count_non_split.R | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 9b6891af..d51f34bf 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -23,8 +23,9 @@ save_dir <- file.path(args$work_dir, "savedObjects", fds_name) dir.create(save_dir, recursive = TRUE, showWarnings = FALSE) file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS")) -# Create the specific cache directory for non-split counts -cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name) +# Create the cache directory WITHOUT the fds_name subdirectory +# This matches what the merge script expects +cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts") dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) # 2. Configure Parallelism and HDF5 @@ -58,12 +59,9 @@ sample_result <- countNonSplicedReads(args$sample_id, recount= TRUE, spliceSiteCoords=filtered_coords) # 6. Verification -# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/ -actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name) - -# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS -expected_h5 <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5")) -expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS")) +# The H5 file should now be at cache_dir/nonSplicedCounts-{sample_id}.h5 +expected_h5 <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5")) +expected_rds <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS")) if (file.exists(expected_h5)) { message("Success: Created non-split counts (HDF5) at ", expected_h5) @@ -71,8 +69,8 @@ if (file.exists(expected_h5)) { message("Success: Created non-split counts (RDS) at ", expected_rds) } else { # Debugging: List files in the directory to see what actually happened - if(dir.exists(actual_cache_dir)){ - message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", ")) + if(dir.exists(cache_dir)){ + message("Files found in cache dir: ", paste(list.files(cache_dir), collapse=", ")) } if(!file.exists(paste0(args$bam_path, ".bai"))){ From 85743706cb80185fa9d905cb5e462f0d7e45bf61 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 10 Feb 2026 22:25:34 +1100 Subject: [PATCH 78/89] Updates to join_counts --- images/fraser/fraser_join_counts.R | 74 +++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 16 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index d4d495ca..e1f960e9 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -64,12 +64,21 @@ fds_name <- paste0("FRASER_", args$cohort_id) saveDir <- file.path(args$work_dir, "savedObjects", fds_name) # 1. Load the FDS object -message("\n[1/5] Loading Fraser Data Set from RDS...") +message("\n[1/7] Loading Fraser Data Set from RDS...") fds <- readRDS(args$fds_path) workingDir(fds) <- saveDir -# 2. Load split counts -message("\n[2/5] Loading split counts...") +# 2. Load the master splice site coordinates that were used to create non-split counts +message("\n[2/7] Loading master splice site coordinates...") +splice_coords_path <- file.path(dirname(dirname(saveDir)), "splice_site_coords.RDS") +if(!file.exists(splice_coords_path)) { + stop("ERROR: splice_site_coords.RDS not found at: ", splice_coords_path) +} +master_splice_coords <- readRDS(splice_coords_path) +message("Master coordinates loaded: ", length(master_splice_coords), " sites") + +# 3. Load split counts +message("\n[3/7] Loading split counts...") split_se_dir <- file.path(saveDir, "splitCounts") if(!dir.exists(split_se_dir)){ stop(paste("Missing splitCounts directory at:", split_se_dir)) @@ -77,12 +86,16 @@ if(!dir.exists(split_se_dir)){ splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples") -# Annotate split counts with splice site IDs -message("Annotating split counts with splice site IDs...") -splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) +# Annotate split counts using the master coordinates to ensure consistency +message("Annotating split counts with master splice site IDs...") +# Use FRASER's internal function but with our master coordinates +splitCountRanges <- rowRanges(splitCounts_se) +# Generate IDs based on the genomic positions +splitCountRanges <- FRASER:::annotateSpliceSite(splitCountRanges, master_splice_coords) +rowRanges(splitCounts_se) <- splitCountRanges -# 3. Load merged non-split counts -message("\n[3/5] Loading merged non-split counts...") +# 4. Load merged non-split counts +message("\n[4/7] Loading merged non-split counts...") merged_non_split_dir <- file.path(saveDir, "nonSplitCounts") if(!dir.exists(merged_non_split_dir)){ stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir)) @@ -90,14 +103,43 @@ if(!dir.exists(merged_non_split_dir)){ nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", ncol(nonSplitCounts_se), " samples") -# Verify non-split counts are annotated +# Verify non-split counts match the master coordinates +if(nrow(nonSplitCounts_se) != length(master_splice_coords)) { + stop("ERROR: Non-split counts dimensions (", nrow(nonSplitCounts_se), + ") don't match master coordinates (", length(master_splice_coords), ")") +} + +# Verify non-split counts are annotated - they should already be from the merge step if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) { - message("Annotating non-split counts with splice site IDs...") - nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) + message("Annotating non-split counts with master splice site IDs...") + nonSplitRanges <- rowRanges(nonSplitCounts_se) + nonSplitRanges <- FRASER:::annotateSpliceSite(nonSplitRanges, master_splice_coords) + rowRanges(nonSplitCounts_se) <- nonSplitRanges +} + +# 5. Verify ID consistency before joining +message("\n[5/7] Verifying ID consistency...") +split_start_ids <- unique(mcols(splitCounts_se)$startID) +split_end_ids <- unique(mcols(splitCounts_se)$endID) +nonsplit_ids <- unique(mcols(nonSplitCounts_se)$spliceSiteID) + +message(" Split startIDs: ", length(split_start_ids)) +message(" Split endIDs: ", length(split_end_ids)) +message(" Non-split spliceSiteIDs: ", length(nonsplit_ids)) + +# Check overlap +missing_start <- sum(!split_start_ids %in% nonsplit_ids) +missing_end <- sum(!split_end_ids %in% nonsplit_ids) + +if(missing_start > 0 || missing_end > 0) { + warning("WARNING: Some split junction IDs are not in non-split IDs:") + warning(" Missing startIDs: ", missing_start) + warning(" Missing endIDs: ", missing_end) + warning(" This may cause issues but attempting to continue...") } -# 4. Add counts to FRASER object -message("\n[4/5] Joining split and non-split counts into FDS object...") +# 6. Add counts to FRASER object +message("\n[6/7] Joining split and non-split counts into FDS object...") fds <- addCountsToFraserDataSet( fds = fds, splitCounts = splitCounts_se, @@ -111,8 +153,8 @@ message(" - Splice sites: ", nrow(counts(fds, type = "ss"))) # Run integrity check check_fds_integrity(fds) -# 5. Calculate PSI values -message("\n[5/5] Calculating PSI values...") +# 7. Calculate PSI values +message("\n[7/7] Calculating PSI values...") fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard")) message("PSI values calculated successfully!") @@ -121,7 +163,7 @@ message("Available assays: ", paste(assayNames(fds), collapse = ", ")) # Final integrity check check_fds_integrity(fds) -# 6. Save final FRASER object +# 8. Save final FRASER object message("\nSaving final integrated FDS...") fds <- saveFraserDataSet(fds) From 4c1dd5c8cc18ad83dc8ff220edef3306a9ae8a99 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 10 Feb 2026 23:09:56 +1100 Subject: [PATCH 79/89] Updates to join_counts --- images/fraser/fraser_join_counts.R | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index e1f960e9..a8efc386 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -86,13 +86,10 @@ if(!dir.exists(split_se_dir)){ splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples") -# Annotate split counts using the master coordinates to ensure consistency -message("Annotating split counts with master splice site IDs...") -# Use FRASER's internal function but with our master coordinates -splitCountRanges <- rowRanges(splitCounts_se) -# Generate IDs based on the genomic positions -splitCountRanges <- FRASER:::annotateSpliceSite(splitCountRanges, master_splice_coords) -rowRanges(splitCounts_se) <- splitCountRanges +# Annotate split counts with splice site IDs +# NOTE: This must be done AFTER loading and works on the whole object +message("Annotating split counts with splice site IDs...") +splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) # 4. Load merged non-split counts message("\n[4/7] Loading merged non-split counts...") @@ -109,12 +106,12 @@ if(nrow(nonSplitCounts_se) != length(master_splice_coords)) { ") don't match master coordinates (", length(master_splice_coords), ")") } -# Verify non-split counts are annotated - they should already be from the merge step +# Non-split counts should already be annotated from the merge step +# Verify the annotation is present if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) { - message("Annotating non-split counts with master splice site IDs...") - nonSplitRanges <- rowRanges(nonSplitCounts_se) - nonSplitRanges <- FRASER:::annotateSpliceSite(nonSplitRanges, master_splice_coords) - rowRanges(nonSplitCounts_se) <- nonSplitRanges + message("WARNING: Non-split counts missing spliceSiteID annotation!") + message("Annotating non-split counts with splice site IDs...") + nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) } # 5. Verify ID consistency before joining From 80230c28abe7e331a9351e52e1d36486d3db00a3 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 10:04:08 +1100 Subject: [PATCH 80/89] Updates to join_counts --- images/fraser/fraser_join_counts.R | 77 ++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 8 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index a8efc386..069064e6 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -86,10 +86,70 @@ if(!dir.exists(split_se_dir)){ splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples") -# Annotate split counts with splice site IDs -# NOTE: This must be done AFTER loading and works on the whole object -message("Annotating split counts with splice site IDs...") -splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) +# Annotate split counts with splice site IDs using the master coordinates +# This is CRITICAL: we must use the same coordinate reference as the non-split counts +message("Annotating split counts with splice site IDs using master coordinates...") +# Extract the start and end coordinates from each junction +splitRanges <- rowRanges(splitCounts_se) + +# Create a mapping of genomic positions to splice site IDs using master coordinates +# The master coordinates define the universe of valid splice sites +master_coords_df <- data.frame( + seqnames = as.character(seqnames(master_splice_coords)), + pos = start(master_splice_coords), + strand = as.character(strand(master_splice_coords)), + spliceSiteID = mcols(master_splice_coords)$spliceSiteID +) + +# If master coordinates don't have spliceSiteIDs yet, generate them +if(is.null(master_coords_df$spliceSiteID)) { + message("Generating spliceSiteIDs for master coordinates...") + master_coords_df$spliceSiteID <- paste0( + "ss_", master_coords_df$seqnames, "_", + master_coords_df$pos, "_", + master_coords_df$strand + ) + mcols(master_splice_coords)$spliceSiteID <- master_coords_df$spliceSiteID +} + +# Now annotate the split counts by matching their start/end positions to the master coordinates +message("Mapping junction start positions to master coordinates...") +start_df <- data.frame( + seqnames = as.character(seqnames(splitRanges)), + pos = start(splitRanges), + strand = as.character(strand(splitRanges)) +) +start_df$startID <- master_coords_df$spliceSiteID[ + match( + paste0(start_df$seqnames, "_", start_df$pos, "_", start_df$strand), + paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand) + ) +] + +message("Mapping junction end positions to master coordinates...") +end_df <- data.frame( + seqnames = as.character(seqnames(splitRanges)), + pos = end(splitRanges), + strand = as.character(strand(splitRanges)) +) +end_df$endID <- master_coords_df$spliceSiteID[ + match( + paste0(end_df$seqnames, "_", end_df$pos, "_", end_df$strand), + paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand) + ) +] + +# Add the IDs to the splitCounts metadata +mcols(splitCounts_se)$startID <- start_df$startID +mcols(splitCounts_se)$endID <- end_df$endID + +# Count how many junctions have valid IDs +valid_junctions <- sum(!is.na(start_df$startID) & !is.na(end_df$endID)) +message(" Junctions with valid start/end IDs: ", valid_junctions, " / ", nrow(splitCounts_se)) + +if(valid_junctions == 0) { + stop("ERROR: No junctions could be mapped to master coordinates!") +} # 4. Load merged non-split counts message("\n[4/7] Loading merged non-split counts...") @@ -107,11 +167,12 @@ if(nrow(nonSplitCounts_se) != length(master_splice_coords)) { } # Non-split counts should already be annotated from the merge step -# Verify the annotation is present +# If not, annotate them using the master coordinates if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) { - message("WARNING: Non-split counts missing spliceSiteID annotation!") - message("Annotating non-split counts with splice site IDs...") - nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) + message("Annotating non-split counts with master coordinate IDs...") + mcols(nonSplitCounts_se)$spliceSiteID <- master_coords_df$spliceSiteID +} else { + message("Non-split counts already annotated with spliceSiteIDs") } # 5. Verify ID consistency before joining From e9abd104681e4a8a2a68aad46e78314096dff5f8 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 10:28:29 +1100 Subject: [PATCH 81/89] Updates to join_counts --- images/fraser/fraser_join_counts.R | 103 +++++------------------------ 1 file changed, 17 insertions(+), 86 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index 069064e6..bc23df5e 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -86,70 +86,11 @@ if(!dir.exists(split_se_dir)){ splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir) message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples") -# Annotate split counts with splice site IDs using the master coordinates -# This is CRITICAL: we must use the same coordinate reference as the non-split counts -message("Annotating split counts with splice site IDs using master coordinates...") -# Extract the start and end coordinates from each junction -splitRanges <- rowRanges(splitCounts_se) - -# Create a mapping of genomic positions to splice site IDs using master coordinates -# The master coordinates define the universe of valid splice sites -master_coords_df <- data.frame( - seqnames = as.character(seqnames(master_splice_coords)), - pos = start(master_splice_coords), - strand = as.character(strand(master_splice_coords)), - spliceSiteID = mcols(master_splice_coords)$spliceSiteID -) - -# If master coordinates don't have spliceSiteIDs yet, generate them -if(is.null(master_coords_df$spliceSiteID)) { - message("Generating spliceSiteIDs for master coordinates...") - master_coords_df$spliceSiteID <- paste0( - "ss_", master_coords_df$seqnames, "_", - master_coords_df$pos, "_", - master_coords_df$strand - ) - mcols(master_splice_coords)$spliceSiteID <- master_coords_df$spliceSiteID -} - -# Now annotate the split counts by matching their start/end positions to the master coordinates -message("Mapping junction start positions to master coordinates...") -start_df <- data.frame( - seqnames = as.character(seqnames(splitRanges)), - pos = start(splitRanges), - strand = as.character(strand(splitRanges)) -) -start_df$startID <- master_coords_df$spliceSiteID[ - match( - paste0(start_df$seqnames, "_", start_df$pos, "_", start_df$strand), - paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand) - ) -] - -message("Mapping junction end positions to master coordinates...") -end_df <- data.frame( - seqnames = as.character(seqnames(splitRanges)), - pos = end(splitRanges), - strand = as.character(strand(splitRanges)) -) -end_df$endID <- master_coords_df$spliceSiteID[ - match( - paste0(end_df$seqnames, "_", end_df$pos, "_", end_df$strand), - paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand) - ) -] - -# Add the IDs to the splitCounts metadata -mcols(splitCounts_se)$startID <- start_df$startID -mcols(splitCounts_se)$endID <- end_df$endID - -# Count how many junctions have valid IDs -valid_junctions <- sum(!is.na(start_df$startID) & !is.na(end_df$endID)) -message(" Junctions with valid start/end IDs: ", valid_junctions, " / ", nrow(splitCounts_se)) - -if(valid_junctions == 0) { - stop("ERROR: No junctions could be mapped to master coordinates!") -} +# Annotate split counts with splice site IDs +# NOTE: annotateSpliceSite() automatically extracts splice sites from junctions +# and generates consistent startID/endID mappings +message("Annotating split counts with splice site IDs...") +splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) # 4. Load merged non-split counts message("\n[4/7] Loading merged non-split counts...") @@ -160,40 +101,30 @@ if(!dir.exists(merged_non_split_dir)){ nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir) message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", ncol(nonSplitCounts_se), " samples") -# Verify non-split counts match the master coordinates -if(nrow(nonSplitCounts_se) != length(master_splice_coords)) { - stop("ERROR: Non-split counts dimensions (", nrow(nonSplitCounts_se), - ") don't match master coordinates (", length(master_splice_coords), ")") -} - -# Non-split counts should already be annotated from the merge step -# If not, annotate them using the master coordinates -if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) { - message("Annotating non-split counts with master coordinate IDs...") - mcols(nonSplitCounts_se)$spliceSiteID <- master_coords_df$spliceSiteID -} else { - message("Non-split counts already annotated with spliceSiteIDs") -} +# CRITICAL FIX: Always re-annotate after loading to ensure consistency +# The saved HDF5 object might have stale or incompatible IDs +message("Re-annotating non-split counts with splice site IDs...") +nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) # 5. Verify ID consistency before joining message("\n[5/7] Verifying ID consistency...") -split_start_ids <- unique(mcols(splitCounts_se)$startID) -split_end_ids <- unique(mcols(splitCounts_se)$endID) -nonsplit_ids <- unique(mcols(nonSplitCounts_se)$spliceSiteID) +split_start_ids <- na.omit(unique(mcols(splitCounts_se)$startID)) +split_end_ids <- na.omit(unique(mcols(splitCounts_se)$endID)) +nonsplit_ids <- na.omit(unique(mcols(nonSplitCounts_se)$spliceSiteID)) message(" Split startIDs: ", length(split_start_ids)) message(" Split endIDs: ", length(split_end_ids)) message(" Non-split spliceSiteIDs: ", length(nonsplit_ids)) -# Check overlap +# Check overlap - both split start/end should be subsets of nonsplit missing_start <- sum(!split_start_ids %in% nonsplit_ids) missing_end <- sum(!split_end_ids %in% nonsplit_ids) if(missing_start > 0 || missing_end > 0) { - warning("WARNING: Some split junction IDs are not in non-split IDs:") - warning(" Missing startIDs: ", missing_start) - warning(" Missing endIDs: ", missing_end) - warning(" This may cause issues but attempting to continue...") + message("NOTE: Some split junction IDs are not in non-split IDs:") + message(" Missing startIDs: ", missing_start, " out of ", length(split_start_ids)) + message(" Missing endIDs: ", missing_end, " out of ", length(split_end_ids)) + message(" This is expected if junctions reference splice sites not in the filtered set") } # 6. Add counts to FRASER object From 6c2ec6365c135786081205a9240726cda3ffd833 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 10:58:26 +1100 Subject: [PATCH 82/89] Updates to join_counts --- images/fraser/fraser_join_counts.R | 72 +++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index bc23df5e..e18c6b2e 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -106,29 +106,59 @@ message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", n message("Re-annotating non-split counts with splice site IDs...") nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) -# 5. Verify ID consistency before joining -message("\n[5/7] Verifying ID consistency...") -split_start_ids <- na.omit(unique(mcols(splitCounts_se)$startID)) -split_end_ids <- na.omit(unique(mcols(splitCounts_se)$endID)) -nonsplit_ids <- na.omit(unique(mcols(nonSplitCounts_se)$spliceSiteID)) - -message(" Split startIDs: ", length(split_start_ids)) -message(" Split endIDs: ", length(split_end_ids)) +# 5. Filter split counts to only include junctions that reference the non-split splice sites +message("\n[5/7] Filtering split counts to match non-split coordinate space...") + +# Get the valid splice site IDs from non-split counts +nonsplit_ids <- mcols(nonSplitCounts_se)$spliceSiteID + +# Get split junction IDs +split_start_ids <- mcols(splitCounts_se)$startID +split_end_ids <- mcols(splitCounts_se)$endID + +# Find junctions where BOTH start and end are in the non-split coordinate set +valid_junctions <- (!is.na(split_start_ids) & !is.na(split_end_ids) & + split_start_ids %in% nonsplit_ids & + split_end_ids %in% nonsplit_ids) + +n_valid <- sum(valid_junctions) +n_total <- nrow(splitCounts_se) +message(" Junctions with both start/end in non-split coords: ", n_valid, " / ", n_total) +message(" Filtering out ", n_total - n_valid, " junctions that reference sites outside the coordinate set") + +if(n_valid == 0) { + stop("ERROR: No junctions have both start and end in the non-split coordinate set!") +} + +# Filter the split counts to only valid junctions +splitCounts_se_filtered <- splitCounts_se[valid_junctions, ] +message(" Filtered split counts dimensions: ", nrow(splitCounts_se_filtered), " junctions x ", + ncol(splitCounts_se_filtered), " samples") + +# Update the splitCounts_se to use the filtered version +splitCounts_se <- splitCounts_se_filtered + +# 6. Verify ID consistency after filtering +message("\n[6/7] Verifying ID consistency after filtering...") +split_start_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$startID)) +split_end_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$endID)) + +message(" Filtered split startIDs: ", length(split_start_ids_filtered)) +message(" Filtered split endIDs: ", length(split_end_ids_filtered)) message(" Non-split spliceSiteIDs: ", length(nonsplit_ids)) -# Check overlap - both split start/end should be subsets of nonsplit -missing_start <- sum(!split_start_ids %in% nonsplit_ids) -missing_end <- sum(!split_end_ids %in% nonsplit_ids) +# Check overlap - should be 100% now +missing_start <- sum(!split_start_ids_filtered %in% nonsplit_ids) +missing_end <- sum(!split_end_ids_filtered %in% nonsplit_ids) if(missing_start > 0 || missing_end > 0) { - message("NOTE: Some split junction IDs are not in non-split IDs:") - message(" Missing startIDs: ", missing_start, " out of ", length(split_start_ids)) - message(" Missing endIDs: ", missing_end, " out of ", length(split_end_ids)) - message(" This is expected if junctions reference splice sites not in the filtered set") + stop("ERROR: After filtering, still have missing IDs! startIDs: ", missing_start, ", endIDs: ", missing_end) } -# 6. Add counts to FRASER object -message("\n[6/7] Joining split and non-split counts into FDS object...") +message("SUCCESS: All split junction IDs now exist in non-split coordinate set!") + +# 7. Add counts to FRASER object +message("\n[7/7] Joining split and non-split counts into FDS object...") fds <- addCountsToFraserDataSet( fds = fds, splitCounts = splitCounts_se, @@ -139,11 +169,9 @@ message("Counts successfully joined!") message(" - Split junctions: ", nrow(counts(fds, type = "j"))) message(" - Splice sites: ", nrow(counts(fds, type = "ss"))) -# Run integrity check -check_fds_integrity(fds) -# 7. Calculate PSI values -message("\n[7/7] Calculating PSI values...") +# 8. Calculate PSI values +message("\n[8/8] Calculating PSI values...") fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard")) message("PSI values calculated successfully!") @@ -152,7 +180,7 @@ message("Available assays: ", paste(assayNames(fds), collapse = ", ")) # Final integrity check check_fds_integrity(fds) -# 8. Save final FRASER object +# 9. Save final FRASER object message("\nSaving final integrated FDS...") fds <- saveFraserDataSet(fds) From aa3320a81399798a8dbf32f0d8b91df37205da3a Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 13:37:58 +1100 Subject: [PATCH 83/89] Finally at the analysis step! --- images/fraser/fraser_analysis.R | 35 ++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index 7f534135..f7fd9fb0 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -45,13 +45,14 @@ if(! "spliceSiteID" %in% colnames(mcols(fds, type="ss"))){ } } -# --- 3. Filtering --- -# It is critical to filter before fitting to reduce the size of the latent space matrices -fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp) - # Create QC directory dir.create("qc_plots", showWarnings = FALSE) +# --- 3. Calculate Filter Values FIRST (required for plotting) --- +message("Calculating filter expression values...") +fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE) + +# --- 4. Generate QC Plots --- # DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast set.seed(42) plot_idx <- sample(nrow(fds), min(nrow(fds), 30000)) @@ -62,12 +63,12 @@ png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150) plotFilterExpression(fds_plot_subset, bins = 100) dev.off() -# Apply actual filtering to full dataset -fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE) +# --- 5. Apply Filtering --- +message("Applying filtering based on calculated values...") +fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] -# --- 4. Dimensionality Message --- +# Dimensionality Message raw_dim <- nrow(fds) -fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ] filtered_dim <- nrow(fds_filtered) message(paste0("\n--- Filtering Summary ---")) @@ -75,28 +76,33 @@ message(paste0("Original junctions: ", raw_dim)) message(paste0("Filtered junctions: ", filtered_dim)) message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) -# --- 5. Hyperparameter Optimization --- +# --- 6. Hyperparameter Optimization --- # Optimization must run on the filtered set +message("Optimizing encoding dimension (q)...") opt_q <- bestQ(fds_filtered, type = "jaccard") png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150) plotEncDimSearch(fds_filtered, type = "jaccard") dev.off() -# --- 6. Fitting --- +# --- 7. Fitting --- +message(paste0("Fitting FRASER model with q = ", opt_q, "...")) fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp) -# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit) +# QQ Plot +message("Generating QQ plot...") png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) plotQQ(fds_fit, type = "jaccard") dev.off() -# --- 7. Annotation --- +# --- 8. Annotation --- +message("Annotating results with gene information...") fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, orgDb = org.Hs.eg.db) -# --- 8. Results & Compressed Export --- +# --- 9. Results & Compressed Export --- +message("Extracting results...") res <- results(fds_fit, padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff, @@ -110,6 +116,7 @@ message("Saving results...") write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv")) write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) -# --- 9. Final Save --- +# --- 10. Final Save --- +message("Saving final FRASER object...") saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final")) message("Analysis Complete.") \ No newline at end of file From b7d016c74e86a998e96a3ce6b39da3e7292da89e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 13:53:29 +1100 Subject: [PATCH 84/89] Finally at the analysis step! (Fixing step order) --- images/fraser/fraser_analysis.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index f7fd9fb0..533fa7d5 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -89,19 +89,20 @@ dev.off() message(paste0("Fitting FRASER model with q = ", opt_q, "...")) fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp) -# QQ Plot -message("Generating QQ plot...") -png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) -plotQQ(fds_fit, type = "jaccard") -dev.off() - -# --- 8. Annotation --- +# --- 8. Annotation (MUST happen before plotting QQ) --- message("Annotating results with gene information...") fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, orgDb = org.Hs.eg.db) -# --- 9. Results & Compressed Export --- +# --- 9. QQ Plot (after annotation) --- +message("Generating QQ plot...") +png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) +# plotQQ expects the FDS object and sample/gene arguments, not 'type' +plotQQ(fds_fit, aggregate = TRUE, global = TRUE) +dev.off() + +# --- 10. Results & Compressed Export --- message("Extracting results...") res <- results(fds_fit, padjCutoff = args$pval_cutoff, @@ -116,7 +117,7 @@ message("Saving results...") write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv")) write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) -# --- 10. Final Save --- +# --- 11. Final Save --- message("Saving final FRASER object...") saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final")) message("Analysis Complete.") \ No newline at end of file From da6dae4aa007b31b48c58ffe4473cf727ea7e2e8 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 15:39:20 +1100 Subject: [PATCH 85/89] fingers crossed --- images/fraser/fraser_analysis.R | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index 533fa7d5..aa41ec73 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -2,7 +2,7 @@ library(argparse) library(FRASER) -library(tidyverse) +library(readr) library(TxDb.Hsapiens.UCSC.hg38.knownGene) library(org.Hs.eg.db) library(HDF5Array) @@ -89,25 +89,33 @@ dev.off() message(paste0("Fitting FRASER model with q = ", opt_q, "...")) fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp) -# --- 8. Annotation (MUST happen before plotting QQ) --- +# --- 8. Annotation (MUST happen BEFORE calculating gene-level p-values) --- message("Annotating results with gene information...") fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = TxDb.Hsapiens.UCSC.hg38.knownGene, orgDb = org.Hs.eg.db) -# --- 9. QQ Plot (after annotation) --- -message("Generating QQ plot...") -png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) -# plotQQ expects the FDS object and sample/gene arguments, not 'type' -plotQQ(fds_fit, aggregate = TRUE, global = TRUE) -dev.off() +# --- 9. Calculate gene-level p-values (after annotation) --- +message("Calculating gene-level p-values...") +fds_fit <- calculatePadjValues(fds_fit, type = "jaccard") -# --- 10. Results & Compressed Export --- +# --- 10. QQ Plot (after gene-level p-values are calculated) --- +message("Generating QQ plot...") +tryCatch({ + png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150) + plotQQ(fds_fit, aggregate = TRUE, global = TRUE) + dev.off() +}, error = function(e) { + dev.off() # Close any open graphics device + message("Warning: QQ plot generation failed: ", e$message) + message("Skipping QQ plot and continuing with analysis...") +}) + +# --- 11. Results & Compressed Export --- message("Extracting results...") res <- results(fds_fit, padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff, - zScoreCutoff = args$z_cutoff, minCount = args$min_count) # Extract all results for Jaccard @@ -117,7 +125,7 @@ message("Saving results...") write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv")) write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) -# --- 11. Final Save --- +# --- 12. Final Save --- message("Saving final FRASER object...") saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final")) message("Analysis Complete.") \ No newline at end of file From 53fca54e56208163d5b76379b7dcd6fa54147eac Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 17:18:33 +1100 Subject: [PATCH 86/89] linting --- images/fraser/fraser_analysis.R | 2 +- images/fraser/fraser_count_non_split.R | 2 +- images/fraser/fraser_merge_split.R | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index aa41ec73..c0c721d2 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -128,4 +128,4 @@ write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz")) # --- 12. Final Save --- message("Saving final FRASER object...") saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final")) -message("Analysis Complete.") \ No newline at end of file +message("Analysis Complete.") diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index d51f34bf..acc28857 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -77,4 +77,4 @@ if (file.exists(expected_h5)) { stop("BAM Index (.bai) missing. FRASER cannot perform random access.") } stop(paste("Counting failed. Output not found for sample:", args$sample_id)) -} \ No newline at end of file +} diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 791961c4..685f833a 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -88,4 +88,3 @@ spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges) saveRDS(splitCountRangesNonFilt, file.path(args$work_dir, "g_ranges_split_counts.RDS")) saveRDS(splitCountRanges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) - From 39e788563541db5d00fe17122c1d121d89ae76ff Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 16 Feb 2026 13:39:49 +1100 Subject: [PATCH 87/89] reverting the validation check to commit c6f64b90 to match last successful run batch 1125169 --- images/fraser/fraser_count_non_split.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index acc28857..508f59d6 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -59,9 +59,12 @@ sample_result <- countNonSplicedReads(args$sample_id, recount= TRUE, spliceSiteCoords=filtered_coords) # 6. Verification -# The H5 file should now be at cache_dir/nonSplicedCounts-{sample_id}.h5 -expected_h5 <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5")) -expected_rds <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS")) +# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/ +actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name) + +# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS +expected_h5 <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5")) +expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS")) if (file.exists(expected_h5)) { message("Success: Created non-split counts (HDF5) at ", expected_h5) @@ -69,8 +72,8 @@ if (file.exists(expected_h5)) { message("Success: Created non-split counts (RDS) at ", expected_rds) } else { # Debugging: List files in the directory to see what actually happened - if(dir.exists(cache_dir)){ - message("Files found in cache dir: ", paste(list.files(cache_dir), collapse=", ")) + if(dir.exists(actual_cache_dir)){ + message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", ")) } if(!file.exists(paste0(args$bam_path, ".bai"))){ From f4d978db1a4dadc280fa6274e4125f69194b8c3d Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 17 Feb 2026 14:56:50 +1100 Subject: [PATCH 88/89] Co-pilot review for all scripts before final run --- images/fraser/Dockerfile | 3 +-- images/fraser/fraser_analysis.R | 3 +-- images/fraser/fraser_count_non_split.R | 17 +++++-------- images/fraser/fraser_init.R | 4 +-- images/fraser/fraser_join_counts.R | 34 ++++++++++---------------- images/fraser/fraser_merge_non_split.R | 26 +++++++++++++++++--- images/fraser/fraser_merge_split.R | 3 +-- 7 files changed, 46 insertions(+), 44 deletions(-) diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile index 8e7505df..eb01c846 100644 --- a/images/fraser/Dockerfile +++ b/images/fraser/Dockerfile @@ -5,7 +5,7 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH ARG VERSION=2.4.6 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ - rm -r /var/cache/apt/* && \ + rm -r /var/cache/apt/* && \ wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \ chmod +x /usr/local/bin/micromamba && \ mkdir -p ${MAMBA_ROOT_PREFIX} && \ @@ -20,7 +20,6 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \ bioconductor-fraser=${VERSION} \ bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \ bioconductor-org.hs.eg.db=3.20.0 && \ - # Install summarizedRT via BiocManager micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \ micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \ micromamba run -n base Rscript -e 'BiocManager::install("HDF5Array", update=FALSE, ask=FALSE)' && \ diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R index c0c721d2..548b8c32 100644 --- a/images/fraser/fraser_analysis.R +++ b/images/fraser/fraser_analysis.R @@ -12,7 +12,6 @@ parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis") parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder") parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID") parser$add_argument("--pval_cutoff", type = "double", default = 0.05) -parser$add_argument("--z_cutoff", type = "double") parser$add_argument("--delta_psi_cutoff", type = "double", default = 0.3) parser$add_argument("--min_count", type = "integer", default = 5) parser$add_argument("--nthreads", type = "integer", default = 1) @@ -74,7 +73,7 @@ filtered_dim <- nrow(fds_filtered) message(paste0("\n--- Filtering Summary ---")) message(paste0("Original junctions: ", raw_dim)) message(paste0("Filtered junctions: ", filtered_dim)) -message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) +if (raw_dim > 0) { message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) } # --- 6. Hyperparameter Optimization --- # Optimization must run on the filtered set diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R index 508f59d6..9456df66 100644 --- a/images/fraser/fraser_count_non_split.R +++ b/images/fraser/fraser_count_non_split.R @@ -31,12 +31,6 @@ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE) # 2. Configure Parallelism and HDF5 options("FRASER.maxSamplesNoHDF5" = 0) options("FRASER.maxJunctionsNoHDF5" = -1) -# Use MulticoreParam if threads > 1, else Serial -if(args$nthreads > 1){ - bpparam <- MulticoreParam(workers = args$nthreads) -} else { - bpparam <- SerialParam() -} # 3. Load Dataset and Coordinates fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) @@ -55,14 +49,12 @@ sample_result <- countNonSplicedReads(args$sample_id, splitCountRanges = NULL, fds = fds, NcpuPerSample = args$nthreads, - minAnchor=5, - recount= TRUE, - spliceSiteCoords=filtered_coords) + minAnchor = 5, + recount = TRUE, + spliceSiteCoords = filtered_coords) # 6. Verification # Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/ actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name) - -# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS expected_h5 <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5")) expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS")) @@ -75,6 +67,9 @@ if (file.exists(expected_h5)) { if(dir.exists(actual_cache_dir)){ message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", ")) } + else { + message("Cache directory not found: ", actual_cache_dir) + } if(!file.exists(paste0(args$bam_path, ".bai"))){ stop("BAM Index (.bai) missing. FRASER cannot perform random access.") diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R index 03be3797..243ce91f 100644 --- a/images/fraser/fraser_init.R +++ b/images/fraser/fraser_init.R @@ -29,7 +29,7 @@ fds <- FraserDataSet( name = fds_name ) -#Setup parallel execution +# Setup parallel execution bp <- MulticoreParam(workers = args$nthreads) register(bp) @@ -38,7 +38,7 @@ register(bp) fds <- saveFraserDataSet(fds) -#Print location for Python to capture if needed +# Print location for Python to capture if needed fds_save_path <- file.path( args$work_dir, "savedObjects", diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R index e18c6b2e..895d682f 100644 --- a/images/fraser/fraser_join_counts.R +++ b/images/fraser/fraser_join_counts.R @@ -68,17 +68,9 @@ message("\n[1/7] Loading Fraser Data Set from RDS...") fds <- readRDS(args$fds_path) workingDir(fds) <- saveDir -# 2. Load the master splice site coordinates that were used to create non-split counts -message("\n[2/7] Loading master splice site coordinates...") -splice_coords_path <- file.path(dirname(dirname(saveDir)), "splice_site_coords.RDS") -if(!file.exists(splice_coords_path)) { - stop("ERROR: splice_site_coords.RDS not found at: ", splice_coords_path) -} -master_splice_coords <- readRDS(splice_coords_path) -message("Master coordinates loaded: ", length(master_splice_coords), " sites") -# 3. Load split counts -message("\n[3/7] Loading split counts...") +# 2. Load split counts +message("\n[2/7] Loading split counts...") split_se_dir <- file.path(saveDir, "splitCounts") if(!dir.exists(split_se_dir)){ stop(paste("Missing splitCounts directory at:", split_se_dir)) @@ -92,8 +84,8 @@ message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol message("Annotating split counts with splice site IDs...") splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se) -# 4. Load merged non-split counts -message("\n[4/7] Loading merged non-split counts...") +# 3. Load merged non-split counts +message("\n[3/7] Loading merged non-split counts...") merged_non_split_dir <- file.path(saveDir, "nonSplitCounts") if(!dir.exists(merged_non_split_dir)){ stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir)) @@ -106,8 +98,8 @@ message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", n message("Re-annotating non-split counts with splice site IDs...") nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se) -# 5. Filter split counts to only include junctions that reference the non-split splice sites -message("\n[5/7] Filtering split counts to match non-split coordinate space...") +# 4. Filter split counts to only include junctions that reference the non-split splice sites +message("\n[4/7] Filtering split counts to match non-split coordinate space...") # Get the valid splice site IDs from non-split counts nonsplit_ids <- mcols(nonSplitCounts_se)$spliceSiteID @@ -138,8 +130,8 @@ message(" Filtered split counts dimensions: ", nrow(splitCounts_se_filtered), " # Update the splitCounts_se to use the filtered version splitCounts_se <- splitCounts_se_filtered -# 6. Verify ID consistency after filtering -message("\n[6/7] Verifying ID consistency after filtering...") +# 5. Verify ID consistency after filtering +message("\n[5/7] Verifying ID consistency after filtering...") split_start_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$startID)) split_end_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$endID)) @@ -157,8 +149,8 @@ if(missing_start > 0 || missing_end > 0) { message("SUCCESS: All split junction IDs now exist in non-split coordinate set!") -# 7. Add counts to FRASER object -message("\n[7/7] Joining split and non-split counts into FDS object...") +# 6. Add counts to FRASER object +message("\n[6/7] Joining split and non-split counts into FDS object...") fds <- addCountsToFraserDataSet( fds = fds, splitCounts = splitCounts_se, @@ -170,8 +162,8 @@ message(" - Split junctions: ", nrow(counts(fds, type = "j"))) message(" - Splice sites: ", nrow(counts(fds, type = "ss"))) -# 8. Calculate PSI values -message("\n[8/8] Calculating PSI values...") +# 7. Calculate PSI values +message("\n[7/7] Calculating PSI values...") fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard")) message("PSI values calculated successfully!") @@ -180,7 +172,7 @@ message("Available assays: ", paste(assayNames(fds), collapse = ", ")) # Final integrity check check_fds_integrity(fds) -# 9. Save final FRASER object +# 8. Save final FRASER object message("\nSaving final integrated FDS...") fds <- saveFraserDataSet(fds) diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R index 68b243d8..5415c9b6 100644 --- a/images/fraser/fraser_merge_non_split.R +++ b/images/fraser/fraser_merge_non_split.R @@ -61,16 +61,34 @@ if(length(h5_files) == 0) { # Copy with standardized naming convention for(i in seq_along(sample_ids)) { sid <- sample_ids[i] - src_file <- h5_files[grepl(sid, h5_files)][1] - if(is.na(src_file)) { + # 1. Precise matching using word boundaries (\\b) + # This ensures "S1" doesn't match "S10" + match_idx <- grep(paste0("\\b", sid, "\\b"), h5_files) + + # 2. Safety check: Did we find a match? + if(length(match_idx) == 0) { warning("No H5 file found for sample: ", sid) next } + # 3. Extract the first matching file path + src_file <- h5_files[match_idx[1]] + + # 4. Define destination and copy + # Constructing the new filename: nonSplicedCounts-[sid].h5 dest <- file.path(out_dir, paste0("nonSplicedCounts-", sid, ".h5")) - file.copy(src_file, dest, overwrite = TRUE) - message(" Copied: ", basename(src_file), " -> ", basename(dest)) + + success <- file.copy(src_file, dest, overwrite = TRUE) + + # 5. Provide feedback + if(success) { + message("Successfully Copied: ", sid) + message(" From: ", basename(src_file)) + message(" To: ", basename(dest)) + } else { + warning("Failed to copy file for sample: ", sid) + } } # 5. Validate H5 files and build combined count matrix diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 685f833a..667e4059 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -32,7 +32,6 @@ register(bp) # 3. Load Dataset # Match the name exactly as defined in the init stage -fds_name <- paste0("FRASER_", args$cohort_id) fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name) @@ -58,6 +57,7 @@ strandSpecific(fds) <- 2 minExpressionInOneSample <- 20 # 4. Merge individual split count RDS files from the cache # FRASER automatically looks in: {work_dir}/cache/splitCounts/ +#TODO: We could add an explicit check here to ensure the expected files are present before calling this function. message("Merging split counts from cache...") splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE) @@ -84,7 +84,6 @@ spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges) # Use absolute paths for saving to match Python 'mv' commands -#This is slightly confusing, but the filtered granges will be used to annotate non_split counts saveRDS(splitCountRangesNonFilt, file.path(args$work_dir, "g_ranges_split_counts.RDS")) saveRDS(splitCountRanges, file.path(args$work_dir, "g_ranges_non_split_counts.RDS")) saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS")) From 1b8375eab1a9cb26cd2da73df33dea422fc14b8a Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 17 Feb 2026 17:21:03 +1100 Subject: [PATCH 89/89] Adding a readme --- images/fraser/README_Fraser.txt | 42 ++++++++++++++++++++++++++++++ images/fraser/fraser_merge_split.R | 1 - 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 images/fraser/README_Fraser.txt diff --git a/images/fraser/README_Fraser.txt b/images/fraser/README_Fraser.txt new file mode 100644 index 00000000..85f6c06f --- /dev/null +++ b/images/fraser/README_Fraser.txt @@ -0,0 +1,42 @@ +The tool is called "Fraser" and it is an R package that can be used to analyze RNA-seq data for splicing events. +It is designed to identify and quantify splicing events in a cohort of samples, and can be used to compare +splicing patterns between different groups of samples. + +This image and tool was made to run Cohort-Level analyses on Exome data for the RD team. +When using these scripts and tools, please make sure to have the correct version of R and the necessary packages installed. +They were designed to be used in conjunction with the cpg-flow-rdrnaseq and specifically the fraser stage of that workflow and run in this order + +1. fraser_init.R: +Initializes the FRASER dataset for a cohort. +Inputs: --cohort_id, --sample_map (CSV with sample_id and bam_path), --work_dir, --nthreads +What it does: Reads the sample map, creates a FRASER dataset skeleton, and saves it for downstream steps. + +2. fraser_count_split.R: +Counts split (spliced) reads for a single sample. +Inputs: --fds_path, --bam_path, --cohort_id, --sample_id, --work_dir, --nthreads +What it does: Loads the dataset, subsets to the sample, counts split reads, and saves results to the cache. + +3. fraser_merge_split.R: +Merges split read counts from all samples. +Inputs: --fds_path, --cohort_id, --work_dir, --nthreads +What it does: Loads and merges all split count files, annotates splice sites, and saves the merged data for the cohort. + +4. fraser_count_non_split.R: +Counts non-split (unspliced) reads for a single sample. +Inputs: --fds_path, --bam_path, --cohort_id, --sample_id, --coords_path, --work_dir, --nthreads +What it does: Counts non-split reads at splice site coordinates and saves results to the cache. + +5. fraser_merge_non_split.R: +Merges non-split read counts and calculates PSI values. +Inputs: --fds_path, --cohort_id, --filtered_ranges_path, --work_dir, --nthreads +What it does: Merges non-split counts for all samples, organizes HDF5 files, and prepares data for analysis. + +6. fraser_join_counts.R: +Joins split and non-split counts into a single dataset. +Inputs: --fds_path, --cohort_id, --work_dir, --nthreads +What it does: Checks data integrity, joins split and non-split counts, and ensures all metadata is present for analysis. + +7. fraser_analysis.R: +Performs statistical analysis and generates QC plots. +Inputs: --fds_dir, --cohort_id, --pval_cutoff, --delta_psi_cutoff, --min_count, --nthreads +What it does: Loads the final dataset, calculates splicing statistics, applies filters, and generates quality control plots and results. diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R index 667e4059..13771414 100644 --- a/images/fraser/fraser_merge_split.R +++ b/images/fraser/fraser_merge_split.R @@ -57,7 +57,6 @@ strandSpecific(fds) <- 2 minExpressionInOneSample <- 20 # 4. Merge individual split count RDS files from the cache # FRASER automatically looks in: {work_dir}/cache/splitCounts/ -#TODO: We could add an explicit check here to ensure the expected files are present before calling this function. message("Merging split counts from cache...") splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE)