From 20d3c597abb0d5413f9e6137954576b1aa8e9e8c Mon Sep 17 00:00:00 2001
From: Matt Welland <mattwellie@gmail.com>
Date: Tue, 6 Jan 2026 11:18:10 +1000
Subject: [PATCH 01/89] Update FRASER for test build

---
 images/fraser/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 527cbc4d..561653c5 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -2,7 +2,7 @@ FROM debian:bookworm-slim
 
 ENV MAMBA_ROOT_PREFIX /root/micromamba
 ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH
-ARG VERSION=${VERSION:-1.12.1}
+ARG VERSION=2.4.6
 
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     rm -r /var/lib/apt/lists/* && \

From 85a965174336c084abfc8b9efb9b6062c14e3771 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 6 Jan 2026 13:50:42 +1100
Subject: [PATCH 02/89] Maybe this should build? syntax fixes for docker

---
 images/fraser/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 561653c5..04be73b2 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -7,8 +7,9 @@ ARG VERSION=2.4.6
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     rm -r /var/lib/apt/lists/* && \
     rm -r /var/cache/apt/* && \
-    wget -qO- https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 | tar -xvj -C /usr/local bin/micromamba && \
-    mkdir ${MAMBA_ROOT_PREFIX} && \
+    wget https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 && \
+    tar -xvjf micromamba-0.8.2-he9b6cbd_0.tar.bz2 -C /usr/local && \
+    mkdir -p ${MAMBA_ROOT_PREFIX} && \
     micromamba install -y --prefix ${MAMBA_ROOT_PREFIX} -c bioconda -c conda-forge \
     python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \
     bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \

From e9476d358e7f01fb2826b4b8abfa4546e1536198 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 6 Jan 2026 14:02:11 +1100
Subject: [PATCH 03/89] seperating micromamba unpacking and cleaning image

---
 images/fraser/Dockerfile | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 04be73b2..5d39a5d1 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -5,12 +5,17 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH
 ARG VERSION=2.4.6
 
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
-    rm -r /var/lib/apt/lists/* && \
-    rm -r /var/cache/apt/* && \
-    wget https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 && \
-    tar -xvjf micromamba-0.8.2-he9b6cbd_0.tar.bz2 -C /usr/local && \
+	wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
+    chmod +x /usr/local/bin/micromamba && \
+    \
     mkdir -p ${MAMBA_ROOT_PREFIX} && \
-    micromamba install -y --prefix ${MAMBA_ROOT_PREFIX} -c bioconda -c conda-forge \
+    \
+    # Perform the installation
+    micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \
+    -c bioconda -c conda-forge \
     python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \
     bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \
-    rm -r /root/micromamba/pkgs
+    \
+    # Clean up to keep the image small
+    micromamba clean --all --yes && \
+    rm -rf /var/lib/apt/lists/*

From 75b15369fc6b26e0845b077bdf7fc7dc72e0b9f4 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 6 Jan 2026 14:08:59 +1100
Subject: [PATCH 04/89] version conflict between fraser R and annotationdbi, if
 this works ill pin the solved versions

---
 images/fraser/Dockerfile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 5d39a5d1..9d179938 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -5,17 +5,17 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH
 ARG VERSION=2.4.6
 
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
-	wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
+    wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
     chmod +x /usr/local/bin/micromamba && \
-    \
     mkdir -p ${MAMBA_ROOT_PREFIX} && \
-    \
-    # Perform the installation
     micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \
     -c bioconda -c conda-forge \
-    python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \
-    bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \
-    \
-    # Clean up to keep the image small
+    python=3.10 \
+    r-base=4.4 \
+    r-reshape2 \
+    r-tidyverse \
+    bioconductor-fraser \
+    bioconductor-txdb.hsapiens.ucsc.hg38.knowngene \
+    bioconductor-org.hs.eg.db && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*

From 87ca809b9c60e38fcbe5d3e0112312728fba2f01 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 6 Jan 2026 14:23:51 +1100
Subject: [PATCH 05/89] pinning the successful specific versions

---
 images/fraser/Dockerfile | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 9d179938..008be1f3 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -10,12 +10,12 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     mkdir -p ${MAMBA_ROOT_PREFIX} && \
     micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \
     -c bioconda -c conda-forge \
-    python=3.10 \
-    r-base=4.4 \
-    r-reshape2 \
-    r-tidyverse \
-    bioconductor-fraser \
-    bioconductor-txdb.hsapiens.ucsc.hg38.knowngene \
-    bioconductor-org.hs.eg.db && \
+    python=3.10.19 \
+    r-base=4.4.3 \
+    r-reshape2=1.4.5 \
+    r-tidyverse=2.0.0 \
+    bioconductor-fraser=2.4.6 \
+    bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
+    bioconductor-org.hs.eg.db=3.20.0 && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*

From e3960ea00135db7a686bcecf502063d24bb2cdf8 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 6 Jan 2026 14:44:39 +1100
Subject: [PATCH 06/89] fraser version variable and re-adding space saving
 measures

---
 images/fraser/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 008be1f3..71b7bb94 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -5,6 +5,7 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH
 ARG VERSION=2.4.6
 
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
+	rm -r /var/cache/apt/* && \
     wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
     chmod +x /usr/local/bin/micromamba && \
     mkdir -p ${MAMBA_ROOT_PREFIX} && \
@@ -14,7 +15,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     r-base=4.4.3 \
     r-reshape2=1.4.5 \
     r-tidyverse=2.0.0 \
-    bioconductor-fraser=2.4.6 \
+    bioconductor-fraser=${VERSION} \
     bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
     bioconductor-org.hs.eg.db=3.20.0 && \
     micromamba clean --all --yes && \

From 59d4d3b1db748ad60f105675b4359d77da8b8895 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 16 Jan 2026 17:44:15 +1100
Subject: [PATCH 07/89] Baking the R scripts into the fraser image

---
 images/fraser/Dockerfile               |  9 ++++
 images/fraser/__init__.py              |  0
 images/fraser/count_split_reads.R      | 32 ++++++++++++
 images/fraser/fraser_analysis.R        | 70 ++++++++++++++++++++++++++
 images/fraser/fraser_count_non_split.R | 29 +++++++++++
 images/fraser/fraser_init.R            | 49 ++++++++++++++++++
 images/fraser/fraser_merge_non_split.R | 38 ++++++++++++++
 images/fraser/fraser_merge_split.R     | 50 ++++++++++++++++++
 8 files changed, 277 insertions(+)
 create mode 100644 images/fraser/__init__.py
 create mode 100644 images/fraser/count_split_reads.R
 create mode 100644 images/fraser/fraser_analysis.R
 create mode 100644 images/fraser/fraser_count_non_split.R
 create mode 100644 images/fraser/fraser_init.R
 create mode 100644 images/fraser/fraser_merge_non_split.R
 create mode 100644 images/fraser/fraser_merge_split.R

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 71b7bb94..ab1b77df 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -20,3 +20,12 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     bioconductor-org.hs.eg.db=3.20.0 && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*
+
+RUN mkdir /RDrnaseq
+
+COPY count_split_reads.R RDrnaseq/
+COPY fraser_analysis.R RDrnaseq/
+COPY fraser_count_non_split.R RDrnaseq/
+COPY fraser_init.R RDrnaseq/
+COPY fraser_merge_non_split.R RDrnaseq/
+COPY fraser_merge_split.R RDrnaseq/
\ No newline at end of file
diff --git a/images/fraser/__init__.py b/images/fraser/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/images/fraser/count_split_reads.R b/images/fraser/count_split_reads.R
new file mode 100644
index 00000000..50afdfaf
--- /dev/null
+++ b/images/fraser/count_split_reads.R
@@ -0,0 +1,32 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Count Split Reads for a Single Sample")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count")
+parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+#Setup FRASER directory structure
+
+dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS"))
+
+#Force HDF5 to save RAM
+
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
+
+fds <- countSplitReads(
+  sampleID = args$sample_id,
+  fds = fds,
+  NcpuPerSample = args$nthreads,
+  recount = TRUE
+)
diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
new file mode 100644
index 00000000..e012ee8b
--- /dev/null
+++ b/images/fraser/fraser_analysis.R
@@ -0,0 +1,70 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(tidyverse)
+library(TxDb.Hsapiens.UCSC.hg38.knownGene)
+library(org.Hs.eg.db)
+library(ggplot2)
+
+parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
+parser$add_argument("--fds_dir", required = TRUE, help = "Directory containing savedObjects")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--pval_cutoff", type = "double", default = 0.05)
+parser$add_argument("--z_cutoff", type = "double")
+parser$add_argument("--delta_psi_cutoff", type = "double", default = 0.3)
+parser$add_argument("--min_count", type = "integer", default = 5)
+parser$add_argument("--nthreads", type = "integer", default = 1)
+args <- parser$parse_args()
+
+#Force HDF5
+
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+z_cutoff <- if (is.null(args$z_cutoff)) NA else args$z_cutoff
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
+#Load
+
+fds <- loadFraserDataSet(dir = args$fds_dir, name = args$cohort_id)
+
+#Filter
+
+fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE)
+dir.create("plots/misc", recursive = TRUE, showWarnings = FALSE)
+
+png("plots/misc/filter_expression.png", width = 2000, height = 2000, res = 300)
+plotFilterExpression(fds, bins = 100)
+dev.off()
+
+fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
+psi_types <- c("psi5", "psi3", "jaccard")
+
+optimal_qs <- c(
+  psi5 = bestQ(fds_filtered, type = "psi5"),
+  psi3 = bestQ(fds_filtered, type = "psi3"),
+  jaccard = bestQ(fds_filtered, type = "jaccard")
+)
+
+fds_fit <- FRASER(fds_filtered, q = optimal_qs, BPPARAM = bp)
+
+#Annotation
+
+txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
+orgDb <- org.Hs.eg.db
+fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = txdb, orgDb = orgDb)
+
+#Results
+
+res <- results(fds_fit, padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff,
+               zScoreCutoff = z_cutoff, minCount = args$min_count)
+res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
+
+write_csv(as.data.frame(res), "results.significant.csv")
+write_csv(as.data.frame(res_all), "results.all.csv")
+
+#Save final state
+
+saveFraserDataSet(fds_fit, dir = getwd(), name = "FraserDataSet")
diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
new file mode 100644
index 00000000..a3e743ea
--- /dev/null
+++ b/images/fraser/fraser_count_non_split.R
@@ -0,0 +1,29 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Count Non-Split Reads for a Single Sample")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--sample_id", required = TRUE, help = "Sample ID")
+parser$add_argument("--coords_path", required = TRUE, help = "Path to splice_site_coords.RDS")
+parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
+splice_site_coords <- readRDS(args$coords_path)
+
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+countNonSplicedReads(
+  sampleID = args$sample_id,
+  fds = fds,
+  spliceSiteCoords = splice_site_coords,
+  splitCountRanges = NULL,
+  minAnchor = 5,
+  NcpuPerSample = args$nthreads
+)
diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
new file mode 100644
index 00000000..49cf6b9f
--- /dev/null
+++ b/images/fraser/fraser_init.R
@@ -0,0 +1,49 @@
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Initialize FRASER Data Set")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--sample_ids", required = TRUE, help = "Comma-separated sample IDs")
+parser$add_argument("--bam_files", required = TRUE, help = "Comma-separated BAM file paths")
+parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+sample_ids <- unlist(strsplit(args$sample_ids, ","))
+bam_files <- unlist(strsplit(args$bam_files, ","))
+
+sample_table <- DataFrame(
+  sampleID = sample_ids,
+  bamFile = bam_files,
+  group = seq_len(length(bam_files)),
+  pairedEnd = TRUE
+)
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+fds <- FraserDataSet(
+  colData = sample_table,
+  workingDir = args$working_dir,
+  name = args$cohort_id
+)
+
+#Setup parallel execution
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
+
+#Initialize counts and metadata
+fds <- countRNAData(fds, BPPARAM = bp)
+fds <- calculatePSIValues(fds,BPPARAM = bp)
+fds <- saveFraserDataSet(fds)
+
+#Print location for Python to capture if needed
+fds_save_path <- file.path(
+  args$working_dir,
+  "savedObjects",
+  paste0("FRASER_", args$cohort_id),
+  "fds-object.RDS"
+)
+
+cat(fds_rds_path, "\n")
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
new file mode 100644
index 00000000..59c2b00b
--- /dev/null
+++ b/images/fraser/fraser_merge_non_split.R
@@ -0,0 +1,38 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to g_ranges_non_split_counts.RDS")
+parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
+register(MulticoreParam(workers = args$nthreads))
+
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+#1. Merge Split
+
+fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE)
+
+#2. Merge Non-Split
+
+split_count_ranges <- readRDS(args$filtered_ranges_path)
+fds <- getNonSplitReadCountsForAllSamples(
+  fds = fds,
+  splitCountRanges = split_count_ranges,
+  minAnchor = 5,
+  recount = FALSE
+)
+
+#3. FRASER 2.0: Calculate Jaccard & PSI
+
+fds <- calculatePSIValues(fds)
+fds <- saveFraserDataSet(fds)
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
new file mode 100644
index 00000000..55a6df34
--- /dev/null
+++ b/images/fraser/fraser_merge_split.R
@@ -0,0 +1,50 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Merge Split Read Counts")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS"))
+
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
+
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
+
+fds <- countRNAData(fds, BPPARAM = bp)
+
+#Merge split counts
+
+fds <- getSplitReadCountsForAllSamples(fds = fds, recount = FALSE)
+
+#Prepare ranges for non-split counting
+
+split_counts <- asSE(fds, type = "j")
+split_count_ranges <- rowRanges(split_counts)
+split_count_ranges <- FRASER:::annotateSpliceSite(split_count_ranges)
+saveRDS(split_count_ranges, "g_ranges_split_counts.RDS")
+
+#Filter for expression to reduce non-split counting overhead
+
+minExpressionInOneSample <- 20
+max_count <- rowMaxs(assay(split_counts, "rawCountsJ"))
+passed <- max_count >= minExpressionInOneSample
+filtered_ranges <- split_count_ranges[passed, ]
+saveRDS(filtered_ranges, "g_ranges_non_split_counts.RDS")
+
+splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds)
+saveRDS(splice_site_coords, "splice_site_coords.RDS")
+
+saveFraserDataSet(fds)

From 021908f4ae9b7389f2f5e8c7fbb3598b9085d49d Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 16 Jan 2026 17:45:28 +1100
Subject: [PATCH 08/89] no need

---
 images/fraser/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 images/fraser/__init__.py

diff --git a/images/fraser/__init__.py b/images/fraser/__init__.py
deleted file mode 100644
index e69de29b..00000000

From 264c6d5c4f6856641bc19b56c33c199e4722a014 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 16 Jan 2026 17:47:18 +1100
Subject: [PATCH 09/89] End of file lint

---
 images/fraser/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index ab1b77df..182f543e 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -28,4 +28,4 @@ COPY fraser_analysis.R RDrnaseq/
 COPY fraser_count_non_split.R RDrnaseq/
 COPY fraser_init.R RDrnaseq/
 COPY fraser_merge_non_split.R RDrnaseq/
-COPY fraser_merge_split.R RDrnaseq/
\ No newline at end of file
+COPY fraser_merge_split.R RDrnaseq/

From cb22e31fbeee75cf4f6755fb57040675be0504e7 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 19 Jan 2026 17:48:25 +1100
Subject: [PATCH 10/89] refactoring file paths

---
 images/fraser/Dockerfile               |  2 +-
 images/fraser/count_split_reads.R      | 32 -----------
 images/fraser/fraser_analysis.R        | 77 ++++++++++++++++----------
 images/fraser/fraser_count_non_split.R | 44 +++++++++++++--
 images/fraser/fraser_count_split.R     | 54 ++++++++++++++++++
 images/fraser/fraser_init.R            | 35 ++++++------
 images/fraser/fraser_merge_non_split.R | 39 +++++++++----
 images/fraser/fraser_merge_split.R     | 56 ++++++++++++-------
 8 files changed, 225 insertions(+), 114 deletions(-)
 delete mode 100644 images/fraser/count_split_reads.R
 create mode 100644 images/fraser/fraser_count_split.R

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 182f543e..49e96a56 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -23,7 +23,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
 
 RUN mkdir /RDrnaseq
 
-COPY count_split_reads.R RDrnaseq/
+COPY fraser_count_split.R RDrnaseq/
 COPY fraser_analysis.R RDrnaseq/
 COPY fraser_count_non_split.R RDrnaseq/
 COPY fraser_init.R RDrnaseq/
diff --git a/images/fraser/count_split_reads.R b/images/fraser/count_split_reads.R
deleted file mode 100644
index 50afdfaf..00000000
--- a/images/fraser/count_split_reads.R
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env Rscript
-
-library(argparse)
-library(FRASER)
-library(BiocParallel)
-
-parser <- ArgumentParser(description = "Count Split Reads for a Single Sample")
-parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
-parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
-parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count")
-parser$add_argument("--working_dir", default = "output", help = "Working directory")
-parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
-args <- parser$parse_args()
-
-#Setup FRASER directory structure
-
-dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE)
-file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS"))
-
-#Force HDF5 to save RAM
-
-options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = -1)
-
-fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
-
-fds <- countSplitReads(
-  sampleID = args$sample_id,
-  fds = fds,
-  NcpuPerSample = args$nthreads,
-  recount = TRUE
-)
diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index e012ee8b..54818e44 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -8,7 +8,7 @@ library(org.Hs.eg.db)
 library(ggplot2)
 
 parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
-parser$add_argument("--fds_dir", required = TRUE, help = "Directory containing savedObjects")
+parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
 parser$add_argument("--pval_cutoff", type = "double", default = 0.05)
 parser$add_argument("--z_cutoff", type = "double")
@@ -17,8 +17,7 @@ parser$add_argument("--min_count", type = "integer", default = 5)
 parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
-#Force HDF5
-
+# Force HDF5 for large matrix operations
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
 
@@ -26,45 +25,65 @@ z_cutoff <- if (is.null(args$z_cutoff)) NA else args$z_cutoff
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
-#Load
-
-fds <- loadFraserDataSet(dir = args$fds_dir, name = args$cohort_id)
+# 1. Load the merged FDS
+# Python extracted the tar into args$fds_dir.
+# Inside is 'savedObjects/FRASER_{cohort_id}/...'
+fds_name <- paste0("FRASER_", args$cohort_id)
+fds <- loadFraserDataSet(dir = args$fds_dir, name = fds_name)
 
-#Filter
+# 2. Filter Expression and Variability
+# This reduces the number of tests and improves power
+message("Filtering junctions...")
+fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
 
-fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE)
 dir.create("plots/misc", recursive = TRUE, showWarnings = FALSE)
-
 png("plots/misc/filter_expression.png", width = 2000, height = 2000, res = 300)
 plotFilterExpression(fds, bins = 100)
 dev.off()
 
-fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
-psi_types <- c("psi5", "psi3", "jaccard")
-
-optimal_qs <- c(
-  psi5 = bestQ(fds_filtered, type = "psi5"),
-  psi3 = bestQ(fds_filtered, type = "psi3"),
-  jaccard = bestQ(fds_filtered, type = "jaccard")
-)
-
-fds_fit <- FRASER(fds_filtered, q = optimal_qs, BPPARAM = bp)
-
-#Annotation
+# 3. Hyperparameter Optimization (bestQ) and Fitting
+# FRASER uses an autoencoder to control for latent confounders (batch effects)
+message("Finding optimal hyperparameter Q and fitting autoencoder...")
+# In FRASER 2.0, we typically fit psi5, psi3, and jaccard
+fds <- FRASER(fds, q = c(psi5=NA, psi3=NA, jaccard=NA), BPPARAM = bp)
 
+# 4. Annotation
+message("Annotating results with UCSC hg38...")
 txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
 orgDb <- org.Hs.eg.db
-fds_fit <- annotateRangesWithTxDb(fds_fit, txdb = txdb, orgDb = orgDb)
+fds <- annotateRangesWithTxDb(fds, txdb = txdb, orgDb = orgDb)
 
-#Results
+# 5. Extract Results
+message("Exporting results...")
+res <- results(fds,
+               padjCutoff = args$pval_cutoff,
+               deltaPsiCutoff = args$delta_psi_cutoff,
+               zScoreCutoff = z_cutoff,
+               minCount = args$min_count)
 
-res <- results(fds_fit, padjCutoff = args$pval_cutoff, deltaPsiCutoff = args$delta_psi_cutoff,
-               zScoreCutoff = z_cutoff, minCount = args$min_count)
-res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
+res_all <- results(fds, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
 
 write_csv(as.data.frame(res), "results.significant.csv")
 write_csv(as.data.frame(res_all), "results.all.csv")
 
-#Save final state
-
-saveFraserDataSet(fds_fit, dir = getwd(), name = "FraserDataSet")
+# 6. Generate Diagnostic Plots
+message("Generating Volcano and Q-Q plots...")
+dir.create("plots/volcano", recursive = TRUE)
+for(type in c("psi5", "psi3", "jaccard")){
+    png(paste0("plots/volcano/volcano_", type, ".png"), width = 2000, height = 2000, res = 300)
+    print(plotVolcano(fds, type = type, sampleID = sampleIDs(fds)[1])) # Plots first sample as example
+    dev.off()
+}
+
+# 7. Final Save
+# We save this in a generic folder so Python can easily grab it
+saveFraserDataSet(fds, dir = "output", name = paste0("Final_", args$cohort_id))
+
+# Summary statistics for the log
+sink("statistics_summary.txt")
+cat("Cohort ID:", args$cohort_id, "\n")
+cat("Total Samples:", length(sampleIDs(fds)), "\n")
+cat("Significant Events (adj P <", args$pval_cutoff, "):", nrow(res), "\n")
+sink()
+
+message("Analysis complete.")
\ No newline at end of file
diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index a3e743ea..d272c9cc 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -5,25 +5,57 @@ library(FRASER)
 library(BiocParallel)
 
 parser <- ArgumentParser(description = "Count Non-Split Reads for a Single Sample")
-parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
+parser$add_argument("--bam_path", required = TRUE, help = "Path to the localized BAM file")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
 parser$add_argument("--sample_id", required = TRUE, help = "Sample ID")
 parser$add_argument("--coords_path", required = TRUE, help = "Path to splice_site_coords.RDS")
-parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
-splice_site_coords <- readRDS(args$coords_path)
+# 1. Reconstruct Directory Structure
+fds_name <- paste0("FRASER_", args$cohort_id)
+save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+
+# Create the specific cache directory for non-split counts
+cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
+dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
 
+# 2. Configure Parallelism and HDF5
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
+register(MulticoreParam(workers = args$nthreads))
+
+# 3. Load Dataset and Coordinates
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+splice_site_coords <- readRDS(args$coords_path)
+
+# 4. Synchronize BAM Path
+# Ensure the FDS object points to the current local BAM location on the worker
+bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path
 
+# 5. Run Non-Split Counting
+# This writes the .h5 or .RDS file into the cache_dir created above
+message(paste("Counting non-split reads for sample:", args$sample_id))
 countNonSplicedReads(
   sampleID = args$sample_id,
   fds = fds,
   spliceSiteCoords = splice_site_coords,
-  splitCountRanges = NULL,
+  splitCountRanges = NULL, # Already handled in merge_split
   minAnchor = 5,
-  NcpuPerSample = args$nthreads
+  NcpuPerSample = args$nthreads,
+  recount = TRUE
 )
+
+# 6. Verification
+# The Python script uses a 'find' command to move this output.
+# We just need to check if the file was created in the cache.
+out_file <- list.files(cache_dir, pattern = paste0("nonSplicedCounts-", args$sample_id))
+if (length(out_file) > 0) {
+    message("Successfully created non-split counts: ", out_file[1])
+} else {
+    stop("Non-split counts file was not found in: ", cache_dir)
+}
\ No newline at end of file
diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
new file mode 100644
index 00000000..510a4a97
--- /dev/null
+++ b/images/fraser/fraser_count_split.R
@@ -0,0 +1,54 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Count Split Reads for a Single Sample")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
+parser$add_argument("--bam_path", required = TRUE, help = "Path to the specific BAM file for this sample")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+# 1. Reconstruct the FRASER directory structure so loadFraserDataSet works
+# FRASER expects: {work_dir}/savedObjects/FRASER_{cohort_id}/fds-object.RDS
+fds_dir_name <- paste0("FRASER_", args$cohort_id)
+save_dir <- file.path(args$work_dir, "savedObjects", fds_dir_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+
+# 2. Force HDF5 and Configure Parallelism
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+register(MulticoreParam(workers = args$nthreads))
+
+# 3. Load the dataset
+# Note: name must match what was used in fraser_init.R
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
+
+# 4. Update the BAM path for this specific worker environment
+# This ensures that even if the BAM was at a different path during Init,
+# the worker uses the current localized path.
+bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path
+
+# 5. Run counting for the specific sample
+# countRNAData with sampleId filter is the standard way to run parallel counts
+fds <- countRNAData(
+  fds,
+  sampleIds = args$sample_id,
+  NcpuPerSample = args$nthreads,
+  recount = TRUE
+)
+
+# 6. Verification
+# FRASER writes split counts to: {work_dir}/cache/splitCounts/splitCounts-{sample_id}.RDS
+expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS"))
+
+if (file.exists(expected_out)) {
+    message("Successfully created split counts at: ", expected_out)
+} else {
+    stop("Split counts file was not found at expected location: ", expected_out)
+}
\ No newline at end of file
diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
index 49cf6b9f..5deaabff 100644
--- a/images/fraser/fraser_init.R
+++ b/images/fraser/fraser_init.R
@@ -4,28 +4,28 @@ library(BiocParallel)
 
 parser <- ArgumentParser(description = "Initialize FRASER Data Set")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
-parser$add_argument("--sample_ids", required = TRUE, help = "Comma-separated sample IDs")
-parser$add_argument("--bam_files", required = TRUE, help = "Comma-separated BAM file paths")
-parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--sample_map", required = TRUE, help = "Path to CSV with sample_id and bam_path")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory for FRASER")
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-sample_ids <- unlist(strsplit(args$sample_ids, ","))
-bam_files <- unlist(strsplit(args$bam_files, ","))
+# 1. CSV Injection: Read the mapping file instead of parsing strings
+# Expected columns: sample_id, bam_path
+sample_map <- read.csv(args$sample_map)
 
 sample_table <- DataFrame(
-  sampleID = sample_ids,
-  bamFile = bam_files,
-  group = seq_len(length(bam_files)),
+  sampleID  = as.character(sample_map$sample_id),
+  bamFile   = as.character(sample_map$bam_path),
+  group     = "cohort",
   pairedEnd = TRUE
 )
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
 
 fds <- FraserDataSet(
-  colData = sample_table,
-  workingDir = args$working_dir,
-  name = args$cohort_id
+  colData    = sample_table,
+  workingDir = args$work_dir,
+  name       = paste0("FRASER_", args$cohort_id)
 )
 
 #Setup parallel execution
@@ -33,17 +33,20 @@ bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
 
-#Initialize counts and metadata
-fds <- countRNAData(fds, BPPARAM = bp)
-fds <- calculatePSIValues(fds,BPPARAM = bp)
+# We calculate initial metadata/PSI skeleton
+fds <- calculatePSIValues(fds, BPPARAM = bp)
 fds <- saveFraserDataSet(fds)
 
 #Print location for Python to capture if needed
 fds_save_path <- file.path(
-  args$working_dir,
+  args$work_dir,
   "savedObjects",
   paste0("FRASER_", args$cohort_id),
   "fds-object.RDS"
 )
 
-cat(fds_rds_path, "\n")
+if (file.exists(fds_save_path)) {
+    message("Successfully initialized FDS skeleton at: ", fds_save_path)
+} else {
+    stop("FDS object was not saved to: ", fds_save_path)
+}
\ No newline at end of file
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 59c2b00b..78c3ed3b 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -5,34 +5,53 @@ library(FRASER)
 library(BiocParallel)
 
 parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI")
-parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
 parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to g_ranges_non_split_counts.RDS")
-parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
-register(MulticoreParam(workers = args$nthreads))
+# 1. Reconstruct Directory Structure
+fds_name <- paste0("FRASER_", args$cohort_id)
+save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
 
+# 2. Configure Backend
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
 
-#1. Merge Split
+# 3. Load Dataset
+# This loads the metadata; the counts will be pulled from the cache symlinked by Python
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
+# 4. Merge Split Counts
+# This step updates the fds with the junctions identified in Step 3
+message("Merging all split-read counts...")
 fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE)
 
-#2. Merge Non-Split
-
+# 5. Merge Non-Split Counts
+# We use the filtered ranges from Step 3 to define the 'at-site' junctions
+message("Merging all non-split-read counts from cache...")
 split_count_ranges <- readRDS(args$filtered_ranges_path)
 fds <- getNonSplitReadCountsForAllSamples(
   fds = fds,
   splitCountRanges = split_count_ranges,
   minAnchor = 5,
-  recount = FALSE
+  recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
 )
 
-#3. FRASER 2.0: Calculate Jaccard & PSI
+# 6. Statistical Normalization (PSI and Jaccard)
+# This is the 'FRASER 2.0' logic. It calculates the metrics used for outlier detection.
+message("Calculating PSI and Jaccard Index values...")
+fds <- calculatePSIValues(fds, BPPARAM = bp)
 
-fds <- calculatePSIValues(fds)
+# 7. Final Save
+# This creates the complete 'fitted' dataset that the analysis script will use
+message("Saving final merged FraserDataSet...")
 fds <- saveFraserDataSet(fds)
+
+message("Merge non-split and PSI calculation complete.")
\ No newline at end of file
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 55a6df34..7053827e 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -3,48 +3,64 @@
 library(argparse)
 library(FRASER)
 library(BiocParallel)
+library(summarizedRT) # Useful for rowMaxs/assay handling
 
 parser <- ArgumentParser(description = "Merge Split Read Counts")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
-parser$add_argument("--working_dir", default = "output", help = "Working directory")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-dir.create(file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id)), recursive = TRUE, showWarnings = FALSE)
-file.copy(args$fds_path, file.path(args$working_dir, "savedObjects", paste0("FRASER_", args$cohort_id), "fds-object.RDS"))
+# 1. Reconstruct Directory Structure
+fds_name <- paste0("FRASER_", args$cohort_id)
+save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
 
+# 2. Configure Backend
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
-
-fds <- loadFraserDataSet(dir = args$working_dir, name = args$cohort_id)
-
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
+# 3. Load Dataset
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-fds <- countRNAData(fds, BPPARAM = bp)
-
-#Merge split counts
+# 4. Merge individual split count RDS files from the cache
+# FRASER automatically looks in: {work_dir}/cache/splitCounts/
+message("Merging split counts from cache...")
+fds <- countRNAData(fds, recount = FALSE, BPPARAM = bp)
 
-fds <- getSplitReadCountsForAllSamples(fds = fds, recount = FALSE)
+# 5. Extract and Annotate Junctions
+# This identifies which junctions exist across the whole cohort
+split_counts_se <- asSE(fds, type = "j")
+split_ranges <- rowRanges(split_counts_se)
 
-#Prepare ranges for non-split counting
-
-split_counts <- asSE(fds, type = "j")
-split_count_ranges <- rowRanges(split_counts)
-split_count_ranges <- FRASER:::annotateSpliceSite(split_count_ranges)
-saveRDS(split_count_ranges, "g_ranges_split_counts.RDS")
-
-#Filter for expression to reduce non-split counting overhead
+# Explicitly annotate splice sites to get donor/acceptor positions
+message("Annotating splice sites...")
+split_ranges <- FRASER:::annotateSpliceSite(split_ranges)
+saveRDS(split_ranges, "g_ranges_split_counts.RDS")
 
+# 6. Filtering for Non-Split Counting (Optimization)
+# We only want to count non-split reads for junctions that actually show up
+# in our data to save massive amounts of compute time.
+message("Filtering ranges for non-split counting...")
 minExpressionInOneSample <- 20
-max_count <- rowMaxs(assay(split_counts, "rawCountsJ"))
+# Use assay(..., "rawCountsJ") to get the counts for junctions
+raw_counts <- assay(split_counts_se, "rawCountsJ")
+max_count <- rowMaxs(raw_counts)
 passed <- max_count >= minExpressionInOneSample
-filtered_ranges <- split_count_ranges[passed, ]
+
+filtered_ranges <- split_ranges[passed, ]
 saveRDS(filtered_ranges, "g_ranges_non_split_counts.RDS")
 
+# 7. Extract Splice Site Coordinates
+# These are the specific base-pair positions the next jobs will query in the BAMs
+message("Extracting splice site coordinates...")
 splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds)
 saveRDS(splice_site_coords, "splice_site_coords.RDS")
 
+# 8. Save the updated FDS
 saveFraserDataSet(fds)
+message("Merge split complete.")
\ No newline at end of file

From 14c1f66909754efc1e76a4a5c5f111d7ec5a2728 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 19 Jan 2026 17:50:01 +1100
Subject: [PATCH 11/89] end of file linting

---
 images/fraser/fraser_analysis.R        | 2 +-
 images/fraser/fraser_count_non_split.R | 2 +-
 images/fraser/fraser_count_split.R     | 2 +-
 images/fraser/fraser_init.R            | 2 +-
 images/fraser/fraser_merge_non_split.R | 2 +-
 images/fraser/fraser_merge_split.R     | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index 54818e44..0ff7e409 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -86,4 +86,4 @@ cat("Total Samples:", length(sampleIDs(fds)), "\n")
 cat("Significant Events (adj P <", args$pval_cutoff, "):", nrow(res), "\n")
 sink()
 
-message("Analysis complete.")
\ No newline at end of file
+message("Analysis complete.")
diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index d272c9cc..486b379a 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -58,4 +58,4 @@ if (length(out_file) > 0) {
     message("Successfully created non-split counts: ", out_file[1])
 } else {
     stop("Non-split counts file was not found in: ", cache_dir)
-}
\ No newline at end of file
+}
diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index 510a4a97..7ba2cb2c 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -51,4 +51,4 @@ if (file.exists(expected_out)) {
     message("Successfully created split counts at: ", expected_out)
 } else {
     stop("Split counts file was not found at expected location: ", expected_out)
-}
\ No newline at end of file
+}
diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
index 5deaabff..6c67b7fd 100644
--- a/images/fraser/fraser_init.R
+++ b/images/fraser/fraser_init.R
@@ -49,4 +49,4 @@ if (file.exists(fds_save_path)) {
     message("Successfully initialized FDS skeleton at: ", fds_save_path)
 } else {
     stop("FDS object was not saved to: ", fds_save_path)
-}
\ No newline at end of file
+}
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 78c3ed3b..1dd0163f 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -54,4 +54,4 @@ fds <- calculatePSIValues(fds, BPPARAM = bp)
 message("Saving final merged FraserDataSet...")
 fds <- saveFraserDataSet(fds)
 
-message("Merge non-split and PSI calculation complete.")
\ No newline at end of file
+message("Merge non-split and PSI calculation complete.")
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 7053827e..973b2841 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -63,4 +63,4 @@ saveRDS(splice_site_coords, "splice_site_coords.RDS")
 
 # 8. Save the updated FDS
 saveFraserDataSet(fds)
-message("Merge split complete.")
\ No newline at end of file
+message("Merge split complete.")

From 899ec5214f4150e539e753d2eefe57ccd467088e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 19 Jan 2026 18:36:59 +1100
Subject: [PATCH 12/89] argparse is necessary for image

---
 images/fraser/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 49e96a56..6aedc2b7 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -15,6 +15,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     r-base=4.4.3 \
     r-reshape2=1.4.5 \
     r-tidyverse=2.0.0 \
+    r-argparse=2.2.3 \
     bioconductor-fraser=${VERSION} \
     bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
     bioconductor-org.hs.eg.db=3.20.0 && \

From f71ba087dc02f4c2c2e55ac82deb33ff55e74095 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 20 Jan 2026 23:09:45 +1100
Subject: [PATCH 13/89] cannot calc before count

---
 images/fraser/fraser_init.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
index 6c67b7fd..a0aea730 100644
--- a/images/fraser/fraser_init.R
+++ b/images/fraser/fraser_init.R
@@ -34,7 +34,7 @@ register(bp)
 
 
 # We calculate initial metadata/PSI skeleton
-fds <- calculatePSIValues(fds, BPPARAM = bp)
+
 fds <- saveFraserDataSet(fds)
 
 #Print location for Python to capture if needed

From eea8c9be0497086691b9e57b717198549d100b30 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 10:14:41 +1100
Subject: [PATCH 14/89] bamData is deprecated

---
 images/fraser/fraser_count_split.R | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index 7ba2cb2c..cf4cbe09 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -29,11 +29,16 @@ register(MulticoreParam(workers = args$nthreads))
 # Note: name must match what was used in fraser_init.R
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
 
-# 4. Update the BAM path for this specific worker environment
-# This ensures that even if the BAM was at a different path during Init,
-# the worker uses the current localized path.
-bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path
+# 4. Update Metadata and Strand
+# Important: Ensure the strand matches your actual lab protocol
+strandSpecific(fds) <- 0
 
+# Use colData to update the localized BAM path
+if (args$sample_id %in% colData(fds)$sampleID) {
+    colData(fds)[colData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path
+} else {
+    stop("Sample ID not found in fds object.")
+}
 # 5. Run counting for the specific sample
 # countRNAData with sampleId filter is the standard way to run parallel counts
 fds <- countRNAData(

From 90bd9c4132b7b0d744bd8ea8148b04eabbf7aff0 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 10:33:34 +1100
Subject: [PATCH 15/89] paralelizing a single sample is confuzling

---
 images/fraser/fraser_count_split.R | 33 ++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index cf4cbe09..06fc4ec0 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -23,7 +23,9 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
 # 2. Force HDF5 and Configure Parallelism
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
-register(MulticoreParam(workers = args$nthreads))
+# Use MulticoreParam for Linux environments
+bpparam <- MulticoreParam(workers = args$nthreads)
+register(bpparam)
 
 # 3. Load the dataset
 # Note: name must match what was used in fraser_init.R
@@ -33,19 +35,23 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
 # Important: Ensure the strand matches your actual lab protocol
 strandSpecific(fds) <- 0
 
-# Use colData to update the localized BAM path
-if (args$sample_id %in% colData(fds)$sampleID) {
-    colData(fds)[colData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path
-} else {
-    stop("Sample ID not found in fds object.")
+# Subset the FDS object to ONLY this sample.
+# This prevents the worker from erroring out when looking for other samples' BAMs.
+if (!(args$sample_id %in% colData(fds)$sampleID)) {
+    stop(paste("Sample", args$sample_id, "not found in the provided FDS object."))
 }
-# 5. Run counting for the specific sample
-# countRNAData with sampleId filter is the standard way to run parallel counts
+fds <- fds[, fds$sampleID == args$sample_id]
+
+# Update the localized BAM path in colData
+colData(fds)$bamFile <- args$bam_path
+
+# 5. Run counting
+# We pass BPPARAM explicitly to ensure the worker uses the allocated threads
 fds <- countRNAData(
   fds,
   sampleIds = args$sample_id,
-  NcpuPerSample = args$nthreads,
-  recount = TRUE
+  recount = TRUE,
+  BPPARAM = bpparam
 )
 
 # 6. Verification
@@ -55,5 +61,10 @@ expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCo
 if (file.exists(expected_out)) {
     message("Successfully created split counts at: ", expected_out)
 } else {
-    stop("Split counts file was not found at expected location: ", expected_out)
+    # List files in the directory to help debug if it fails
+    found_files <- list.files(file.path(args$work_dir, "cache", "splitCounts"))
+    stop(paste0(
+        "Split counts file not found. Expected: ", expected_out,
+        "\nFound in cache: ", paste(found_files, collapse=", ")
+    ))
 }

From bc44e653d0220a324c7138feee8bde4803a19ec0 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 11:14:07 +1100
Subject: [PATCH 16/89] parallel limit

---
 images/fraser/fraser_count_split.R | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index 06fc4ec0..283fad74 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -1,5 +1,9 @@
 #!/usr/bin/env Rscript
 
+# Set memory limit at the environment level before loading heavy libraries
+# 11GB limit
+Sys.setenv("R_MAX_VSIZE" = "14Gb")
+
 library(argparse)
 library(FRASER)
 library(BiocParallel)
@@ -23,8 +27,13 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
 # 2. Force HDF5 and Configure Parallelism
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
-# Use MulticoreParam for Linux environments
-bpparam <- MulticoreParam(workers = args$nthreads)
+
+# Configure MulticoreParam with the 11GB limit in mind
+# Note: nthreads shares the 11GB pool
+bpparam <- MulticoreParam(
+    workers = args$nthreads,
+    stop.on.error = TRUE
+)
 register(bpparam)
 
 # 3. Load the dataset

From b4fcd7ab216fbb844191d44917fa37f6a9315824 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 11:35:37 +1100
Subject: [PATCH 17/89] parallel limit

---
 images/fraser/fraser_count_split.R | 51 ++++++++++++++----------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index 283fad74..2955e946 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -22,40 +22,40 @@ args <- parser$parse_args()
 fds_dir_name <- paste0("FRASER_", args$cohort_id)
 save_dir <- file.path(args$work_dir, "savedObjects", fds_dir_name)
 dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
-file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
 
 # 2. Force HDF5 and Configure Parallelism
 options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = -1)
+options("FRASER.maxJunctionsNoHDF5" = 0) # Changed from -1 to 0 to be safer
 
-# Configure MulticoreParam with the 11GB limit in mind
-# Note: nthreads shares the 11GB pool
-bpparam <- MulticoreParam(
-    workers = args$nthreads,
-    stop.on.error = TRUE
-)
-register(bpparam)
-
-# 3. Load the dataset
-# Note: name must match what was used in fraser_init.R
+# 3. Load and Prune IMMEDIATELY
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
 
 # 4. Update Metadata and Strand
 # Important: Ensure the strand matches your actual lab protocol
 strandSpecific(fds) <- 0
 
-# Subset the FDS object to ONLY this sample.
-# This prevents the worker from erroring out when looking for other samples' BAMs.
-if (!(args$sample_id %in% colData(fds)$sampleID)) {
-    stop(paste("Sample", args$sample_id, "not found in the provided FDS object."))
-}
+# SUBSET FIRST: This is the most critical memory-saving step.
+# By subsetting here, we drop the metadata of all other samples.
 fds <- fds[, fds$sampleID == args$sample_id]
-
-# Update the localized BAM path in colData
 colData(fds)$bamFile <- args$bam_path
 
-# 5. Run counting
-# We pass BPPARAM explicitly to ensure the worker uses the allocated threads
+# Force Garbage Collection to reclaim memory from the full cohort load
+gc()
+
+# 4. Parallel vs Serial Logic
+# If you are hitting 11GB limits, 'MulticoreParam' is risky because it forks the process.
+# If nthreads > 1, we use a very conservative setup.
+if(args$nthreads > 1) {
+    # Using 'tasks' helps throttle the memory spikes
+    bpparam <- MulticoreParam(workers = args$nthreads, tasks = args$nthreads)
+} else {
+    bpparam <- SerialParam()
+}
+
+# 5. Run counting with minimal overhead
+# We use recount=FALSE if possible, but keeping TRUE as per your requirement.
+# junctionId is set to NULL to ensure a fresh scan of this specific BAM.
 fds <- countRNAData(
   fds,
   sampleIds = args$sample_id,
@@ -68,12 +68,7 @@ fds <- countRNAData(
 expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS"))
 
 if (file.exists(expected_out)) {
-    message("Successfully created split counts at: ", expected_out)
+    message("Success: ", expected_out)
 } else {
-    # List files in the directory to help debug if it fails
-    found_files <- list.files(file.path(args$work_dir, "cache", "splitCounts"))
-    stop(paste0(
-        "Split counts file not found. Expected: ", expected_out,
-        "\nFound in cache: ", paste(found_files, collapse=", ")
-    ))
+    stop("Counting failed to produce output.")
 }

From 11d35a93286f43e30609492d96db672884904280 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 11:55:29 +1100
Subject: [PATCH 18/89] no parallel

---
 images/fraser/fraser_count_split.R | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index 2955e946..ddfc6a36 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -43,16 +43,6 @@ colData(fds)$bamFile <- args$bam_path
 # Force Garbage Collection to reclaim memory from the full cohort load
 gc()
 
-# 4. Parallel vs Serial Logic
-# If you are hitting 11GB limits, 'MulticoreParam' is risky because it forks the process.
-# If nthreads > 1, we use a very conservative setup.
-if(args$nthreads > 1) {
-    # Using 'tasks' helps throttle the memory spikes
-    bpparam <- MulticoreParam(workers = args$nthreads, tasks = args$nthreads)
-} else {
-    bpparam <- SerialParam()
-}
-
 # 5. Run counting with minimal overhead
 # We use recount=FALSE if possible, but keeping TRUE as per your requirement.
 # junctionId is set to NULL to ensure a fresh scan of this specific BAM.
@@ -60,7 +50,6 @@ fds <- countRNAData(
   fds,
   sampleIds = args$sample_id,
   recount = TRUE,
-  BPPARAM = bpparam
 )
 
 # 6. Verification

From afb514c649f2b2712132808ee17a980f5cdcf7b0 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 12:17:05 +1100
Subject: [PATCH 19/89] force serial

---
 images/fraser/fraser_count_split.R | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index ddfc6a36..ec80204b 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -26,7 +26,7 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE
 
 # 2. Force HDF5 and Configure Parallelism
 options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = 0) # Changed from -1 to 0 to be safer
+options("FRASER.maxJunctionsNoHDF5" = 0)
 
 # 3. Load and Prune IMMEDIATELY
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
@@ -43,13 +43,27 @@ colData(fds)$bamFile <- args$bam_path
 # Force Garbage Collection to reclaim memory from the full cohort load
 gc()
 
-# 5. Run counting with minimal overhead
-# We use recount=FALSE if possible, but keeping TRUE as per your requirement.
-# junctionId is set to NULL to ensure a fresh scan of this specific BAM.
+# 4. Configure Conservative Parallelism
+# MulticoreParam (forking) is likely causing the 11GB overflow.
+# SerialParam is the safest for memory-constrained environments.
+if(args$nthreads > 1){
+    # If you MUST use multiple threads, SnowParam is often more memory-stable
+    # than MulticoreParam on some systems, but SerialParam is the safest.
+    bpparam <- SerialParam()
+    message("Note: Defaulting to SerialParam to stay within 14GB limit.")
+} else {
+    bpparam <- SerialParam()
+}
+
+# 5. Run counting
+# We use NcpuPerSample = 1 to prevent internal sub-parallelization
+# which often bypasses the BPPARAM settings.
 fds <- countRNAData(
   fds,
   sampleIds = args$sample_id,
   recount = TRUE,
+  NcpuPerSample = 1,
+  BPPARAM = bpparam
 )
 
 # 6. Verification
@@ -57,7 +71,7 @@ fds <- countRNAData(
 expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS"))
 
 if (file.exists(expected_out)) {
-    message("Success: ", expected_out)
+    message("Success: Created split counts at ", expected_out)
 } else {
-    stop("Counting failed to produce output.")
+    stop("Counting failed. Check if BAM index is valid and accessible.")
 }

From f8587a79c4cdbf717d91f34afd24bf6228e219ee Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 12:43:18 +1100
Subject: [PATCH 20/89] making it even leaner

---
 images/fraser/fraser_count_split.R | 33 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index ec80204b..ff2a9086 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -27,6 +27,7 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE
 # 2. Force HDF5 and Configure Parallelism
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = 0)
+bpparam <- SerialParam()
 
 # 3. Load and Prune IMMEDIATELY
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
@@ -40,30 +41,28 @@ strandSpecific(fds) <- 0
 fds <- fds[, fds$sampleID == args$sample_id]
 colData(fds)$bamFile <- args$bam_path
 
-# Force Garbage Collection to reclaim memory from the full cohort load
+# 4. PRE-SCAN DIAGNOSTIC
+# We check the first 1 million reads to estimate junction density
+message("--- Memory Diagnostic: Pre-scanning BAM for junction density ---")
+sample_bam <- BamFile(args$bam_path, yieldSize = 1000000)
+tmp_juncs <- summarizeJunctions(sample_bam, genome = NULL)
+message(paste("Unique junctions found in first 1M reads:", length(tmp_juncs)))
+rm(tmp_juncs, sample_bam)
 gc()
 
-# 4. Configure Conservative Parallelism
-# MulticoreParam (forking) is likely causing the 11GB overflow.
-# SerialParam is the safest for memory-constrained environments.
-if(args$nthreads > 1){
-    # If you MUST use multiple threads, SnowParam is often more memory-stable
-    # than MulticoreParam on some systems, but SerialParam is the safest.
-    bpparam <- SerialParam()
-    message("Note: Defaulting to SerialParam to stay within 14GB limit.")
-} else {
-    bpparam <- SerialParam()
-}
-
-# 5. Run counting
-# We use NcpuPerSample = 1 to prevent internal sub-parallelization
-# which often bypasses the BPPARAM settings.
+# 5. Run Counting with strict filters
+# minCount = 2 ignores junctions with only 1 supporting read (saves massive RAM)
+message(paste("Starting countRNAData for sample:", args$sample_id))
 fds <- countRNAData(
   fds,
   sampleIds = args$sample_id,
   recount = TRUE,
   NcpuPerSample = 1,
-  BPPARAM = bpparam
+  minCount = 2,
+  BPPARAM = bpparam,
+  param = ScanBamParam(
+    flag = scanBamFlag(isDuplicate = FALSE, isSecondaryAlignment = FALSE)
+  )
 )
 
 # 6. Verification

From 885788d4fa42ebac1bc5bed26989f4d41f559ab2 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 12:58:11 +1100
Subject: [PATCH 21/89] fix

---
 images/fraser/fraser_count_split.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index ff2a9086..a14f4b72 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -45,9 +45,6 @@ colData(fds)$bamFile <- args$bam_path
 # We check the first 1 million reads to estimate junction density
 message("--- Memory Diagnostic: Pre-scanning BAM for junction density ---")
 sample_bam <- BamFile(args$bam_path, yieldSize = 1000000)
-tmp_juncs <- summarizeJunctions(sample_bam, genome = NULL)
-message(paste("Unique junctions found in first 1M reads:", length(tmp_juncs)))
-rm(tmp_juncs, sample_bam)
 gc()
 
 # 5. Run Counting with strict filters

From 83d0dd9c65aa3cce2b40e855af4f4014e163580c Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 21 Jan 2026 14:58:47 +1100
Subject: [PATCH 22/89] fix

---
 images/fraser/fraser_count_non_split.R | 22 ++++++++--------------
 images/fraser/fraser_count_split.R     | 21 +++------------------
 2 files changed, 11 insertions(+), 32 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 486b379a..4ef11cae 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -27,28 +27,22 @@ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
 # 2. Configure Parallelism and HDF5
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
-register(MulticoreParam(workers = args$nthreads))
+bpparam <- SerialParam()
 
 # 3. Load Dataset and Coordinates
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
-splice_site_coords <- readRDS(args$coords_path)
+strandSpecific(fds) <- 0
 
-# 4. Synchronize BAM Path
-# Ensure the FDS object points to the current local BAM location on the worker
-bamData(fds)[bamData(fds)$sampleID == args$sample_id, "bamFile"] <- args$bam_path
+
+
+fds <- fds[, fds$sampleID == args$sample_id]
+colData(fds)$bamFile <- args$bam_path
 
 # 5. Run Non-Split Counting
 # This writes the .h5 or .RDS file into the cache_dir created above
 message(paste("Counting non-split reads for sample:", args$sample_id))
-countNonSplicedReads(
-  sampleID = args$sample_id,
-  fds = fds,
-  spliceSiteCoords = splice_site_coords,
-  splitCountRanges = NULL, # Already handled in merge_split
-  minAnchor = 5,
-  NcpuPerSample = args$nthreads,
-  recount = TRUE
-)
+getNonSplitReadCountsForAllSamples(fds,
+                    recount = TRUE)
 
 # 6. Verification
 # The Python script uses a 'find' command to move this output.
diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index a14f4b72..ce1f6613 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -32,8 +32,6 @@ bpparam <- SerialParam()
 # 3. Load and Prune IMMEDIATELY
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
 
-# 4. Update Metadata and Strand
-# Important: Ensure the strand matches your actual lab protocol
 strandSpecific(fds) <- 0
 
 # SUBSET FIRST: This is the most critical memory-saving step.
@@ -41,26 +39,13 @@ strandSpecific(fds) <- 0
 fds <- fds[, fds$sampleID == args$sample_id]
 colData(fds)$bamFile <- args$bam_path
 
-# 4. PRE-SCAN DIAGNOSTIC
-# We check the first 1 million reads to estimate junction density
-message("--- Memory Diagnostic: Pre-scanning BAM for junction density ---")
-sample_bam <- BamFile(args$bam_path, yieldSize = 1000000)
-gc()
 
 # 5. Run Counting with strict filters
 # minCount = 2 ignores junctions with only 1 supporting read (saves massive RAM)
 message(paste("Starting countRNAData for sample:", args$sample_id))
-fds <- countRNAData(
-  fds,
-  sampleIds = args$sample_id,
-  recount = TRUE,
-  NcpuPerSample = 1,
-  minCount = 2,
-  BPPARAM = bpparam,
-  param = ScanBamParam(
-    flag = scanBamFlag(isDuplicate = FALSE, isSecondaryAlignment = FALSE)
-  )
-)
+
+fds <- getSplitReadCountsForAllSamples(fds,
+                    recount = TRUE)
 
 # 6. Verification
 # FRASER writes split counts to: {work_dir}/cache/splitCounts/splitCounts-{sample_id}.RDS

From d01466b1607b943ce1dd6d29b3990722393a421b Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 09:17:57 +1100
Subject: [PATCH 23/89] adding necessary R libraries

---
 images/fraser/Dockerfile           | 3 +++
 images/fraser/fraser_merge_split.R | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 6aedc2b7..b1c454ac 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -16,9 +16,12 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     r-reshape2=1.4.5 \
     r-tidyverse=2.0.0 \
     r-argparse=2.2.3 \
+    r-biocmanager \
     bioconductor-fraser=${VERSION} \
     bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
     bioconductor-org.hs.eg.db=3.20.0 && \
+    # Install summarizedRT via BiocManager
+    micromamba run -n base Rscript -e 'BiocManager::install("summarizedRT", update=FALSE, ask=FALSE)' && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 973b2841..cc566029 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -30,7 +30,7 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 # 4. Merge individual split count RDS files from the cache
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
 message("Merging split counts from cache...")
-fds <- countRNAData(fds, recount = FALSE, BPPARAM = bp)
+fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp)
 
 # 5. Extract and Annotate Junctions
 # This identifies which junctions exist across the whole cohort

From ee25ec0ab605422343a6687ed1063a5c53ca9622 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 09:24:53 +1100
Subject: [PATCH 24/89] adding coordinate checking precaution

---
 images/fraser/fraser_count_non_split.R | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 4ef11cae..4077ebe8 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -31,13 +31,21 @@ bpparam <- SerialParam()
 
 # 3. Load Dataset and Coordinates
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
-strandSpecific(fds) <- 0
-
-
+filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS
 
+# 4. Critical: Subset and Inject
+# First, subset to the specific sample
 fds <- fds[, fds$sampleID == args$sample_id]
 colData(fds)$bamFile <- args$bam_path
 
+# Replace the internal splice site ranges with your filtered ones
+# This forces FRASER to only query these specific BP positions in the BAM
+nonDegenerated <- !duplicated(filtered_coords)
+mcols(fds)$spliceSiteCoords <- filtered_coords[nonDegenerated]
+
+# Set strand specificity (Ensure 0 is correct for your library!)
+strandSpecific(fds) <- 0
+
 # 5. Run Non-Split Counting
 # This writes the .h5 or .RDS file into the cache_dir created above
 message(paste("Counting non-split reads for sample:", args$sample_id))

From c7a8819a555e68e17f86cdb8f95d7fda199b1803 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 10:03:39 +1100
Subject: [PATCH 25/89] library has been renamed

---
 images/fraser/Dockerfile           | 2 +-
 images/fraser/fraser_merge_split.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index b1c454ac..11b5508b 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -21,7 +21,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
     bioconductor-org.hs.eg.db=3.20.0 && \
     # Install summarizedRT via BiocManager
-    micromamba run -n base Rscript -e 'BiocManager::install("summarizedRT", update=FALSE, ask=FALSE)' && \
+    micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index cc566029..1940770a 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -3,7 +3,7 @@
 library(argparse)
 library(FRASER)
 library(BiocParallel)
-library(summarizedRT) # Useful for rowMaxs/assay handling
+library(SummarizedExperiment) # Useful for rowMaxs/assay handling
 
 parser <- ArgumentParser(description = "Merge Split Read Counts")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")

From efd22a8474daefef46ef9eb656abf3b485121260 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 12:24:52 +1100
Subject: [PATCH 26/89] hail file caching fix

---
 images/fraser/fraser_count_split.R | 44 ++++++++++++++++++++----------
 images/fraser/fraser_init.R        | 12 ++++----
 images/fraser/fraser_merge_split.R | 12 +++++++-
 3 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index ce1f6613..b3938303 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -1,8 +1,7 @@
 #!/usr/bin/env Rscript
 
-# Set memory limit at the environment level before loading heavy libraries
-# 11GB limit
-Sys.setenv("R_MAX_VSIZE" = "14Gb")
+# Set memory limit - increased slightly to allow for HDF5 overhead
+Sys.setenv("R_MAX_VSIZE" = "16Gb")
 
 library(argparse)
 library(FRASER)
@@ -26,8 +25,13 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE
 
 # 2. Force HDF5 and Configure Parallelism
 options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = 0)
-bpparam <- SerialParam()
+options("FRASER.maxJunctionsNoHDF5" = -1)
+# Use MulticoreParam if threads > 1, else Serial
+if(args$nthreads > 1){
+    bpparam <- MulticoreParam(workers = args$nthreads)
+} else {
+    bpparam <- SerialParam()
+}
 
 # 3. Load and Prune IMMEDIATELY
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
@@ -37,22 +41,34 @@ strandSpecific(fds) <- 0
 # SUBSET FIRST: This is the most critical memory-saving step.
 # By subsetting here, we drop the metadata of all other samples.
 fds <- fds[, fds$sampleID == args$sample_id]
-colData(fds)$bamFile <- args$bam_path
 
+# Validate the BAM path - Ensure the R script sees what Hail localized
+if(!file.exists(args$bam_path)){
+    stop(paste("BAM file not found at:", args$bam_path))
+}
+colData(fds)$bamFile <- args$bam_path
 
-# 5. Run Counting with strict filters
-# minCount = 2 ignores junctions with only 1 supporting read (saves massive RAM)
-message(paste("Starting countRNAData for sample:", args$sample_id))
+# 4. Count Split Reads
+message(paste("Starting split read counting for sample:", args$sample_id))
 
-fds <- getSplitReadCountsForAllSamples(fds,
-                    recount = TRUE)
+# In FRASER 2.0, we use getSplitReadCountsForAllSamples with recount=TRUE.
+# This writes an RDS file to the cache which we will harvest in the merge step.
+fds <- getSplitReadCountsForAllSamples(
+    fds,
+    recount = TRUE,
+    BPPARAM = bpparam
+)
 
-# 6. Verification
-# FRASER writes split counts to: {work_dir}/cache/splitCounts/splitCounts-{sample_id}.RDS
+# 5. Verification
+# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS
 expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS"))
 
 if (file.exists(expected_out)) {
     message("Success: Created split counts at ", expected_out)
 } else {
-    stop("Counting failed. Check if BAM index is valid and accessible.")
+    # Check for common Bioinformatic failures
+    if(!file.exists(paste0(args$bam_path, ".bai"))){
+        stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.")
+    }
+    stop("Counting failed. The RDS file was not generated in the cache.")
 }
diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
index a0aea730..f31c5fc1 100644
--- a/images/fraser/fraser_init.R
+++ b/images/fraser/fraser_init.R
@@ -9,14 +9,12 @@ parser$add_argument("--work_dir", default = "/io/work", help = "Working director
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-# 1. CSV Injection: Read the mapping file instead of parsing strings
-# Expected columns: sample_id, bam_path
-sample_map <- read.csv(args$sample_map)
+# 1. Read the static mapping created by the Python job
+sample_map <- read.csv(args$sample_map, stringsAsFactors = FALSE)
 
 sample_table <- DataFrame(
   sampleID  = as.character(sample_map$sample_id),
   bamFile   = as.character(sample_map$bam_path),
-  group     = "cohort",
   pairedEnd = TRUE
 )
 options("FRASER.maxSamplesNoHDF5" = 0)
@@ -25,7 +23,7 @@ options("FRASER.maxJunctionsNoHDF5" = -1)
 fds <- FraserDataSet(
   colData    = sample_table,
   workingDir = args$work_dir,
-  name       = paste0("FRASER_", args$cohort_id)
+  name       = fds_name
 )
 
 #Setup parallel execution
@@ -41,12 +39,14 @@ fds <- saveFraserDataSet(fds)
 fds_save_path <- file.path(
   args$work_dir,
   "savedObjects",
-  paste0("FRASER_", args$cohort_id),
+  fds_name,
   "fds-object.RDS"
 )
 
 if (file.exists(fds_save_path)) {
     message("Successfully initialized FDS skeleton at: ", fds_save_path)
+    # Output path for Hail logs
+    cat(fds_save_path, "\n")
 } else {
     stop("FDS object was not saved to: ", fds_save_path)
 }
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 1940770a..74ad0776 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -16,7 +16,7 @@ args <- parser$parse_args()
 fds_name <- paste0("FRASER_", args$cohort_id)
 save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
 dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
-file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
 
 # 2. Configure Backend
 options("FRASER.maxSamplesNoHDF5" = 0)
@@ -27,6 +27,16 @@ register(bp)
 # 3. Load Dataset
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
+# --- VULNERABILITY FIX: HAIL LOCALIZATION ---
+# Re-assert the dummy BAM path to prevent seqlevelsStyle crashes
+# during junction metadata extraction.
+dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE)
+dummy_bam <- "/io/batch/input_bams/dummy.bam"
+if(!file.exists(dummy_bam)) file.create(dummy_bam)
+colData(fds)$bamFile <- dummy_bam
+strandSpecific(fds) <- 0
+# --------------------------------------------
+
 # 4. Merge individual split count RDS files from the cache
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
 message("Merging split counts from cache...")

From c60b5791fbfff1a682532f58c1009afbefa181bd Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 14:50:46 +1100
Subject: [PATCH 27/89] whoops

---
 images/fraser/fraser_init.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
index f31c5fc1..03be3797 100644
--- a/images/fraser/fraser_init.R
+++ b/images/fraser/fraser_init.R
@@ -11,6 +11,9 @@ args <- parser$parse_args()
 
 # 1. Read the static mapping created by the Python job
 sample_map <- read.csv(args$sample_map, stringsAsFactors = FALSE)
+# Define the dataset name consistently with the Python expectations
+fds_name <- paste0("FRASER_", args$cohort_id)
+
 
 sample_table <- DataFrame(
   sampleID  = as.character(sample_map$sample_id),

From a0984368832d7a8c28b0edc6b01eba4612f5af88 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 16:51:14 +1100
Subject: [PATCH 28/89] fraser_merge_split.R fix

---
 images/fraser/fraser_merge_split.R | 37 +++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 74ad0776..22f1c742 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -21,19 +21,30 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE
 # 2. Configure Backend
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
+
+
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
 # 3. Load Dataset
+# Match the name exactly as defined in the init stage
+fds_name <- paste0("FRASER_", args$cohort_id)
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# --- VULNERABILITY FIX: HAIL LOCALIZATION ---
-# Re-assert the dummy BAM path to prevent seqlevelsStyle crashes
-# during junction metadata extraction.
-dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE)
-dummy_bam <- "/io/batch/input_bams/dummy.bam"
-if(!file.exists(dummy_bam)) file.create(dummy_bam)
-colData(fds)$bamFile <- dummy_bam
+# --- VULNERABILITY FIX: BAM Metadata Extraction ---
+# Instead of a dummy file, point metadata queries to an existing BAM
+# to ensure chromosome naming (seqlevelsStyle) can be validated.
+# We assume the symlinks were created in /io/batch/input_bams/
+available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
+if(length(available_bams) > 0){
+    colData(fds)$bamFile <- available_bams[1]
+} else {
+    # Fallback if no bams localized (though this shouldn't happen)
+    dummy_bam <- "/io/batch/input_bams/dummy.bam"
+    dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE)
+    if(!file.exists(dummy_bam)) file.create(dummy_bam)
+    colData(fds)$bamFile <- dummy_bam
+}
 strandSpecific(fds) <- 0
 # --------------------------------------------
 
@@ -50,26 +61,24 @@ split_ranges <- rowRanges(split_counts_se)
 # Explicitly annotate splice sites to get donor/acceptor positions
 message("Annotating splice sites...")
 split_ranges <- FRASER:::annotateSpliceSite(split_ranges)
-saveRDS(split_ranges, "g_ranges_split_counts.RDS")
 
 # 6. Filtering for Non-Split Counting (Optimization)
-# We only want to count non-split reads for junctions that actually show up
-# in our data to save massive amounts of compute time.
 message("Filtering ranges for non-split counting...")
 minExpressionInOneSample <- 20
-# Use assay(..., "rawCountsJ") to get the counts for junctions
 raw_counts <- assay(split_counts_se, "rawCountsJ")
 max_count <- rowMaxs(raw_counts)
 passed <- max_count >= minExpressionInOneSample
 
 filtered_ranges <- split_ranges[passed, ]
-saveRDS(filtered_ranges, "g_ranges_non_split_counts.RDS")
 
 # 7. Extract Splice Site Coordinates
-# These are the specific base-pair positions the next jobs will query in the BAMs
 message("Extracting splice site coordinates...")
 splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds)
-saveRDS(splice_site_coords, "splice_site_coords.RDS")
+
+# Use absolute paths for saving to match Python 'mv' commands
+saveRDS(split_ranges,       file.path(args$work_dir, "g_ranges_split_counts.RDS"))
+saveRDS(filtered_ranges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
+saveRDS(splice_site_coords, file.path(args$work_dir, "splice_site_coords.RDS"))
 
 # 8. Save the updated FDS
 saveFraserDataSet(fds)

From c2910ed06a45e76bf53e6ab64322fc9592c15d71 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 17:11:22 +1100
Subject: [PATCH 29/89] fraser_merge_split.R fix

---
 images/fraser/Dockerfile           |  1 +
 images/fraser/fraser_merge_split.R | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 11b5508b..5d26c50f 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -22,6 +22,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     bioconductor-org.hs.eg.db=3.20.0 && \
     # Install summarizedRT via BiocManager
     micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \
+    micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 22f1c742..cfa931ed 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -3,7 +3,8 @@
 library(argparse)
 library(FRASER)
 library(BiocParallel)
-library(SummarizedExperiment) # Useful for rowMaxs/assay handling
+library(SummarizedExperiment)
+library(Rsamtools) # Added for BamFile validation
 
 parser <- ArgumentParser(description = "Merge Split Read Counts")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
@@ -37,13 +38,19 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 # We assume the symlinks were created in /io/batch/input_bams/
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
 if(length(available_bams) > 0){
-    colData(fds)$bamFile <- available_bams[1]
+    message("Using reference BAM for metadata: ", available_bams[1])
+
+    # Force-update colData for ALL samples to point to a valid file.
+    # This prevents the 'dummy.bam' error during internal validation.
+    new_colData <- colData(fds)
+    new_colData$bamFile <- available_bams[1]
+    colData(fds) <- new_colData
+
+    # Manually assert the seqlevelsStyle (e.g., 'UCSC' or 'Ensembl')
+    # This satisfies the internal check that is currently crashing.
+    seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1]))
 } else {
-    # Fallback if no bams localized (though this shouldn't happen)
-    dummy_bam <- "/io/batch/input_bams/dummy.bam"
-    dir.create("/io/batch/input_bams", recursive = TRUE, showWarnings = FALSE)
-    if(!file.exists(dummy_bam)) file.create(dummy_bam)
-    colData(fds)$bamFile <- dummy_bam
+    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. The merge cannot proceed without at least one valid BAM header for seqlevelsStyle validation.")
 }
 strandSpecific(fds) <- 0
 # --------------------------------------------

From 62e2fa2608a1783f7dc63491065605ac07d22126 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 22 Jan 2026 18:01:16 +1100
Subject: [PATCH 30/89] fraser_merge_split.R fix

---
 images/fraser/fraser_merge_split.R | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index cfa931ed..10095fc5 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -59,10 +59,12 @@ strandSpecific(fds) <- 0
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
 message("Merging split counts from cache...")
 fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp)
+split_counts_se <- SummarizedExperiment(
+    assays  = list(rawCountsJ = K(fds, type="j")),
+    rowRanges = rowRanges(fds, type="j"),
+    colData   = colData(fds)
+)
 
-# 5. Extract and Annotate Junctions
-# This identifies which junctions exist across the whole cohort
-split_counts_se <- asSE(fds, type = "j")
 split_ranges <- rowRanges(split_counts_se)
 
 # Explicitly annotate splice sites to get donor/acceptor positions
@@ -73,8 +75,8 @@ split_ranges <- FRASER:::annotateSpliceSite(split_ranges)
 message("Filtering ranges for non-split counting...")
 minExpressionInOneSample <- 20
 raw_counts <- assay(split_counts_se, "rawCountsJ")
-max_count <- rowMaxs(raw_counts)
-passed <- max_count >= minExpressionInOneSample
+max_count  <- rowMaxs(raw_counts)
+passed     <- max_count >= minExpressionInOneSample
 
 filtered_ranges <- split_ranges[passed, ]
 

From 999f930effd43898dd019c4226700add63c9eccd Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 00:29:40 +1100
Subject: [PATCH 31/89] fraser_merge_split.R fix

---
 images/fraser/fraser_merge_split.R | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 10095fc5..fc9871f9 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -4,7 +4,8 @@ library(argparse)
 library(FRASER)
 library(BiocParallel)
 library(SummarizedExperiment)
-library(Rsamtools) # Added for BamFile validation
+library(Rsamtools)
+library(DelayedMatrixStats) # Crucial for rowMaxs on HDF5 data
 
 parser <- ArgumentParser(description = "Merge Split Read Counts")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
@@ -32,11 +33,9 @@ register(bp)
 fds_name <- paste0("FRASER_", args$cohort_id)
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# --- VULNERABILITY FIX: BAM Metadata Extraction ---
-# Instead of a dummy file, point metadata queries to an existing BAM
-# to ensure chromosome naming (seqlevelsStyle) can be validated.
-# We assume the symlinks were created in /io/batch/input_bams/
+# --- NUCLEAR FIX: Force Valid Metadata ---
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
+
 if(length(available_bams) > 0){
     message("Using reference BAM for metadata: ", available_bams[1])
 
@@ -50,7 +49,7 @@ if(length(available_bams) > 0){
     # This satisfies the internal check that is currently crashing.
     seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1]))
 } else {
-    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. The merge cannot proceed without at least one valid BAM header for seqlevelsStyle validation.")
+    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
 }
 strandSpecific(fds) <- 0
 # --------------------------------------------
@@ -60,20 +59,21 @@ strandSpecific(fds) <- 0
 message("Merging split counts from cache...")
 fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp)
 split_counts_se <- SummarizedExperiment(
-    assays  = list(rawCountsJ = K(fds, type="j")),
+    assays    = list(rawCountsJ = K(fds, type="j")),
     rowRanges = rowRanges(fds, type="j"),
     colData   = colData(fds)
 )
 
 split_ranges <- rowRanges(split_counts_se)
 
-# Explicitly annotate splice sites to get donor/acceptor positions
 message("Annotating splice sites...")
 split_ranges <- FRASER:::annotateSpliceSite(split_ranges)
 
 # 6. Filtering for Non-Split Counting (Optimization)
 message("Filtering ranges for non-split counting...")
 minExpressionInOneSample <- 20
+
+# Use assay() and rowMaxs from DelayedMatrixStats for HDF5 compatibility
 raw_counts <- assay(split_counts_se, "rawCountsJ")
 max_count  <- rowMaxs(raw_counts)
 passed     <- max_count >= minExpressionInOneSample

From ade8becbe329efcccd1ac56982299b519ac1a39d Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 11:57:51 +1100
Subject: [PATCH 32/89] good mornin, lets get crackin

---
 images/fraser/fraser_merge_split.R | 51 +++++++++++-------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index fc9871f9..9a2a822a 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -5,7 +5,8 @@ library(FRASER)
 library(BiocParallel)
 library(SummarizedExperiment)
 library(Rsamtools)
-library(DelayedMatrixStats) # Crucial for rowMaxs on HDF5 data
+library(DelayedMatrixStats)
+
 
 parser <- ArgumentParser(description = "Merge Split Read Counts")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to FDS RDS file")
@@ -33,7 +34,7 @@ register(bp)
 fds_name <- paste0("FRASER_", args$cohort_id)
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# --- NUCLEAR FIX: Force Valid Metadata ---
+
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
 
 if(length(available_bams) > 0){
@@ -53,42 +54,28 @@ if(length(available_bams) > 0){
 }
 strandSpecific(fds) <- 0
 # --------------------------------------------
-
+minExpressionInOneSample <- 20
 # 4. Merge individual split count RDS files from the cache
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
 message("Merging split counts from cache...")
-fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE, BPPARAM = bp)
-split_counts_se <- SummarizedExperiment(
-    assays    = list(rawCountsJ = K(fds, type="j")),
-    rowRanges = rowRanges(fds, type="j"),
-    colData   = colData(fds)
-)
-
-split_ranges <- rowRanges(split_counts_se)
-
-message("Annotating splice sites...")
-split_ranges <- FRASER:::annotateSpliceSite(split_ranges)
-
-# 6. Filtering for Non-Split Counting (Optimization)
-message("Filtering ranges for non-split counting...")
-minExpressionInOneSample <- 20
-
-# Use assay() and rowMaxs from DelayedMatrixStats for HDF5 compatibility
-raw_counts <- assay(split_counts_se, "rawCountsJ")
-max_count  <- rowMaxs(raw_counts)
-passed     <- max_count >= minExpressionInOneSample
+splitCounts <- getSplitReadCountsForAllSamples(fds=fds,
+                                               recount=FALSE)
+splitCountRanges <- rowRanges(splitCounts)
+splitCountRangesNonFilt <- FRASER:::annotateSpliceSite(splitCountRanges)
 
-filtered_ranges <- split_ranges[passed, ]
+maxCount <- rowMaxs(assay(splitCounts, "rawCountsJ"))
+passed <- maxCount >= minExpressionInOneSample
+# extract granges after filtering
+splitCountRanges <- splitCountRangesNonFilt[passed,]
+spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges)
 
-# 7. Extract Splice Site Coordinates
-message("Extracting splice site coordinates...")
-splice_site_coords <- FRASER:::extractSpliceSiteCoordinates(filtered_ranges, fds)
 
 # Use absolute paths for saving to match Python 'mv' commands
-saveRDS(split_ranges,       file.path(args$work_dir, "g_ranges_split_counts.RDS"))
-saveRDS(filtered_ranges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
-saveRDS(splice_site_coords, file.path(args$work_dir, "splice_site_coords.RDS"))
+#This is sslightly confusing, but the filtered granges will be used to annotate non_split counts
+saveRDS(splitCountRangesNonFilt,       file.path(args$work_dir, "g_ranges_split_counts.RDS"))
+saveRDS(splitCountRanges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
+saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))
 
-# 8. Save the updated FDS
-saveFraserDataSet(fds)
+# 8. Save the updated FDS note that the dummy bams were used here so we dont have to localised the bam paths every time
+saveFraserDataSet(splitCounts)
 message("Merge split complete.")

From c094b082bd29affe921b789766018383474e9456 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 12:31:06 +1100
Subject: [PATCH 33/89] this should be it

---
 images/fraser/fraser_merge_split.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 9a2a822a..4eb6a8e4 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -76,6 +76,3 @@ saveRDS(splitCountRangesNonFilt,       file.path(args$work_dir, "g_ranges_split_
 saveRDS(splitCountRanges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
 saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))
 
-# 8. Save the updated FDS note that the dummy bams were used here so we dont have to localised the bam paths every time
-saveFraserDataSet(splitCounts)
-message("Merge split complete.")

From ee3b46c6200f45a4698a2bd350223c6ae0f02642 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 13:02:32 +1100
Subject: [PATCH 34/89] count non_split

---
 images/fraser/fraser_count_non_split.R | 48 +++++++++++++++++---------
 1 file changed, 32 insertions(+), 16 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 4077ebe8..b1582688 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -1,5 +1,8 @@
 #!/usr/bin/env Rscript
 
+# Set memory limit - increased slightly to allow for HDF5 overhead
+Sys.setenv("R_MAX_VSIZE" = "16Gb")
+
 library(argparse)
 library(FRASER)
 library(BiocParallel)
@@ -27,23 +30,31 @@ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
 # 2. Configure Parallelism and HDF5
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
-bpparam <- SerialParam()
+# Use MulticoreParam if threads > 1, else Serial
+if(args$nthreads > 1){
+    bpparam <- MulticoreParam(workers = args$nthreads)
+} else {
+    bpparam <- SerialParam()
+}
 
 # 3. Load Dataset and Coordinates
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS
 
-# 4. Critical: Subset and Inject
-# First, subset to the specific sample
-fds <- fds[, fds$sampleID == args$sample_id]
-colData(fds)$bamFile <- args$bam_path
-
-# Replace the internal splice site ranges with your filtered ones
-# This forces FRASER to only query these specific BP positions in the BAM
+# 4. Inject coordinates FIRST (before subsetting)
 nonDegenerated <- !duplicated(filtered_coords)
 mcols(fds)$spliceSiteCoords <- filtered_coords[nonDegenerated]
 
-# Set strand specificity (Ensure 0 is correct for your library!)
+# THEN subset to the specific sample
+fds <- fds[, fds$sampleID == args$sample_id]
+
+# Validate the BAM path - Ensure the R script sees what Hail localized
+if(!file.exists(args$bam_path)){
+    stop(paste("BAM file not found at:", args$bam_path))
+}
+colData(fds)$bamFile <- args$bam_path
+
+# Set strand specificity
 strandSpecific(fds) <- 0
 
 # 5. Run Non-Split Counting
@@ -52,12 +63,17 @@ message(paste("Counting non-split reads for sample:", args$sample_id))
 getNonSplitReadCountsForAllSamples(fds,
                     recount = TRUE)
 
+
 # 6. Verification
-# The Python script uses a 'find' command to move this output.
-# We just need to check if the file was created in the cache.
-out_file <- list.files(cache_dir, pattern = paste0("nonSplicedCounts-", args$sample_id))
-if (length(out_file) > 0) {
-    message("Successfully created non-split counts: ", out_file[1])
+# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS
+expected_out <- file.path(args$work_dir, "cache", "nonSplicedCounts-", paste0("nonSplicedCounts--", args$sample_id, ".RDS"))
+
+if (file.exists(expected_out)) {
+    message("Success: Created split counts at ", expected_out)
 } else {
-    stop("Non-split counts file was not found in: ", cache_dir)
-}
+    # Check for common Bioinformatic failures
+    if(!file.exists(paste0(args$bam_path, ".bai"))){
+        stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.")
+    }
+    stop("Counting failed. The RDS file was not generated in the cache.")
+}
\ No newline at end of file

From 9abdee7ba9c81ce407e288274ef07fb15fb09e78 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 13:38:35 +1100
Subject: [PATCH 35/89] Kiss,ONLY VULNERABILITY HERE IS bampaths

---
 images/fraser/fraser_count_non_split.R | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index b1582688..915614be 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -41,18 +41,6 @@ if(args$nthreads > 1){
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS
 
-# 4. Inject coordinates FIRST (before subsetting)
-nonDegenerated <- !duplicated(filtered_coords)
-mcols(fds)$spliceSiteCoords <- filtered_coords[nonDegenerated]
-
-# THEN subset to the specific sample
-fds <- fds[, fds$sampleID == args$sample_id]
-
-# Validate the BAM path - Ensure the R script sees what Hail localized
-if(!file.exists(args$bam_path)){
-    stop(paste("BAM file not found at:", args$bam_path))
-}
-colData(fds)$bamFile <- args$bam_path
 
 # Set strand specificity
 strandSpecific(fds) <- 0
@@ -63,7 +51,13 @@ message(paste("Counting non-split reads for sample:", args$sample_id))
 getNonSplitReadCountsForAllSamples(fds,
                     recount = TRUE)
 
-
+sample_result <- countNonSplicedReads(args$sample_id,
+                                      splitCountRanges = NULL,
+                                      fds = fds,
+                                      NcpuPerSample = args$nthreads,
+                                      minAnchor=5,
+                                      recount= TRUE,
+                                      spliceSiteCoords=filtered_coords)
 # 6. Verification
 # FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS
 expected_out <- file.path(args$work_dir, "cache", "nonSplicedCounts-", paste0("nonSplicedCounts--", args$sample_id, ".RDS"))

From b746910c265a3e4d89dd2088cd4278b974a397d1 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 13:59:40 +1100
Subject: [PATCH 36/89] I was the true vulnerability here

---
 images/fraser/fraser_count_non_split.R | 2 --
 1 file changed, 2 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 915614be..710d93e1 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -48,8 +48,6 @@ strandSpecific(fds) <- 0
 # 5. Run Non-Split Counting
 # This writes the .h5 or .RDS file into the cache_dir created above
 message(paste("Counting non-split reads for sample:", args$sample_id))
-getNonSplitReadCountsForAllSamples(fds,
-                    recount = TRUE)
 
 sample_result <- countNonSplicedReads(args$sample_id,
                                       splitCountRanges = NULL,

From 8be7cff03dea2f5a7849e23d3516f84e8413958e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 23 Jan 2026 14:42:12 +1100
Subject: [PATCH 37/89] the output checking didnt check the write place

---
 images/fraser/fraser_count_non_split.R | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 710d93e1..8d3b5a1d 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -57,15 +57,25 @@ sample_result <- countNonSplicedReads(args$sample_id,
                                       recount= TRUE,
                                       spliceSiteCoords=filtered_coords)
 # 6. Verification
-# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS
-expected_out <- file.path(args$work_dir, "cache", "nonSplicedCounts-", paste0("nonSplicedCounts--", args$sample_id, ".RDS"))
+# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/
+actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
 
-if (file.exists(expected_out)) {
-    message("Success: Created split counts at ", expected_out)
+# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS
+expected_h5  <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
+expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
+
+if (file.exists(expected_h5)) {
+    message("Success: Created non-split counts (HDF5) at ", expected_h5)
+} else if (file.exists(expected_rds)) {
+    message("Success: Created non-split counts (RDS) at ", expected_rds)
 } else {
-    # Check for common Bioinformatic failures
+    # Debugging: List files in the directory to see what actually happened
+    if(dir.exists(actual_cache_dir)){
+        message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", "))
+    }
+
     if(!file.exists(paste0(args$bam_path, ".bai"))){
-        stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.")
+        stop("BAM Index (.bai) missing. FRASER cannot perform random access.")
     }
-    stop("Counting failed. The RDS file was not generated in the cache.")
+    stop(paste("Counting failed. Output not found for sample:", args$sample_id))
 }
\ No newline at end of file

From a3b158980d96a245f5a671c4b8975e270f675212 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 27 Jan 2026 13:26:12 +1100
Subject: [PATCH 38/89] merge_non_split

---
 images/fraser/fraser_merge_non_split.R | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 1dd0163f..058a4dd7 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -26,8 +26,28 @@ register(bp)
 
 # 3. Load Dataset
 # This loads the metadata; the counts will be pulled from the cache symlinked by Python
+fds_name <- paste0("FRASER_", args$cohort_id)
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
+
+available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
+
+if(length(available_bams) > 0){
+    message("Using reference BAM for metadata: ", available_bams[1])
+
+    # Force-update colData for ALL samples to point to a valid file.
+    # This prevents the 'dummy.bam' error during internal validation.
+    new_colData <- colData(fds)
+    new_colData$bamFile <- available_bams[1]
+    colData(fds) <- new_colData
+
+    # Manually assert the seqlevelsStyle (e.g., 'UCSC' or 'Ensembl')
+    # This satisfies the internal check that is currently crashing.
+    seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1]))
+} else {
+    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
+}
+strandSpecific(fds) <- 0
 # 4. Merge Split Counts
 # This step updates the fds with the junctions identified in Step 3
 message("Merging all split-read counts...")

From 1de313541651a70d9df6d1abe2b6df62949cebd4 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 27 Jan 2026 15:51:27 +1100
Subject: [PATCH 39/89] merge_non_split

---
 images/fraser/fraser_merge_non_split.R | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 058a4dd7..67169243 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -33,19 +33,18 @@ fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
 
 if(length(available_bams) > 0){
-    message("Using reference BAM for metadata: ", available_bams[1])
+    # We use the reference BAM to satisfy the internal 'file.exists' checks.
+    # available_bams[1] should be the 'reference.bam' symlinked by the Python task.
+    ref_bam <- available_bams[1]
+    message("Validating metadata against reference BAM: ", ref_bam)
 
-    # Force-update colData for ALL samples to point to a valid file.
-    # This prevents the 'dummy.bam' error during internal validation.
-    new_colData <- colData(fds)
-    new_colData$bamFile <- available_bams[1]
-    colData(fds) <- new_colData
+    # Update colData paths for all samples to the reference BAM
+    colData(fds)$bamFile <- ref_bam
 
-    # Manually assert the seqlevelsStyle (e.g., 'UCSC' or 'Ensembl')
-    # This satisfies the internal check that is currently crashing.
-    seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(available_bams[1]))
+    # Ensure seqlevels match (e.g., 'chr1' vs '1') to prevent crashing during merge
+    seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(ref_bam))
 } else {
-    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
+    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. Validation will fail.")
 }
 strandSpecific(fds) <- 0
 # 4. Merge Split Counts

From 57ef4afdf3f54820d881eb0fdab31cb5db36948e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 27 Jan 2026 16:24:58 +1100
Subject: [PATCH 40/89] how did that get in there?

---
 images/fraser/fraser_merge_non_split.R | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 67169243..a12c02a6 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -47,10 +47,6 @@ if(length(available_bams) > 0){
     stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. Validation will fail.")
 }
 strandSpecific(fds) <- 0
-# 4. Merge Split Counts
-# This step updates the fds with the junctions identified in Step 3
-message("Merging all split-read counts...")
-fds <- getSplitReadCountsForAllSamples(fds, recount = FALSE)
 
 # 5. Merge Non-Split Counts
 # We use the filtered ranges from Step 3 to define the 'at-site' junctions
@@ -60,7 +56,7 @@ fds <- getNonSplitReadCountsForAllSamples(
   fds = fds,
   splitCountRanges = split_count_ranges,
   minAnchor = 5,
-  recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
+  recount =FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
 )
 
 # 6. Statistical Normalization (PSI and Jaccard)

From cac6886e7d6eec4f66ff32d353d61623b0381769 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 28 Jan 2026 12:51:36 +1100
Subject: [PATCH 41/89] name change

---
 images/fraser/fraser_merge_non_split.R | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index a12c02a6..a82d065a 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -51,19 +51,14 @@ strandSpecific(fds) <- 0
 # 5. Merge Non-Split Counts
 # We use the filtered ranges from Step 3 to define the 'at-site' junctions
 message("Merging all non-split-read counts from cache...")
-split_count_ranges <- readRDS(args$filtered_ranges_path)
+non_split_count_ranges <- readRDS(args$filtered_ranges_path)
 fds <- getNonSplitReadCountsForAllSamples(
   fds = fds,
-  splitCountRanges = split_count_ranges,
+  splitCountRanges = non_split_count_ranges,
   minAnchor = 5,
   recount =FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
 )
 
-# 6. Statistical Normalization (PSI and Jaccard)
-# This is the 'FRASER 2.0' logic. It calculates the metrics used for outlier detection.
-message("Calculating PSI and Jaccard Index values...")
-fds <- calculatePSIValues(fds, BPPARAM = bp)
-
 # 7. Final Save
 # This creates the complete 'fitted' dataset that the analysis script will use
 message("Saving final merged FraserDataSet...")

From 8aae29f7cefec4e6886a0fb41e8b2613686a83dd Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 11:00:07 +1100
Subject: [PATCH 42/89] lets start again

---
 images/fraser/fraser_merge_non_split.R | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index a82d065a..bfbcd4e5 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -15,7 +15,11 @@ args <- parser$parse_args()
 # 1. Reconstruct Directory Structure
 fds_name <- paste0("FRASER_", args$cohort_id)
 save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
-dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+
+# Create the specific sub-directory for nonSplitCounts
+dir.create(file.path(save_dir, "nonSplitCounts"), recursive = TRUE, showWarnings = FALSE)
+
+# Copy the object so loadFraserDataSet can find it
 file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
 
 # 2. Configure Backend
@@ -24,9 +28,7 @@ options("FRASER.maxJunctionsNoHDF5" = -1)
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
-# 3. Load Dataset
-# This loads the metadata; the counts will be pulled from the cache symlinked by Python
-fds_name <- paste0("FRASER_", args$cohort_id)
+# --- 3. Load Dataset ---
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
 
@@ -40,11 +42,9 @@ if(length(available_bams) > 0){
 
     # Update colData paths for all samples to the reference BAM
     colData(fds)$bamFile <- ref_bam
-
-    # Ensure seqlevels match (e.g., 'chr1' vs '1') to prevent crashing during merge
-    seqlevelsStyle(fds) <- seqlevelsStyle(BamFile(ref_bam))
+    seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam))
 } else {
-    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams. Validation will fail.")
+    stop("CRITICAL ERROR: No BAM files found.")
 }
 strandSpecific(fds) <- 0
 
@@ -52,16 +52,16 @@ strandSpecific(fds) <- 0
 # We use the filtered ranges from Step 3 to define the 'at-site' junctions
 message("Merging all non-split-read counts from cache...")
 non_split_count_ranges <- readRDS(args$filtered_ranges_path)
-fds <- getNonSplitReadCountsForAllSamples(
+non_split_counts <- getNonSplitReadCountsForAllSamples(
   fds = fds,
   splitCountRanges = non_split_count_ranges,
   minAnchor = 5,
-  recount =FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
+  recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
 )
 
 # 7. Final Save
 # This creates the complete 'fitted' dataset that the analysis script will use
 message("Saving final merged FraserDataSet...")
-fds <- saveFraserDataSet(fds)
+saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))
 
 message("Merge non-split and PSI calculation complete.")

From ec6da3983ff785fff8a44a733b2ddaaeaf0325f8 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 11:21:57 +1100
Subject: [PATCH 43/89] fixing paths

---
 images/fraser/fraser_merge_non_split.R | 50 +++++++++++++-------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index bfbcd4e5..42ae2f14 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -14,43 +14,46 @@ args <- parser$parse_args()
 
 # 1. Reconstruct Directory Structure
 fds_name <- paste0("FRASER_", args$cohort_id)
-save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+# Where the pipeline put them
+old_dir <- file.path(args$work_dir, "savedObjects", "Data_Analysis", "nonSplitCounts")
+# Where FRASER expects them
+new_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts")
 
-# Create the specific sub-directory for nonSplitCounts
-dir.create(file.path(save_dir, "nonSplitCounts"), recursive = TRUE, showWarnings = FALSE)
+# --- 2. Move Files to Correct Location ---
+if (dir.exists(old_dir)) {
+    message("Moving .h5 files from ", old_dir, " to ", new_dir)
+    dir.create(new_dir, recursive = TRUE, showWarnings = FALSE)
 
-# Copy the object so loadFraserDataSet can find it
-file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+    h5_files <- list.files(old_dir, pattern = "\\.h5$", full.names = TRUE)
+    for (f in h5_files) {
+        dest <- file.path(new_dir, basename(f))
+        file.rename(f, dest)
+    }
+} else {
+    message("Notice: Source directory ", old_dir, " not found. Checking if files are already in place.")
+}
 
-# 2. Configure Backend
-options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = -1)
-bp <- MulticoreParam(workers = args$nthreads)
-register(bp)
+# --- 3. Prepare FDS for Loading ---
+# FRASER's loadFraserDataSet looks for the .RDS file in a specific subfolder
+save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
 
-# --- 3. Load Dataset ---
+# --- 4. Load Dataset ---
+# dir is the ROOT workdir, name is the cohort folder name
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
 
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
-
 if(length(available_bams) > 0){
-    # We use the reference BAM to satisfy the internal 'file.exists' checks.
-    # available_bams[1] should be the 'reference.bam' symlinked by the Python task.
     ref_bam <- available_bams[1]
-    message("Validating metadata against reference BAM: ", ref_bam)
-
-    # Update colData paths for all samples to the reference BAM
     colData(fds)$bamFile <- ref_bam
     seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam))
-} else {
-    stop("CRITICAL ERROR: No BAM files found.")
 }
 strandSpecific(fds) <- 0
 
-# 5. Merge Non-Split Counts
-# We use the filtered ranges from Step 3 to define the 'at-site' junctions
-message("Merging all non-split-read counts from cache...")
+# --- 6. Merge Non-Split Counts ---
+message("Merging counts using HDF5 files in: ", new_dir)
 non_split_count_ranges <- readRDS(args$filtered_ranges_path)
 non_split_counts <- getNonSplitReadCountsForAllSamples(
   fds = fds,
@@ -61,7 +64,6 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
 
 # 7. Final Save
 # This creates the complete 'fitted' dataset that the analysis script will use
-message("Saving final merged FraserDataSet...")
+
 saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))
 
-message("Merge non-split and PSI calculation complete.")

From c2a728461f1295eb3ccafa9d77b8e0bf8f4fade2 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 11:54:29 +1100
Subject: [PATCH 44/89] metadata file was missing?

---
 images/fraser/fraser_merge_non_split.R | 56 ++++++++++++++------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 42ae2f14..64177a6b 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -3,6 +3,7 @@
 library(argparse)
 library(FRASER)
 library(BiocParallel)
+library(SummarizedExperiment)
 
 parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
@@ -12,43 +13,48 @@ parser$add_argument("--work_dir", default = "/io/work", help = "Working director
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-# 1. Reconstruct Directory Structure
+# 1. Load the ranges first so they are available for all steps
+non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+
+# 2. Reconstruct Directory Structure
 fds_name <- paste0("FRASER_", args$cohort_id)
-# Where the pipeline put them
-old_dir <- file.path(args$work_dir, "savedObjects", "Data_Analysis", "nonSplitCounts")
-# Where FRASER expects them
-new_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts")
-
-# --- 2. Move Files to Correct Location ---
-if (dir.exists(old_dir)) {
-    message("Moving .h5 files from ", old_dir, " to ", new_dir)
-    dir.create(new_dir, recursive = TRUE, showWarnings = FALSE)
-
-    h5_files <- list.files(old_dir, pattern = "\\.h5$", full.names = TRUE)
-    for (f in h5_files) {
-        dest <- file.path(new_dir, basename(f))
-        file.rename(f, dest)
-    }
-} else {
-    message("Notice: Source directory ", old_dir, " not found. Checking if files are already in place.")
+save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+out_dir <- file.path(save_dir, "nonSplitCounts")
+
+dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
+
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+
+# --- MINIMUM FIX: Move files and create the missing 'se.rds' metadata ---
+h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE)
+for(f in h5_files) {
+    file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
 }
 
-# --- 3. Prepare FDS for Loading ---
-# FRASER's loadFraserDataSet looks for the .RDS file in a specific subfolder
-save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
-dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
-file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
+# Create the se.rds 'glue' file that FRASER/HDF5Array looks for
+non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+tmp_se <- SummarizedExperiment(rowRanges = non_split_count_ranges)
+saveRDS(tmp_se, file.path(out_dir, "se.rds"))
+# -----------------------------------------------------------------------
 
-# --- 4. Load Dataset ---
-# dir is the ROOT workdir, name is the cohort folder name
+# 2. Configure Backend
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
+# 3. Load Dataset
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
 
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
 if(length(available_bams) > 0){
     ref_bam <- available_bams[1]
+    message("Validating metadata against reference BAM: ", ref_bam)
     colData(fds)$bamFile <- ref_bam
     seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam))
+} else {
+    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
 }
 strandSpecific(fds) <- 0
 

From 127cb74cf49d9146df63c70ca4e929d7061c6256 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 12:13:45 +1100
Subject: [PATCH 45/89] paths

---
 images/fraser/fraser_merge_non_split.R | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 64177a6b..e6b68478 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -22,8 +22,7 @@ save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
 out_dir <- file.path(save_dir, "nonSplitCounts")
 
 dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
-
-file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
 
 # --- MINIMUM FIX: Move files and create the missing 'se.rds' metadata ---
 h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE)
@@ -31,8 +30,7 @@ for(f in h5_files) {
     file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
 }
 
-# Create the se.rds 'glue' file that FRASER/HDF5Array looks for
-non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+# Create the se.rds 'glue' file that loadHDF5SummarizedExperiment requires
 tmp_se <- SummarizedExperiment(rowRanges = non_split_count_ranges)
 saveRDS(tmp_se, file.path(out_dir, "se.rds"))
 # -----------------------------------------------------------------------
@@ -59,8 +57,7 @@ if(length(available_bams) > 0){
 strandSpecific(fds) <- 0
 
 # --- 6. Merge Non-Split Counts ---
-message("Merging counts using HDF5 files in: ", new_dir)
-non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+message("Merging counts using HDF5 files in: ", out_dir)
 non_split_counts <- getNonSplitReadCountsForAllSamples(
   fds = fds,
   splitCountRanges = non_split_count_ranges,
@@ -71,5 +68,5 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
 # 7. Final Save
 # This creates the complete 'fitted' dataset that the analysis script will use
 
-saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))
+saveRDS(non_split_counts, file.path(args$work_dir, "non_split_counts.RDS"))
 

From 379900a33cf6297f4389547d90e2a4dcde14d7a6 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 12:45:31 +1100
Subject: [PATCH 46/89] make a better simmarized exp

---
 images/fraser/fraser_merge_non_split.R | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index e6b68478..4530e0e8 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -30,8 +30,19 @@ for(f in h5_files) {
     file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
 }
 
-# Create the se.rds 'glue' file that loadHDF5SummarizedExperiment requires
-tmp_se <- SummarizedExperiment(rowRanges = non_split_count_ranges)
+# 1. Create a dummy matrix with the right number of samples (columns)
+# This satisfies the: if(all(samples(fds) %in% colnames(siteCounts))) check
+sample_names <- samples(fds)
+dummy_matrix <- matrix(0, nrow=length(non_split_count_ranges), ncol=length(sample_names))
+colnames(dummy_matrix) <- sample_names
+
+# 2. Create the SE object with the dummy matrix
+tmp_se <- SummarizedExperiment(
+    assays = list(rawCountsSS = dummy_matrix), # Name must match what FRASER expects
+    rowRanges = non_split_count_ranges
+)
+
+# 3. Save it to the cache directory
 saveRDS(tmp_se, file.path(out_dir, "se.rds"))
 # -----------------------------------------------------------------------
 

From f32506ec7aed195dc3ce3787b89ed072df67df52 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 12:59:04 +1100
Subject: [PATCH 47/89] whoops

---
 images/fraser/fraser_merge_non_split.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 4530e0e8..5099d849 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -29,6 +29,8 @@ h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.n
 for(f in h5_files) {
     file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
 }
+# 3. Load Dataset
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
 # 1. Create a dummy matrix with the right number of samples (columns)
 # This satisfies the: if(all(samples(fds) %in% colnames(siteCounts))) check
@@ -52,9 +54,6 @@ options("FRASER.maxJunctionsNoHDF5" = -1)
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
-# 3. Load Dataset
-fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
-
 
 available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
 if(length(available_bams) > 0){

From f162368ccc70b4f4423d140478c4892cc4d756a1 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 13:39:49 +1100
Subject: [PATCH 48/89] handshake fix

---
 images/fraser/fraser_merge_non_split.R | 33 ++++++++++----------------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 5099d849..71af95ec 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -4,6 +4,7 @@ library(argparse)
 library(FRASER)
 library(BiocParallel)
 library(SummarizedExperiment)
+library(HDF5Array)
 
 parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
@@ -15,40 +16,32 @@ args <- parser$parse_args()
 
 # 1. Load the ranges first so they are available for all steps
 non_split_count_ranges <- readRDS(args$filtered_ranges_path)
-
-# 2. Reconstruct Directory Structure
 fds_name <- paste0("FRASER_", args$cohort_id)
 save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
 out_dir <- file.path(save_dir, "nonSplitCounts")
-
 dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
 file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
 
-# --- MINIMUM FIX: Move files and create the missing 'se.rds' metadata ---
+# 2. Load FDS
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+
+# 3. Anchor HDF5 Directory
 h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE)
 for(f in h5_files) {
     file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
 }
-# 3. Load Dataset
-fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
-
-# 1. Create a dummy matrix with the right number of samples (columns)
-# This satisfies the: if(all(samples(fds) %in% colnames(siteCounts))) check
-sample_names <- samples(fds)
-dummy_matrix <- matrix(0, nrow=length(non_split_count_ranges), ncol=length(sample_names))
-colnames(dummy_matrix) <- sample_names
 
-# 2. Create the SE object with the dummy matrix
-tmp_se <- SummarizedExperiment(
-    assays = list(rawCountsSS = dummy_matrix), # Name must match what FRASER expects
+anchor_se <- SummarizedExperiment(
+    assays = list(rawCountsSS = matrix(0,
+                                       nrow = length(non_split_count_ranges),
+                                       ncol = length(samples(fds)),
+                                       dimnames = list(NULL, samples(fds)))),
     rowRanges = non_split_count_ranges
 )
+# This creates the se.rds that FRASER's loadHDF5SummarizedExperiment needs
+saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "")
 
-# 3. Save it to the cache directory
-saveRDS(tmp_se, file.path(out_dir, "se.rds"))
-# -----------------------------------------------------------------------
-
-# 2. Configure Backend
+# 4. Configure Backend
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
 bp <- MulticoreParam(workers = args$nthreads)

From 7c09fd82c7c26e0cac4e72470d164e6f0dd0c19a Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 14:17:56 +1100
Subject: [PATCH 49/89] save fix

---
 images/fraser/fraser_merge_non_split.R | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 71af95ec..40d92e77 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -71,5 +71,13 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
 # 7. Final Save
 # This creates the complete 'fitted' dataset that the analysis script will use
 
-saveRDS(non_split_counts, file.path(args$work_dir, "non_split_counts.RDS"))
+# Use the HDF5-safe saving method for the counts object specifically
+# We save this to a new directory to avoid clobbering the input cache
+saveHDF5SummarizedExperiment(
+    non_split_counts,
+    dir = file.path(args$work_dir, "merged_non_split_counts"),
+    replace = TRUE
+)
 
+# Also save the updated FDS object (this updates the internal fds-object.RDS)
+fds <- saveFraserDataSet(fds)

From 9b5e7b1efc201d396a81def2ae8f1e9b9084b3d7 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 16:35:12 +1100
Subject: [PATCH 50/89] Add join

---
 images/fraser/fraser_join_counts.R | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 images/fraser/fraser_join_counts.R

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
new file mode 100644
index 00000000..3ef817c0
--- /dev/null
+++ b/images/fraser/fraser_join_counts.R
@@ -0,0 +1,30 @@
+#!/usr/bin/env Rscript
+library(argparse)
+library(FRASER)
+library(SummarizedExperiment)
+library(HDF5Array)
+
+parser <- ArgumentParser(description = "Join Split and Non-Split Counts")
+parser$add_argument("--fds_path", required = TRUE)
+parser$add_argument("--cohort_id", required = TRUE)
+parser$add_argument("--work_dir", default = "/io/work")
+parser$add_argument("--nthreads", type = "integer", default = 1)
+args <- parser$parse_args()
+
+# Set number of threads
+register(MulticoreParam(args$nthreads))
+
+fds_name <- paste0("FRASER_", args$cohort_id)
+
+# Load the FDS object
+message(paste("Loading FraserDataSet from:", args$work_dir))
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+
+# calculatePSIValues will look for HDF5 arrays in:
+# 1. work_dir/savedObjects/fds_name/splitCounts
+# 2. work_dir/savedObjects/fds_name/nonSplitCounts
+message("Calculating PSI values using HDF5-backed assays...")
+fds <- calculatePSIValues(fds)
+
+message("Saving integrated FraserDataSet...")
+fds <- saveFraserDataSet(fds)

From 98972c71e8e9bbae40e57ddbf0fe462d893e2a9c Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 29 Jan 2026 16:45:08 +1100
Subject: [PATCH 51/89] Add join

---
 images/fraser/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 5d26c50f..630f1f38 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -34,3 +34,4 @@ COPY fraser_count_non_split.R RDrnaseq/
 COPY fraser_init.R RDrnaseq/
 COPY fraser_merge_non_split.R RDrnaseq/
 COPY fraser_merge_split.R RDrnaseq/
+COPY fraser_join_counts.R RDrnaseq/

From 87097f02fa24a18e956462dd28a7790ad2e4c75b Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 10:24:15 +1100
Subject: [PATCH 52/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 43 +++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 10 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 3ef817c0..8e1b2869 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -7,23 +7,46 @@ library(HDF5Array)
 parser <- ArgumentParser(description = "Join Split and Non-Split Counts")
 parser$add_argument("--fds_path", required = TRUE)
 parser$add_argument("--cohort_id", required = TRUE)
+parser$add_argument("--filtered_ranges_path", required = TRUE)
 parser$add_argument("--work_dir", default = "/io/work")
-parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
-# Set number of threads
-register(MulticoreParam(args$nthreads))
-
 fds_name <- paste0("FRASER_", args$cohort_id)
+# Paths to the HDF5 anchor directories
+split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "splitCounts")
+non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts")
 
-# Load the FDS object
-message(paste("Loading FraserDataSet from:", args$work_dir))
+# 1. Load the FDS object
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# calculatePSIValues will look for HDF5 arrays in:
-# 1. work_dir/savedObjects/fds_name/splitCounts
-# 2. work_dir/savedObjects/fds_name/nonSplitCounts
-message("Calculating PSI values using HDF5-backed assays...")
+# 2. Re-anchor Split Counts (The 'j' counts)
+# If the splitCounts folder was moved/localized, fds might lose the pointer.
+# We reload the HDF5 SummarizedExperiment and inject it back into fds.
+if(file.exists(file.path(split_dir, "se.rds"))){
+    message("Re-anchoring split counts (rawCountsJ)...")
+    split_se <- loadHDF5SummarizedExperiment(dir = split_dir)
+    assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se)
+} else {
+    stop("CRITICAL ERROR: splitCounts/se.rds not found. Cannot calculate PSI.")
+}
+
+# 3. Handle Non-Split Counts (The 'ss' counts)
+# Use the shim logic from before to ensure the 'rawCountsSS' is present
+non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+if(!file.exists(file.path(non_split_dir, "se.rds"))){
+    message("Creating anchor for non-split counts (rawCountsSS)...")
+    anchor_se <- SummarizedExperiment(
+        assays = list(rawCountsSS = matrix(0, nrow=length(non_split_count_ranges),
+                                           ncol=length(samples(fds)),
+                                           dimnames=list(NULL, samples(fds)))),
+        rowRanges = non_split_count_ranges
+    )
+    saveHDF5SummarizedExperiment(anchor_se, dir=non_split_dir, replace=TRUE, prefix="")
+}
+
+# 4. Calculate PSI
+# Now that both 'rawCountsJ' and 'rawCountsSS' are linked, this will succeed
+message("Calculating PSI values...")
 fds <- calculatePSIValues(fds)
 
 message("Saving integrated FraserDataSet...")

From 15f83aea4886af2d1e9aeac6c380d1ad001b346e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 10:30:50 +1100
Subject: [PATCH 53/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 47 +++++++++++++++++-------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 8e1b2869..c1310b9e 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -7,8 +7,9 @@ library(HDF5Array)
 parser <- ArgumentParser(description = "Join Split and Non-Split Counts")
 parser$add_argument("--fds_path", required = TRUE)
 parser$add_argument("--cohort_id", required = TRUE)
-parser$add_argument("--filtered_ranges_path", required = TRUE)
 parser$add_argument("--work_dir", default = "/io/work")
+parser$add_argument("--filtered_ranges_path", required = TRUE) # Added this back
+parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
 fds_name <- paste0("FRASER_", args$cohort_id)
@@ -19,35 +20,41 @@ non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCou
 # 1. Load the FDS object
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# 2. Re-anchor Split Counts (The 'j' counts)
-# If the splitCounts folder was moved/localized, fds might lose the pointer.
-# We reload the HDF5 SummarizedExperiment and inject it back into fds.
+# 2. Re-attach Split Counts (Junctions)
+# This prevents the "Missing rawCounts for 'j'" error
 if(file.exists(file.path(split_dir, "se.rds"))){
-    message("Re-anchoring split counts (rawCountsJ)...")
+    message("Linking split counts (rawCountsJ)...")
     split_se <- loadHDF5SummarizedExperiment(dir = split_dir)
-    assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se)
-} else {
-    stop("CRITICAL ERROR: splitCounts/se.rds not found. Cannot calculate PSI.")
+    assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se[, samples(fds)])
 }
 
-# 3. Handle Non-Split Counts (The 'ss' counts)
-# Use the shim logic from before to ensure the 'rawCountsSS' is present
+# 3. Handle Non-Split Counts (Splice Sites - 'ss')
+# We need to ensure the se.rds anchor exists for the non-split files moved from Step 5
 non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+
 if(!file.exists(file.path(non_split_dir, "se.rds"))){
-    message("Creating anchor for non-split counts (rawCountsSS)...")
+    message("Creating HDF5 anchor for non-split counts (rawCountsSS)...")
+    # This shim tells FRASER how to read the loose .h5 files as a single assay
     anchor_se <- SummarizedExperiment(
-        assays = list(rawCountsSS = matrix(0, nrow=length(non_split_count_ranges),
-                                           ncol=length(samples(fds)),
-                                           dimnames=list(NULL, samples(fds)))),
+        assays = list(rawCountsSS = matrix(0,
+                                           nrow = length(non_split_count_ranges),
+                                           ncol = length(samples(fds)),
+                                           dimnames = list(NULL, samples(fds)))),
         rowRanges = non_split_count_ranges
     )
-    saveHDF5SummarizedExperiment(anchor_se, dir=non_split_dir, replace=TRUE, prefix="")
+    # This creates the se.rds that FRASER needs to 'see' the .h5 files
+    saveHDF5SummarizedExperiment(anchor_se, dir = non_split_dir, replace = TRUE, prefix = "")
 }
 
-# 4. Calculate PSI
-# Now that both 'rawCountsJ' and 'rawCountsSS' are linked, this will succeed
-message("Calculating PSI values...")
-fds <- calculatePSIValues(fds)
+# 4. Final Join and PSI Calculation
+message("Calculating PSI values using both assays...")
+# Link the non-split counts now that the anchor is ready
+non_split_se <- loadHDF5SummarizedExperiment(dir = non_split_dir)
+assay(fds, "rawCountsSS", withDimnames=FALSE) <- assay(non_split_se[, samples(fds)])
+
+# Calculate ratios (PSI)
+bp <- MulticoreParam(workers = args$nthreads)
+fds <- calculatePSIValues(fds, BPPARAM = bp)
 
-message("Saving integrated FraserDataSet...")
+# 5. Save the final integrated object
 fds <- saveFraserDataSet(fds)

From 8eb6413a9c299d445a501fa7366ca83caa0b2437 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 12:20:26 +1100
Subject: [PATCH 54/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index c1310b9e..f3ece970 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -3,16 +3,19 @@ library(argparse)
 library(FRASER)
 library(SummarizedExperiment)
 library(HDF5Array)
+library(BiocParallel)
 
 parser <- ArgumentParser(description = "Join Split and Non-Split Counts")
 parser$add_argument("--fds_path", required = TRUE)
 parser$add_argument("--cohort_id", required = TRUE)
 parser$add_argument("--work_dir", default = "/io/work")
-parser$add_argument("--filtered_ranges_path", required = TRUE) # Added this back
 parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
 fds_name <- paste0("FRASER_", args$cohort_id)
+# The orchestrator puts the non-split ranges here:
+filtered_ranges_path <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS")
+
 # Paths to the HDF5 anchor directories
 split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "splitCounts")
 non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts")
@@ -29,12 +32,16 @@ if(file.exists(file.path(split_dir, "se.rds"))){
 }
 
 # 3. Handle Non-Split Counts (Splice Sites - 'ss')
-# We need to ensure the se.rds anchor exists for the non-split files moved from Step 5
-non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+# We must use the ranges generated in the 'Merge Split' step
+if(!file.exists(filtered_ranges_path)){
+    stop(paste("Required non-split ranges not found at:", filtered_ranges_path))
+}
+non_split_count_ranges <- readRDS(filtered_ranges_path)
 
 if(!file.exists(file.path(non_split_dir, "se.rds"))){
     message("Creating HDF5 anchor for non-split counts (rawCountsSS)...")
-    # This shim tells FRASER how to read the loose .h5 files as a single assay
+
+    # Create the container matching the dimensions FRASER expects
     anchor_se <- SummarizedExperiment(
         assays = list(rawCountsSS = matrix(0,
                                            nrow = length(non_split_count_ranges),
@@ -47,12 +54,12 @@ if(!file.exists(file.path(non_split_dir, "se.rds"))){
 }
 
 # 4. Final Join and PSI Calculation
-message("Calculating PSI values using both assays...")
-# Link the non-split counts now that the anchor is ready
+message("Loading non-split counts...")
 non_split_se <- loadHDF5SummarizedExperiment(dir = non_split_dir)
 assay(fds, "rawCountsSS", withDimnames=FALSE) <- assay(non_split_se[, samples(fds)])
 
 # Calculate ratios (PSI)
+message("Calculating PSI values...")
 bp <- MulticoreParam(workers = args$nthreads)
 fds <- calculatePSIValues(fds, BPPARAM = bp)
 

From a0632e03e49342847db325c86a88f3f631d5a9a0 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 12:51:26 +1100
Subject: [PATCH 55/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 82 ++++++++++++++++--------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index f3ece970..67a39755 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -3,7 +3,6 @@ library(argparse)
 library(FRASER)
 library(SummarizedExperiment)
 library(HDF5Array)
-library(BiocParallel)
 
 parser <- ArgumentParser(description = "Join Split and Non-Split Counts")
 parser$add_argument("--fds_path", required = TRUE)
@@ -13,55 +12,64 @@ parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
 fds_name <- paste0("FRASER_", args$cohort_id)
-# The orchestrator puts the non-split ranges here:
-filtered_ranges_path <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS")
+saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
 # Paths to the HDF5 anchor directories
-split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "splitCounts")
-non_split_dir <- file.path(args$work_dir, "savedObjects", fds_name, "nonSplitCounts")
+split_dir <- file.path(saveDir, "splitCounts")
+non_split_dir <- file.path(saveDir, "nonSplitCounts")
+
+# Paths to the RDS files generated in previous steps
+gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS")
+spliceSitePath <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS")
 
 # 1. Load the FDS object
+message("Loading Fraser Data Set...")
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# 2. Re-attach Split Counts (Junctions)
-# This prevents the "Missing rawCounts for 'j'" error
-if(file.exists(file.path(split_dir, "se.rds"))){
-    message("Linking split counts (rawCountsJ)...")
-    split_se <- loadHDF5SummarizedExperiment(dir = split_dir)
-    assay(fds, "rawCountsJ", withDimnames=FALSE) <- assay(split_se[, samples(fds)])
-}
+# Load the ranges
+splitCounts_gRanges <- readRDS(gRangesSplitPath)
+spliceSiteCoords <- readRDS(spliceSitePath)
+
+# 2. Get splitReads counts
+# The HDF5 file is named 'rawCountsJ.h5' inside the splitCounts subdir
+splitCounts_h5_path <- file.path(split_dir, "rawCountsJ.h5")
+message(paste("Loading split counts from:", splitCounts_h5_path))
 
-# 3. Handle Non-Split Counts (Splice Sites - 'ss')
-# We must use the ranges generated in the 'Merge Split' step
-if(!file.exists(filtered_ranges_path)){
-    stop(paste("Required non-split ranges not found at:", filtered_ranges_path))
-}
-non_split_count_ranges <- readRDS(filtered_ranges_path)
+splitCounts_h5 <- HDF5Array::HDF5Array(splitCounts_h5_path, "rawCountsJ")
+splitCounts_se <- SummarizedExperiment(
+  colData = colData(fds),
+  rowRanges = splitCounts_gRanges,
+  assays = list(rawCountsJ = splitCounts_h5)
+)
 
-if(!file.exists(file.path(non_split_dir, "se.rds"))){
-    message("Creating HDF5 anchor for non-split counts (rawCountsSS)...")
+# 3. Get nonSplitRead counts
+# The HDF5 file is named 'rawCountsSS.h5' inside the nonSplitCounts subdir
+nonSplitCounts_h5_path <- file.path(non_split_dir, "rawCountsSS.h5")
+message(paste("Loading non-split counts from:", nonSplitCounts_h5_path))
 
-    # Create the container matching the dimensions FRASER expects
-    anchor_se <- SummarizedExperiment(
-        assays = list(rawCountsSS = matrix(0,
-                                           nrow = length(non_split_count_ranges),
-                                           ncol = length(samples(fds)),
-                                           dimnames = list(NULL, samples(fds)))),
-        rowRanges = non_split_count_ranges
-    )
-    # This creates the se.rds that FRASER needs to 'see' the .h5 files
-    saveHDF5SummarizedExperiment(anchor_se, dir = non_split_dir, replace = TRUE, prefix = "")
-}
+nonSplitCounts_h5 <- HDF5Array::HDF5Array(nonSplitCounts_h5_path, "rawCountsSS")
+nonSplitCounts_se <- SummarizedExperiment(
+  colData = colData(fds),
+  rowRanges = spliceSiteCoords,
+  assays = list(rawCountsSS = nonSplitCounts_h5)
+)
 
-# 4. Final Join and PSI Calculation
-message("Loading non-split counts...")
-non_split_se <- loadHDF5SummarizedExperiment(dir = non_split_dir)
-assay(fds, "rawCountsSS", withDimnames=FALSE) <- assay(non_split_se[, samples(fds)])
+# 4. Add Counts to FRASER object
+# This function automatically populates the internal slots correctly
+message("Joining assays into FDS object...")
+fds <- addCountsToFraserDataSet(
+  fds = fds,
+  splitCounts = splitCounts_se,
+  nonSplitCounts = nonSplitCounts_se
+)
 
-# Calculate ratios (PSI)
+# 5. Calculate PSI values
 message("Calculating PSI values...")
 bp <- MulticoreParam(workers = args$nthreads)
 fds <- calculatePSIValues(fds, BPPARAM = bp)
 
-# 5. Save the final integrated object
+# 6. Save final FRASER object
+message("Saving integrated FDS...")
 fds <- saveFraserDataSet(fds)
+
+message("FRASER join complete.")

From 0c832c3d3c107b83517cd5999412a07d316bc4cb Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 13:34:02 +1100
Subject: [PATCH 56/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 67a39755..b413f327 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -31,11 +31,8 @@ splitCounts_gRanges <- readRDS(gRangesSplitPath)
 spliceSiteCoords <- readRDS(spliceSitePath)
 
 # 2. Get splitReads counts
-# The HDF5 file is named 'rawCountsJ.h5' inside the splitCounts subdir
-splitCounts_h5_path <- file.path(split_dir, "rawCountsJ.h5")
-message(paste("Loading split counts from:", splitCounts_h5_path))
-
-splitCounts_h5 <- HDF5Array::HDF5Array(splitCounts_h5_path, "rawCountsJ")
+message("Loading split counts...")
+splitCounts_h5 <- HDF5Array(file.path(split_dir, "rawCountsJ"), "rawCountsJ")
 splitCounts_se <- SummarizedExperiment(
   colData = colData(fds),
   rowRanges = splitCounts_gRanges,
@@ -43,11 +40,8 @@ splitCounts_se <- SummarizedExperiment(
 )
 
 # 3. Get nonSplitRead counts
-# The HDF5 file is named 'rawCountsSS.h5' inside the nonSplitCounts subdir
-nonSplitCounts_h5_path <- file.path(non_split_dir, "rawCountsSS.h5")
-message(paste("Loading non-split counts from:", nonSplitCounts_h5_path))
-
-nonSplitCounts_h5 <- HDF5Array::HDF5Array(nonSplitCounts_h5_path, "rawCountsSS")
+message("Loading non-split counts...")
+nonSplitCounts_h5 <- HDF5Array(file.path(non_split_dir, "rawCountsSS"), "rawCountsSS")
 nonSplitCounts_se <- SummarizedExperiment(
   colData = colData(fds),
   rowRanges = spliceSiteCoords,

From d5b51ef9d942ff4cd074b3d11a89a809017622d5 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 14:57:41 +1100
Subject: [PATCH 57/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 42 +++++++++++++-----------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index b413f327..1fee3300 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -14,13 +14,14 @@ args <- parser$parse_args()
 fds_name <- paste0("FRASER_", args$cohort_id)
 saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
-# Paths to the HDF5 anchor directories
-split_dir <- file.path(saveDir, "splitCounts")
-non_split_dir <- file.path(saveDir, "nonSplitCounts")
-
 # Paths to the RDS files generated in previous steps
 gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS")
-spliceSitePath <- file.path(args$work_dir, "g_ranges_non_split_counts.RDS")
+spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS")  # Correct file
+
+# Load the ranges
+splitCounts_gRanges <- readRDS(gRangesSplitPath)
+spliceSiteCoords <- readRDS(spliceSitePath)  # Now loads the actual splice site coords
+
 
 # 1. Load the FDS object
 message("Loading Fraser Data Set...")
@@ -32,21 +33,19 @@ spliceSiteCoords <- readRDS(spliceSitePath)
 
 # 2. Get splitReads counts
 message("Loading split counts...")
-splitCounts_h5 <- HDF5Array(file.path(split_dir, "rawCountsJ"), "rawCountsJ")
-splitCounts_se <- SummarizedExperiment(
-  colData = colData(fds),
-  rowRanges = splitCounts_gRanges,
-  assays = list(rawCountsJ = splitCounts_h5)
-)
+split_se_path <- file.path(saveDir, "splitCounts", "se.rds")
+if(!file.exists(split_se_path)){
+    stop(paste("Missing splitCounts anchor at:", split_se_path))
+}
+splitCounts_se <- readRDS(split_se_path)
 
 # 3. Get nonSplitRead counts
 message("Loading non-split counts...")
-nonSplitCounts_h5 <- HDF5Array(file.path(non_split_dir, "rawCountsSS"), "rawCountsSS")
-nonSplitCounts_se <- SummarizedExperiment(
-  colData = colData(fds),
-  rowRanges = spliceSiteCoords,
-  assays = list(rawCountsSS = nonSplitCounts_h5)
-)
+non_split_se_path <- file.path(saveDir, "nonSplitCounts", "se.rds")
+if(!file.exists(non_split_se_path)){
+    stop(paste("Missing nonSplitCounts anchor at:", non_split_se_path))
+}
+nonSplitCounts_se <- readRDS(non_split_se_path)
 
 # 4. Add Counts to FRASER object
 # This function automatically populates the internal slots correctly
@@ -57,13 +56,8 @@ fds <- addCountsToFraserDataSet(
   nonSplitCounts = nonSplitCounts_se
 )
 
-# 5. Calculate PSI values
-message("Calculating PSI values...")
-bp <- MulticoreParam(workers = args$nthreads)
-fds <- calculatePSIValues(fds, BPPARAM = bp)
-
-# 6. Save final FRASER object
-message("Saving integrated FDS...")
+# 5. Save final FRASER object
+message("Saving final integrated FDS...")
 fds <- saveFraserDataSet(fds)
 
 message("FRASER join complete.")

From 20a8af25b54709c3662f55f816c7f0387b4c04c5 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 15:54:32 +1100
Subject: [PATCH 58/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 1fee3300..2994d28a 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -16,16 +16,14 @@ saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
 # Paths to the RDS files generated in previous steps
 gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS")
-spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS")  # Correct file
-
-# Load the ranges
-splitCounts_gRanges <- readRDS(gRangesSplitPath)
-spliceSiteCoords <- readRDS(spliceSitePath)  # Now loads the actual splice site coords
+spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS")
 
+# 1. Load the FDS object DIRECTLY from RDS (not using loadFraserDataSet)
+message("Loading Fraser Data Set from RDS...")
+fds <- readRDS(args$fds_path)
 
-# 1. Load the FDS object
-message("Loading Fraser Data Set...")
-fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+# Update the internal directory path to match the current work directory
+workingDir(fds) <- saveDir
 
 # Load the ranges
 splitCounts_gRanges <- readRDS(gRangesSplitPath)

From 1f418f4ba498b639c1eaf5d0420e1b2e5dc75b8a Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 16:57:28 +1100
Subject: [PATCH 59/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 47 ++++++++++++++++--------------
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 2994d28a..b8eef65e 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -14,39 +14,42 @@ args <- parser$parse_args()
 fds_name <- paste0("FRASER_", args$cohort_id)
 saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
-# Paths to the RDS files generated in previous steps
-gRangesSplitPath <- file.path(args$work_dir, "g_ranges_split_counts.RDS")
-spliceSitePath <- file.path(args$work_dir, "splice_site_coords.RDS")
-
-# 1. Load the FDS object DIRECTLY from RDS (not using loadFraserDataSet)
+# 1. Load the FDS object
 message("Loading Fraser Data Set from RDS...")
 fds <- readRDS(args$fds_path)
 
 # Update the internal directory path to match the current work directory
 workingDir(fds) <- saveDir
 
-# Load the ranges
-splitCounts_gRanges <- readRDS(gRangesSplitPath)
-spliceSiteCoords <- readRDS(spliceSitePath)
-
-# 2. Get splitReads counts
+# 2. Load split counts
 message("Loading split counts...")
-split_se_path <- file.path(saveDir, "splitCounts", "se.rds")
-if(!file.exists(split_se_path)){
-    stop(paste("Missing splitCounts anchor at:", split_se_path))
+split_counts_path <- file.path(args$work_dir, "g_ranges_split_counts.RDS")
+if(!file.exists(split_counts_path)){
+    stop(paste("Missing split counts at:", split_counts_path))
+}
+splitCounts_se <- readRDS(split_counts_path)
+
+# CRITICAL: Materialize the assays if they're HDF5-backed with broken paths
+message("Materializing split counts assays...")
+for(assay_name in names(assays(splitCounts_se))){
+    assays(splitCounts_se)[[assay_name]] <- as.matrix(assays(splitCounts_se)[[assay_name]])
+}
+
+# 3. Load merged non-split counts
+message("Loading merged non-split counts...")
+merged_non_split_dir <- file.path(args$work_dir, "merged_non_split_counts")
+if(!dir.exists(merged_non_split_dir)){
+    stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir))
 }
-splitCounts_se <- readRDS(split_se_path)
+nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
 
-# 3. Get nonSplitRead counts
-message("Loading non-split counts...")
-non_split_se_path <- file.path(saveDir, "nonSplitCounts", "se.rds")
-if(!file.exists(non_split_se_path)){
-    stop(paste("Missing nonSplitCounts anchor at:", non_split_se_path))
+# CRITICAL: Materialize non-split counts assays too
+message("Materializing non-split counts assays...")
+for(assay_name in names(assays(nonSplitCounts_se))){
+    assays(nonSplitCounts_se)[[assay_name]] <- as.matrix(assays(nonSplitCounts_se)[[assay_name]])
 }
-nonSplitCounts_se <- readRDS(non_split_se_path)
 
-# 4. Add Counts to FRASER object
-# This function automatically populates the internal slots correctly
+# 4. Add counts to FRASER object
 message("Joining assays into FDS object...")
 fds <- addCountsToFraserDataSet(
   fds = fds,

From 0e7bdb2f7a51a3c0d12b0f5e6c3df2c0edd87b3b Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 17:26:37 +1100
Subject: [PATCH 60/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 23 +++++------------------
 images/fraser/fraser_merge_split.R | 13 ++++++++++---
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index b8eef65e..70d6c06d 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -17,23 +17,16 @@ saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 # 1. Load the FDS object
 message("Loading Fraser Data Set from RDS...")
 fds <- readRDS(args$fds_path)
-
-# Update the internal directory path to match the current work directory
 workingDir(fds) <- saveDir
 
 # 2. Load split counts
 message("Loading split counts...")
-split_counts_path <- file.path(args$work_dir, "g_ranges_split_counts.RDS")
-if(!file.exists(split_counts_path)){
-    stop(paste("Missing split counts at:", split_counts_path))
-}
-splitCounts_se <- readRDS(split_counts_path)
-
-# CRITICAL: Materialize the assays if they're HDF5-backed with broken paths
-message("Materializing split counts assays...")
-for(assay_name in names(assays(splitCounts_se))){
-    assays(splitCounts_se)[[assay_name]] <- as.matrix(assays(splitCounts_se)[[assay_name]])
+split_se_dir <- file.path(saveDir, "splitCounts")
+if(!dir.exists(split_se_dir)){
+    stop(paste("Missing splitCounts directory at:", split_se_dir,
+               "\nYou need to run the split counting step first"))
 }
+splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
 
 # 3. Load merged non-split counts
 message("Loading merged non-split counts...")
@@ -43,12 +36,6 @@ if(!dir.exists(merged_non_split_dir)){
 }
 nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
 
-# CRITICAL: Materialize non-split counts assays too
-message("Materializing non-split counts assays...")
-for(assay_name in names(assays(nonSplitCounts_se))){
-    assays(nonSplitCounts_se)[[assay_name]] <- as.matrix(assays(nonSplitCounts_se)[[assay_name]])
-}
-
 # 4. Add counts to FRASER object
 message("Joining assays into FDS object...")
 fds <- addCountsToFraserDataSet(
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 4eb6a8e4..3b0758d4 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -58,8 +58,15 @@ minExpressionInOneSample <- 20
 # 4. Merge individual split count RDS files from the cache
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
 message("Merging split counts from cache...")
-splitCounts <- getSplitReadCountsForAllSamples(fds=fds,
-                                               recount=FALSE)
+splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE)
+
+# SAVE THE SPLIT COUNTS BEFORE FILTERING
+message("Saving merged split counts SummarizedExperiment...")
+split_counts_dir <- file.path(save_dir, "splitCounts")
+dir.create(split_counts_dir, recursive = TRUE, showWarnings = FALSE)
+saveHDF5SummarizedExperiment(splitCounts, dir = split_counts_dir, replace = TRUE)
+
+# Now continue with filtering for annotations
 splitCountRanges <- rowRanges(splitCounts)
 splitCountRangesNonFilt <- FRASER:::annotateSpliceSite(splitCountRanges)
 
@@ -71,7 +78,7 @@ spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges)
 
 
 # Use absolute paths for saving to match Python 'mv' commands
-#This is sslightly confusing, but the filtered granges will be used to annotate non_split counts
+#This is slightly confusing, but the filtered granges will be used to annotate non_split counts
 saveRDS(splitCountRangesNonFilt,       file.path(args$work_dir, "g_ranges_split_counts.RDS"))
 saveRDS(splitCountRanges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
 saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))

From 3c025364b0b1ba296e87bdaddd0c115e1955f68d Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 17:45:24 +1100
Subject: [PATCH 61/89] fix file path handling in join

---
 images/fraser/Dockerfile           | 1 +
 images/fraser/fraser_merge_split.R | 1 +
 2 files changed, 2 insertions(+)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 630f1f38..8e7505df 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -23,6 +23,7 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     # Install summarizedRT via BiocManager
     micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \
     micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \
+    micromamba run -n base Rscript -e 'BiocManager::install("HDF5Array", update=FALSE, ask=FALSE)' && \
     micromamba clean --all --yes && \
     rm -rf /var/lib/apt/lists/*
 
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 3b0758d4..a1ea45e8 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -6,6 +6,7 @@ library(BiocParallel)
 library(SummarizedExperiment)
 library(Rsamtools)
 library(DelayedMatrixStats)
+library(HDF5Array)
 
 
 parser <- ArgumentParser(description = "Merge Split Read Counts")

From b7b7bd5104b1ea7c7d3d301b3f25ebd29abf6e7e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 30 Jan 2026 18:19:34 +1100
Subject: [PATCH 62/89] fix file path handling in join

---
 images/fraser/fraser_join_counts.R | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 70d6c06d..1429b558 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -23,14 +23,13 @@ workingDir(fds) <- saveDir
 message("Loading split counts...")
 split_se_dir <- file.path(saveDir, "splitCounts")
 if(!dir.exists(split_se_dir)){
-    stop(paste("Missing splitCounts directory at:", split_se_dir,
-               "\nYou need to run the split counting step first"))
+    stop(paste("Missing splitCounts directory at:", split_se_dir))
 }
 splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
 
 # 3. Load merged non-split counts
 message("Loading merged non-split counts...")
-merged_non_split_dir <- file.path(args$work_dir, "merged_non_split_counts")
+merged_non_split_dir <- file.path(saveDir, "nonSplitCounts")
 if(!dir.exists(merged_non_split_dir)){
     stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir))
 }

From d574eaff4625d88ee3d27e19d3484d078a7ea553 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 2 Feb 2026 15:54:09 +1100
Subject: [PATCH 63/89] fixing the analysis script

---
 images/fraser/fraser_analysis.R | 117 ++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 52 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index 0ff7e409..dbdd7cda 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -5,6 +5,7 @@ library(FRASER)
 library(tidyverse)
 library(TxDb.Hsapiens.UCSC.hg38.knownGene)
 library(org.Hs.eg.db)
+library(HDF5Array)
 library(ggplot2)
 
 parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
@@ -17,73 +18,85 @@ parser$add_argument("--min_count", type = "integer", default = 5)
 parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
-# Force HDF5 for large matrix operations
-options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = -1)
-
-z_cutoff <- if (is.null(args$z_cutoff)) NA else args$z_cutoff
+# --- 1. Parallelization Setup ---
+# Use the user-provided nthreads for all BiocParallel operations
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
 # 1. Load the merged FDS
 # Python extracted the tar into args$fds_dir.
 # Inside is 'savedObjects/FRASER_{cohort_id}/...'
+options(delayedArray.block.size = 1e9) # 1GB blocks
 fds_name <- paste0("FRASER_", args$cohort_id)
-fds <- loadFraserDataSet(dir = args$fds_dir, name = fds_name)
+message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir))
+fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name)
+
+
+# --- 3. Filtering ---
+# It is critical to filter before fitting to reduce the size of the latent space matrices
+fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp)
+
+# Create QC directory
+dir.create("qc_plots", showWarnings = FALSE)
+
+# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast
+set.seed(42)
+plot_idx <- sample(nrow(fds), min(nrow(fds), 30000))
+fds_plot_subset <- fds[plot_idx, ]
 
-# 2. Filter Expression and Variability
-# This reduces the number of tests and improves power
-message("Filtering junctions...")
+message("Generating QC plots using downsampled subset...")
+png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150)
+plotFilterExpression(fds_plot_subset, bins = 100)
+dev.off()
+
+# Apply actual filtering to full dataset
 fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
 
-dir.create("plots/misc", recursive = TRUE, showWarnings = FALSE)
-png("plots/misc/filter_expression.png", width = 2000, height = 2000, res = 300)
-plotFilterExpression(fds, bins = 100)
+# --- 4. Dimensionality Message ---
+raw_dim <- nrow(fds)
+fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
+filtered_dim <- nrow(fds_filtered)
+
+message(paste0("\n--- Filtering Summary ---"))
+message(paste0("Original junctions:   ", raw_dim))
+message(paste0("Filtered junctions:   ", filtered_dim))
+message(paste0("Reduction:            ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%"))
+
+# --- 5. Hyperparameter Optimization ---
+# Optimization must run on the filtered set
+opt_q <- bestQ(fds_filtered, type = "jaccard")
+
+png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150)
+plotEncDimSearch(fds_filtered, type = "jaccard")
 dev.off()
 
-# 3. Hyperparameter Optimization (bestQ) and Fitting
-# FRASER uses an autoencoder to control for latent confounders (batch effects)
-message("Finding optimal hyperparameter Q and fitting autoencoder...")
-# In FRASER 2.0, we typically fit psi5, psi3, and jaccard
-fds <- FRASER(fds, q = c(psi5=NA, psi3=NA, jaccard=NA), BPPARAM = bp)
-
-# 4. Annotation
-message("Annotating results with UCSC hg38...")
-txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
-orgDb <- org.Hs.eg.db
-fds <- annotateRangesWithTxDb(fds, txdb = txdb, orgDb = orgDb)
-
-# 5. Extract Results
-message("Exporting results...")
-res <- results(fds,
-               padjCutoff = args$pval_cutoff,
-               deltaPsiCutoff = args$delta_psi_cutoff,
-               zScoreCutoff = z_cutoff,
-               minCount = args$min_count)
+# --- 6. Fitting ---
+fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
 
-res_all <- results(fds, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
+# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit)
+png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
+plotQQ(fds_fit, type = "jaccard")
+dev.off()
 
-write_csv(as.data.frame(res), "results.significant.csv")
-write_csv(as.data.frame(res_all), "results.all.csv")
+# --- 7. Annotation ---
+fds_fit <- annotateRangesWithTxDb(fds_fit,
+                                  txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
+                                  orgDb = org.Hs.eg.db)
 
-# 6. Generate Diagnostic Plots
-message("Generating Volcano and Q-Q plots...")
-dir.create("plots/volcano", recursive = TRUE)
-for(type in c("psi5", "psi3", "jaccard")){
-    png(paste0("plots/volcano/volcano_", type, ".png"), width = 2000, height = 2000, res = 300)
-    print(plotVolcano(fds, type = type, sampleID = sampleIDs(fds)[1])) # Plots first sample as example
-    dev.off()
-}
+# --- 8. Results & Compressed Export ---
+res <- results(fds_fit,
+               padjCutoff = args$pval_cutoff,
+               deltaPsiCutoff = args$delta_psi_cutoff,
+               zScoreCutoff = args$z_cutoff,
+               minCount = args$min_count)
 
-# 7. Final Save
-# We save this in a generic folder so Python can easily grab it
-saveFraserDataSet(fds, dir = "output", name = paste0("Final_", args$cohort_id))
+# Extract all results for Jaccard
+res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
 
-# Summary statistics for the log
-sink("statistics_summary.txt")
-cat("Cohort ID:", args$cohort_id, "\n")
-cat("Total Samples:", length(sampleIDs(fds)), "\n")
-cat("Significant Events (adj P <", args$pval_cutoff, "):", nrow(res), "\n")
-sink()
+message("Saving results...")
+write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
+write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 
-message("Analysis complete.")
+# --- 9. Final Save ---
+saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final"))
+message("Analysis Complete.")
\ No newline at end of file

From 58c4462e728e6e83f0e3e11b547b110905107952 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 3 Feb 2026 10:12:45 +1100
Subject: [PATCH 64/89] Analysis steps are out of order?

---
 images/fraser/fraser_analysis.R | 56 ++++++++++++++-------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index dbdd7cda..cad479ec 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -7,6 +7,7 @@ library(TxDb.Hsapiens.UCSC.hg38.knownGene)
 library(org.Hs.eg.db)
 library(HDF5Array)
 library(ggplot2)
+library(BiocParallel)
 
 parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
 parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder")
@@ -19,51 +20,39 @@ parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
 # --- 1. Parallelization Setup ---
-# Use the user-provided nthreads for all BiocParallel operations
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
-# 1. Load the merged FDS
-# Python extracted the tar into args$fds_dir.
-# Inside is 'savedObjects/FRASER_{cohort_id}/...'
-options(delayedArray.block.size = 1e9) # 1GB blocks
+# --- 2. Load the Dataset ---
+options(delayedArray.block.size = 1e9)
 fds_name <- paste0("FRASER_", args$cohort_id)
 message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir))
 fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name)
 
+# --- 3. Filtering & PSI Calculation ---
+# Filter first to keep the Jaccard matrix calculation lean
+fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
+fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
 
-# --- 3. Filtering ---
-# It is critical to filter before fitting to reduce the size of the latent space matrices
-fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp)
+# Calculate Jaccard metrics only on filtered junctions
+fitMetrics(fds_filtered) <- "jaccard"
+fds_filtered <- calculatePSIValues(fds_filtered, types = "jaccard", BPPARAM = bp)
 
-# Create QC directory
+# --- 4. QC Plots (Downsampled) ---
 dir.create("qc_plots", showWarnings = FALSE)
-
-# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast
 set.seed(42)
-plot_idx <- sample(nrow(fds), min(nrow(fds), 30000))
-fds_plot_subset <- fds[plot_idx, ]
+plot_idx <- sample(nrow(fds_filtered), min(nrow(fds_filtered), 30000))
+fds_plot_subset <- fds_filtered[plot_idx, ]
 
-message("Generating QC plots using downsampled subset...")
+message("Generating QC plots...")
 png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150)
 plotFilterExpression(fds_plot_subset, bins = 100)
 dev.off()
 
-# Apply actual filtering to full dataset
-fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
-
-# --- 4. Dimensionality Message ---
-raw_dim <- nrow(fds)
-fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
-filtered_dim <- nrow(fds_filtered)
-
-message(paste0("\n--- Filtering Summary ---"))
-message(paste0("Original junctions:   ", raw_dim))
-message(paste0("Filtered junctions:   ", filtered_dim))
-message(paste0("Reduction:            ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%"))
-
 # --- 5. Hyperparameter Optimization ---
-# Optimization must run on the filtered set
+message("Optimizing Hyperparameters (q) for Jaccard...")
+# This step is required before calling bestQ
+fds_filtered <- optimHyperParams(fds_filtered, type = "jaccard", BPPARAM = bp)
 opt_q <- bestQ(fds_filtered, type = "jaccard")
 
 png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150)
@@ -71,29 +60,30 @@ plotEncDimSearch(fds_filtered, type = "jaccard")
 dev.off()
 
 # --- 6. Fitting ---
+message(paste0("Fitting model with q=", opt_q))
 fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
 
-# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit)
 png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
 plotQQ(fds_fit, type = "jaccard")
 dev.off()
 
 # --- 7. Annotation ---
+message("Annotating junctions...")
 fds_fit <- annotateRangesWithTxDb(fds_fit,
                                   txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
                                   orgDb = org.Hs.eg.db)
 
-# --- 8. Results & Compressed Export ---
+# --- 8. Results Export ---
+message("Extracting results...")
 res <- results(fds_fit,
+               type = "jaccard",
                padjCutoff = args$pval_cutoff,
                deltaPsiCutoff = args$delta_psi_cutoff,
                zScoreCutoff = args$z_cutoff,
                minCount = args$min_count)
 
-# Extract all results for Jaccard
-res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
+res_all <- results(fds_fit, type = "jaccard", padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
 
-message("Saving results...")
 write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
 write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 

From 696a1f7bd8d1a1e2544a5b3752aa15a0ccab55e1 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 3 Feb 2026 15:40:25 +1100
Subject: [PATCH 65/89] they were not, putting it back

---
 images/fraser/fraser_analysis.R | 56 +++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 23 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index cad479ec..dbdd7cda 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -7,7 +7,6 @@ library(TxDb.Hsapiens.UCSC.hg38.knownGene)
 library(org.Hs.eg.db)
 library(HDF5Array)
 library(ggplot2)
-library(BiocParallel)
 
 parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
 parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder")
@@ -20,39 +19,51 @@ parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
 # --- 1. Parallelization Setup ---
+# Use the user-provided nthreads for all BiocParallel operations
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
-# --- 2. Load the Dataset ---
-options(delayedArray.block.size = 1e9)
+# 1. Load the merged FDS
+# Python extracted the tar into args$fds_dir.
+# Inside is 'savedObjects/FRASER_{cohort_id}/...'
+options(delayedArray.block.size = 1e9) # 1GB blocks
 fds_name <- paste0("FRASER_", args$cohort_id)
 message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir))
 fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name)
 
-# --- 3. Filtering & PSI Calculation ---
-# Filter first to keep the Jaccard matrix calculation lean
-fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
-fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
 
-# Calculate Jaccard metrics only on filtered junctions
-fitMetrics(fds_filtered) <- "jaccard"
-fds_filtered <- calculatePSIValues(fds_filtered, types = "jaccard", BPPARAM = bp)
+# --- 3. Filtering ---
+# It is critical to filter before fitting to reduce the size of the latent space matrices
+fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp)
 
-# --- 4. QC Plots (Downsampled) ---
+# Create QC directory
 dir.create("qc_plots", showWarnings = FALSE)
+
+# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast
 set.seed(42)
-plot_idx <- sample(nrow(fds_filtered), min(nrow(fds_filtered), 30000))
-fds_plot_subset <- fds_filtered[plot_idx, ]
+plot_idx <- sample(nrow(fds), min(nrow(fds), 30000))
+fds_plot_subset <- fds[plot_idx, ]
 
-message("Generating QC plots...")
+message("Generating QC plots using downsampled subset...")
 png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150)
 plotFilterExpression(fds_plot_subset, bins = 100)
 dev.off()
 
+# Apply actual filtering to full dataset
+fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
+
+# --- 4. Dimensionality Message ---
+raw_dim <- nrow(fds)
+fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
+filtered_dim <- nrow(fds_filtered)
+
+message(paste0("\n--- Filtering Summary ---"))
+message(paste0("Original junctions:   ", raw_dim))
+message(paste0("Filtered junctions:   ", filtered_dim))
+message(paste0("Reduction:            ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%"))
+
 # --- 5. Hyperparameter Optimization ---
-message("Optimizing Hyperparameters (q) for Jaccard...")
-# This step is required before calling bestQ
-fds_filtered <- optimHyperParams(fds_filtered, type = "jaccard", BPPARAM = bp)
+# Optimization must run on the filtered set
 opt_q <- bestQ(fds_filtered, type = "jaccard")
 
 png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150)
@@ -60,30 +71,29 @@ plotEncDimSearch(fds_filtered, type = "jaccard")
 dev.off()
 
 # --- 6. Fitting ---
-message(paste0("Fitting model with q=", opt_q))
 fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
 
+# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit)
 png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
 plotQQ(fds_fit, type = "jaccard")
 dev.off()
 
 # --- 7. Annotation ---
-message("Annotating junctions...")
 fds_fit <- annotateRangesWithTxDb(fds_fit,
                                   txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
                                   orgDb = org.Hs.eg.db)
 
-# --- 8. Results Export ---
-message("Extracting results...")
+# --- 8. Results & Compressed Export ---
 res <- results(fds_fit,
-               type = "jaccard",
                padjCutoff = args$pval_cutoff,
                deltaPsiCutoff = args$delta_psi_cutoff,
                zScoreCutoff = args$z_cutoff,
                minCount = args$min_count)
 
-res_all <- results(fds_fit, type = "jaccard", padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
+# Extract all results for Jaccard
+res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
 
+message("Saving results...")
 write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
 write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 

From f6674ad0144e50d6ddbd1b5247e26e72a5e4f8f4 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 4 Feb 2026 12:02:14 +1100
Subject: [PATCH 66/89] adds rescue block

---
 images/fraser/fraser_analysis.R | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index dbdd7cda..c73e1f30 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -26,11 +26,23 @@ register(bp)
 # 1. Load the merged FDS
 # Python extracted the tar into args$fds_dir.
 # Inside is 'savedObjects/FRASER_{cohort_id}/...'
-options(delayedArray.block.size = 1e9) # 1GB blocks
+
 fds_name <- paste0("FRASER_", args$cohort_id)
-message(paste0("Loading Fraser Data Set: ", fds_name, " from ", args$fds_dir))
+message(paste0("Loading Fraser Data Set: ", fds_name))
 fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name)
 
+# REPAIR: If spliceSiteID is missing, FRASER 2.0 cannot calculate Jaccard/PSI
+if(is.null(nonSplicedReads(fds)) || !("spliceSiteID" %in% colnames(mcols(fds, type="ss")))){
+    message("Restoring missing Splice Site IDs for Jaccard calculation...")
+
+    # We re-extract the splice site map from the existing junctions
+    # this forces FRASER to re-index the donor/acceptor relationships
+    fds <- makeFraserDataSet(colData=colData(fds),
+                            junctions=rowRanges(fds, type="j"))
+
+    # Re-assign counts if they were lost during the re-make
+    # (Usually not necessary if using loadFraserDataSet, but good for safety)
+}
 
 # --- 3. Filtering ---
 # It is critical to filter before fitting to reduce the size of the latent space matrices

From cc80fc96aeb95e0fac96d65689090f8762e0a94b Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 4 Feb 2026 13:06:25 +1100
Subject: [PATCH 67/89] rescuing the rescue block

---
 images/fraser/fraser_analysis.R | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index c73e1f30..7f534135 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -23,25 +23,26 @@ args <- parser$parse_args()
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
-# 1. Load the merged FDS
-# Python extracted the tar into args$fds_dir.
-# Inside is 'savedObjects/FRASER_{cohort_id}/...'
-
+# --- 2. Load the Dataset ---
+options(delayedArray.block.size = 1e9)
 fds_name <- paste0("FRASER_", args$cohort_id)
 message(paste0("Loading Fraser Data Set: ", fds_name))
 fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name)
 
 # REPAIR: If spliceSiteID is missing, FRASER 2.0 cannot calculate Jaccard/PSI
-if(is.null(nonSplicedReads(fds)) || !("spliceSiteID" %in% colnames(mcols(fds, type="ss")))){
-    message("Restoring missing Splice Site IDs for Jaccard calculation...")
-
-    # We re-extract the splice site map from the existing junctions
-    # this forces FRASER to re-index the donor/acceptor relationships
-    fds <- makeFraserDataSet(colData=colData(fds),
-                            junctions=rowRanges(fds, type="j"))
-
-    # Re-assign counts if they were lost during the re-make
-    # (Usually not necessary if using loadFraserDataSet, but good for safety)
+if(! "spliceSiteID" %in% colnames(mcols(fds, type="ss"))){
+  message("Manually injecting Splice Site IDs...")
+
+  # This internal FRASER call generates the mapping without needing the full constructor
+  # It populates the 'spliceSiteCoords' slot which calculatePSIValues needs
+  fds <- FRASER:::annotateSpliceSite(fds)
+
+  # We also need to ensure the Jaccard-specific metadata is initialized
+  # Often, this is what's missing when the merge.data.table fails
+  if(is.null(mcols(fds, type="j")$startID)){
+    # This maps junctions to the spliceSiteIDs we just generated
+    fds <- FRASER:::updateIndices(fds)
+  }
 }
 
 # --- 3. Filtering ---

From 7365090d0e8ae4ac8b192c6442a77cbf9b41a647 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 4 Feb 2026 13:46:58 +1100
Subject: [PATCH 68/89] unrescuing the rescue block

---
 images/fraser/fraser_join_counts.R     | 48 ++++++++++++++++++++++++++
 images/fraser/fraser_merge_non_split.R |  4 ++-
 images/fraser/fraser_merge_split.R     | 15 +++++---
 3 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 1429b558..d1ae66a0 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -11,6 +11,48 @@ parser$add_argument("--work_dir", default = "/io/work")
 parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
+check_fds_integrity <- function(fds) {
+    message("\n--- Running FDS Integrity Check ---")
+
+    # 1. Check for basic ID existence
+    j_cols <- colnames(mcols(fds, type="j"))
+    ss_cols <- colnames(mcols(fds, type="ss"))
+
+    has_ids <- all(c("startID", "endID") %in% j_cols) && ("spliceSiteID" %in% ss_cols)
+
+    if(has_ids) {
+        message("SUCCESS: SpliceSiteIDs found in both Junctions and SpliceSites.")
+    } else {
+        stop("CRITICAL ERROR: SpliceSiteID metadata is missing. Analysis will fail.")
+    }
+
+    # 2. Check for ID alignment (The "by.y" error test)
+    # Are all junction startIDs present in the Splice Site map?
+    missing_starts <- !all(mcols(fds, type="j")$startID %in% mcols(fds, type="ss")$spliceSiteID)
+    if(missing_starts) {
+        stop("CRITICAL ERROR: Junction startIDs do not match SpliceSiteIDs. Map is broken.")
+    }
+    message("SUCCESS: Junction IDs are correctly mapped to Splice Sites.")
+
+    # 3. Check HDF5 Backend connectivity
+    # Ensure the pointers to the .h5 files are valid and readable
+    tryCatch({
+        test_val <- as.matrix(counts(fds, type="j")[1, 1])
+        message("SUCCESS: HDF5 backends are reachable and readable.")
+    }, error = function(e) {
+        stop("CRITICAL ERROR: HDF5 files are missing or paths are broken: ", e$message)
+    })
+
+    # 4. Check for Non-Split Counts
+    if("rawCountsSS" %in% assayNames(fds)) {
+        message("SUCCESS: Non-split counts (rawCountsSS) are present.")
+    } else {
+        warning("WARNING: Non-split counts are missing. Jaccard/PSI calculation may fail.")
+    }
+
+    message("--- Integrity Check Passed ---\n")
+}
+
 fds_name <- paste0("FRASER_", args$cohort_id)
 saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
@@ -43,6 +85,12 @@ fds <- addCountsToFraserDataSet(
   nonSplitCounts = nonSplitCounts_se
 )
 
+# Instead of updateIndices, use this to force the internal ID mapping
+# This will populate the 'ss' (splice site) metadata correctly
+fds <- calculatePSIValues(fds, types="psi5")
+
+# --- Call the check before finishing ---
+check_fds_integrity(fds)
 # 5. Save final FRASER object
 message("Saving final integrated FDS...")
 fds <- saveFraserDataSet(fds)
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 40d92e77..2519ce2f 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -69,7 +69,8 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
 )
 
 # 7. Final Save
-# This creates the complete 'fitted' dataset that the analysis script will use
+# This populates the internal 'nonSplicedReads' slot and the SS map
+nonSplicedReads(fds) <- non_split_counts
 
 # Use the HDF5-safe saving method for the counts object specifically
 # We save this to a new directory to avoid clobbering the input cache
@@ -79,5 +80,6 @@ saveHDF5SummarizedExperiment(
     replace = TRUE
 )
 
+fds <- calculatePSIValues(fds)
 # Also save the updated FDS object (this updates the internal fds-object.RDS)
 fds <- saveFraserDataSet(fds)
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index a1ea45e8..b403066e 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -61,15 +61,20 @@ minExpressionInOneSample <- 20
 message("Merging split counts from cache...")
 splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE)
 
-# SAVE THE SPLIT COUNTS BEFORE FILTERING
-message("Saving merged split counts SummarizedExperiment...")
+# --- CRITICAL FIX: Annotate the object ITSELF, not a temporary variable ---
+# This generates startID and endID and attaches them to the rowRanges
+message("Generating Splice Site IDs...")
+rowRanges(splitCounts) <- FRASER:::annotateSpliceSite(rowRanges(splitCounts))
+
+# 5. Now save the Annotated SE
+message("Saving merged split counts SummarizedExperiment with IDs...")
 split_counts_dir <- file.path(save_dir, "splitCounts")
 dir.create(split_counts_dir, recursive = TRUE, showWarnings = FALSE)
 saveHDF5SummarizedExperiment(splitCounts, dir = split_counts_dir, replace = TRUE)
 
-# Now continue with filtering for annotations
-splitCountRanges <- rowRanges(splitCounts)
-splitCountRangesNonFilt <- FRASER:::annotateSpliceSite(splitCountRanges)
+# 6. Extract coordinates for the next pipeline steps
+# We use the rowRanges that NOW HAVE the startID/endID columns
+splitCountRangesNonFilt <- rowRanges(splitCounts)
 
 maxCount <- rowMaxs(assay(splitCounts, "rawCountsJ"))
 passed <- maxCount >= minExpressionInOneSample

From f273223d7b957f57f5231bd4a3c971c07c20af41 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 4 Feb 2026 14:48:42 +1100
Subject: [PATCH 69/89] merging non split cannot calculate psi

---
 images/fraser/fraser_merge_non_split.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 2519ce2f..8e22c44c 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -80,6 +80,6 @@ saveHDF5SummarizedExperiment(
     replace = TRUE
 )
 
-fds <- calculatePSIValues(fds)
+
 # Also save the updated FDS object (this updates the internal fds-object.RDS)
 fds <- saveFraserDataSet(fds)

From 149615db48c80c2b3054a35fb097b95ca0f933e1 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 4 Feb 2026 15:24:18 +1100
Subject: [PATCH 70/89] join counts still cant calculate psi

---
 images/fraser/fraser_join_counts.R | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index d1ae66a0..80d9c019 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -77,7 +77,13 @@ if(!dir.exists(merged_non_split_dir)){
 }
 nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
 
-# 4. Add counts to FRASER object
+# 4. Annotate the Split Counts
+# This is the missing link. It generates the spliceSiteID mapping
+# that calculatePSIValues needs later.
+message("Annotating splice sites...")
+splitCounts_se <- annotateSpliceSites(splitCounts_se)
+
+# 5. Add counts to FRASER object
 message("Joining assays into FDS object...")
 fds <- addCountsToFraserDataSet(
   fds = fds,
@@ -85,9 +91,11 @@ fds <- addCountsToFraserDataSet(
   nonSplitCounts = nonSplitCounts_se
 )
 
-# Instead of updateIndices, use this to force the internal ID mapping
+# --- Call the check before finishing ---
+check_fds_integrity(fds)
+
 # This will populate the 'ss' (splice site) metadata correctly
-fds <- calculatePSIValues(fds, types="psi5")
+fds <- calculatePSIValues(fds, types="jaccard")
 
 # --- Call the check before finishing ---
 check_fds_integrity(fds)

From 34f7991fbdb425be4388bbfc42d13111ecfb5423 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 4 Feb 2026 15:27:04 +1100
Subject: [PATCH 71/89] explicitly annotating non-split counts

---
 images/fraser/fraser_merge_non_split.R | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 8e22c44c..f33c3254 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -68,6 +68,11 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
   recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
 )
 
+# CRITICAL ADDITION: Annotate the non-split counts object!
+# This assigns 'spliceSiteID' to the rowRanges of your non-split object.
+message("Annotating non-split splice site IDs...")
+non_split_counts <- annotateSpliceSites(non_split_counts)
+
 # 7. Final Save
 # This populates the internal 'nonSplicedReads' slot and the SS map
 nonSplicedReads(fds) <- non_split_counts

From 9ce1a016cc28bbfda2e6842ab842ab0dafca2c4a Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 5 Feb 2026 13:33:47 +1100
Subject: [PATCH 72/89] explicitly annotating non-split counts, must refer to
 fraser function while I do that

---
 images/fraser/fraser_join_counts.R     | 2 +-
 images/fraser/fraser_merge_non_split.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 80d9c019..296211aa 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -81,7 +81,7 @@ nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
 # This is the missing link. It generates the spliceSiteID mapping
 # that calculatePSIValues needs later.
 message("Annotating splice sites...")
-splitCounts_se <- annotateSpliceSites(splitCounts_se)
+splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
 
 # 5. Add counts to FRASER object
 message("Joining assays into FDS object...")
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index f33c3254..2fae4033 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -71,7 +71,7 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
 # CRITICAL ADDITION: Annotate the non-split counts object!
 # This assigns 'spliceSiteID' to the rowRanges of your non-split object.
 message("Annotating non-split splice site IDs...")
-non_split_counts <- annotateSpliceSites(non_split_counts)
+non_split_counts <- FRASER:::annotateSpliceSite(non_split_counts)
 
 # 7. Final Save
 # This populates the internal 'nonSplicedReads' slot and the SS map

From baab41398cc3092a69340052c651a966c0ed4640 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 5 Feb 2026 13:38:44 +1100
Subject: [PATCH 73/89] Always assume paired end samples

---
 images/fraser/fraser_count_non_split.R | 2 +-
 images/fraser/fraser_count_split.R     | 2 +-
 images/fraser/fraser_merge_non_split.R | 2 +-
 images/fraser/fraser_merge_split.R     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 8d3b5a1d..7caa0c94 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -43,7 +43,7 @@ filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.R
 
 
 # Set strand specificity
-strandSpecific(fds) <- 0
+strandSpecific(fds) <- 2
 
 # 5. Run Non-Split Counting
 # This writes the .h5 or .RDS file into the cache_dir created above
diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
index b3938303..cf2e8760 100644
--- a/images/fraser/fraser_count_split.R
+++ b/images/fraser/fraser_count_split.R
@@ -36,7 +36,7 @@ if(args$nthreads > 1){
 # 3. Load and Prune IMMEDIATELY
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
 
-strandSpecific(fds) <- 0
+strandSpecific(fds) <- 2
 
 # SUBSET FIRST: This is the most critical memory-saving step.
 # By subsetting here, we drop the metadata of all other samples.
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 2fae4033..3768d467 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -57,7 +57,7 @@ if(length(available_bams) > 0){
 } else {
     stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
 }
-strandSpecific(fds) <- 0
+strandSpecific(fds) <- 2
 
 # --- 6. Merge Non-Split Counts ---
 message("Merging counts using HDF5 files in: ", out_dir)
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index b403066e..791961c4 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -53,7 +53,7 @@ if(length(available_bams) > 0){
 } else {
     stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
 }
-strandSpecific(fds) <- 0
+strandSpecific(fds) <- 2
 # --------------------------------------------
 minExpressionInOneSample <- 20
 # 4. Merge individual split count RDS files from the cache

From c6f64b90f4552ec1dd1312b31c5cdbf43af634a5 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Thu, 5 Feb 2026 14:56:09 +1100
Subject: [PATCH 74/89] mini changes for data integrity

---
 images/fraser/fraser_count_non_split.R |  3 ++-
 images/fraser/fraser_merge_non_split.R | 17 ++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 7caa0c94..9b6891af 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -40,7 +40,8 @@ if(args$nthreads > 1){
 # 3. Load Dataset and Coordinates
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS
-
+# Add this before calling countNonSplicedReads
+colData(fds)$bamFile <- args$bam_path
 
 # Set strand specificity
 strandSpecific(fds) <- 2
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 3768d467..93dd6774 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -39,7 +39,7 @@ anchor_se <- SummarizedExperiment(
     rowRanges = non_split_count_ranges
 )
 # This creates the se.rds that FRASER's loadHDF5SummarizedExperiment needs
-saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "")
+saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "rawCountsSS")
 
 # 4. Configure Backend
 options("FRASER.maxSamplesNoHDF5" = 0)
@@ -73,6 +73,21 @@ non_split_counts <- getNonSplitReadCountsForAllSamples(
 message("Annotating non-split splice site IDs...")
 non_split_counts <- FRASER:::annotateSpliceSite(non_split_counts)
 
+# --- Verification Block ---
+# 1. Check if the assay exists
+if(!"rawCountsSS" %in% assayNames(non_split_counts)){
+    stop("Merge failed: 'rawCountsSS' assay not found in merged object.")
+}
+
+# 2. Check for zeros (Common sign of a path/naming mismatch)
+total_counts <- sum(assay(non_split_counts, "rawCountsSS"))
+if(total_counts == 0){
+    stop("Merge Error: The merged non-split counts matrix is empty (all zeros).
+          Check if sample IDs in fds match the filenames in the cache.")
+}
+
+message("Merge verified: Found ", total_counts, " total site coverage counts.")
+
 # 7. Final Save
 # This populates the internal 'nonSplicedReads' slot and the SS map
 nonSplicedReads(fds) <- non_split_counts

From 908c59f3f48b687084c2f760b45d5efdd8302666 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Fri, 6 Feb 2026 16:22:29 +1100
Subject: [PATCH 75/89] here we go

---
 images/fraser/fraser_merge_non_split.R | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 93dd6774..85c3aed9 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -25,8 +25,10 @@ file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE
 # 2. Load FDS
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
-# 3. Anchor HDF5 Directory
-h5_files <- list.files("/io/batch", pattern = "\\.h5$", recursive = TRUE, full.names = TRUE)
+# 3. Copy H5 files from cache to the proper output directory
+cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts")
+h5_files <- list.files(cache_dir, pattern = "\\.h5$", full.names = TRUE)
+message("Copying ", length(h5_files), " HDF5 files from cache to output directory...")
 for(f in h5_files) {
     file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
 }
@@ -82,24 +84,21 @@ if(!"rawCountsSS" %in% assayNames(non_split_counts)){
 # 2. Check for zeros (Common sign of a path/naming mismatch)
 total_counts <- sum(assay(non_split_counts, "rawCountsSS"))
 if(total_counts == 0){
-    stop("Merge Error: The merged non-split counts matrix is empty (all zeros).
-          Check if sample IDs in fds match the filenames in the cache.")
+    stop("Merge Error: The merged non-split counts matrix is empty (all zeros).")
 }
 
 message("Merge verified: Found ", total_counts, " total site coverage counts.")
 
-# 7. Final Save
-# This populates the internal 'nonSplicedReads' slot and the SS map
-nonSplicedReads(fds) <- non_split_counts
-
-# Use the HDF5-safe saving method for the counts object specifically
-# We save this to a new directory to avoid clobbering the input cache
+# 8. Save the merged non-split counts to the correct location for join step
+message("Saving merged non-split counts to: ", out_dir)
 saveHDF5SummarizedExperiment(
     non_split_counts,
-    dir = file.path(args$work_dir, "merged_non_split_counts"),
+    dir = out_dir,
     replace = TRUE
 )
 
-
-# Also save the updated FDS object (this updates the internal fds-object.RDS)
+# 9. Update FDS object and save
+nonSplicedReads(fds) <- non_split_counts
 fds <- saveFraserDataSet(fds)
+
+message("Non-split merge complete!")

From aaf1dcaf6d8f9d36caefcf39d9fc6d64f25e80f1 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 10 Feb 2026 18:25:00 +1100
Subject: [PATCH 76/89] Updates to merge non-split and join_counts

---
 images/fraser/fraser_join_counts.R     |  65 +++++---
 images/fraser/fraser_merge_non_split.R | 207 +++++++++++++++++--------
 2 files changed, 191 insertions(+), 81 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 296211aa..d4d495ca 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -3,6 +3,7 @@ library(argparse)
 library(FRASER)
 library(SummarizedExperiment)
 library(HDF5Array)
+library(BiocParallel)
 
 parser <- ArgumentParser(description = "Join Split and Non-Split Counts")
 parser$add_argument("--fds_path", required = TRUE)
@@ -11,6 +12,14 @@ parser$add_argument("--work_dir", default = "/io/work")
 parser$add_argument("--nthreads", type = "integer", default = 1)
 args <- parser$parse_args()
 
+# Configure FRASER to use HDF5 backend
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+# Setup parallel processing
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
 check_fds_integrity <- function(fds) {
     message("\n--- Running FDS Integrity Check ---")
 
@@ -27,7 +36,6 @@ check_fds_integrity <- function(fds) {
     }
 
     # 2. Check for ID alignment (The "by.y" error test)
-    # Are all junction startIDs present in the Splice Site map?
     missing_starts <- !all(mcols(fds, type="j")$startID %in% mcols(fds, type="ss")$spliceSiteID)
     if(missing_starts) {
         stop("CRITICAL ERROR: Junction startIDs do not match SpliceSiteIDs. Map is broken.")
@@ -35,7 +43,6 @@ check_fds_integrity <- function(fds) {
     message("SUCCESS: Junction IDs are correctly mapped to Splice Sites.")
 
     # 3. Check HDF5 Backend connectivity
-    # Ensure the pointers to the .h5 files are valid and readable
     tryCatch({
         test_val <- as.matrix(counts(fds, type="j")[1, 1])
         message("SUCCESS: HDF5 backends are reachable and readable.")
@@ -57,50 +64,70 @@ fds_name <- paste0("FRASER_", args$cohort_id)
 saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
 # 1. Load the FDS object
-message("Loading Fraser Data Set from RDS...")
+message("\n[1/5] Loading Fraser Data Set from RDS...")
 fds <- readRDS(args$fds_path)
 workingDir(fds) <- saveDir
 
 # 2. Load split counts
-message("Loading split counts...")
+message("\n[2/5] Loading split counts...")
 split_se_dir <- file.path(saveDir, "splitCounts")
 if(!dir.exists(split_se_dir)){
     stop(paste("Missing splitCounts directory at:", split_se_dir))
 }
 splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
+message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples")
+
+# Annotate split counts with splice site IDs
+message("Annotating split counts with splice site IDs...")
+splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
 
 # 3. Load merged non-split counts
-message("Loading merged non-split counts...")
+message("\n[3/5] Loading merged non-split counts...")
 merged_non_split_dir <- file.path(saveDir, "nonSplitCounts")
 if(!dir.exists(merged_non_split_dir)){
     stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir))
 }
 nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
+message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", ncol(nonSplitCounts_se), " samples")
 
-# 4. Annotate the Split Counts
-# This is the missing link. It generates the spliceSiteID mapping
-# that calculatePSIValues needs later.
-message("Annotating splice sites...")
-splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
+# Verify non-split counts are annotated
+if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) {
+    message("Annotating non-split counts with splice site IDs...")
+    nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
+}
 
-# 5. Add counts to FRASER object
-message("Joining assays into FDS object...")
+# 4. Add counts to FRASER object
+message("\n[4/5] Joining split and non-split counts into FDS object...")
 fds <- addCountsToFraserDataSet(
   fds = fds,
   splitCounts = splitCounts_se,
   nonSplitCounts = nonSplitCounts_se
 )
 
-# --- Call the check before finishing ---
+message("Counts successfully joined!")
+message("  - Split junctions: ", nrow(counts(fds, type = "j")))
+message("  - Splice sites: ", nrow(counts(fds, type = "ss")))
+
+# Run integrity check
 check_fds_integrity(fds)
 
-# This will populate the 'ss' (splice site) metadata correctly
-fds <- calculatePSIValues(fds, types="jaccard")
+# 5. Calculate PSI values
+message("\n[5/5] Calculating PSI values...")
+fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard"))
 
-# --- Call the check before finishing ---
+message("PSI values calculated successfully!")
+message("Available assays: ", paste(assayNames(fds), collapse = ", "))
+
+# Final integrity check
 check_fds_integrity(fds)
-# 5. Save final FRASER object
-message("Saving final integrated FDS...")
+
+# 6. Save final FRASER object
+message("\nSaving final integrated FDS...")
 fds <- saveFraserDataSet(fds)
 
-message("FRASER join complete.")
+message("\n=== FRASER join complete! ===")
+message("FDS object saved in: ", workingDir(fds))
+message("\nNext steps for analysis:")
+message("  1. Filter junctions: fds <- filterExpressionAndVariability(fds)")
+message("  2. Fit model: fds <- FRASER(fds)")
+message("  3. Get results: results(fds)")
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 85c3aed9..68b243d8 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -5,100 +5,183 @@ library(FRASER)
 library(BiocParallel)
 library(SummarizedExperiment)
 library(HDF5Array)
+library(rhdf5)
 
 parser <- ArgumentParser(description = "Merge Non-Split Counts and Calculate PSI")
 parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
-parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to g_ranges_non_split_counts.RDS")
+parser$add_argument("--filtered_ranges_path", required = TRUE, help = "Path to splice_site_coords.RDS")
 parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
 parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
 args <- parser$parse_args()
 
-# 1. Load the ranges first so they are available for all steps
-non_split_count_ranges <- readRDS(args$filtered_ranges_path)
+# Configure FRASER to use HDF5 backend
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+# 1. Setup directories
+message("\n[1/6] Setting up directories...")
 fds_name <- paste0("FRASER_", args$cohort_id)
 save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
 out_dir <- file.path(save_dir, "nonSplitCounts")
 dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)
-file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
 
-# 2. Load FDS
-fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+# Copy FDS file only if source and destination are different
+fds_dest <- file.path(save_dir, "fds-object.RDS")
+if(normalizePath(args$fds_path, mustWork = FALSE) != normalizePath(fds_dest, mustWork = FALSE)) {
+    file.copy(args$fds_path, fds_dest, overwrite = TRUE)
+    message("Copied FDS from: ", args$fds_path)
+} else {
+    message("FDS already in correct location: ", fds_dest)
+}
 
-# 3. Copy H5 files from cache to the proper output directory
+# 2. Load FDS to get sample information
+message("\n[2/6] Loading FraserDataSet...")
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+sample_ids <- samples(fds)
+n_samples <- length(sample_ids)
+message("Samples: ", n_samples)
+
+# 3. Load the genomic ranges
+message("\n[3/6] Loading genomic ranges...")
+non_split_ranges <- readRDS(args$filtered_ranges_path)
+n_sites <- length(non_split_ranges)
+message("Non-split sites: ", n_sites)
+
+# 4. Copy H5 files from cache to the proper output directory
+message("\n[4/6] Copying and organizing H5 files...")
 cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts")
 h5_files <- list.files(cache_dir, pattern = "\\.h5$", full.names = TRUE)
-message("Copying ", length(h5_files), " HDF5 files from cache to output directory...")
-for(f in h5_files) {
-    file.copy(f, file.path(out_dir, basename(f)), overwrite = TRUE)
+message("Found ", length(h5_files), " HDF5 files in cache")
+
+if(length(h5_files) == 0) {
+    stop("ERROR: No H5 files found in cache directory: ", cache_dir)
 }
 
-anchor_se <- SummarizedExperiment(
-    assays = list(rawCountsSS = matrix(0,
-                                       nrow = length(non_split_count_ranges),
-                                       ncol = length(samples(fds)),
-                                       dimnames = list(NULL, samples(fds)))),
-    rowRanges = non_split_count_ranges
-)
-# This creates the se.rds that FRASER's loadHDF5SummarizedExperiment needs
-saveHDF5SummarizedExperiment(anchor_se, dir = out_dir, replace = TRUE, prefix = "rawCountsSS")
+# Copy with standardized naming convention
+for(i in seq_along(sample_ids)) {
+    sid <- sample_ids[i]
+    src_file <- h5_files[grepl(sid, h5_files)][1]
 
-# 4. Configure Backend
-options("FRASER.maxSamplesNoHDF5" = 0)
-options("FRASER.maxJunctionsNoHDF5" = -1)
-bp <- MulticoreParam(workers = args$nthreads)
-register(bp)
+    if(is.na(src_file)) {
+        warning("No H5 file found for sample: ", sid)
+        next
+    }
 
+    dest <- file.path(out_dir, paste0("nonSplicedCounts-", sid, ".h5"))
+    file.copy(src_file, dest, overwrite = TRUE)
+    message("  Copied: ", basename(src_file), " -> ", basename(dest))
+}
 
-available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
-if(length(available_bams) > 0){
-    ref_bam <- available_bams[1]
-    message("Validating metadata against reference BAM: ", ref_bam)
-    colData(fds)$bamFile <- ref_bam
-    seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam))
-} else {
-    stop("CRITICAL ERROR: No BAM files found in /io/batch/input_bams.")
+# 5. Validate H5 files and build combined count matrix
+message("\n[5/6] Building combined count matrix (memory-efficient)...")
+
+# Read first file to validate dimensions
+first_h5 <- file.path(out_dir, paste0("nonSplicedCounts-", sample_ids[1], ".h5"))
+if(!file.exists(first_h5)) {
+    stop("ERROR: First H5 file not found: ", first_h5)
 }
-strandSpecific(fds) <- 2
 
-# --- 6. Merge Non-Split Counts ---
-message("Merging counts using HDF5 files in: ", out_dir)
-non_split_counts <- getNonSplitReadCountsForAllSamples(
-  fds = fds,
-  splitCountRanges = non_split_count_ranges,
-  minAnchor = 5,
-  recount = FALSE # Crucial: FALSE ensures it uses the .h5 files from the cache
-)
+h5_info <- h5ls(first_h5)
+dataset_row <- h5_info[h5_info$name == "nonSplicedCounts", ]
+actual_rows <- as.numeric(strsplit(dataset_row$dim, " x ")[[1]][1])
+
+if(actual_rows != n_sites) {
+    stop("ERROR: H5 files have ", actual_rows, " rows but ranges have ", n_sites,
+         " sites. Mismatch!")
+}
 
-# CRITICAL ADDITION: Annotate the non-split counts object!
-# This assigns 'spliceSiteID' to the rowRanges of your non-split object.
-message("Annotating non-split splice site IDs...")
-non_split_counts <- FRASER:::annotateSpliceSite(non_split_counts)
+message("Validated: H5 dimensions match genomic ranges (", actual_rows, " sites)")
 
-# --- Verification Block ---
-# 1. Check if the assay exists
-if(!"rawCountsSS" %in% assayNames(non_split_counts)){
-    stop("Merge failed: 'rawCountsSS' assay not found in merged object.")
+# Create a combined count matrix by reading each sample one at a time
+count_matrix <- matrix(0, nrow = n_sites, ncol = n_samples)
+colnames(count_matrix) <- sample_ids
+
+total_counts <- 0
+for(i in seq_along(sample_ids)) {
+    sid <- sample_ids[i]
+    h5_file <- file.path(out_dir, paste0("nonSplicedCounts-", sid, ".h5"))
+
+    if(!file.exists(h5_file)) {
+        warning("H5 file not found for sample ", sid, ", using zeros")
+        next
+    }
+
+    message("  Reading [", i, "/", n_samples, "]: ", sid)
+
+    # Read the counts data for this sample only
+    counts <- h5read(h5_file, "nonSplicedCounts")
+    if(is.matrix(counts)) {
+        counts <- counts[, 1]
+    }
+
+    count_matrix[, i] <- as.vector(counts)
+    total_counts <- total_counts + sum(counts)
 }
 
-# 2. Check for zeros (Common sign of a path/naming mismatch)
-total_counts <- sum(assay(non_split_counts, "rawCountsSS"))
-if(total_counts == 0){
-    stop("Merge Error: The merged non-split counts matrix is empty (all zeros).")
+message("Combined matrix dimensions: ", nrow(count_matrix), " x ", ncol(count_matrix))
+message("Total counts: ", total_counts)
+
+if(total_counts == 0) {
+    stop("ERROR: The merged non-split counts matrix is empty (all zeros).")
 }
 
-message("Merge verified: Found ", total_counts, " total site coverage counts.")
+# 6. Create HDF5-backed SummarizedExperiment
+message("\n[6/6] Creating HDF5-backed SummarizedExperiment...")
 
-# 8. Save the merged non-split counts to the correct location for join step
-message("Saving merged non-split counts to: ", out_dir)
-saveHDF5SummarizedExperiment(
-    non_split_counts,
-    dir = out_dir,
-    replace = TRUE
+# Create SE with in-memory matrix
+nonSplitCounts_se <- SummarizedExperiment(
+    assays = list(rawCountsSS = count_matrix),
+    rowRanges = non_split_ranges,
+    colData = DataFrame(sampleID = sample_ids, row.names = sample_ids)
 )
 
-# 9. Update FDS object and save
+message("Non-split SE created with dimensions: ", nrow(nonSplitCounts_se),
+        " x ", ncol(nonSplitCounts_se))
+
+# Annotate non-split counts with splice site IDs
+message("Annotating non-split counts with splice site IDs...")
+nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
+
+# Save the SummarizedExperiment - this will automatically create HDF5 backend
+message("Saving non-split HDF5 SummarizedExperiment...")
+saveHDF5SummarizedExperiment(nonSplitCounts_se, dir = out_dir, replace = TRUE)
+
+# Reload it to get the proper HDF5-backed version
+message("Reloading HDF5-backed version...")
+non_split_counts <- loadHDF5SummarizedExperiment(dir = out_dir)
+
+# Final verification
+if(!"rawCountsSS" %in% assayNames(non_split_counts)) {
+    stop("ERROR: 'rawCountsSS' assay not found in reloaded object.")
+}
+
+message("Verification: Reloaded object has ", sum(assay(non_split_counts, "rawCountsSS")), " total counts")
+
+# 7. Configure Backend for FDS
+message("\nConfiguring FRASER backend...")
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
+# Set BAM file metadata (for validation purposes)
+available_bams <- list.files("/io/batch/input_bams", pattern = "\\.bam$", full.names = TRUE)
+if(length(available_bams) > 0) {
+    ref_bam <- available_bams[1]
+    message("Validating metadata against reference BAM: ", ref_bam)
+    colData(fds)$bamFile <- ref_bam
+    seqlevelsStyle(fds) <- seqlevelsStyle(Rsamtools::BamFile(ref_bam))
+} else {
+    warning("WARNING: No BAM files found in /io/batch/input_bams.")
+}
+strandSpecific(fds) <- 2
+
+# 8. Update FDS object with the merged non-split counts
+message("\nUpdating FDS object with merged non-split counts...")
 nonSplicedReads(fds) <- non_split_counts
 fds <- saveFraserDataSet(fds)
 
-message("Non-split merge complete!")
+message("\n=== Non-split merge complete! ===")
+message("Final stats:")
+message("  - Sites: ", nrow(non_split_counts))
+message("  - Samples: ", ncol(non_split_counts))
+message("  - Total counts: ", sum(assay(non_split_counts, "rawCountsSS")))

From b1d55ee48d82b8dd9480dbac6945bd622b537321 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 10 Feb 2026 21:48:11 +1100
Subject: [PATCH 77/89] Updates to merge non-split and join_counts

---
 images/fraser/fraser_count_non_split.R | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 9b6891af..d51f34bf 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -23,8 +23,9 @@ save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
 dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
 file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
 
-# Create the specific cache directory for non-split counts
-cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
+# Create the cache directory WITHOUT the fds_name subdirectory
+# This matches what the merge script expects
+cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts")
 dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
 
 # 2. Configure Parallelism and HDF5
@@ -58,12 +59,9 @@ sample_result <- countNonSplicedReads(args$sample_id,
                                       recount= TRUE,
                                       spliceSiteCoords=filtered_coords)
 # 6. Verification
-# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/
-actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
-
-# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS
-expected_h5  <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
-expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
+# The H5 file should now be at cache_dir/nonSplicedCounts-{sample_id}.h5
+expected_h5  <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
+expected_rds <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
 
 if (file.exists(expected_h5)) {
     message("Success: Created non-split counts (HDF5) at ", expected_h5)
@@ -71,8 +69,8 @@ if (file.exists(expected_h5)) {
     message("Success: Created non-split counts (RDS) at ", expected_rds)
 } else {
     # Debugging: List files in the directory to see what actually happened
-    if(dir.exists(actual_cache_dir)){
-        message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", "))
+    if(dir.exists(cache_dir)){
+        message("Files found in cache dir: ", paste(list.files(cache_dir), collapse=", "))
     }
 
     if(!file.exists(paste0(args$bam_path, ".bai"))){

From 85743706cb80185fa9d905cb5e462f0d7e45bf61 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 10 Feb 2026 22:25:34 +1100
Subject: [PATCH 78/89] Updates to join_counts

---
 images/fraser/fraser_join_counts.R | 74 +++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 16 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index d4d495ca..e1f960e9 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -64,12 +64,21 @@ fds_name <- paste0("FRASER_", args$cohort_id)
 saveDir <- file.path(args$work_dir, "savedObjects", fds_name)
 
 # 1. Load the FDS object
-message("\n[1/5] Loading Fraser Data Set from RDS...")
+message("\n[1/7] Loading Fraser Data Set from RDS...")
 fds <- readRDS(args$fds_path)
 workingDir(fds) <- saveDir
 
-# 2. Load split counts
-message("\n[2/5] Loading split counts...")
+# 2. Load the master splice site coordinates that were used to create non-split counts
+message("\n[2/7] Loading master splice site coordinates...")
+splice_coords_path <- file.path(dirname(dirname(saveDir)), "splice_site_coords.RDS")
+if(!file.exists(splice_coords_path)) {
+    stop("ERROR: splice_site_coords.RDS not found at: ", splice_coords_path)
+}
+master_splice_coords <- readRDS(splice_coords_path)
+message("Master coordinates loaded: ", length(master_splice_coords), " sites")
+
+# 3. Load split counts
+message("\n[3/7] Loading split counts...")
 split_se_dir <- file.path(saveDir, "splitCounts")
 if(!dir.exists(split_se_dir)){
     stop(paste("Missing splitCounts directory at:", split_se_dir))
@@ -77,12 +86,16 @@ if(!dir.exists(split_se_dir)){
 splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
 message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples")
 
-# Annotate split counts with splice site IDs
-message("Annotating split counts with splice site IDs...")
-splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
+# Annotate split counts using the master coordinates to ensure consistency
+message("Annotating split counts with master splice site IDs...")
+# Use FRASER's internal function but with our master coordinates
+splitCountRanges <- rowRanges(splitCounts_se)
+# Generate IDs based on the genomic positions
+splitCountRanges <- FRASER:::annotateSpliceSite(splitCountRanges, master_splice_coords)
+rowRanges(splitCounts_se) <- splitCountRanges
 
-# 3. Load merged non-split counts
-message("\n[3/5] Loading merged non-split counts...")
+# 4. Load merged non-split counts
+message("\n[4/7] Loading merged non-split counts...")
 merged_non_split_dir <- file.path(saveDir, "nonSplitCounts")
 if(!dir.exists(merged_non_split_dir)){
     stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir))
@@ -90,14 +103,43 @@ if(!dir.exists(merged_non_split_dir)){
 nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
 message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", ncol(nonSplitCounts_se), " samples")
 
-# Verify non-split counts are annotated
+# Verify non-split counts match the master coordinates
+if(nrow(nonSplitCounts_se) != length(master_splice_coords)) {
+    stop("ERROR: Non-split counts dimensions (", nrow(nonSplitCounts_se),
+         ") don't match master coordinates (", length(master_splice_coords), ")")
+}
+
+# Verify non-split counts are annotated - they should already be from the merge step
 if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) {
-    message("Annotating non-split counts with splice site IDs...")
-    nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
+    message("Annotating non-split counts with master splice site IDs...")
+    nonSplitRanges <- rowRanges(nonSplitCounts_se)
+    nonSplitRanges <- FRASER:::annotateSpliceSite(nonSplitRanges, master_splice_coords)
+    rowRanges(nonSplitCounts_se) <- nonSplitRanges
+}
+
+# 5. Verify ID consistency before joining
+message("\n[5/7] Verifying ID consistency...")
+split_start_ids <- unique(mcols(splitCounts_se)$startID)
+split_end_ids <- unique(mcols(splitCounts_se)$endID)
+nonsplit_ids <- unique(mcols(nonSplitCounts_se)$spliceSiteID)
+
+message("  Split startIDs: ", length(split_start_ids))
+message("  Split endIDs: ", length(split_end_ids))
+message("  Non-split spliceSiteIDs: ", length(nonsplit_ids))
+
+# Check overlap
+missing_start <- sum(!split_start_ids %in% nonsplit_ids)
+missing_end <- sum(!split_end_ids %in% nonsplit_ids)
+
+if(missing_start > 0 || missing_end > 0) {
+    warning("WARNING: Some split junction IDs are not in non-split IDs:")
+    warning("  Missing startIDs: ", missing_start)
+    warning("  Missing endIDs: ", missing_end)
+    warning("  This may cause issues but attempting to continue...")
 }
 
-# 4. Add counts to FRASER object
-message("\n[4/5] Joining split and non-split counts into FDS object...")
+# 6. Add counts to FRASER object
+message("\n[6/7] Joining split and non-split counts into FDS object...")
 fds <- addCountsToFraserDataSet(
   fds = fds,
   splitCounts = splitCounts_se,
@@ -111,8 +153,8 @@ message("  - Splice sites: ", nrow(counts(fds, type = "ss")))
 # Run integrity check
 check_fds_integrity(fds)
 
-# 5. Calculate PSI values
-message("\n[5/5] Calculating PSI values...")
+# 7. Calculate PSI values
+message("\n[7/7] Calculating PSI values...")
 fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard"))
 
 message("PSI values calculated successfully!")
@@ -121,7 +163,7 @@ message("Available assays: ", paste(assayNames(fds), collapse = ", "))
 # Final integrity check
 check_fds_integrity(fds)
 
-# 6. Save final FRASER object
+# 8. Save final FRASER object
 message("\nSaving final integrated FDS...")
 fds <- saveFraserDataSet(fds)
 

From 4c1dd5c8cc18ad83dc8ff220edef3306a9ae8a99 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 10 Feb 2026 23:09:56 +1100
Subject: [PATCH 79/89] Updates to join_counts

---
 images/fraser/fraser_join_counts.R | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index e1f960e9..a8efc386 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -86,13 +86,10 @@ if(!dir.exists(split_se_dir)){
 splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
 message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples")
 
-# Annotate split counts using the master coordinates to ensure consistency
-message("Annotating split counts with master splice site IDs...")
-# Use FRASER's internal function but with our master coordinates
-splitCountRanges <- rowRanges(splitCounts_se)
-# Generate IDs based on the genomic positions
-splitCountRanges <- FRASER:::annotateSpliceSite(splitCountRanges, master_splice_coords)
-rowRanges(splitCounts_se) <- splitCountRanges
+# Annotate split counts with splice site IDs
+# NOTE: This must be done AFTER loading and works on the whole object
+message("Annotating split counts with splice site IDs...")
+splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
 
 # 4. Load merged non-split counts
 message("\n[4/7] Loading merged non-split counts...")
@@ -109,12 +106,12 @@ if(nrow(nonSplitCounts_se) != length(master_splice_coords)) {
          ") don't match master coordinates (", length(master_splice_coords), ")")
 }
 
-# Verify non-split counts are annotated - they should already be from the merge step
+# Non-split counts should already be annotated from the merge step
+# Verify the annotation is present
 if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) {
-    message("Annotating non-split counts with master splice site IDs...")
-    nonSplitRanges <- rowRanges(nonSplitCounts_se)
-    nonSplitRanges <- FRASER:::annotateSpliceSite(nonSplitRanges, master_splice_coords)
-    rowRanges(nonSplitCounts_se) <- nonSplitRanges
+    message("WARNING: Non-split counts missing spliceSiteID annotation!")
+    message("Annotating non-split counts with splice site IDs...")
+    nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
 }
 
 # 5. Verify ID consistency before joining

From 80230c28abe7e331a9351e52e1d36486d3db00a3 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 10:04:08 +1100
Subject: [PATCH 80/89] Updates to join_counts

---
 images/fraser/fraser_join_counts.R | 77 ++++++++++++++++++++++++++----
 1 file changed, 69 insertions(+), 8 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index a8efc386..069064e6 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -86,10 +86,70 @@ if(!dir.exists(split_se_dir)){
 splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
 message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples")
 
-# Annotate split counts with splice site IDs
-# NOTE: This must be done AFTER loading and works on the whole object
-message("Annotating split counts with splice site IDs...")
-splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
+# Annotate split counts with splice site IDs using the master coordinates
+# This is CRITICAL: we must use the same coordinate reference as the non-split counts
+message("Annotating split counts with splice site IDs using master coordinates...")
+# Extract the start and end coordinates from each junction
+splitRanges <- rowRanges(splitCounts_se)
+
+# Create a mapping of genomic positions to splice site IDs using master coordinates
+# The master coordinates define the universe of valid splice sites
+master_coords_df <- data.frame(
+    seqnames = as.character(seqnames(master_splice_coords)),
+    pos = start(master_splice_coords),
+    strand = as.character(strand(master_splice_coords)),
+    spliceSiteID = mcols(master_splice_coords)$spliceSiteID
+)
+
+# If master coordinates don't have spliceSiteIDs yet, generate them
+if(is.null(master_coords_df$spliceSiteID)) {
+    message("Generating spliceSiteIDs for master coordinates...")
+    master_coords_df$spliceSiteID <- paste0(
+        "ss_", master_coords_df$seqnames, "_",
+        master_coords_df$pos, "_",
+        master_coords_df$strand
+    )
+    mcols(master_splice_coords)$spliceSiteID <- master_coords_df$spliceSiteID
+}
+
+# Now annotate the split counts by matching their start/end positions to the master coordinates
+message("Mapping junction start positions to master coordinates...")
+start_df <- data.frame(
+    seqnames = as.character(seqnames(splitRanges)),
+    pos = start(splitRanges),
+    strand = as.character(strand(splitRanges))
+)
+start_df$startID <- master_coords_df$spliceSiteID[
+    match(
+        paste0(start_df$seqnames, "_", start_df$pos, "_", start_df$strand),
+        paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand)
+    )
+]
+
+message("Mapping junction end positions to master coordinates...")
+end_df <- data.frame(
+    seqnames = as.character(seqnames(splitRanges)),
+    pos = end(splitRanges),
+    strand = as.character(strand(splitRanges))
+)
+end_df$endID <- master_coords_df$spliceSiteID[
+    match(
+        paste0(end_df$seqnames, "_", end_df$pos, "_", end_df$strand),
+        paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand)
+    )
+]
+
+# Add the IDs to the splitCounts metadata
+mcols(splitCounts_se)$startID <- start_df$startID
+mcols(splitCounts_se)$endID <- end_df$endID
+
+# Count how many junctions have valid IDs
+valid_junctions <- sum(!is.na(start_df$startID) & !is.na(end_df$endID))
+message("  Junctions with valid start/end IDs: ", valid_junctions, " / ", nrow(splitCounts_se))
+
+if(valid_junctions == 0) {
+    stop("ERROR: No junctions could be mapped to master coordinates!")
+}
 
 # 4. Load merged non-split counts
 message("\n[4/7] Loading merged non-split counts...")
@@ -107,11 +167,12 @@ if(nrow(nonSplitCounts_se) != length(master_splice_coords)) {
 }
 
 # Non-split counts should already be annotated from the merge step
-# Verify the annotation is present
+# If not, annotate them using the master coordinates
 if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) {
-    message("WARNING: Non-split counts missing spliceSiteID annotation!")
-    message("Annotating non-split counts with splice site IDs...")
-    nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
+    message("Annotating non-split counts with master coordinate IDs...")
+    mcols(nonSplitCounts_se)$spliceSiteID <- master_coords_df$spliceSiteID
+} else {
+    message("Non-split counts already annotated with spliceSiteIDs")
 }
 
 # 5. Verify ID consistency before joining

From e9abd104681e4a8a2a68aad46e78314096dff5f8 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 10:28:29 +1100
Subject: [PATCH 81/89] Updates to join_counts

---
 images/fraser/fraser_join_counts.R | 103 +++++------------------------
 1 file changed, 17 insertions(+), 86 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index 069064e6..bc23df5e 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -86,70 +86,11 @@ if(!dir.exists(split_se_dir)){
 splitCounts_se <- loadHDF5SummarizedExperiment(dir = split_se_dir)
 message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol(splitCounts_se), " samples")
 
-# Annotate split counts with splice site IDs using the master coordinates
-# This is CRITICAL: we must use the same coordinate reference as the non-split counts
-message("Annotating split counts with splice site IDs using master coordinates...")
-# Extract the start and end coordinates from each junction
-splitRanges <- rowRanges(splitCounts_se)
-
-# Create a mapping of genomic positions to splice site IDs using master coordinates
-# The master coordinates define the universe of valid splice sites
-master_coords_df <- data.frame(
-    seqnames = as.character(seqnames(master_splice_coords)),
-    pos = start(master_splice_coords),
-    strand = as.character(strand(master_splice_coords)),
-    spliceSiteID = mcols(master_splice_coords)$spliceSiteID
-)
-
-# If master coordinates don't have spliceSiteIDs yet, generate them
-if(is.null(master_coords_df$spliceSiteID)) {
-    message("Generating spliceSiteIDs for master coordinates...")
-    master_coords_df$spliceSiteID <- paste0(
-        "ss_", master_coords_df$seqnames, "_",
-        master_coords_df$pos, "_",
-        master_coords_df$strand
-    )
-    mcols(master_splice_coords)$spliceSiteID <- master_coords_df$spliceSiteID
-}
-
-# Now annotate the split counts by matching their start/end positions to the master coordinates
-message("Mapping junction start positions to master coordinates...")
-start_df <- data.frame(
-    seqnames = as.character(seqnames(splitRanges)),
-    pos = start(splitRanges),
-    strand = as.character(strand(splitRanges))
-)
-start_df$startID <- master_coords_df$spliceSiteID[
-    match(
-        paste0(start_df$seqnames, "_", start_df$pos, "_", start_df$strand),
-        paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand)
-    )
-]
-
-message("Mapping junction end positions to master coordinates...")
-end_df <- data.frame(
-    seqnames = as.character(seqnames(splitRanges)),
-    pos = end(splitRanges),
-    strand = as.character(strand(splitRanges))
-)
-end_df$endID <- master_coords_df$spliceSiteID[
-    match(
-        paste0(end_df$seqnames, "_", end_df$pos, "_", end_df$strand),
-        paste0(master_coords_df$seqnames, "_", master_coords_df$pos, "_", master_coords_df$strand)
-    )
-]
-
-# Add the IDs to the splitCounts metadata
-mcols(splitCounts_se)$startID <- start_df$startID
-mcols(splitCounts_se)$endID <- end_df$endID
-
-# Count how many junctions have valid IDs
-valid_junctions <- sum(!is.na(start_df$startID) & !is.na(end_df$endID))
-message("  Junctions with valid start/end IDs: ", valid_junctions, " / ", nrow(splitCounts_se))
-
-if(valid_junctions == 0) {
-    stop("ERROR: No junctions could be mapped to master coordinates!")
-}
+# Annotate split counts with splice site IDs
+# NOTE: annotateSpliceSite() automatically extracts splice sites from junctions
+# and generates consistent startID/endID mappings
+message("Annotating split counts with splice site IDs...")
+splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
 
 # 4. Load merged non-split counts
 message("\n[4/7] Loading merged non-split counts...")
@@ -160,40 +101,30 @@ if(!dir.exists(merged_non_split_dir)){
 nonSplitCounts_se <- loadHDF5SummarizedExperiment(dir = merged_non_split_dir)
 message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", ncol(nonSplitCounts_se), " samples")
 
-# Verify non-split counts match the master coordinates
-if(nrow(nonSplitCounts_se) != length(master_splice_coords)) {
-    stop("ERROR: Non-split counts dimensions (", nrow(nonSplitCounts_se),
-         ") don't match master coordinates (", length(master_splice_coords), ")")
-}
-
-# Non-split counts should already be annotated from the merge step
-# If not, annotate them using the master coordinates
-if(!"spliceSiteID" %in% colnames(mcols(nonSplitCounts_se))) {
-    message("Annotating non-split counts with master coordinate IDs...")
-    mcols(nonSplitCounts_se)$spliceSiteID <- master_coords_df$spliceSiteID
-} else {
-    message("Non-split counts already annotated with spliceSiteIDs")
-}
+# CRITICAL FIX: Always re-annotate after loading to ensure consistency
+# The saved HDF5 object might have stale or incompatible IDs
+message("Re-annotating non-split counts with splice site IDs...")
+nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
 
 # 5. Verify ID consistency before joining
 message("\n[5/7] Verifying ID consistency...")
-split_start_ids <- unique(mcols(splitCounts_se)$startID)
-split_end_ids <- unique(mcols(splitCounts_se)$endID)
-nonsplit_ids <- unique(mcols(nonSplitCounts_se)$spliceSiteID)
+split_start_ids <- na.omit(unique(mcols(splitCounts_se)$startID))
+split_end_ids <- na.omit(unique(mcols(splitCounts_se)$endID))
+nonsplit_ids <- na.omit(unique(mcols(nonSplitCounts_se)$spliceSiteID))
 
 message("  Split startIDs: ", length(split_start_ids))
 message("  Split endIDs: ", length(split_end_ids))
 message("  Non-split spliceSiteIDs: ", length(nonsplit_ids))
 
-# Check overlap
+# Check overlap - both split start/end should be subsets of nonsplit
 missing_start <- sum(!split_start_ids %in% nonsplit_ids)
 missing_end <- sum(!split_end_ids %in% nonsplit_ids)
 
 if(missing_start > 0 || missing_end > 0) {
-    warning("WARNING: Some split junction IDs are not in non-split IDs:")
-    warning("  Missing startIDs: ", missing_start)
-    warning("  Missing endIDs: ", missing_end)
-    warning("  This may cause issues but attempting to continue...")
+    message("NOTE: Some split junction IDs are not in non-split IDs:")
+    message("  Missing startIDs: ", missing_start, " out of ", length(split_start_ids))
+    message("  Missing endIDs: ", missing_end, " out of ", length(split_end_ids))
+    message("  This is expected if junctions reference splice sites not in the filtered set")
 }
 
 # 6. Add counts to FRASER object

From 6c2ec6365c135786081205a9240726cda3ffd833 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 10:58:26 +1100
Subject: [PATCH 82/89] Updates to join_counts

---
 images/fraser/fraser_join_counts.R | 72 +++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index bc23df5e..e18c6b2e 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -106,29 +106,59 @@ message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", n
 message("Re-annotating non-split counts with splice site IDs...")
 nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
 
-# 5. Verify ID consistency before joining
-message("\n[5/7] Verifying ID consistency...")
-split_start_ids <- na.omit(unique(mcols(splitCounts_se)$startID))
-split_end_ids <- na.omit(unique(mcols(splitCounts_se)$endID))
-nonsplit_ids <- na.omit(unique(mcols(nonSplitCounts_se)$spliceSiteID))
-
-message("  Split startIDs: ", length(split_start_ids))
-message("  Split endIDs: ", length(split_end_ids))
+# 5. Filter split counts to only include junctions that reference the non-split splice sites
+message("\n[5/7] Filtering split counts to match non-split coordinate space...")
+
+# Get the valid splice site IDs from non-split counts
+nonsplit_ids <- mcols(nonSplitCounts_se)$spliceSiteID
+
+# Get split junction IDs
+split_start_ids <- mcols(splitCounts_se)$startID
+split_end_ids <- mcols(splitCounts_se)$endID
+
+# Find junctions where BOTH start and end are in the non-split coordinate set
+valid_junctions <- (!is.na(split_start_ids) & !is.na(split_end_ids) &
+                     split_start_ids %in% nonsplit_ids &
+                     split_end_ids %in% nonsplit_ids)
+
+n_valid <- sum(valid_junctions)
+n_total <- nrow(splitCounts_se)
+message("  Junctions with both start/end in non-split coords: ", n_valid, " / ", n_total)
+message("  Filtering out ", n_total - n_valid, " junctions that reference sites outside the coordinate set")
+
+if(n_valid == 0) {
+    stop("ERROR: No junctions have both start and end in the non-split coordinate set!")
+}
+
+# Filter the split counts to only valid junctions
+splitCounts_se_filtered <- splitCounts_se[valid_junctions, ]
+message("  Filtered split counts dimensions: ", nrow(splitCounts_se_filtered), " junctions x ",
+        ncol(splitCounts_se_filtered), " samples")
+
+# Update the splitCounts_se to use the filtered version
+splitCounts_se <- splitCounts_se_filtered
+
+# 6. Verify ID consistency after filtering
+message("\n[6/7] Verifying ID consistency after filtering...")
+split_start_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$startID))
+split_end_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$endID))
+
+message("  Filtered split startIDs: ", length(split_start_ids_filtered))
+message("  Filtered split endIDs: ", length(split_end_ids_filtered))
 message("  Non-split spliceSiteIDs: ", length(nonsplit_ids))
 
-# Check overlap - both split start/end should be subsets of nonsplit
-missing_start <- sum(!split_start_ids %in% nonsplit_ids)
-missing_end <- sum(!split_end_ids %in% nonsplit_ids)
+# Check overlap - should be 100% now
+missing_start <- sum(!split_start_ids_filtered %in% nonsplit_ids)
+missing_end <- sum(!split_end_ids_filtered %in% nonsplit_ids)
 
 if(missing_start > 0 || missing_end > 0) {
-    message("NOTE: Some split junction IDs are not in non-split IDs:")
-    message("  Missing startIDs: ", missing_start, " out of ", length(split_start_ids))
-    message("  Missing endIDs: ", missing_end, " out of ", length(split_end_ids))
-    message("  This is expected if junctions reference splice sites not in the filtered set")
+    stop("ERROR: After filtering, still have missing IDs! startIDs: ", missing_start, ", endIDs: ", missing_end)
 }
 
-# 6. Add counts to FRASER object
-message("\n[6/7] Joining split and non-split counts into FDS object...")
+message("SUCCESS: All split junction IDs now exist in non-split coordinate set!")
+
+# 7. Add counts to FRASER object
+message("\n[7/7] Joining split and non-split counts into FDS object...")
 fds <- addCountsToFraserDataSet(
   fds = fds,
   splitCounts = splitCounts_se,
@@ -139,11 +169,9 @@ message("Counts successfully joined!")
 message("  - Split junctions: ", nrow(counts(fds, type = "j")))
 message("  - Splice sites: ", nrow(counts(fds, type = "ss")))
 
-# Run integrity check
-check_fds_integrity(fds)
 
-# 7. Calculate PSI values
-message("\n[7/7] Calculating PSI values...")
+# 8. Calculate PSI values
+message("\n[8/8] Calculating PSI values...")
 fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard"))
 
 message("PSI values calculated successfully!")
@@ -152,7 +180,7 @@ message("Available assays: ", paste(assayNames(fds), collapse = ", "))
 # Final integrity check
 check_fds_integrity(fds)
 
-# 8. Save final FRASER object
+# 9. Save final FRASER object
 message("\nSaving final integrated FDS...")
 fds <- saveFraserDataSet(fds)
 

From aa3320a81399798a8dbf32f0d8b91df37205da3a Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 13:37:58 +1100
Subject: [PATCH 83/89] Finally at the analysis step!

---
 images/fraser/fraser_analysis.R | 35 ++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index 7f534135..f7fd9fb0 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -45,13 +45,14 @@ if(! "spliceSiteID" %in% colnames(mcols(fds, type="ss"))){
   }
 }
 
-# --- 3. Filtering ---
-# It is critical to filter before fitting to reduce the size of the latent space matrices
-fds <- calculatePSIValues(fds, types = "jaccard", BPPARAM = bp)
-
 # Create QC directory
 dir.create("qc_plots", showWarnings = FALSE)
 
+# --- 3. Calculate Filter Values FIRST (required for plotting) ---
+message("Calculating filter expression values...")
+fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE)
+
+# --- 4. Generate QC Plots ---
 # DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast
 set.seed(42)
 plot_idx <- sample(nrow(fds), min(nrow(fds), 30000))
@@ -62,12 +63,12 @@ png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150)
 plotFilterExpression(fds_plot_subset, bins = 100)
 dev.off()
 
-# Apply actual filtering to full dataset
-fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = TRUE)
+# --- 5. Apply Filtering ---
+message("Applying filtering based on calculated values...")
+fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
 
-# --- 4. Dimensionality Message ---
+# Dimensionality Message
 raw_dim <- nrow(fds)
-fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
 filtered_dim <- nrow(fds_filtered)
 
 message(paste0("\n--- Filtering Summary ---"))
@@ -75,28 +76,33 @@ message(paste0("Original junctions:   ", raw_dim))
 message(paste0("Filtered junctions:   ", filtered_dim))
 message(paste0("Reduction:            ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%"))
 
-# --- 5. Hyperparameter Optimization ---
+# --- 6. Hyperparameter Optimization ---
 # Optimization must run on the filtered set
+message("Optimizing encoding dimension (q)...")
 opt_q <- bestQ(fds_filtered, type = "jaccard")
 
 png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150)
 plotEncDimSearch(fds_filtered, type = "jaccard")
 dev.off()
 
-# --- 6. Fitting ---
+# --- 7. Fitting ---
+message(paste0("Fitting FRASER model with q = ", opt_q, "..."))
 fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
 
-# QQ Plot (also uses a subset internally in FRASER, but we'll be explicit)
+# QQ Plot
+message("Generating QQ plot...")
 png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
 plotQQ(fds_fit, type = "jaccard")
 dev.off()
 
-# --- 7. Annotation ---
+# --- 8. Annotation ---
+message("Annotating results with gene information...")
 fds_fit <- annotateRangesWithTxDb(fds_fit,
                                   txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
                                   orgDb = org.Hs.eg.db)
 
-# --- 8. Results & Compressed Export ---
+# --- 9. Results & Compressed Export ---
+message("Extracting results...")
 res <- results(fds_fit,
                padjCutoff = args$pval_cutoff,
                deltaPsiCutoff = args$delta_psi_cutoff,
@@ -110,6 +116,7 @@ message("Saving results...")
 write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
 write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 
-# --- 9. Final Save ---
+# --- 10. Final Save ---
+message("Saving final FRASER object...")
 saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final"))
 message("Analysis Complete.")
\ No newline at end of file

From b7d016c74e86a998e96a3ce6b39da3e7292da89e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 13:53:29 +1100
Subject: [PATCH 84/89] Finally at the analysis step! (Fixing step order)

---
 images/fraser/fraser_analysis.R | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index f7fd9fb0..533fa7d5 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -89,19 +89,20 @@ dev.off()
 message(paste0("Fitting FRASER model with q = ", opt_q, "..."))
 fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
 
-# QQ Plot
-message("Generating QQ plot...")
-png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
-plotQQ(fds_fit, type = "jaccard")
-dev.off()
-
-# --- 8. Annotation ---
+# --- 8. Annotation (MUST happen before plotting QQ) ---
 message("Annotating results with gene information...")
 fds_fit <- annotateRangesWithTxDb(fds_fit,
                                   txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
                                   orgDb = org.Hs.eg.db)
 
-# --- 9. Results & Compressed Export ---
+# --- 9. QQ Plot (after annotation) ---
+message("Generating QQ plot...")
+png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
+# plotQQ expects the FDS object and sample/gene arguments, not 'type'
+plotQQ(fds_fit, aggregate = TRUE, global = TRUE)
+dev.off()
+
+# --- 10. Results & Compressed Export ---
 message("Extracting results...")
 res <- results(fds_fit,
                padjCutoff = args$pval_cutoff,
@@ -116,7 +117,7 @@ message("Saving results...")
 write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
 write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 
-# --- 10. Final Save ---
+# --- 11. Final Save ---
 message("Saving final FRASER object...")
 saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final"))
 message("Analysis Complete.")
\ No newline at end of file

From da6dae4aa007b31b48c58ffe4473cf727ea7e2e8 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 15:39:20 +1100
Subject: [PATCH 85/89] fingers crossed

---
 images/fraser/fraser_analysis.R | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index 533fa7d5..aa41ec73 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -2,7 +2,7 @@
 
 library(argparse)
 library(FRASER)
-library(tidyverse)
+library(readr)
 library(TxDb.Hsapiens.UCSC.hg38.knownGene)
 library(org.Hs.eg.db)
 library(HDF5Array)
@@ -89,25 +89,33 @@ dev.off()
 message(paste0("Fitting FRASER model with q = ", opt_q, "..."))
 fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
 
-# --- 8. Annotation (MUST happen before plotting QQ) ---
+# --- 8. Annotation (MUST happen BEFORE calculating gene-level p-values) ---
 message("Annotating results with gene information...")
 fds_fit <- annotateRangesWithTxDb(fds_fit,
                                   txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
                                   orgDb = org.Hs.eg.db)
 
-# --- 9. QQ Plot (after annotation) ---
-message("Generating QQ plot...")
-png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
-# plotQQ expects the FDS object and sample/gene arguments, not 'type'
-plotQQ(fds_fit, aggregate = TRUE, global = TRUE)
-dev.off()
+# --- 9. Calculate gene-level p-values (after annotation) ---
+message("Calculating gene-level p-values...")
+fds_fit <- calculatePadjValues(fds_fit, type = "jaccard")
 
-# --- 10. Results & Compressed Export ---
+# --- 10. QQ Plot (after gene-level p-values are calculated) ---
+message("Generating QQ plot...")
+tryCatch({
+  png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
+  plotQQ(fds_fit, aggregate = TRUE, global = TRUE)
+  dev.off()
+}, error = function(e) {
+  dev.off()  # Close any open graphics device
+  message("Warning: QQ plot generation failed: ", e$message)
+  message("Skipping QQ plot and continuing with analysis...")
+})
+
+# --- 11. Results & Compressed Export ---
 message("Extracting results...")
 res <- results(fds_fit,
                padjCutoff = args$pval_cutoff,
                deltaPsiCutoff = args$delta_psi_cutoff,
-               zScoreCutoff = args$z_cutoff,
                minCount = args$min_count)
 
 # Extract all results for Jaccard
@@ -117,7 +125,7 @@ message("Saving results...")
 write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
 write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 
-# --- 11. Final Save ---
+# --- 12. Final Save ---
 message("Saving final FRASER object...")
 saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final"))
 message("Analysis Complete.")
\ No newline at end of file

From 53fca54e56208163d5b76379b7dcd6fa54147eac Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 17:18:33 +1100
Subject: [PATCH 86/89] linting

---
 images/fraser/fraser_analysis.R        | 2 +-
 images/fraser/fraser_count_non_split.R | 2 +-
 images/fraser/fraser_merge_split.R     | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index aa41ec73..c0c721d2 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -128,4 +128,4 @@ write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
 # --- 12. Final Save ---
 message("Saving final FRASER object...")
 saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final"))
-message("Analysis Complete.")
\ No newline at end of file
+message("Analysis Complete.")
diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index d51f34bf..acc28857 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -77,4 +77,4 @@ if (file.exists(expected_h5)) {
         stop("BAM Index (.bai) missing. FRASER cannot perform random access.")
     }
     stop(paste("Counting failed. Output not found for sample:", args$sample_id))
-}
\ No newline at end of file
+}
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 791961c4..685f833a 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -88,4 +88,3 @@ spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges)
 saveRDS(splitCountRangesNonFilt,       file.path(args$work_dir, "g_ranges_split_counts.RDS"))
 saveRDS(splitCountRanges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
 saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))
-

From 39e788563541db5d00fe17122c1d121d89ae76ff Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 16 Feb 2026 13:39:49 +1100
Subject: [PATCH 87/89] reverting the validation check to commit c6f64b90 to
 match last successful run batch 1125169

---
 images/fraser/fraser_count_non_split.R | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index acc28857..508f59d6 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -59,9 +59,12 @@ sample_result <- countNonSplicedReads(args$sample_id,
                                       recount= TRUE,
                                       spliceSiteCoords=filtered_coords)
 # 6. Verification
-# The H5 file should now be at cache_dir/nonSplicedCounts-{sample_id}.h5
-expected_h5  <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
-expected_rds <- file.path(cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
+# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/
+actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
+
+# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS
+expected_h5  <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
+expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
 
 if (file.exists(expected_h5)) {
     message("Success: Created non-split counts (HDF5) at ", expected_h5)
@@ -69,8 +72,8 @@ if (file.exists(expected_h5)) {
     message("Success: Created non-split counts (RDS) at ", expected_rds)
 } else {
     # Debugging: List files in the directory to see what actually happened
-    if(dir.exists(cache_dir)){
-        message("Files found in cache dir: ", paste(list.files(cache_dir), collapse=", "))
+    if(dir.exists(actual_cache_dir)){
+        message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", "))
     }
 
     if(!file.exists(paste0(args$bam_path, ".bai"))){

From f4d978db1a4dadc280fa6274e4125f69194b8c3d Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 17 Feb 2026 14:56:50 +1100
Subject: [PATCH 88/89] Co-pilot review for all scripts before final run

---
 images/fraser/Dockerfile               |  3 +--
 images/fraser/fraser_analysis.R        |  3 +--
 images/fraser/fraser_count_non_split.R | 17 +++++--------
 images/fraser/fraser_init.R            |  4 +--
 images/fraser/fraser_join_counts.R     | 34 ++++++++++----------------
 images/fraser/fraser_merge_non_split.R | 26 +++++++++++++++++---
 images/fraser/fraser_merge_split.R     |  3 +--
 7 files changed, 46 insertions(+), 44 deletions(-)

diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
index 8e7505df..eb01c846 100644
--- a/images/fraser/Dockerfile
+++ b/images/fraser/Dockerfile
@@ -5,7 +5,7 @@ ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH
 ARG VERSION=2.4.6
 
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
-	rm -r /var/cache/apt/* && \
+    rm -r /var/cache/apt/* && \
     wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
     chmod +x /usr/local/bin/micromamba && \
     mkdir -p ${MAMBA_ROOT_PREFIX} && \
@@ -20,7 +20,6 @@ RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
     bioconductor-fraser=${VERSION} \
     bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
     bioconductor-org.hs.eg.db=3.20.0 && \
-    # Install summarizedRT via BiocManager
     micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \
     micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \
     micromamba run -n base Rscript -e 'BiocManager::install("HDF5Array", update=FALSE, ask=FALSE)' && \
diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
index c0c721d2..548b8c32 100644
--- a/images/fraser/fraser_analysis.R
+++ b/images/fraser/fraser_analysis.R
@@ -12,7 +12,6 @@ parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
 parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder")
 parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
 parser$add_argument("--pval_cutoff", type = "double", default = 0.05)
-parser$add_argument("--z_cutoff", type = "double")
 parser$add_argument("--delta_psi_cutoff", type = "double", default = 0.3)
 parser$add_argument("--min_count", type = "integer", default = 5)
 parser$add_argument("--nthreads", type = "integer", default = 1)
@@ -74,7 +73,7 @@ filtered_dim <- nrow(fds_filtered)
 message(paste0("\n--- Filtering Summary ---"))
 message(paste0("Original junctions:   ", raw_dim))
 message(paste0("Filtered junctions:   ", filtered_dim))
-message(paste0("Reduction:            ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%"))
+if (raw_dim > 0) { message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) }
 
 # --- 6. Hyperparameter Optimization ---
 # Optimization must run on the filtered set
diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
index 508f59d6..9456df66 100644
--- a/images/fraser/fraser_count_non_split.R
+++ b/images/fraser/fraser_count_non_split.R
@@ -31,12 +31,6 @@ dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
 # 2. Configure Parallelism and HDF5
 options("FRASER.maxSamplesNoHDF5" = 0)
 options("FRASER.maxJunctionsNoHDF5" = -1)
-# Use MulticoreParam if threads > 1, else Serial
-if(args$nthreads > 1){
-    bpparam <- MulticoreParam(workers = args$nthreads)
-} else {
-    bpparam <- SerialParam()
-}
 
 # 3. Load Dataset and Coordinates
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
@@ -55,14 +49,12 @@ sample_result <- countNonSplicedReads(args$sample_id,
                                       splitCountRanges = NULL,
                                       fds = fds,
                                       NcpuPerSample = args$nthreads,
-                                      minAnchor=5,
-                                      recount= TRUE,
-                                      spliceSiteCoords=filtered_coords)
+                                      minAnchor = 5,
+                                      recount = TRUE,
+                                      spliceSiteCoords = filtered_coords)
 # 6. Verification
 # Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/
 actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
-
-# FRASER typically names these files "nonSplicedCounts-{sample_id}.h5" or .RDS
 expected_h5  <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
 expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
 
@@ -75,6 +67,9 @@ if (file.exists(expected_h5)) {
     if(dir.exists(actual_cache_dir)){
         message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", "))
     }
+    else {
+		message("Cache directory not found: ", actual_cache_dir)
+	}
 
     if(!file.exists(paste0(args$bam_path, ".bai"))){
         stop("BAM Index (.bai) missing. FRASER cannot perform random access.")
diff --git a/images/fraser/fraser_init.R b/images/fraser/fraser_init.R
index 03be3797..243ce91f 100644
--- a/images/fraser/fraser_init.R
+++ b/images/fraser/fraser_init.R
@@ -29,7 +29,7 @@ fds <- FraserDataSet(
   name       = fds_name
 )
 
-#Setup parallel execution
+# Setup parallel execution
 bp <- MulticoreParam(workers = args$nthreads)
 register(bp)
 
@@ -38,7 +38,7 @@ register(bp)
 
 fds <- saveFraserDataSet(fds)
 
-#Print location for Python to capture if needed
+# Print location for Python to capture if needed
 fds_save_path <- file.path(
   args$work_dir,
   "savedObjects",
diff --git a/images/fraser/fraser_join_counts.R b/images/fraser/fraser_join_counts.R
index e18c6b2e..895d682f 100644
--- a/images/fraser/fraser_join_counts.R
+++ b/images/fraser/fraser_join_counts.R
@@ -68,17 +68,9 @@ message("\n[1/7] Loading Fraser Data Set from RDS...")
 fds <- readRDS(args$fds_path)
 workingDir(fds) <- saveDir
 
-# 2. Load the master splice site coordinates that were used to create non-split counts
-message("\n[2/7] Loading master splice site coordinates...")
-splice_coords_path <- file.path(dirname(dirname(saveDir)), "splice_site_coords.RDS")
-if(!file.exists(splice_coords_path)) {
-    stop("ERROR: splice_site_coords.RDS not found at: ", splice_coords_path)
-}
-master_splice_coords <- readRDS(splice_coords_path)
-message("Master coordinates loaded: ", length(master_splice_coords), " sites")
 
-# 3. Load split counts
-message("\n[3/7] Loading split counts...")
+# 2. Load split counts
+message("\n[2/7] Loading split counts...")
 split_se_dir <- file.path(saveDir, "splitCounts")
 if(!dir.exists(split_se_dir)){
     stop(paste("Missing splitCounts directory at:", split_se_dir))
@@ -92,8 +84,8 @@ message("Split counts dimensions: ", nrow(splitCounts_se), " junctions x ", ncol
 message("Annotating split counts with splice site IDs...")
 splitCounts_se <- FRASER:::annotateSpliceSite(splitCounts_se)
 
-# 4. Load merged non-split counts
-message("\n[4/7] Loading merged non-split counts...")
+# 3. Load merged non-split counts
+message("\n[3/7] Loading merged non-split counts...")
 merged_non_split_dir <- file.path(saveDir, "nonSplitCounts")
 if(!dir.exists(merged_non_split_dir)){
     stop(paste("Missing merged non-split counts directory at:", merged_non_split_dir))
@@ -106,8 +98,8 @@ message("Non-split counts dimensions: ", nrow(nonSplitCounts_se), " sites x ", n
 message("Re-annotating non-split counts with splice site IDs...")
 nonSplitCounts_se <- FRASER:::annotateSpliceSite(nonSplitCounts_se)
 
-# 5. Filter split counts to only include junctions that reference the non-split splice sites
-message("\n[5/7] Filtering split counts to match non-split coordinate space...")
+# 4. Filter split counts to only include junctions that reference the non-split splice sites
+message("\n[4/7] Filtering split counts to match non-split coordinate space...")
 
 # Get the valid splice site IDs from non-split counts
 nonsplit_ids <- mcols(nonSplitCounts_se)$spliceSiteID
@@ -138,8 +130,8 @@ message("  Filtered split counts dimensions: ", nrow(splitCounts_se_filtered), "
 # Update the splitCounts_se to use the filtered version
 splitCounts_se <- splitCounts_se_filtered
 
-# 6. Verify ID consistency after filtering
-message("\n[6/7] Verifying ID consistency after filtering...")
+# 5. Verify ID consistency after filtering
+message("\n[5/7] Verifying ID consistency after filtering...")
 split_start_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$startID))
 split_end_ids_filtered <- na.omit(unique(mcols(splitCounts_se)$endID))
 
@@ -157,8 +149,8 @@ if(missing_start > 0 || missing_end > 0) {
 
 message("SUCCESS: All split junction IDs now exist in non-split coordinate set!")
 
-# 7. Add counts to FRASER object
-message("\n[7/7] Joining split and non-split counts into FDS object...")
+# 6. Add counts to FRASER object
+message("\n[6/7] Joining split and non-split counts into FDS object...")
 fds <- addCountsToFraserDataSet(
   fds = fds,
   splitCounts = splitCounts_se,
@@ -170,8 +162,8 @@ message("  - Split junctions: ", nrow(counts(fds, type = "j")))
 message("  - Splice sites: ", nrow(counts(fds, type = "ss")))
 
 
-# 8. Calculate PSI values
-message("\n[8/8] Calculating PSI values...")
+# 7. Calculate PSI values
+message("\n[7/7] Calculating PSI values...")
 fds <- calculatePSIValues(fds, types = c("psi3", "psi5", "jaccard"))
 
 message("PSI values calculated successfully!")
@@ -180,7 +172,7 @@ message("Available assays: ", paste(assayNames(fds), collapse = ", "))
 # Final integrity check
 check_fds_integrity(fds)
 
-# 9. Save final FRASER object
+# 8. Save final FRASER object
 message("\nSaving final integrated FDS...")
 fds <- saveFraserDataSet(fds)
 
diff --git a/images/fraser/fraser_merge_non_split.R b/images/fraser/fraser_merge_non_split.R
index 68b243d8..5415c9b6 100644
--- a/images/fraser/fraser_merge_non_split.R
+++ b/images/fraser/fraser_merge_non_split.R
@@ -61,16 +61,34 @@ if(length(h5_files) == 0) {
 # Copy with standardized naming convention
 for(i in seq_along(sample_ids)) {
     sid <- sample_ids[i]
-    src_file <- h5_files[grepl(sid, h5_files)][1]
 
-    if(is.na(src_file)) {
+    # 1. Precise matching using word boundaries (\\b)
+    # This ensures "S1" doesn't match "S10"
+    match_idx <- grep(paste0("\\b", sid, "\\b"), h5_files)
+
+    # 2. Safety check: Did we find a match?
+    if(length(match_idx) == 0) {
         warning("No H5 file found for sample: ", sid)
         next
     }
 
+    # 3. Extract the first matching file path
+    src_file <- h5_files[match_idx[1]]
+
+    # 4. Define destination and copy
+    # Constructing the new filename: nonSplicedCounts-[sid].h5
     dest <- file.path(out_dir, paste0("nonSplicedCounts-", sid, ".h5"))
-    file.copy(src_file, dest, overwrite = TRUE)
-    message("  Copied: ", basename(src_file), " -> ", basename(dest))
+
+    success <- file.copy(src_file, dest, overwrite = TRUE)
+
+    # 5. Provide feedback
+    if(success) {
+        message("Successfully Copied: ", sid)
+        message("  From: ", basename(src_file))
+        message("  To:   ", basename(dest))
+    } else {
+        warning("Failed to copy file for sample: ", sid)
+    }
 }
 
 # 5. Validate H5 files and build combined count matrix
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 685f833a..667e4059 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -32,7 +32,6 @@ register(bp)
 
 # 3. Load Dataset
 # Match the name exactly as defined in the init stage
-fds_name <- paste0("FRASER_", args$cohort_id)
 fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
 
 
@@ -58,6 +57,7 @@ strandSpecific(fds) <- 2
 minExpressionInOneSample <- 20
 # 4. Merge individual split count RDS files from the cache
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
+#TODO: We could add an explicit check here to ensure the expected files are present before calling this function.
 message("Merging split counts from cache...")
 splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE)
 
@@ -84,7 +84,6 @@ spliceSiteCoords <- FRASER:::extractSpliceSiteCoordinates(splitCountRanges)
 
 
 # Use absolute paths for saving to match Python 'mv' commands
-#This is slightly confusing, but the filtered granges will be used to annotate non_split counts
 saveRDS(splitCountRangesNonFilt,       file.path(args$work_dir, "g_ranges_split_counts.RDS"))
 saveRDS(splitCountRanges,    file.path(args$work_dir, "g_ranges_non_split_counts.RDS"))
 saveRDS(spliceSiteCoords, file.path(args$work_dir, "splice_site_coords.RDS"))

From 1b8375eab1a9cb26cd2da73df33dea422fc14b8a Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 17 Feb 2026 17:21:03 +1100
Subject: [PATCH 89/89] Adding a readme

---
 images/fraser/README_Fraser.txt    | 42 ++++++++++++++++++++++++++++++
 images/fraser/fraser_merge_split.R |  1 -
 2 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 images/fraser/README_Fraser.txt

diff --git a/images/fraser/README_Fraser.txt b/images/fraser/README_Fraser.txt
new file mode 100644
index 00000000..85f6c06f
--- /dev/null
+++ b/images/fraser/README_Fraser.txt
@@ -0,0 +1,42 @@
+The tool is called "Fraser" and it is an R package that can be used to analyze RNA-seq data for splicing events.
+It is designed to identify and quantify splicing events in a cohort of samples, and can be used to compare
+splicing patterns between different groups of samples.
+
+This image and tool was made to run Cohort-Level analyses on Exome data for the RD team.
+When using these scripts and tools, please make sure to have the correct version of R and the necessary packages installed.
+They were designed to be used in conjunction with the cpg-flow-rdrnaseq and specifically the fraser stage of that workflow and run in this order
+
+1. fraser_init.R:
+Initializes the FRASER dataset for a cohort.
+Inputs: --cohort_id, --sample_map (CSV with sample_id and bam_path), --work_dir, --nthreads
+What it does: Reads the sample map, creates a FRASER dataset skeleton, and saves it for downstream steps.
+
+2. fraser_count_split.R:
+Counts split (spliced) reads for a single sample.
+Inputs: --fds_path, --bam_path, --cohort_id, --sample_id, --work_dir, --nthreads
+What it does: Loads the dataset, subsets to the sample, counts split reads, and saves results to the cache.
+
+3. fraser_merge_split.R:
+Merges split read counts from all samples.
+Inputs: --fds_path, --cohort_id, --work_dir, --nthreads
+What it does: Loads and merges all split count files, annotates splice sites, and saves the merged data for the cohort.
+
+4. fraser_count_non_split.R:
+Counts non-split (unspliced) reads for a single sample.
+Inputs: --fds_path, --bam_path, --cohort_id, --sample_id, --coords_path, --work_dir, --nthreads
+What it does: Counts non-split reads at splice site coordinates and saves results to the cache.
+
+5. fraser_merge_non_split.R:
+Merges non-split read counts and calculates PSI values.
+Inputs: --fds_path, --cohort_id, --filtered_ranges_path, --work_dir, --nthreads
+What it does: Merges non-split counts for all samples, organizes HDF5 files, and prepares data for analysis.
+
+6. fraser_join_counts.R:
+Joins split and non-split counts into a single dataset.
+Inputs: --fds_path, --cohort_id, --work_dir, --nthreads
+What it does: Checks data integrity, joins split and non-split counts, and ensures all metadata is present for analysis.
+
+7. fraser_analysis.R:
+Performs statistical analysis and generates QC plots.
+Inputs: --fds_dir, --cohort_id, --pval_cutoff, --delta_psi_cutoff, --min_count, --nthreads
+What it does: Loads the final dataset, calculates splicing statistics, applies filters, and generates quality control plots and results.
diff --git a/images/fraser/fraser_merge_split.R b/images/fraser/fraser_merge_split.R
index 667e4059..13771414 100644
--- a/images/fraser/fraser_merge_split.R
+++ b/images/fraser/fraser_merge_split.R
@@ -57,7 +57,6 @@ strandSpecific(fds) <- 2
 minExpressionInOneSample <- 20
 # 4. Merge individual split count RDS files from the cache
 # FRASER automatically looks in: {work_dir}/cache/splitCounts/
-#TODO: We could add an explicit check here to ensure the expected files are present before calling this function.
 message("Merging split counts from cache...")
 splitCounts <- getSplitReadCountsForAllSamples(fds=fds, recount=FALSE)