populationgenomics · Johnnyassaf · Feb 18, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 6, 2026
diff --git a/images/fraser/Dockerfile b/images/fraser/Dockerfile
@@ -2,14 +2,36 @@ FROM debian:bookworm-slim
 
 ENV MAMBA_ROOT_PREFIX /root/micromamba
 ENV PATH $MAMBA_ROOT_PREFIX/bin:$PATH
-ARG VERSION=${VERSION:-1.12.1}
+ARG VERSION=2.4.6
 
 RUN apt-get update && apt-get install -y git wget bash bzip2 zip && \
-    rm -r /var/lib/apt/lists/* && \
     rm -r /var/cache/apt/* && \
-    wget -qO- https://api.anaconda.org/download/conda-forge/micromamba/0.8.2/linux-64/micromamba-0.8.2-he9b6cbd_0.tar.bz2 | tar -xvj -C /usr/local bin/micromamba && \
-    mkdir ${MAMBA_ROOT_PREFIX} && \
-    micromamba install -y --prefix ${MAMBA_ROOT_PREFIX} -c bioconda -c conda-forge \
-    python=3.10 r-base=4.3.1 r-reshape2=1.4.4 r-tidyverse=2.0.0 \
-    bioconductor-fraser=${VERSION} bioconductor-txdb.hsapiens.ucsc.hg38.knowngene bioconductor-org.hs.eg.db && \
-    rm -r /root/micromamba/pkgs
+    wget -qO- https://micro.mamba.pm/api/micromamba/linux-64/latest | tar -xvj -C /usr/local/bin --strip-components=1 bin/micromamba && \
+    chmod +x /usr/local/bin/micromamba && \
+    mkdir -p ${MAMBA_ROOT_PREFIX} && \
+    micromamba install -y --root-prefix ${MAMBA_ROOT_PREFIX} --prefix ${MAMBA_ROOT_PREFIX} \
+    -c bioconda -c conda-forge \
+    python=3.10.19 \
+    r-base=4.4.3 \
+    r-reshape2=1.4.5 \
+    r-tidyverse=2.0.0 \
+    r-argparse=2.2.3 \
+    r-biocmanager \
+    bioconductor-fraser=${VERSION} \
+    bioconductor-txdb.hsapiens.ucsc.hg38.knowngene=3.20.0 \
+    bioconductor-org.hs.eg.db=3.20.0 && \
+    micromamba run -n base Rscript -e 'BiocManager::install("SummarizedExperiment", update=FALSE, ask=FALSE)' && \
+    micromamba run -n base Rscript -e 'BiocManager::install("Rsamtools", update=FALSE, ask=FALSE)' && \
+    micromamba run -n base Rscript -e 'BiocManager::install("HDF5Array", update=FALSE, ask=FALSE)' && \
+    micromamba clean --all --yes && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN mkdir /RDrnaseq
+
+COPY fraser_count_split.R RDrnaseq/
+COPY fraser_analysis.R RDrnaseq/
+COPY fraser_count_non_split.R RDrnaseq/
+COPY fraser_init.R RDrnaseq/
+COPY fraser_merge_non_split.R RDrnaseq/
+COPY fraser_merge_split.R RDrnaseq/
+COPY fraser_join_counts.R RDrnaseq/
diff --git a/images/fraser/README_Fraser.txt b/images/fraser/README_Fraser.txt
@@ -0,0 +1,42 @@
+The tool is called "Fraser" and it is an R package that can be used to analyze RNA-seq data for splicing events.
+It is designed to identify and quantify splicing events in a cohort of samples, and can be used to compare
+splicing patterns between different groups of samples.
+
+This image and tool was made to run Cohort-Level analyses on Exome data for the RD team.
+When using these scripts and tools, please make sure to have the correct version of R and the necessary packages installed.
+They were designed to be used in conjunction with the cpg-flow-rdrnaseq and specifically the fraser stage of that workflow and run in this order
+
+1. fraser_init.R:
+Initializes the FRASER dataset for a cohort.
+Inputs: --cohort_id, --sample_map (CSV with sample_id and bam_path), --work_dir, --nthreads
+What it does: Reads the sample map, creates a FRASER dataset skeleton, and saves it for downstream steps.
+
+2. fraser_count_split.R:
+Counts split (spliced) reads for a single sample.
+Inputs: --fds_path, --bam_path, --cohort_id, --sample_id, --work_dir, --nthreads
+What it does: Loads the dataset, subsets to the sample, counts split reads, and saves results to the cache.
+
+3. fraser_merge_split.R:
+Merges split read counts from all samples.
+Inputs: --fds_path, --cohort_id, --work_dir, --nthreads
+What it does: Loads and merges all split count files, annotates splice sites, and saves the merged data for the cohort.
+
+4. fraser_count_non_split.R:
+Counts non-split (unspliced) reads for a single sample.
+Inputs: --fds_path, --bam_path, --cohort_id, --sample_id, --coords_path, --work_dir, --nthreads
+What it does: Counts non-split reads at splice site coordinates and saves results to the cache.
+
+5. fraser_merge_non_split.R:
+Merges non-split read counts and calculates PSI values.
+Inputs: --fds_path, --cohort_id, --filtered_ranges_path, --work_dir, --nthreads
+What it does: Merges non-split counts for all samples, organizes HDF5 files, and prepares data for analysis.
+
+6. fraser_join_counts.R:
+Joins split and non-split counts into a single dataset.
+Inputs: --fds_path, --cohort_id, --work_dir, --nthreads
+What it does: Checks data integrity, joins split and non-split counts, and ensures all metadata is present for analysis.
+
+7. fraser_analysis.R:
+Performs statistical analysis and generates QC plots.
+Inputs: --fds_dir, --cohort_id, --pval_cutoff, --delta_psi_cutoff, --min_count, --nthreads
+What it does: Loads the final dataset, calculates splicing statistics, applies filters, and generates quality control plots and results.
diff --git a/images/fraser/fraser_analysis.R b/images/fraser/fraser_analysis.R
@@ -0,0 +1,130 @@
+#!/usr/bin/env Rscript
+
+library(argparse)
+library(FRASER)
+library(readr)
+library(TxDb.Hsapiens.UCSC.hg38.knownGene)
+library(org.Hs.eg.db)
+library(HDF5Array)
+library(ggplot2)
+
+parser <- ArgumentParser(description = "FRASER 2.0 Statistical Analysis")
+parser$add_argument("--fds_dir", required = TRUE, help = "Base directory containing the output folder")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--pval_cutoff", type = "double", default = 0.05)
+parser$add_argument("--delta_psi_cutoff", type = "double", default = 0.3)
+parser$add_argument("--min_count", type = "integer", default = 5)
+parser$add_argument("--nthreads", type = "integer", default = 1)
+args <- parser$parse_args()
+
+# --- 1. Parallelization Setup ---
+# Use the user-provided nthreads for all BiocParallel operations
+bp <- MulticoreParam(workers = args$nthreads)
+register(bp)
+
+# --- 2. Load the Dataset ---
+options(delayedArray.block.size = 1e9)
+fds_name <- paste0("FRASER_", args$cohort_id)
+message(paste0("Loading Fraser Data Set: ", fds_name))
+fds <- loadFraserDataSet(dir = file.path(args$fds_dir, fds_name), name = fds_name)
+
+# REPAIR: If spliceSiteID is missing, FRASER 2.0 cannot calculate Jaccard/PSI
+if(! "spliceSiteID" %in% colnames(mcols(fds, type="ss"))){
+  message("Manually injecting Splice Site IDs...")
+
+  # This internal FRASER call generates the mapping without needing the full constructor
+  # It populates the 'spliceSiteCoords' slot which calculatePSIValues needs
+  fds <- FRASER:::annotateSpliceSite(fds)
+
+  # We also need to ensure the Jaccard-specific metadata is initialized
+  # Often, this is what's missing when the merge.data.table fails
+  if(is.null(mcols(fds, type="j")$startID)){
+    # This maps junctions to the spliceSiteIDs we just generated
+    fds <- FRASER:::updateIndices(fds)
+  }
+}
+
+# Create QC directory
+dir.create("qc_plots", showWarnings = FALSE)
+
+# --- 3. Calculate Filter Values FIRST (required for plotting) ---
+message("Calculating filter expression values...")
+fds <- filterExpressionAndVariability(fds, minDeltaPsi = 0.0, filter = FALSE)
+
+# --- 4. Generate QC Plots ---
+# DOWNSAMPLING FOR PLOTS: Use 30,000 random junctions for QC to keep it fast
+set.seed(42)
+plot_idx <- sample(nrow(fds), min(nrow(fds), 30000))
+fds_plot_subset <- fds[plot_idx, ]
+
+message("Generating QC plots using downsampled subset...")
+png("qc_plots/filter_expression.png", width = 1200, height = 1200, res = 150)
+plotFilterExpression(fds_plot_subset, bins = 100)
+dev.off()
+
+# --- 5. Apply Filtering ---
+message("Applying filtering based on calculated values...")
+fds_filtered <- fds[mcols(fds, type = "j")[, "passed"], ]
+
+# Dimensionality Message
+raw_dim <- nrow(fds)
+filtered_dim <- nrow(fds_filtered)
+
+message(paste0("\n--- Filtering Summary ---"))
+message(paste0("Original junctions:   ", raw_dim))
+message(paste0("Filtered junctions:   ", filtered_dim))
+if (raw_dim > 0) { message(paste0("Reduction: ", round((1 - (filtered_dim / raw_dim)) * 100, 2), "%")) }
+
+# --- 6. Hyperparameter Optimization ---
+# Optimization must run on the filtered set
+message("Optimizing encoding dimension (q)...")
+opt_q <- bestQ(fds_filtered, type = "jaccard")
+
+png("qc_plots/best_q_optimization.png", width = 1200, height = 1200, res = 150)
+plotEncDimSearch(fds_filtered, type = "jaccard")
+dev.off()
+
+# --- 7. Fitting ---
+message(paste0("Fitting FRASER model with q = ", opt_q, "..."))
+fds_fit <- FRASER(fds_filtered, q = opt_q, type = "jaccard", BPPARAM = bp)
+
+# --- 8. Annotation (MUST happen BEFORE calculating gene-level p-values) ---
+message("Annotating results with gene information...")
+fds_fit <- annotateRangesWithTxDb(fds_fit,
+                                  txdb = TxDb.Hsapiens.UCSC.hg38.knownGene,
+                                  orgDb = org.Hs.eg.db)
+
+# --- 9. Calculate gene-level p-values (after annotation) ---
+message("Calculating gene-level p-values...")
+fds_fit <- calculatePadjValues(fds_fit, type = "jaccard")
+
+# --- 10. QQ Plot (after gene-level p-values are calculated) ---
+message("Generating QQ plot...")
+tryCatch({
+  png("qc_plots/qq_plot.png", width = 1200, height = 1200, res = 150)
+  plotQQ(fds_fit, aggregate = TRUE, global = TRUE)
+  dev.off()
+}, error = function(e) {
+  dev.off()  # Close any open graphics device
+  message("Warning: QQ plot generation failed: ", e$message)
+  message("Skipping QQ plot and continuing with analysis...")
+})
+
+# --- 11. Results & Compressed Export ---
+message("Extracting results...")
+res <- results(fds_fit,
+               padjCutoff = args$pval_cutoff,
+               deltaPsiCutoff = args$delta_psi_cutoff,
+               minCount = args$min_count)
+
+# Extract all results for Jaccard
+res_all <- results(fds_fit, padjCutoff = 1, deltaPsiCutoff = 0, minCount = 0)
+
+message("Saving results...")
+write_csv(as.data.frame(res), paste0(args$cohort_id, ".significant.csv"))
+write_csv(as.data.frame(res_all), paste0(args$cohort_id, ".all_results.csv.gz"))
+
+# --- 12. Final Save ---
+message("Saving final FRASER object...")
+saveFraserDataSet(fds_fit, dir = getwd(), name = paste0(args$cohort_id, "_final"))
+message("Analysis Complete.")
diff --git a/images/fraser/fraser_count_non_split.R b/images/fraser/fraser_count_non_split.R
@@ -0,0 +1,78 @@
+#!/usr/bin/env Rscript
+
+# Set memory limit - increased slightly to allow for HDF5 overhead
+Sys.setenv("R_MAX_VSIZE" = "16Gb")
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Count Non-Split Reads for a Single Sample")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
+parser$add_argument("--bam_path", required = TRUE, help = "Path to the localized BAM file")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--sample_id", required = TRUE, help = "Sample ID")
+parser$add_argument("--coords_path", required = TRUE, help = "Path to splice_site_coords.RDS")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+# 1. Reconstruct Directory Structure
+fds_name <- paste0("FRASER_", args$cohort_id)
+save_dir <- file.path(args$work_dir, "savedObjects", fds_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"))
+
+# Create the cache directory WITHOUT the fds_name subdirectory
+# This matches what the merge script expects
+cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts")
+dir.create(cache_dir, recursive = TRUE, showWarnings = FALSE)
+
+# 2. Configure Parallelism and HDF5
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+
+# 3. Load Dataset and Coordinates
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_name)
+filtered_coords <- readRDS(args$coords_path) # This is your splice_site_coords.RDS
+# Add this before calling countNonSplicedReads
+colData(fds)$bamFile <- args$bam_path
+
+# Set strand specificity
+strandSpecific(fds) <- 2
+
+# 5. Run Non-Split Counting
+# This writes the .h5 or .RDS file into the cache_dir created above
+message(paste("Counting non-split reads for sample:", args$sample_id))
+
+sample_result <- countNonSplicedReads(args$sample_id,
+                                      splitCountRanges = NULL,
+                                      fds = fds,
+                                      NcpuPerSample = args$nthreads,
+                                      minAnchor = 5,
+                                      recount = TRUE,
+                                      spliceSiteCoords = filtered_coords)
+# 6. Verification
+# Define the actual subdirectory FRASER uses: cache/nonSplicedCounts/FRASER_{cohort_id}/
+actual_cache_dir <- file.path(args$work_dir, "cache", "nonSplicedCounts", fds_name)
+expected_h5  <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".h5"))
+expected_rds <- file.path(actual_cache_dir, paste0("nonSplicedCounts-", args$sample_id, ".RDS"))
+
+if (file.exists(expected_h5)) {
+    message("Success: Created non-split counts (HDF5) at ", expected_h5)
+} else if (file.exists(expected_rds)) {
+    message("Success: Created non-split counts (RDS) at ", expected_rds)
+} else {
+    # Debugging: List files in the directory to see what actually happened
+    if(dir.exists(actual_cache_dir)){
+        message("Files found in cache dir: ", paste(list.files(actual_cache_dir), collapse=", "))
+    }
+    else {
+		message("Cache directory not found: ", actual_cache_dir)
+	}
+
+    if(!file.exists(paste0(args$bam_path, ".bai"))){
+        stop("BAM Index (.bai) missing. FRASER cannot perform random access.")
+    }
+    stop(paste("Counting failed. Output not found for sample:", args$sample_id))
+}
diff --git a/images/fraser/fraser_count_split.R b/images/fraser/fraser_count_split.R
@@ -0,0 +1,74 @@
+#!/usr/bin/env Rscript
+
+# Set memory limit - increased slightly to allow for HDF5 overhead
+Sys.setenv("R_MAX_VSIZE" = "16Gb")
+
+library(argparse)
+library(FRASER)
+library(BiocParallel)
+
+parser <- ArgumentParser(description = "Count Split Reads for a Single Sample")
+parser$add_argument("--fds_path", required = TRUE, help = "Path to localized FDS RDS file")
+parser$add_argument("--bam_path", required = TRUE, help = "Path to the specific BAM file for this sample")
+parser$add_argument("--cohort_id", required = TRUE, help = "Cohort ID")
+parser$add_argument("--sample_id", required = TRUE, help = "Sample ID to count")
+parser$add_argument("--work_dir", default = "/io/work", help = "Working directory")
+parser$add_argument("--nthreads", type = "integer", default = 1, help = "Number of threads")
+args <- parser$parse_args()
+
+# 1. Reconstruct the FRASER directory structure so loadFraserDataSet works
+# FRASER expects: {work_dir}/savedObjects/FRASER_{cohort_id}/fds-object.RDS
+fds_dir_name <- paste0("FRASER_", args$cohort_id)
+save_dir <- file.path(args$work_dir, "savedObjects", fds_dir_name)
+dir.create(save_dir, recursive = TRUE, showWarnings = FALSE)
+file.copy(args$fds_path, file.path(save_dir, "fds-object.RDS"), overwrite = TRUE)
+
+# 2. Force HDF5 and Configure Parallelism
+options("FRASER.maxSamplesNoHDF5" = 0)
+options("FRASER.maxJunctionsNoHDF5" = -1)
+# Use MulticoreParam if threads > 1, else Serial
+if(args$nthreads > 1){
+    bpparam <- MulticoreParam(workers = args$nthreads)
+} else {
+    bpparam <- SerialParam()
+}
+
+# 3. Load and Prune IMMEDIATELY
+fds <- loadFraserDataSet(dir = args$work_dir, name = fds_dir_name)
+
+strandSpecific(fds) <- 2
+
+# SUBSET FIRST: This is the most critical memory-saving step.
+# By subsetting here, we drop the metadata of all other samples.
+fds <- fds[, fds$sampleID == args$sample_id]
+
+# Validate the BAM path - Ensure the R script sees what Hail localized
+if(!file.exists(args$bam_path)){
+    stop(paste("BAM file not found at:", args$bam_path))
+}
+colData(fds)$bamFile <- args$bam_path
+
+# 4. Count Split Reads
+message(paste("Starting split read counting for sample:", args$sample_id))
+
+# In FRASER 2.0, we use getSplitReadCountsForAllSamples with recount=TRUE.
+# This writes an RDS file to the cache which we will harvest in the merge step.
+fds <- getSplitReadCountsForAllSamples(
+    fds,
+    recount = TRUE,
+    BPPARAM = bpparam
+)
+
+# 5. Verification
+# FRASER saves individual counts to: cache/splitCounts/splitCounts-{sample_id}.RDS
+expected_out <- file.path(args$work_dir, "cache", "splitCounts", paste0("splitCounts-", args$sample_id, ".RDS"))
+
+if (file.exists(expected_out)) {
+    message("Success: Created split counts at ", expected_out)
+} else {
+    # Check for common Bioinformatic failures
+    if(!file.exists(paste0(args$bam_path, ".bai"))){
+        stop("BAM Index (.bai) missing. FRASER cannot perform random access counting.")
+    }
+    stop("Counting failed. The RDS file was not generated in the cache.")
+}