diff --git a/DESCRIPTION b/DESCRIPTION index b0d5336..85bd736 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,46 +1,49 @@ Package: ramr Title: Detection of Rare Aberrantly Methylated Regions in Array and NGS Data -Version: 1.15.0 +Version: 1.15.1 Authors@R: - person(given = "Oleksii", - family = "Nikolaienko", - role = c("aut", "cre"), - email = "oleksii.nikolaienko@gmail.com", - comment = c(ORCID = "0000-0002-5910-4934")) + person(given = "Oleksii", + family = "Nikolaienko", + role = c("aut", "cre"), + email = "oleksii.nikolaienko@gmail.com", + comment = c(ORCID = "0000-0002-5910-4934")) Comment: Maintainer: Oleksii Nikolaienko -Description: ramr is an R package for detection of low-frequency aberrant - methylation events in large data sets obtained by methylation profiling - using array or high-throughput bisulfite sequencing. In addition, package - provides functions to visualize found aberrantly methylated regions (AMRs), - to generate sets of all possible regions to be used as reference sets for - enrichment analysis, and to generate biologically relevant test data sets - for performance evaluation of AMR/DMR search algorithms. +Description: ramr is an R package for detection of epimutations (i.e., + infrequent aberrant DNA methylation events) + in large data sets obtained by methylation profiling using array or + high-throughput methylation sequencing. In addition, package + provides functions to visualize found aberrantly methylated regions (AMRs), + to generate sets of all possible regions to be used as reference sets for + enrichment analysis, and to generate biologically relevant test data sets + for performance evaluation of AMR/DMR search algorithms. Depends: - R (>= 4.1), - GenomicRanges, - parallel, - doParallel, - foreach, - doRNG, - methods + R (>= 4.1) Imports: - IRanges, - BiocGenerics, - ggplot2, - reshape2, - EnvStats, - ExtDist, - matrixStats, - S4Vectors + parallel, + doParallel, + foreach, + doRNG, + methods, + data.table, + matrixStats, + GenomicRanges, + IRanges, + BiocGenerics, + EnvStats, + ExtDist, + S4Vectors, + gamlss, + gamlss.dist Suggests: - RUnit, - knitr, - rmarkdown, - gridExtra, - annotatr, - LOLA, - org.Hs.eg.db, - TxDb.Hsapiens.UCSC.hg19.knownGene + RUnit, + knitr, + rmarkdown, + ggplot2, + gridExtra, + annotatr, + LOLA, + org.Hs.eg.db, + TxDb.Hsapiens.UCSC.hg19.knownGene License: Artistic-2.0 URL: https://github.com/BBCG/ramr BugReports: https://github.com/BBCG/ramr/issues diff --git a/NAMESPACE b/NAMESPACE index 7ed7a51..03e3881 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,6 @@ export(getUniverse) export(plotAMR) export(simulateAMR) export(simulateData) -import(ggplot2) importFrom(BiocGenerics,relist) importFrom(EnvStats,ebeta) importFrom(ExtDist,eBeta) @@ -15,11 +14,16 @@ importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,granges) importFrom(GenomicRanges,mcols) importFrom(GenomicRanges,reduce) -importFrom(IRanges,`%outside%`) +importFrom(IRanges,subsetByOverlaps) importFrom(S4Vectors,queryHits) +importFrom(data.table,as.data.table) +importFrom(data.table,melt.data.table) importFrom(doParallel,registerDoParallel) importFrom(doRNG,"%dorng%") importFrom(foreach,foreach) +importFrom(gamlss,gamlss) +importFrom(gamlss,gamlss.control) +importFrom(gamlss.dist,pBEINF) importFrom(matrixStats,rowIQRs) importFrom(matrixStats,rowMedians) importFrom(methods,as) @@ -27,7 +31,6 @@ importFrom(methods,is) importFrom(parallel,detectCores) importFrom(parallel,makeCluster) importFrom(parallel,stopCluster) -importFrom(reshape2,melt) importFrom(stats,median) importFrom(stats,na.omit) importFrom(stats,pbeta) diff --git a/R/getAMR.R b/R/getAMR.R index d1cb0c4..9bc23a1 100644 --- a/R/getAMR.R +++ b/R/getAMR.R @@ -7,12 +7,16 @@ #' @details #' In the provided data set, `getAMR` compares methylation beta values of each #' sample with other samples to identify rare long-range methylation -#' aberrations. For `ramr.method=="IQR"`: for every genomic location (CpG) in +#' aberrations (epimutations). +#' For `ramr.method=="IQR"`: for every genomic location (CpG) in #' `data.ranges` the IQR-normalized deviation from the median value is #' calculated, and all CpGs with such normalized deviation not smaller than the -#' `iqr.cutoff` are retained. For `ramr.method=="*beta"`: parameters of beta -#' distribution are estimated by means of `EnvStats::ebeta` or `ExtDist::eBeta` -#' functions, and then used to calculate the probability values, followed by the +#' `iqr.cutoff` are retained. For +#' `ramr.method %in% c("beta", "wbeta", "beinf")`: parameters of beta +#' distribution are estimated by means of `EnvStats::ebeta` (beta distribution), +#' `ExtDist::eBeta` (weighted beta destribution), or `gamlss.dist::BEINF` (zero +#' and one inflated beta distribution) functions, respectively. These +#' parameters are then used to calculate the probability values, followed by the #' filtering when all CpGs with p-values not greater than `qval.cutoff` are #' retained. Another filtering is then performed to exclude all CpGs within #' `exclude.range`. Next, the retained (significant) CpGs are merged within @@ -26,8 +30,9 @@ #' columns) are included in the analysis. #' @param ramr.method A character scalar: when ramr.method is "IQR" (the #' default), the filtering based on interquantile range is used (`iqr.cutoff` -#' value is then used as a threshold). When "beta" or "wbeta" - filtering based -#' on fitting non-weighted (`EnvStats::ebeta`) or weighted (`ExtDist::eBeta`) +#' value is then used as a threshold). When "beta", "wbeta" or "beinf" - +#' filtering based on fitting non-weighted (`EnvStats::ebeta`), weighted +#' (`ExtDist::eBeta`) or zero-and-one inflated (`gamlss.dist::BEINF`) #' beta distributions, respectively, is used, and `pval.cutoff` or `qval.cutoff` #' (if not `NULL`) is used as a threshold. For "wbeta", weights directly #' correlate with bin contents (number of values per bin) and inversly - with @@ -82,20 +87,10 @@ #' data(ramr) #' getAMR(ramr.data, ramr.samples, ramr.method="beta", #' min.cpgs=5, merge.window=1000, qval.cutoff=1e-3, cores=2) -#' @importFrom parallel detectCores makeCluster stopCluster -#' @importFrom doParallel registerDoParallel -#' @importFrom GenomicRanges mcols reduce -#' @importFrom EnvStats ebeta -#' @importFrom ExtDist eBeta pBeta -#' @importFrom matrixStats rowMedians rowIQRs -#' @importFrom foreach foreach -#' @importFrom doRNG %dorng% -#' @importFrom methods as is -#' @importFrom stats median pbeta #' @export getAMR <- function (data.ranges, data.samples=NULL, - ramr.method="IQR", + ramr.method=c("IQR", "beta", "wbeta", "beinf"), iqr.cutoff=5, pval.cutoff=5e-2, qval.cutoff=NULL, @@ -115,6 +110,7 @@ getAMR <- function (data.ranges, stop("'data.ranges' metadata must include 'data.samples'") if (length(data.samples)<3) stop("at least three 'data.samples' must be provided") + ramr.method <- match.arg(ramr.method) ##################################################################################### @@ -134,7 +130,7 @@ getAMR <- function (data.ranges, x.median <- stats::median(x, na.rm=TRUE) x[is.na(x)] <- x.median # weight directly correlates with bin contents (number of values per bin) - # and inversly - with the distance from the median value, thus narrowing + # and inversely - with the distance from the median value, thus narrowing # the estimated distribution and emphasizing outliers c <- cut(x, c(0:100)/100) b <- table(c) @@ -146,6 +142,30 @@ getAMR <- function (data.ranges, }) return(t(chunk.filt)) } + inv.logit <- function (x) exp(x)/(1+exp(x)) + getPValues.beinf <- function (data.chunk, ...) { + chunk.filt <- apply(data.chunk, 1, function (x) { + x.median <- stats::median(x, na.rm=TRUE) + x[is.na(x)] <- x.median + beinf.fit <- gamlss::gamlss( + as.numeric(x)~1, sigma.formula=~1, nu.formula=~1, tau.formula=~1, + family=gamlss.dist::BEINF(mu.link="logit", sigma.link="logit", + nu.link="log", tau.link="log"), + control=gamlss::gamlss.control(trace=FALSE), ... + ) + pvals <- gamlss.dist::pBEINF( + q=x, mu=inv.logit(beinf.fit$mu.coefficients), + sigma=inv.logit(beinf.fit$sigma.coefficients), + nu=exp(beinf.fit$nu.coefficients), + tau=exp(beinf.fit$tau.coefficients), + lower.tail=TRUE, log.p=FALSE + ) + pvals[x>x.median] <- 1 - pvals[x>x.median] + return(pvals) + }) + return(t(chunk.filt)) + } + ##################################################################################### @@ -158,28 +178,30 @@ getAMR <- function (data.ranges, universe <- getUniverse(data.ranges, merge.window=merge.window, min.cpgs=min.cpgs, min.width=min.width) universe.cpgs <- unlist(universe$revmap) - betas <- as.matrix(mcols(data.ranges)[universe.cpgs,data.samples,drop=FALSE]) + betas <- as.matrix(mcols(data.ranges)[universe.cpgs, data.samples, drop=FALSE]) if (is.null(qval.cutoff)) qval.cutoff <- pval.cutoff/ncol(betas) - chunks <- split(seq_len(nrow(betas)), if (cores>1) cut(seq_len(nrow(betas)),cores) else 1) - medians <- foreach (chunk=chunks, .combine=c) %dorng% matrixStats::rowMedians(betas[chunk,], na.rm=TRUE) + chunks <- split(seq_len(nrow(betas)), if (cores>1) cut(seq_len(nrow(betas)), cores) else 1) + medians <- foreach (chunk=chunks, .combine=c) %dorng% matrixStats::rowMedians(betas[chunk, ], na.rm=TRUE) if (ramr.method=="IQR") { - iqrs <- foreach (chunk=chunks, .combine=c) %dorng% matrixStats::rowIQRs(betas[chunk,], na.rm=TRUE) + iqrs <- foreach (chunk=chunks, .combine=c) %dorng% matrixStats::rowIQRs(betas[chunk, ], na.rm=TRUE) betas.filtered <- (betas-medians)/iqrs betas.filtered[abs(betas.filtered)mle>>>fitdistrplus::fitdist) - betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.beta(betas[chunk,], ...) + betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.beta(betas[chunk, ], ...) betas.filtered <- do.call(rbind, betas.filtered) betas.filtered[betas.filtered>=qval.cutoff] <- NA } else if (ramr.method=="wbeta") { - betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.wbeta(betas[chunk,], ...) + betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.wbeta(betas[chunk, ], ...) + betas.filtered <- do.call(rbind, betas.filtered) + betas.filtered[betas.filtered>=qval.cutoff] <- NA + } else if (ramr.method=="beinf") { + betas.filtered <- foreach (chunk=chunks) %dorng% getPValues.beinf(betas[chunk, ], ...) betas.filtered <- do.call(rbind, betas.filtered) betas.filtered[betas.filtered>=qval.cutoff] <- NA - } else { - stop("unknown 'ramr.method'") } if (!is.null(exclude.range)) @@ -201,9 +223,7 @@ getAMR <- function (data.ranges, ranges$xiqr <- vapply(ranges$revmap, function (revmap) { mean(betas.filtered[not.na[revmap],column,drop=FALSE], na.rm=TRUE) }, numeric(1)) - } - - if (ramr.method=="beta" | ramr.method=="wbeta") { + } else { ranges$pval <- vapply(ranges$revmap, function (revmap) { return( 10**mean(log10(betas.filtered[not.na[revmap],column] + .Machine$double.xmin), na.rm=TRUE) ) }, numeric(1)) diff --git a/R/getUniverse.R b/R/getUniverse.R index 2b34f40..e15ef22 100644 --- a/R/getUniverse.R +++ b/R/getUniverse.R @@ -40,8 +40,6 @@ #' runLOLA(amrs, universe, hg19.extdb, cores=1, redefineUserSets=TRUE) #' } #' } -#' @importFrom GenomicRanges reduce -#' @importFrom methods is #' @export getUniverse <- function (data.ranges, merge.window=300, diff --git a/R/globals.R b/R/globals.R deleted file mode 100644 index d1130f7..0000000 --- a/R/globals.R +++ /dev/null @@ -1 +0,0 @@ -utils::globalVariables(c("chunk", "column", "ncpg", "width")) diff --git a/R/internal.R b/R/internal.R new file mode 100644 index 0000000..5335701 --- /dev/null +++ b/R/internal.R @@ -0,0 +1,49 @@ +#' @importFrom BiocGenerics relist +#' @importFrom data.table as.data.table melt.data.table +#' @importFrom doParallel registerDoParallel +#' @importFrom doRNG %dorng% +#' @importFrom EnvStats ebeta +#' @importFrom ExtDist eBeta pBeta +#' @importFrom foreach foreach +#' @importFrom gamlss gamlss gamlss.control +#' @importFrom gamlss.dist pBEINF +#' @importFrom GenomicRanges mcols `mcols<-` granges reduce findOverlaps +#' @importFrom IRanges subsetByOverlaps +#' @importFrom matrixStats rowMedians rowIQRs +#' @importFrom methods as is +#' @importFrom parallel detectCores makeCluster stopCluster +#' @importFrom S4Vectors queryHits +#' @importFrom stats median na.omit rbeta pbeta +#' @importFrom utils head tail +## #' @importFrom Rcpp sourceCpp +## #' @useDynLib ramr, .registration=TRUE + + +# internal globals, constants and helper functions +# + +################################################################################ +# Globals, unload +################################################################################ + +utils::globalVariables(c( + "chunk", "column", "ncpg", "width", "..data.samples", ":=", "alpha", "color", + "size", "start" +)) + +# .onUnload <- function (libpath) {library.dynam.unload("ramr", libpath)} + +################################################################################ +# Constants +################################################################################ + +# descr: ... + +# + +################################################################################ +# Functions: ... +################################################################################ + +# descr: ... +# value: ... diff --git a/R/plotAMR.R b/R/plotAMR.R index c20a96c..e3a72ad 100644 --- a/R/plotAMR.R +++ b/R/plotAMR.R @@ -23,6 +23,8 @@ #' `amr.ranges` object. #' @param title An optional title for the plot. If NULL (the default), plot #' title is set to a genomic location of particular AMR. +#' @param labs Optional axis labels for the plot. Default: c("genomic position", +#' "beta value"). #' @param window An optional integer constant to expand genomic ranges of the #' `amr.ranges` object (the default: 300). #' @return The output is a list of `ggplot` objects. @@ -37,21 +39,17 @@ #' library(gridExtra) #' do.call("grid.arrange", #' c(plotAMR(ramr.data, ramr.samples, ramr.tp.nonunique), ncol=2)) -#' @importFrom BiocGenerics relist -#' @importFrom GenomicRanges reduce findOverlaps mcols -#' @import ggplot2 -#' @importFrom matrixStats rowMedians -#' @importFrom reshape2 melt -#' @importFrom S4Vectors queryHits -#' @importFrom methods as is #' @export plotAMR <- function (data.ranges, data.samples=NULL, amr.ranges, highlight=NULL, title=NULL, + labs=c("genomic position", "beta value"), window=300) { + if (!requireNamespace("ggplot2", quietly=TRUE)) stop("ggplot2 is required for plotting. Please install") + if (is.null(data.samples)) data.samples <- colnames(GenomicRanges::mcols(data.ranges)) amr.ranges.reduced <- GenomicRanges::reduce(amr.ranges, min.gapwidth=window, with.revmap=TRUE) @@ -60,33 +58,42 @@ plotAMR <- function (data.ranges, for (i in seq_along(amr.ranges.relisted)) { plot.ranges <- unlist(amr.ranges.relisted[i]) - revmap.rows <- unique(unlist(plot.ranges$revmap)) + # revmap.rows <- unique(unlist(plot.ranges$revmap)) data.hits <- unique(S4Vectors::queryHits(GenomicRanges::findOverlaps(data.ranges, plot.ranges, maxgap=window, ignore.strand=TRUE))) if (length(data.hits)>0) { - plot.data <- data.frame(data.ranges[data.hits, data.samples], check.names=FALSE, stringsAsFactors=FALSE) - colnames(plot.data) <- c("seqnames", "start", "end", "width", "strand", data.samples) - plot.data$median <- matrixStats::rowMedians(as.matrix(plot.data[,data.samples]), na.rm=TRUE) + plot.data <- data.table::as.data.table(data.ranges[data.hits, data.samples]) + plot.data$median <- apply(plot.data[, ..data.samples], 1, median, na.rm=TRUE) colorify <- c("median", if (is.null(highlight)) unique(plot.ranges$sample), highlight) - plot.data.melt <- reshape2::melt(plot.data, id.vars=c("seqnames","start","end","width","strand"), + plot.data.melt <- data.table::melt.data.table(plot.data, id.vars=c("seqnames","start","end","width","strand"), variable.name="sample", value.name="beta") - plot.data.melt <- cbind(plot.data.melt, list(alpha=0.5,color=factor("lightgrey",levels=c("lightgrey",colorify)))) - - plot.data.melt[plot.data.melt$sample %in% colorify, "alpha"] <- 0.9 - for (sample.name in colorify) - plot.data.melt[plot.data.melt$sample==sample.name,"color"] <- sample.name - - gene.plot <- ggplot(plot.data.melt, aes_string(x="start", y="beta", group="sample", color="color", alpha="alpha")) + - geom_line(size=0.5) + - geom_point(size=1) + - scale_x_continuous(name="position") + - scale_y_continuous(name="beta value", limits=c(0, 1), breaks=c(0, 0.25, 0.5, 0.75, 1)) + - scale_color_discrete(name="samples", limits=colorify) + - scale_alpha_continuous(guide="none") + - theme(legend.text=element_text(size=8), - axis.text.x=element_text(size=8, angle=0), - axis.text.y=element_text(size=8)) + - ggtitle(if (is.null(title)) as.character(reduce(plot.ranges)), title) + plot.data.melt[, `:=` ( + size=0, + alpha=0.5, + color=factor("lightgrey",levels=c("lightgrey", colorify)) + )] + + plot.data.melt[sample %in% colorify, `:=` (alpha=0.9, color=sample)] + for (j in seq_along(plot.ranges)) { + plot.data.melt[sample==plot.ranges$sample[j] & start %in% GenomicRanges::start( data.ranges[unlist(plot.ranges[j]$revmap)] ), + size:=1] + } + + gene.plot <- ggplot2::ggplot(plot.data.melt, ggplot2::aes(x=start, y=beta, group=sample, color=color, alpha=alpha)) + + ggplot2::geom_line(linewidth=0.5) + + ggplot2::geom_point(mapping=ggplot2::aes(size=size)) + + ggplot2::scale_x_continuous(name=labs[1]) + + ggplot2::scale_y_continuous(name=labs[2], limits=c(0, 1), breaks=c(0, 0.25, 0.5, 0.75, 1)) + + ggplot2::scale_color_discrete(name="samples", limits=colorify) + + ggplot2::scale_alpha_continuous(guide="none") + + ggplot2::scale_size_identity(guide="none") + + ggplot2::theme_light() + + ggplot2::theme(legend.text=ggplot2::element_text(size=8), + axis.text.x=ggplot2::element_text(size=8, angle=0), + axis.text.y=ggplot2::element_text(size=8)) + + ggplot2::ggtitle( + if (is.null(title)) as.character(GenomicRanges::reduce(plot.ranges)) else title + ) plot.list[length(plot.list)+1] <- list(gene.plot) } diff --git a/R/simulateAMR.R b/R/simulateAMR.R index aa9dfbf..039eb00 100644 --- a/R/simulateAMR.R +++ b/R/simulateAMR.R @@ -74,9 +74,6 @@ #' amrs.nonunique <- #' simulateAMR(ramr.data, nsamples=3, exclude.ranges=amrs.unique, #' samples.per.region=2, min.cpgs=5, merge.window=1000) -#' @importFrom methods is -#' @importFrom utils head tail -#' @importFrom IRanges `%outside%` #' @export simulateAMR <- function (template.ranges, nsamples, @@ -111,7 +108,7 @@ simulateAMR <- function (template.ranges, universe.ranges <- getUniverse(template.ranges, merge.window=merge.window, min.cpgs=min.cpgs, min.width=min.width) universe.ranges <- subset(universe.ranges, `ncpg`<=max.cpgs) if (methods::is(exclude.ranges,"GRanges")) - universe.ranges <- subset(universe.ranges, universe.ranges %outside% exclude.ranges) + universe.ranges <- IRanges::subsetByOverlaps(universe.ranges, exclude.ranges, invert=TRUE) if (is.null(sample.names)) sample.names <- paste0("sample", seq_len(nsamples)) diff --git a/R/simulateData.R b/R/simulateData.R index 37d3f72..8c5e8fa 100644 --- a/R/simulateData.R +++ b/R/simulateData.R @@ -82,14 +82,6 @@ #' noisy.data <- #' simulateData(ramr.data, nsamples=10, amr.ranges=c(amrs,noise), cores=2) #' plotAMR(noisy.data, amr.ranges=amrs[1]) -#' @importFrom parallel detectCores makeCluster stopCluster -#' @importFrom doParallel registerDoParallel -#' @importFrom GenomicRanges mcols `mcols<-` granges -#' @importFrom EnvStats ebeta -#' @importFrom foreach foreach -#' @importFrom doRNG %dorng% -#' @importFrom methods is -#' @importFrom stats median na.omit rbeta #' @export simulateData <- function (template.ranges, nsamples, diff --git a/README.md b/README.md index c00c93e..30b9165 100644 --- a/README.md +++ b/README.md @@ -18,11 +18,12 @@ This readme contains condensed info on *`ramr`* usage. For more, please check fu ## Current Features * Identification of aberrantly methylated regions (AMRs) + - filtering by interquartile range (IQR) + - filtering by fitting non-weighted, weighted, or one-and-zero inflated beta distributions * AMR visualization * Generation of reference sets for third-party analyses (e.g. enrichment) * Generation of test data sets for performance evaluation of algorithms for search of differentially (DMR) or aberrantly (AMR) methylated regions - ------- ## Installation diff --git a/inst/unitTests/test_getAMR.R b/inst/unitTests/test_getAMR.R index 055cb09..e4104b1 100644 --- a/inst/unitTests/test_getAMR.R +++ b/inst/unitTests/test_getAMR.R @@ -1,5 +1,6 @@ test_getAMR <- function () { data(ramr) + RUnit::checkException( getAMR(c()) ) @@ -12,20 +13,39 @@ test_getAMR <- function () { RUnit::checkException( getAMR(ramr.data, ramr.samples, ramr.method="zzz") ) + + amr.iqr.1 <- getAMR(ramr.data, ramr.method="IQR", min.cpgs=5, merge.window=10000, iqr.cutoff=5, cores=1) + amr.iqr.2 <- getAMR(ramr.data, ramr.method="IQR", min.cpgs=5, merge.window=10000, iqr.cutoff=5, cores=2) + RUnit::checkIdentical( + amr.iqr.1, + amr.iqr.2 + ) + RUnit::checkEquals( + c(sum(GenomicRanges::countOverlaps(amr.iqr.2, ramr.tp.unique)), sum(GenomicRanges::countOverlaps(amr.iqr.2, ramr.tp.nonunique))), + c(6, 45) + ) + + amr.beta <- getAMR(ramr.data, ramr.samples, ramr.method="beta", min.cpgs=5, merge.window=10000, qval.cutoff=1e-3, cores=2) RUnit::checkEquals( - length( getAMR(ramr.data, ramr.method="IQR", min.cpgs=5, merge.window=10000, iqr.cutoff=5, cores=1) ), - length( c(ramr.tp.unique,ramr.tp.nonunique) ) + c(sum(GenomicRanges::countOverlaps(amr.beta, ramr.tp.unique)), sum(GenomicRanges::countOverlaps(amr.beta, ramr.tp.nonunique))), + c(6, 45) ) + + amr.wbeta <- getAMR(ramr.data, ramr.samples, ramr.method="wbeta", min.cpgs=5, merge.window=10000, qval.cutoff=1e-4, cores=2) RUnit::checkEquals( - length( getAMR(ramr.data, ramr.samples, ramr.method="beta", min.cpgs=5, merge.window=10000, qval.cutoff=1e-3, cores=2) ), - length( c(ramr.tp.unique,ramr.tp.nonunique) ) + c(sum(GenomicRanges::countOverlaps(amr.wbeta, ramr.tp.unique)), sum(GenomicRanges::countOverlaps(amr.wbeta, ramr.tp.nonunique))), + c(6, 45) ) + + amr.iqr.3 <- getAMR(ramr.data, ramr.samples, ramr.method="IQR", min.cpgs=5, merge.window=10000, cores=2, exclude.range=c(0.1,0.9)) RUnit::checkEquals( - length( getAMR(ramr.data, ramr.samples, ramr.method="wbeta", min.cpgs=5, merge.window=10000, qval.cutoff=1e-4, cores=2) ), - length( c(ramr.tp.unique,ramr.tp.nonunique) ) + c(sum(GenomicRanges::countOverlaps(amr.iqr.3, ramr.tp.unique)), sum(GenomicRanges::countOverlaps(amr.iqr.3, ramr.tp.nonunique))), + c(2, 18) ) + + amr.beinf <- getAMR(ramr.data, ramr.samples, ramr.method="beinf", min.cpgs=5, merge.window=10000, qval.cutoff=1e-3, cores=2) RUnit::checkEquals( - length( getAMR(ramr.data, ramr.samples, ramr.method="IQR", min.cpgs=5, merge.window=10000, qval.cutoff=1e-3, cores=2, exclude.range=c(0.1,0.9)) ), - 8 + c(sum(GenomicRanges::countOverlaps(amr.beinf, ramr.tp.unique)), sum(GenomicRanges::countOverlaps(amr.beinf, ramr.tp.nonunique))), + c(6, 45) ) } diff --git a/inst/unitTests/test_getUniverse.R b/inst/unitTests/test_getUniverse.R index b8eab00..7cc3218 100644 --- a/inst/unitTests/test_getUniverse.R +++ b/inst/unitTests/test_getUniverse.R @@ -8,6 +8,6 @@ test_getUniverse <- function () { length(ramr.data) ) RUnit::checkTrue( - all( width( getUniverse(ramr.data, min.cpgs=10, merge.window=1000) ) > 1000 ) + all( GenomicRanges::width( getUniverse(ramr.data, min.cpgs=10, merge.window=1000) ) > 1000 ) ) } diff --git a/inst/unitTests/test_plotAMR.R b/inst/unitTests/test_plotAMR.R index 4f410f9..0feebb7 100644 --- a/inst/unitTests/test_plotAMR.R +++ b/inst/unitTests/test_plotAMR.R @@ -2,10 +2,10 @@ test_plotAMR <- function () { data(ramr) RUnit::checkEquals( length(plotAMR(ramr.data, ramr.samples, ramr.tp.nonunique)), - length(reduce(ramr.tp.nonunique)) + length(GenomicRanges::reduce(ramr.tp.nonunique)) ) RUnit::checkEquals( length(plotAMR(ramr.data, NULL, ramr.tp.unique)), - length(reduce(ramr.tp.unique)) + length(GenomicRanges::reduce(ramr.tp.unique)) ) } diff --git a/inst/unitTests/test_simulateAMR.R b/inst/unitTests/test_simulateAMR.R index 2212e1f..92d21ea 100644 --- a/inst/unitTests/test_simulateAMR.R +++ b/inst/unitTests/test_simulateAMR.R @@ -18,10 +18,12 @@ test_simulateAMR <- function () { unique <- simulateAMR(ramr.data, nsamples=29, regions.per.sample=2, samples.per.region=1) nonunique <- simulateAMR(ramr.data, nsamples=20, exclude.ranges=unique, merge.window=1000, regions.per.sample=5, samples.per.region=3) noise <- simulateAMR(ramr.data, nsamples=100, exclude.ranges=c(unique,nonunique), merge.window=1, min.cpgs=1, max.cpgs=1, regions.per.sample=1000, samples.per.region=1) - RUnit::checkTrue( - all(unique %outside% nonunique) + RUnit::checkEquals( + sum(GenomicRanges::countOverlaps(unique, nonunique)), + 0 ) - RUnit::checkTrue( - all(noise %outside% c(unique,nonunique)) + RUnit::checkEquals( + sum(GenomicRanges::countOverlaps(noise, c(unique,nonunique))), + 0 ) } diff --git a/man/getAMR.Rd b/man/getAMR.Rd index c5b7f3c..f12f5f6 100644 --- a/man/getAMR.Rd +++ b/man/getAMR.Rd @@ -7,7 +7,7 @@ getAMR( data.ranges, data.samples = NULL, - ramr.method = "IQR", + ramr.method = c("IQR", "beta", "wbeta", "beinf"), iqr.cutoff = 5, pval.cutoff = 0.05, qval.cutoff = NULL, @@ -30,8 +30,9 @@ columns) are included in the analysis.} \item{ramr.method}{A character scalar: when ramr.method is "IQR" (the default), the filtering based on interquantile range is used (`iqr.cutoff` -value is then used as a threshold). When "beta" or "wbeta" - filtering based -on fitting non-weighted (`EnvStats::ebeta`) or weighted (`ExtDist::eBeta`) +value is then used as a threshold). When "beta", "wbeta" or "beinf" - +filtering based on fitting non-weighted (`EnvStats::ebeta`), weighted +(`ExtDist::eBeta`) or zero-and-one inflated (`gamlss.dist::BEINF`) beta distributions, respectively, is used, and `pval.cutoff` or `qval.cutoff` (if not `NULL`) is used as a threshold. For "wbeta", weights directly correlate with bin contents (number of values per bin) and inversly - with @@ -98,12 +99,16 @@ regions (AMRs) for all samples in a data set. \details{ In the provided data set, `getAMR` compares methylation beta values of each sample with other samples to identify rare long-range methylation -aberrations. For `ramr.method=="IQR"`: for every genomic location (CpG) in +aberrations (epimutations). +For `ramr.method=="IQR"`: for every genomic location (CpG) in `data.ranges` the IQR-normalized deviation from the median value is calculated, and all CpGs with such normalized deviation not smaller than the -`iqr.cutoff` are retained. For `ramr.method=="*beta"`: parameters of beta -distribution are estimated by means of `EnvStats::ebeta` or `ExtDist::eBeta` -functions, and then used to calculate the probability values, followed by the +`iqr.cutoff` are retained. For +`ramr.method %in% c("beta", "wbeta", "beinf")`: parameters of beta +distribution are estimated by means of `EnvStats::ebeta` (beta distribution), +`ExtDist::eBeta` (weighted beta destribution), or `gamlss.dist::BEINF` (zero +and one inflated beta distribution) functions, respectively. These +parameters are then used to calculate the probability values, followed by the filtering when all CpGs with p-values not greater than `qval.cutoff` are retained. Another filtering is then performed to exclude all CpGs within `exclude.range`. Next, the retained (significant) CpGs are merged within diff --git a/man/plotAMR.Rd b/man/plotAMR.Rd index b982dfb..428e551 100644 --- a/man/plotAMR.Rd +++ b/man/plotAMR.Rd @@ -10,6 +10,7 @@ plotAMR( amr.ranges, highlight = NULL, title = NULL, + labs = c("genomic position", "beta value"), window = 300 ) } @@ -31,6 +32,9 @@ default), will contain sample IDs from the `sample` metadata column of \item{title}{An optional title for the plot. If NULL (the default), plot title is set to a genomic location of particular AMR.} +\item{labs}{Optional axis labels for the plot. Default: c("genomic position", +"beta value").} + \item{window}{An optional integer constant to expand genomic ranges of the `amr.ranges` object (the default: 300).} } diff --git a/vignettes/ramr.Rmd b/vignettes/ramr.Rmd index 8cb4331..b4a3eae 100644 --- a/vignettes/ramr.Rmd +++ b/vignettes/ramr.Rmd @@ -24,7 +24,7 @@ options(width=100) # Introduction -*`ramr`* is an R package for detection of low-frequency aberrant methylation events in large data sets +*`ramr`* is an R package for detection of low-frequency aberrant methylation events (epimutations) in large data sets obtained by methylation profiling using array or high-throughput bisulfite sequencing. In addition, package provides functions to visualize found aberrantly methylated regions (AMRs), to generate sets of all possible regions to be used as reference sets for enrichment analysis, and to generate biologically relevant test data sets for @@ -62,6 +62,8 @@ GRanges object with 383788 ranges and 845 metadata columns: *`ramr`* package is supplied with a sample data, which was simulated using GSE51032 data set as described in the *`ramr`* reference paper. Sample data set *`ramr.data`* contains beta values for 10000 CpGs and 100 samples (*`ramr.samples`*), and carries 6 unique (*`ramr.tp.unique`*) and 15 non-unique (*`ramr.tp.nonunique`*) true positive AMRs containing at least 10 CpGs with their beta values increased/decreased by 0.5 ```{r, fig.width=10, fig.height=4, out.width="100%", out.height="100%"} +library(GenomicRanges) +library(ggplot2) library(ramr) data(ramr)