From 494058396bbaecdb3f79b7e7deda71913f1d8adb Mon Sep 17 00:00:00 2001 From: Michael Piechotta Date: Wed, 14 Oct 2020 01:00:46 +0200 Subject: [PATCH] Fixes --- NAMESPACE | 13 +- R/arrest-rate.R | 23 +- R/base-count.R | 11 +- R/{bc-ratio.R => base-ratio.R} | 4 +- R/base_sub.R | 21 +- R/common.R | 57 +-- R/cov.R | 4 - R/helper-filter.R | 34 +- R/helper-robust.R | 119 ++++-- R/helper-sub.R | 2 +- R/helper.R | 74 ++-- R/io-guess.R | 4 +- R/io.R | 69 ++-- R/non-ref-ratio.R | 2 +- R/plot-ecdf-allele-count.R | 18 - R/plot-ecdf-column.R | 90 ----- R/sub-ratio.R | 2 +- R/write-result.R | 1 + man/JACUSA2helper.Rd | 53 +-- man/apply_cond.Rd | 20 - man/apply_repl.Rd | 20 - man/base_call.Rd | 17 + man/base_count.Rd | 4 +- man/{bc_ratio.Rd => base_ratio.Rd} | 10 +- man/base_sub.Rd | 8 +- man/coverage.Rd | 19 + man/filter_all_artefacts.Rd | 23 -- man/filter_artefact.Rd | 10 +- man/gather_repl.Rd | 21 ++ man/lapply_cond.Rd | 23 ++ man/lapply_repl.Rd | 23 ++ man/mapply_repl.Rd | 23 ++ man/non_ref_ratio.Rd | 2 +- man/plot_ecdf_allele_count.Rd | 31 -- man/plot_ecdf_column.Rd | 42 --- man/read_results.Rd | 4 +- man/robust.Rd | 4 +- man/sub_ratio.Rd | 2 +- .../{test-bc-ratio.R => test-base-ratio.R} | 6 +- .../testthat/{test-cov.R => test-coverage.R} | 16 +- tests/testthat/test-filter-by-filter-info.R | 7 - tests/testthat/test-guess-cond-count.R | 16 +- tests/testthat/test-guess-file-type.R | 8 +- tests/testthat/test-io.R | 4 +- vignettes/JACUSA2helper-meta-condition.bib | 37 -- vignettes/JACUSA2helper-meta-condtions.Rmd | 168 +++++---- vignettes/JACUSA2helper.Rmd | 348 ++++++++++-------- 47 files changed, 710 insertions(+), 807 deletions(-) rename R/{bc-ratio.R => base-ratio.R} (91%) delete mode 100644 R/cov.R delete mode 100644 R/plot-ecdf-allele-count.R delete mode 100644 R/plot-ecdf-column.R delete mode 100644 man/apply_cond.Rd delete mode 100644 man/apply_repl.Rd create mode 100644 man/base_call.Rd rename man/{bc_ratio.Rd => base_ratio.Rd} (77%) create mode 100644 man/coverage.Rd delete mode 100644 man/filter_all_artefacts.Rd create mode 100644 man/gather_repl.Rd create mode 100644 man/lapply_cond.Rd create mode 100644 man/lapply_repl.Rd create mode 100644 man/mapply_repl.Rd delete mode 100644 man/plot_ecdf_allele_count.Rd delete mode 100644 man/plot_ecdf_column.Rd rename tests/testthat/{test-bc-ratio.R => test-base-ratio.R} (79%) rename tests/testthat/{test-cov.R => test-coverage.R} (60%) delete mode 100644 vignettes/JACUSA2helper-meta-condition.bib diff --git a/NAMESPACE b/NAMESPACE index 5d2819e..8f56ce2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,20 +3,21 @@ export(All) export(Any) export(add_arrest_rate) -export(apply_cond) -export(apply_repl) export(arrest_rate) +export(base_call) export(base_count) +export(base_ratio) export(base_sub) -export(bc_ratio) export(clean_read_sub) -export(filter_all_artefacts) +export(coverage) export(filter_artefact) +export(gather_repl) +export(lapply_cond) +export(lapply_repl) +export(mapply_repl) export(mask_sub) export(merge_sub) export(non_ref_ratio) -export(plot_ecdf_allele_count) -export(plot_ecdf_column) export(read_result) export(read_results) export(robust) diff --git a/R/arrest-rate.R b/R/arrest-rate.R index 9e8d050..c60b656 100644 --- a/R/arrest-rate.R +++ b/R/arrest-rate.R @@ -8,27 +8,10 @@ #' #' @export add_arrest_rate <- function(result, cores = 1) { - arrest_cov <- .cov(result[[.ARREST_HELPER_COL]]) - through_cov <- .cov(result[[.THROUGH_HELPER_COL]]) - - - result[[.ARREST_RATE_COL]] <- lapply(result[[.ARREST_HELPER_COL]], function(x) { - tidyr::as_tibble(lapply(x, function(y) { - rowSums(y) - })) - }) %>% c() %>% tidyr::as_tibble() + arrest_cov <- coverage(result[[.ARREST_COL]]) + through_cov <- coverage(result[[.THROUGH_COL]]) - # FIXME make it more robust - cond_count <- names(result[[.THROUGH_HELPER_COL]]) %>% length() - for (cond in paste0("cond", 1:cond_count)) { - cond_through <- result[[.THROUGH_HELPER_COL]][[cond]] - for (repl in names(cond_through)) { - arrest_cov <- result[[.ARREST_RATE_COL]][[cond]][[repl]] - through_cov <- rowSums(cond_through[[repl]]) - - result[[.ARREST_RATE_COL]][[cond]][[repl]] <- arrest_rate(arrest_cov, through_cov) - } - } + result[[.ARREST_RATE_COL]] <- mapply_repl(arrest_rate, arrest_cov, through_cov) result } diff --git a/R/base-count.R b/R/base-count.R index 3ea745c..6f452fe 100644 --- a/R/base-count.R +++ b/R/base-count.R @@ -10,7 +10,7 @@ #' } #' #' @importFrom magrittr %>% -#' @param bc vector of strings with observed base calls. +#' @param bases vector of base calls or tibble of base call counts. #' @param ref vector of strings with reference bases #' @return numeric vector of total number of bases. #' @examples @@ -21,12 +21,15 @@ #' ref <- c("A", "A", "T") #' str(base_count(bc, ref)) #' @export -base_count <- function(bc, ref = NULL) { +base_count <- function(bases, ref = NULL) { + if (! is.vector(bases)) { + bases <- base_call(bases) + } if (! is.null(ref)) { - bc <- paste0(bc, ref) + bases <- paste0(bases, ref) } - strsplit(bc, "") %>% + strsplit(bases, "") %>% lapply(unique) %>% # remove duplicates lapply(length) %>% # how many bases? unlist() diff --git a/R/bc-ratio.R b/R/base-ratio.R similarity index 91% rename from R/bc-ratio.R rename to R/base-ratio.R index c259384..aa54258 100644 --- a/R/bc-ratio.R +++ b/R/base-ratio.R @@ -7,9 +7,9 @@ #' @return data frame of base call rations \code{bases}. #' @examples #' bases <- matrix(c(5, 0, 5, 0, 1, 1, 1, 1), byrow = TRUE, ncol = 4) -#' ratio <- bc_ratio(bases) +#' ratio <- base_ratio(bases) #' @export -bc_ratio <- function(bases) { +base_ratio <- function(bases) { total <- rowSums(bases) m <- matrix( diff --git a/R/base_sub.R b/R/base_sub.R index 897cebd..482f69e 100644 --- a/R/base_sub.R +++ b/R/base_sub.R @@ -2,30 +2,33 @@ #' #' Calculates and formats base changes for \code{ref} and \code{bc}. #' All elements in \code{ref} must be all length 1. -#' +#' #' @param ref vector of reference bases. -#' @param bc vector of base calls. +#' @param bases vector of base calls or tibble of base call counts. #' @return vector of formatted base call substitutions. #' @examples #' ref <- c("A", "A", "C", "A") -#' bc <- c("AG", "G", "C", "G") -#' base_sub(ref, bc) +#' bases <- c("AG", "G", "C", "G") +#' base_sub(ref, bases) #' @export -base_sub <- function(ref, bc) { +base_sub <- function(ref, bases) { if (all(nchar(ref) != 1)) { stop("All ref elements must be nchar() == 1") } + if (! is.vector(bases)) { + bases <- base_call(bases) + } # remove ref in bc # goal: A->G instead of A->AG - bc <- mapply(function(r, o) { + bases <- mapply(function(r, o) { return(gsub(r, "", o)) - }, ref, bc) + }, ref, bases) # add nice separator - sub <- paste(ref, sep = .SUB_SEP, bc) + sub <- paste(ref, sep = .SUB_SEP, bases) # add nice info when there is no change - sub[ref == bc | bc == ""] <- .SUB_NO_CHANGE + sub[ref == bases | bases == ""] <- .SUB_NO_CHANGE sub } diff --git a/R/common.R b/R/common.R index bf2359f..3433080 100644 --- a/R/common.R +++ b/R/common.R @@ -1,14 +1,7 @@ #' JACUSA2helper: A package for post-processing JACUSA2 result files. #' #' @section Description: -#' A package that provides the following categories of functions to post-process result files of JACUSA2: -#' \describe{ -#' \item{read/write}{Read and write JACUSA2 result files, e.g.: \code{read_result()}.} -#' \item{add}{Adds some field to an existing JACUSA2 result object and return the modified object, e.g.: \code{base_sub()}.} -#' \item{filter}{Will remove sites from a result object with some filtering criteria, e.g.: \code{filter_by_coverage()}} -#' \item{plot}{Plots certain characteristics of a JACUSA2 result object.} -#' \item{other}{TODO} -#' } +#' A package that provides functions to post-process result files of JACUSA2. #' #' The following methods from JACUSA2 are supported: #' \describe{ @@ -28,16 +21,21 @@ #' The central data structure in JACUSA2helper is the JACUSA2 result object that follows the #' tidy data approach to feature easy interaction with dplyr and ggplot2. #' A JACUSA2 result object can be created via \code{result <- read_result("jacusa2.out")} and is -#' currently represented as a tibble. Furthermore, JACUSA2helper supports the analysis of several related -#' JACUSA2 result files via \code{results <- read_results(files, meta)} where \code{meta_conditions} is a -#' vector of character strings that provides a descriptive name for each file in \code{files}. +#' currently represented as a tibble. Special structured columns exist that +#' hold condition andn replicate related data such as: coverage, bases. arrest rate. +#' Furthermore, JACUSA2helper supports the analysis of several related JACUSA2 +#' result files via \code{results <- read_results(files, meta_cond)} where +#' \code{meta_cond} is a vector of character strings that provides a descriptive +#' name for each file in \code{files}. #' -#' Check \code{vignette(TODO)} for a general introduction and \code{vignette(TODO meta conditions)} for details about meta conditions. +#' Check \code{vignette("JACUSA2helper", "JACUSA2helper")} for a general +#' introduction and \code{"JACUSA2helper", "JACUSA2helper"} for details about +#' meta conditions. #' #' @section read/write functions: #' See: #' \describe{ -#' \item{read_result}{Reads and unpacks a JACUSA2 result file and creates a result object.} +#' \item{read_result}{Reads and unpacks a JACUSA2 result file.} #' \item{read_results}{Allows to combine multiple result files and distinguish them with meta conditions.} # \item{write_result}{This will pack result object and write its contents back to a file.} #' \item{write_bedGraph}{Writes a vector of values as bedGraph file.} @@ -46,28 +44,35 @@ #' @section Helper functions: #' See: #' \describe{ -#' \item{arrest_rate}{Adds arrest rate to JACUSA2 result object.} -#' \item{base_count}{TODO} -#' \item{base_sub}{Adds base substitution column to JACUSA2 result object.} -#' \item{non_ref2bc_ratio}{Adds non reference base ratio to JACUSA2 result object.} -#' \item{sub_ratio}{Adds base substitution ratio for all bases to a JACUSA2 result object.} +#' \item{arrest_rate}{Calculates arrest rate from base call counts (arest, through).} +#' \item{base_count}{Calculates the bumber of observed base calls.} +#' \item{base_sub}{Calculates base substitution.} +#' \item{base_ratio}{Calculates base call ratios.} +#' \item{non_ref_ratio}{Calculates non reference base ratio to JACUSA2 result object.} +#' \item{sub_ratio}{Calculates base substitution ratio for all bases to a JACUSA2 result object.} +#' +# \item{merge_sub} +# \item{mask_sub} #' } #' #' @section filter functions: -#' This function set enables filtering by read coverage or -#' enforcing a minimal number of variant base calls per sample. +#' This function set enables filtering by read coverage or enforcing a minimal +#' number of variant base calls per sample. #' #' See: #' \describe{ -#' \item{robust}{Retains sites that contain an arrest event in all replicates in at least one condition.} -#' \item{All}{TODO} -#' \item{Any}{TODO} +#' \item{robust}{Retains sites that are robust in one feature. The feature can be observed in all replicates of at least one condition.} +#' \item{filter_artefact}{Remove sites that have been marked as an artefact.} +#' \item{All}{Helper function} +#' \item{Any}{Helper function} +#' \item{lapply_cond}{lapply wrapper - applies function to all conditions.} +#' \item{lapply_repl}{lapply wrapper - applies function to all replicates.} +#' \item{mapply_repl}{mapply wrapper - applies function to all replicates.} #' } #' -#' # TODO -#' read_sub +# # TODO +# read_sub #' #' @docType package #' @name JACUSA2helper NULL - diff --git a/R/cov.R b/R/cov.R deleted file mode 100644 index f9febdc..0000000 --- a/R/cov.R +++ /dev/null @@ -1,4 +0,0 @@ -# -.cov <- function(bases, cores = 1) { - apply_repl(bases, rowSums, cores) -} diff --git a/R/helper-filter.R b/R/helper-filter.R index 21d76cd..9592a06 100644 --- a/R/helper-filter.R +++ b/R/helper-filter.R @@ -3,38 +3,24 @@ #' Removes sites that sites that have been marked by feature/artefact filter. #' #' @param filter vector of strings that contains artefact filter information. -#' @param artefacts vector of characters that correspond to feature/artefact filters to be filtered. +#' @param artefacts vector of characters that correspond to feature/artefact filters to be filtered, default: NULL - Filter all. #' @return vector of logical. #' @examples #' data(rdd) #' # remove sites that are marked by artefact filter "D" -#' filtered <- filter_by(rdd, filter_artefact(filter, c("D"))) -#' str(filtered) +#' dim(rdd) +#' dim(dplyr::filter(rdd, filter_artefact(filter, c("D")))) +#' # remove all sites that are marked by some artefact filter +#' dim(dplyr::filter(rdd, filter_artefact(filter))) #' @export -filter_artefact <- function(filter, artefacts) { - if (nchar(artefacts) == 0) { - stop("artefacts cannot be 0") +filter_artefact <- function(filter, artefacts = NULL) { + if (is.null(artefacts)) { + artefacts <- c(paste0('\\', .EMPTY)) } grepl(paste0(artefacts, collapse = "|"), filter) } -#' Filters all sites with an artefact -#' -#' Removes sites have been marked by any feature/artefact filter. -#' -#' @param filter vector of strings that contains artefact filter information. -#' @return vector of logical. -#' @examples -#' data(rdd) -#' # remove sites that are marked by artefact filter "D" -#' filtered <- filter_by(rdd, filter_all_artefacts(filter)) -#' str(filtered) -#' @export -filter_all_artefacts <- function(filter) { - filter_artefact(filter, c(paste0('\\', .EMPTY))) -} - #' Merge tibbles with columns holding logical values with AND to a vector. #' #' Each column holding values of logicals will be merged row-wise with AND. @@ -42,7 +28,7 @@ filter_all_artefacts <- function(filter) { #' @return logical tibble with columns merged with AND. #' @export All <- function(d) { - Reduce("&", tidyr::as_tibble(d)) + Reduce("&", tidyr::as_tibble(d)) %>%tidyr::as_tibble() } @@ -53,5 +39,5 @@ All <- function(d) { #' @return logical tibble with columns merged with OR. #' @export Any <- function(d) { - Reduce("|", tidyr::as_tibble(d)) + Reduce("|", tidyr::as_tibble(d)) %>% tidyr::as_tibble() } diff --git a/R/helper-robust.R b/R/helper-robust.R index bb8bc32..fe5786c 100644 --- a/R/helper-robust.R +++ b/R/helper-robust.R @@ -1,70 +1,115 @@ - - - #' Check if data is robust #' #' Tests if all values that have been observed (>0) in all conditions (op = |) # can observed in one of the conditions (op = &) #' Supports 1 or 2 conditions. #' -#' @param data tibble of numeric. +#' @param x tibble of numeric. #' @return logical vector indicating robust rows. #' @export -robust <- function(data) { - not_null <- apply_repl(df, function(x) {x > 0}) +robust <- function(x) { + not_null <- lapply_repl(x, function(x) {x > 0}) mask_any <- .mask_any(not_null) mask_all <- .mask_all(not_null) mask <- mask_any & mask_all | ! mask_any - n <- length(names(mask)) - + n <- dim(mask)[2] + rowSums(mask) == n } #' @noRd -.mask_any <- function(df) { - apply_cond(df, function(cond) { - Reduce("|", cond) - }) %>% Reduce("|", .) %>% tidyr::as_tibble() +.mask_any <- function(x) { + lapply_cond(x, function(cond) { + Any(cond) + }) %>% Any() %>% tidyr::as_tibble() } #' @noRd -.mask_all <- function(df) { - apply_cond(df, function(cond) { - Reduce("&", cond) - }) %>% Reduce("|", .) %>% tidyr::as_tibble() +.mask_all <- function(x) { + lapply_cond(x, function(cond) { + All(cond) + }) %>% Any() %>% tidyr::as_tibble() } -#' Apply FUN to all tibbles of conditions +#' Apply f to all conditions #' -#' TODO -#' @param X TODO -#' @param FUN TODO -#' @param cores TODO -#' @param ... TODO +#' Wrapper for lapply +#' @param x tibble of data per condition +#' @param f function to apply to each condition data +#' @param ... parameters for f +#' @param cores numer of cores to use +#' @return tibble #' @export -apply_cond <- function(X, FUN, cores = 1, ...) { +lapply_cond <- function(x, f, ..., cores = 1) { parallel::mclapply( - X, FUN, ..., - mc.cores = min(names(X), cores), mc.preschedule = FALSE - ) %>% lapply(., tidyr::as_tibble) %>% tidyr::as_tibble() + x, f, ..., + mc.cores = min(names(x), cores), + mc.preschedule = FALSE, + mc.allow.recursive = FALSE + ) %>% + .as() %>% + tidyr::as_tibble() } -#' Apply FUN to all replicates +#' Apply f to all replicates #' -#' TODO -#' @param X TODO -#' @param FUN TODO -#' @param cores TODO -#' @param ... TODO +#' Wrapper for lapply +#' @param x tibble of data per condition +#' @param f function to apply to each replicate data +#' @param ... parameters for f +#' @param cores numer of cores to use +#' @return tibble #' @export -apply_repl <- function(X, FUN, cores = 1, ...) { - # lapply(., tidyr::as_tibble) %>% +lapply_repl <- function(x, f, ..., cores = 1) { + .helper <- function(y) { + lapply(y, f, ...) %>% + lapply(.as) %>% + tidyr::as_tibble() + } + + # TODO use lapply_cond parallel::mclapply( - X, function(cond) { - lapply(cond, FUN, ...) %>% tidyr::as_tibble() - }, mc.cores = min(names(X), cores), mc.preschedule = FALSE + x, .helper, + mc.cores = min(names(x), cores), + mc.preschedule = FALSE, + mc.allow.recursive = FALSE + ) %>% tidyr::as_tibble() +} + +# make everything a tibble if it has a 2nd dimension +#' @noRd +.as <- function(y) { + if (is.vector(y)) { + return(y) + } + return(tidyr::as_tibble(y)) +} + +#' Apply f to all replicates +#' +#' Wrapper for mapply. +#' @param f function to apply to each replicate data +#' @param ... see mapply +#' @param MoreArgs see mapply +#' @param cores numer of cores to use +#' @return tibble +#' @export +mapply_repl <- function(f, ..., MoreArgs = NULL, cores = 1) { + .helper <- function(...) { + dots <- list(...) + mapply(f, ..., MoreArgs = MoreArgs, SIMPLIFY = FALSE) %>% + lapply(.as) %>% + tidyr::as_tibble() + } + + parallel::mcmapply( + .helper, + ..., + SIMPLIFY = FALSE, + mc.cores = cores, + mc.preschedule = FALSE ) %>% tidyr::as_tibble() } diff --git a/R/helper-sub.R b/R/helper-sub.R index 988dce3..3f008e6 100644 --- a/R/helper-sub.R +++ b/R/helper-sub.R @@ -96,5 +96,5 @@ mask_sub <- function(subs, keep) { #' clean_read_sub(subs) #' @export clean_read_sub <- function(subs) { - i <- gsub("2", .SUB_SEP, subs) + gsub("2", .SUB_SEP, subs) } diff --git a/R/helper.R b/R/helper.R index 53b3b38..f402425 100644 --- a/R/helper.R +++ b/R/helper.R @@ -23,10 +23,8 @@ # convenience: description data fields .CALL_PILEUP_COL <- "bases" -.ARREST_DATA_COL <- "arrest" -.ARREST_HELPER_COL <- "arrest" -.THROUGH_DATA_COL <- "through" -.THROUGH_HELPER_COL <- "through" +.ARREST_COL <- "arrest" +.THROUGH_COL <- "through" .LRT_ARREST_POS_COL <- "arrest_pos" @@ -42,42 +40,42 @@ .META_COND_COL <- "meta_cond" .COV <- "cov" -.BC <- "bc" -.SUB <- "sub" -.SUB_RATIO <- "sub_ratio" -.NON_REF_RATIO <- "non_ref_ratio" -# get unique base calls from a vector of base calls -.get_unique_base_calls <- function(bcs) { - base_calls <- bcs %>% - paste0() %>% - strsplit("") %>% - unlist() %>% - unique() - - base_calls +#' Calculate coverage for structured column. +#' +#' Calculate coverage for structured column. +#' +#' @param bases structured column of bases +#' @param cores number of cores to use +#' @return structure column with coverages +#' +#' @export +coverage <- function(bases, cores = 1) { + lapply_repl(bases, rowSums, cores) } -# apply boolean operator "&","|" on all columns -.get_mask <- function(mat, op) { - mat <- t(apply(mat, 1, function(x) { x > 0 })) - if (op == "all") { - mat <- t(apply(mat, 2, all)) - } else if (op == "any") { - mat <- t(apply(mat, 2, any)) - } else { - stop("Unknown op: ", op) +#' Extract values from a structured column. +#' +#' Extract values from a structured column. +#' +#' @param id vector that unique identifies each row +#' @param x structured column +#' @param meta_cond vector of meta conditions +#' @return extracted column +#' +#' @export +gather_repl <- function(id, x, meta_cond = NULL) { + r <- list() + for (cond in names(x)) { + for (repl in names(x[[cond]])) { + df <- tidyr::tibble(id = id, value = x[[cond]][[repl]]) + if (! is.null(meta_cond)) { + df[["meta_cond"]] <- meta_cond + } + df[["cond"]] <- cond + df[["repl"]] <- repl + r[[length(r) + 1]] <- df + } } - - mat + dplyr::bind_rows(r) } - -# get number of alleles from a vector of base calls -.get_alleles <- function(bcs) { - allele_count <- length(.get_unique_base_calls(bcs)) - - allele_count -} - - - diff --git a/R/io-guess.R b/R/io-guess.R index baf1345..1d6d40a 100644 --- a/R/io-guess.R +++ b/R/io-guess.R @@ -10,7 +10,7 @@ cond_repl <- .extract_cond_repl(header_names, prefix) cond_count <- .find_cond_count(cond_repl) } else if (type %in% c(.RT_ARREST, .LRT_ARREST)) { - cond_counts <- lapply(c(.ARREST_DATA_COL, .THROUGH_DATA_COL), function(x) { + cond_counts <- lapply(c(.ARREST_COL, .THROUGH_COL), function(x) { cond_repl <- .extract_cond_repl(header_names, x) return(.find_cond_count(cond_repl)) }) @@ -70,7 +70,7 @@ type <- .UNKNOWN_METHOD if (length(grep(.LRT_ARREST_POS_COL, line)) > 0) { # lrt-arrest type <- .LRT_ARREST - } else if(length(grep(paste0("\t", .ARREST_DATA_COL), line)) > 0) { # rt-arrest + } else if(length(grep(paste0("\t", .ARREST_COL), line)) > 0) { # rt-arrest type <- .RT_ARREST } else if (length(grep(paste0("\t", .CALL_PILEUP_COL), line)) > 0) { # call-pileup type <- .CALL_PILEUP diff --git a/R/io.R b/R/io.R index 47bb5d9..ce71276 100644 --- a/R/io.R +++ b/R/io.R @@ -87,10 +87,13 @@ read_result <- function(file, cond_desc = c(), unpack = FALSE, progress = TRUE, } attr(result, .ATTR_HEADER) <- jacusa_header result <- sticky::sticky(result) + id <- info <- filter <- ref <- NULL + result <- result %>% + dplyr::select(id, dplyr::everything()) %>% + dplyr::select(-c(info, filter, ref), c(info, filter, ref)) + - rownames(result) <- .id(result) - - result + result } @@ -109,6 +112,7 @@ read_result <- function(file, cond_desc = c(), unpack = FALSE, progress = TRUE, regex <- paste0("(", .SUB_TAG_COL, "|read_sub)=([^;]+)") if (any(stringr::str_detect(data[[.INFO_COL]], regex))) { sub_tag <- stringr::str_match(data[[.INFO_COL]], regex)[, 3] + sub_tag <- clean_read_sub(sub_tag) return(paste0(coord, ":", sub_tag)) } @@ -125,45 +129,24 @@ read_result <- function(file, cond_desc = c(), unpack = FALSE, progress = TRUE, if(type == .CALL_PILEUP) { cols <- c(.CALL_PILEUP_COL) } else if (type %in% c(.RT_ARREST, .LRT_ARREST)) { - cols <- c(.ARREST_DATA_COL, .THROUGH_DATA_COL) + cols <- c(.ARREST_COL, .THROUGH_COL) } else { stop("Unknown type: ", type) } result <- .unpack_cols(data, cols, cond_count, .BASES, cores) # process arrest and through columns if (type %in% c(.RT_ARREST, .LRT_ARREST)) { - if (.ARREST_HELPER_COL != .ARREST_DATA_COL) { - result[[.ARREST_HELPER_COL]] <- result[[.ARREST_DATA_COL]] - result[[.ARREST_DATA_COL]] <- NULL - } - if (.THROUGH_HELPER_COL != .THROUGH_DATA_COL) { - result[[.THROUGH_HELPER_COL]] <- result[[.THROUGH_DATA_COL]] - result[[.THROUGH_DATA_COL]] <- NULL - } - - result[[.CALL_PILEUP_COL]] <- result[[.ARREST_HELPER_COL]] - for (cond in paste0("cond", 1:cond_count)) { - cond_through <- result[[.THROUGH_HELPER_COL]][[cond]] - for (repl in names(cond_through)) { - result[[.CALL_PILEUP_COL]][[cond]][[repl]] <- - result[[.CALL_PILEUP_COL]][[cond]][[repl]] + cond_through[[repl]] - } - } + result[[.CALL_PILEUP_COL]] <- mapply_repl( + Reduce, + result[[.ARREST_COL]], result[[.THROUGH_COL]], + MoreArgs=list(f="+"), cores = cores + ) result <- add_arrest_rate(result) } - # FIXME make faster - cond_bases <- parallel::mclapply(result[[.CALL_PILEUP_COL]], function(x) { - return(Reduce('+', x)) - }, mc.cores = min(cond_count, cores), mc.preschedule = FALSE) - total_bases <- Reduce('+', cond_bases) - # add coverage - result[[.COV]] <- .cov(result[[.CALL_PILEUP_COL]]) - - # add base call - result[[.BC]] <- .base_call(total_bases) - + result[[.COV]] <- coverage(result[[.CALL_PILEUP_COL]]) + # unpack info field if (unpack) { result <- .unpack_info(result, cond_count, cores) @@ -172,8 +155,14 @@ read_result <- function(file, cond_desc = c(), unpack = FALSE, progress = TRUE, result } -# -.base_call <-function(bases) { +#' Observed base calls. +#' +#' Observed base calls. +#' +#' @param bases tibble of base call counts +#' @return vector of observed base calls +#' @export +base_call <-function(bases) { apply(bases > 0, 1, function(x) { paste0(names(which(x)), collapse= "") } @@ -302,17 +291,17 @@ read_result <- function(file, cond_desc = c(), unpack = FALSE, progress = TRUE, #' #' @param files Vector of files. #' @param meta_conds Vector of string vectors that explain files. -# FIXME @param cond_descs Vector of strings that represent names/descriptions for conditions. +#' @param cond_descs Vector of strings that represent names/descriptions for conditions. #' @param unpack Boolean indicates if info column should be processed. #' @param cores Integer defines how many cores to use. #' #' @export -read_results <- function(files, meta_conds, unpack = FALSE, cores = 1) { +read_results <- function(files, meta_conds, cond_descs, unpack = FALSE, cores = 1) { stopifnot(length(files) == length(meta_conds)) # read all files - l <- parallel::mcmapply(function(file, meta_cond) { - result <- read_result(file, unpack) + l <- parallel::mcmapply(function(file, meta_cond, cond_desc) { + result <- read_result(file, cond_desc, unpack) type <- attr(result, .ATTR_TYPE) attr(result, .ATTR_TYPE) <- NULL @@ -320,7 +309,7 @@ read_results <- function(files, meta_conds, unpack = FALSE, cores = 1) { result[[.META_COND_COL]] <- meta_cond return(list(result, type)) - }, files, meta_conds, mc.cores = min(length(files), cores), SIMPLIFY = FALSE) + }, files, meta_conds, cond_descs, mc.cores = min(length(files), cores), SIMPLIFY = FALSE) types <- lapply(l, "[[", 2) %>% unlist(use.names = FALSE) @@ -330,7 +319,7 @@ read_results <- function(files, meta_conds, unpack = FALSE, cores = 1) { results$meta_cond <- as.factor(results$meta_cond) attr(results, .ATTR_TYPE) <- types - # FIXME attr(results, .ATTR_COND_DESC) <- cond_descs + attr(results, .ATTR_COND_DESC) <- cond_descs results } diff --git a/R/non-ref-ratio.R b/R/non-ref-ratio.R index 9b0d821..bd9cb90 100644 --- a/R/non-ref-ratio.R +++ b/R/non-ref-ratio.R @@ -13,7 +13,7 @@ #' 0, 1, 1, 1, #' 1, 1, 1, 1 #' ) -#' str(non_ref_ratio(ref, bases)) +#' non_ref_ratio(ref, bases) #' @export non_ref_ratio <- function(ref, bases) { stopifnot(length(ref) == nrow(bases)) diff --git a/R/plot-ecdf-allele-count.R b/R/plot-ecdf-allele-count.R deleted file mode 100644 index 1d8f88f..0000000 --- a/R/plot-ecdf-allele-count.R +++ /dev/null @@ -1,18 +0,0 @@ -#' Plot cumulative allele count. -#' -#' Plots cumulative distribution of allele counts for single and multiple JACUSA2 result objects. -#' @seealso \code{plot_ecdf_column()} -#' -#' @param result object created by \code{read_result()} or \code{read_results()}. -#' @param data_desc vector of characters strings providing details about the sample. -#' @param allele_count_column character sting specifing the name of the column. -#' @param ... parameters forwared to plot_ecdf_column. -#' @return ggplot2 plot object. -#' -#' @export -plot_ecdf_allele_count <- function(result, data_desc, allele_count_column = "allele_count", ...) { - p <- plot_ecdf_column(result, allele_count_column, data_desc, ...) + - ggplot2::xlab("Allele count") - - p -} diff --git a/R/plot-ecdf-column.R b/R/plot-ecdf-column.R deleted file mode 100644 index 157c3fd..0000000 --- a/R/plot-ecdf-column.R +++ /dev/null @@ -1,90 +0,0 @@ -#' Generic plot for cumulative data (EXPERIMENTAL) -#' -#' Creates a generic plot of a \code{column} from a JACUSA2 \code{result} object and display -#' verbose data descripton \code{data_desc}. -#' -#' The user must make sure that dimensions (rows and length) of: -#' \code{result} and \code{data_desc} match and that \code{column} exists -#' in \code{result}. Furthermore, the user is responsible for correct filtering and grouping of the -#' \code{result} object. -#' \code{data_desc} is used to provide a descriptive name for each observation. -#' -#' @importFrom magrittr %>% -#' @param result object created by \code{read_result()} or \code{read_results()}. -#' @param column character string to be used for plotting. -#' @param data_desc vector of characters strings providing details about the sample. -#' @param main_group character string to be used to define colour. -#' @param sub_group character string to be used to define linetype. -#' @param name character string defines name of the legend. -#' @return ggplot2 plot object. -#' -#' @export -plot_ecdf_column <- function(result, column, data_desc, main_group, sub_group = NULL, name = "Data description") { - if (nrow(result) != length(data_desc)) { - stop("'data_desc' does not match number of entries of 'result'") - } - - result[["data_desc"]] <- NULL - result <- tidyr::add_column(result, data_desc) - - groups <- c(main_group) - if (! is.null(sub_group)) { - groups <- c(groups, sub_group) - } - result$group <- do.call( - interaction, - list(result[groups]) - ) - - columns <- c(groups, "group", "data_desc") - meta_desc <- dplyr::distinct( - result, !!!rlang::syms(columns) - ) - - limits <- as.vector(meta_desc[["group"]]) - labels <- meta_desc[["data_desc"]] - - column <- group <- NULL - p <- NULL - if (is.null(sub_group)) { - p <- ggplot2::ggplot( - result, - ggplot2::aes( - x = !!rlang::sym(column), - colour = group - ) - ) - } else { - p <- ggplot2::ggplot( - result, - ggplot2::aes( - x = !!rlang::sym(column), - colour = group, - linetype = group - ) - ) - } - - p <- p + ggplot2::scale_colour_manual( - name = name, - labels = labels, - limits = limits, - values = factor(meta_desc[[main_group]]) %>% as.integer() - ) - - if (! is.null(sub_group)) { - p <- p + ggplot2::scale_linetype_manual( - name = name, - labels = labels, - limits = limits, - values = factor(meta_desc[[sub_group]]) %>% as.integer() - ) - } - - p <- p + - ggplot2::xlab(column) + - ggplot2::ylab("Density") + - ggplot2::stat_ecdf(geom = "step") - - p -} diff --git a/R/sub-ratio.R b/R/sub-ratio.R index ff64550..49c44dc 100644 --- a/R/sub-ratio.R +++ b/R/sub-ratio.R @@ -7,7 +7,7 @@ #' #' @param ref vector of reference bases. #' @param bases matrix of observed base call counts. -#' @param bc vector strings that represents observed base calls. Default: "bases". +#' @param bc vector strings that represent observed base calls. Default: "bases". #' @return vector of base callsubstitution ratios. #' @examples #' ref <- c("A", "A", "T") diff --git a/R/write-result.R b/R/write-result.R index 30cfcdf..72e5dde 100644 --- a/R/write-result.R +++ b/R/write-result.R @@ -12,6 +12,7 @@ # @param file character string naming a file for writing # @param txt character string to be added to the JACUSA2 header. # @export +#' @noRd write_result <- function(result, file, txt = "") { if (.META_COND_COL %in% names(result)) { stop("result contains meta condition column: ", .META_COND_COL, diff --git a/man/JACUSA2helper.Rd b/man/JACUSA2helper.Rd index e4d98b7..d82683d 100644 --- a/man/JACUSA2helper.Rd +++ b/man/JACUSA2helper.Rd @@ -6,14 +6,7 @@ \title{JACUSA2helper: A package for post-processing JACUSA2 result files.} \section{Description}{ -A package that provides the following categories of functions to post-process result files of JACUSA2: -\describe{ - \item{read/write}{Read and write JACUSA2 result files, e.g.: \code{read_result()}.} - \item{add}{Adds some field to an existing JACUSA2 result object and return the modified object, e.g.: \code{base_sub()}.} - \item{filter}{Will remove sites from a result object with some filtering criteria, e.g.: \code{filter_by_coverage()}} - \item{plot}{Plots certain characteristics of a JACUSA2 result object.} - \item{other}{TODO} -} +A package that provides functions to post-process result files of JACUSA2. The following methods from JACUSA2 are supported: \describe{ @@ -33,18 +26,23 @@ provided library type option "-P" UNSTRANDED|FR_FIRSTSTRAND|RF_SECOND_STRAND". The central data structure in JACUSA2helper is the JACUSA2 result object that follows the tidy data approach to feature easy interaction with dplyr and ggplot2. A JACUSA2 result object can be created via \code{result <- read_result("jacusa2.out")} and is -currently represented as a tibble. Furthermore, JACUSA2helper supports the analysis of several related -JACUSA2 result files via \code{results <- read_results(files, meta)} where \code{meta_conditions} is a -vector of character strings that provides a descriptive name for each file in \code{files}. +currently represented as a tibble. Special structured columns exist that +hold condition andn replicate related data such as: coverage, bases. arrest rate. +Furthermore, JACUSA2helper supports the analysis of several related JACUSA2 +result files via \code{results <- read_results(files, meta_cond)} where +\code{meta_cond} is a vector of character strings that provides a descriptive +name for each file in \code{files}. -Check \code{vignette(TODO)} for a general introduction and \code{vignette(TODO meta conditions)} for details about meta conditions. +Check \code{vignette("JACUSA2helper", "JACUSA2helper")} for a general +introduction and \code{"JACUSA2helper", "JACUSA2helper"} for details about +meta conditions. } \section{read/write functions}{ See: \describe{ - \item{read_result}{Reads and unpacks a JACUSA2 result file and creates a result object.} + \item{read_result}{Reads and unpacks a JACUSA2 result file.} \item{read_results}{Allows to combine multiple result files and distinguish them with meta conditions.} \item{write_bedGraph}{Writes a vector of values as bedGraph file.} } @@ -54,27 +52,30 @@ See: See: \describe{ - \item{arrest_rate}{Adds arrest rate to JACUSA2 result object.} - \item{base_count}{TODO} - \item{base_sub}{Adds base substitution column to JACUSA2 result object.} - \item{non_ref2bc_ratio}{Adds non reference base ratio to JACUSA2 result object.} - \item{sub_ratio}{Adds base substitution ratio for all bases to a JACUSA2 result object.} + \item{arrest_rate}{Calculates arrest rate from base call counts (arest, through).} + \item{base_count}{Calculates the bumber of observed base calls.} + \item{base_sub}{Calculates base substitution.} + \item{base_ratio}{Calculates base call ratios.} + \item{non_ref_ratio}{Calculates non reference base ratio to JACUSA2 result object.} + \item{sub_ratio}{Calculates base substitution ratio for all bases to a JACUSA2 result object.} + } } \section{filter functions}{ -This function set enables filtering by read coverage or -enforcing a minimal number of variant base calls per sample. +This function set enables filtering by read coverage or enforcing a minimal +number of variant base calls per sample. See: \describe{ - \item{robust}{Retains sites that contain an arrest event in all replicates in at least one condition.} - \item{All}{TODO} - \item{Any}{TODO} + \item{robust}{Retains sites that are robust in one feature. The feature can be observed in all replicates of at least one condition.} + \item{filter_artefact}{Remove sites that have been marked as an artefact.} + \item{All}{Helper function} + \item{Any}{Helper function} + \item{lapply_cond}{lapply wrapper - applies function to all conditions.} + \item{lapply_repl}{lapply wrapper - applies function to all replicates.} + \item{mapply_repl}{mapply wrapper - applies function to all replicates.} } - -# TODO -read_sub } diff --git a/man/apply_cond.Rd b/man/apply_cond.Rd deleted file mode 100644 index 6a07fe4..0000000 --- a/man/apply_cond.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper-robust.R -\name{apply_cond} -\alias{apply_cond} -\title{Apply FUN to all tibbles of conditions} -\usage{ -apply_cond(X, FUN, cores = 1, ...) -} -\arguments{ -\item{X}{TODO} - -\item{FUN}{TODO} - -\item{cores}{TODO} - -\item{...}{TODO} -} -\description{ -TODO -} diff --git a/man/apply_repl.Rd b/man/apply_repl.Rd deleted file mode 100644 index 1e55c88..0000000 --- a/man/apply_repl.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper-robust.R -\name{apply_repl} -\alias{apply_repl} -\title{Apply FUN to all replicates} -\usage{ -apply_repl(X, FUN, cores = 1, ...) -} -\arguments{ -\item{X}{TODO} - -\item{FUN}{TODO} - -\item{cores}{TODO} - -\item{...}{TODO} -} -\description{ -TODO -} diff --git a/man/base_call.Rd b/man/base_call.Rd new file mode 100644 index 0000000..e419dc2 --- /dev/null +++ b/man/base_call.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{base_call} +\alias{base_call} +\title{Observed base calls.} +\usage{ +base_call(bases) +} +\arguments{ +\item{bases}{tibble of base call counts} +} +\value{ +vector of observed base calls +} +\description{ +Observed base calls. +} diff --git a/man/base_count.Rd b/man/base_count.Rd index 8716cfc..26d6806 100644 --- a/man/base_count.Rd +++ b/man/base_count.Rd @@ -4,10 +4,10 @@ \alias{base_count} \title{Retrieves the total number of observed unique bases.} \usage{ -base_count(bc, ref = NULL) +base_count(bases, ref = NULL) } \arguments{ -\item{bc}{vector of strings with observed base calls.} +\item{bases}{vector of base calls or tibble of base call counts.} \item{ref}{vector of strings with reference bases} } diff --git a/man/bc_ratio.Rd b/man/base_ratio.Rd similarity index 77% rename from man/bc_ratio.Rd rename to man/base_ratio.Rd index 7fa2725..a2c73ad 100644 --- a/man/bc_ratio.Rd +++ b/man/base_ratio.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/bc-ratio.R -\name{bc_ratio} -\alias{bc_ratio} +% Please edit documentation in R/base-ratio.R +\name{base_ratio} +\alias{base_ratio} \title{Calculate base call ratios for base counts.} \usage{ -bc_ratio(bases) +base_ratio(bases) } \arguments{ \item{bases}{nx4 matrix or data frame of base call counts.} @@ -18,5 +18,5 @@ Calculates base call ratios for base counts. } \examples{ bases <- matrix(c(5, 0, 5, 0, 1, 1, 1, 1), byrow = TRUE, ncol = 4) -ratio <- bc_ratio(bases) +ratio <- base_ratio(bases) } diff --git a/man/base_sub.Rd b/man/base_sub.Rd index 8299ffc..0458030 100644 --- a/man/base_sub.Rd +++ b/man/base_sub.Rd @@ -4,12 +4,12 @@ \alias{base_sub} \title{Calculates reference base to base call (base substitution).} \usage{ -base_sub(ref, bc) +base_sub(ref, bases) } \arguments{ \item{ref}{vector of reference bases.} -\item{bc}{vector of base calls.} +\item{bases}{vector of base calls or tibble of base call counts.} } \value{ vector of formatted base call substitutions. @@ -20,6 +20,6 @@ All elements in \code{ref} must be all length 1. } \examples{ ref <- c("A", "A", "C", "A") -bc <- c("AG", "G", "C", "G") -base_sub(ref, bc) +bases <- c("AG", "G", "C", "G") +base_sub(ref, bases) } diff --git a/man/coverage.Rd b/man/coverage.Rd new file mode 100644 index 0000000..c73cd37 --- /dev/null +++ b/man/coverage.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper.R +\name{coverage} +\alias{coverage} +\title{Calculate coverage for structured column.} +\usage{ +coverage(bases, cores = 1) +} +\arguments{ +\item{bases}{structured column of bases} + +\item{cores}{number of cores to use} +} +\value{ +structure column with coverages +} +\description{ +Calculate coverage for structured column. +} diff --git a/man/filter_all_artefacts.Rd b/man/filter_all_artefacts.Rd deleted file mode 100644 index 08d3759..0000000 --- a/man/filter_all_artefacts.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/helper-filter.R -\name{filter_all_artefacts} -\alias{filter_all_artefacts} -\title{Filters all sites with an artefact} -\usage{ -filter_all_artefacts(filter) -} -\arguments{ -\item{filter}{vector of strings that contains artefact filter information.} -} -\value{ -vector of logical. -} -\description{ -Removes sites have been marked by any feature/artefact filter. -} -\examples{ -data(rdd) -# remove sites that are marked by artefact filter "D" -filtered <- filter_by(rdd, filter_all_artefacts(filter)) -str(filtered) -} diff --git a/man/filter_artefact.Rd b/man/filter_artefact.Rd index 89b0526..180a4d6 100644 --- a/man/filter_artefact.Rd +++ b/man/filter_artefact.Rd @@ -4,12 +4,12 @@ \alias{filter_artefact} \title{Filters sites by artefacts.} \usage{ -filter_artefact(filter, artefacts) +filter_artefact(filter, artefacts = NULL) } \arguments{ \item{filter}{vector of strings that contains artefact filter information.} -\item{artefacts}{vector of characters that correspond to feature/artefact filters to be filtered.} +\item{artefacts}{vector of characters that correspond to feature/artefact filters to be filtered, default: NULL - Filter all.} } \value{ vector of logical. @@ -20,6 +20,8 @@ Removes sites that sites that have been marked by feature/artefact filter. \examples{ data(rdd) # remove sites that are marked by artefact filter "D" -filtered <- filter_by(rdd, filter_artefact(filter, c("D"))) -str(filtered) +dim(rdd) +dim(dplyr::filter(rdd, filter_artefact(filter, c("D")))) +# remove all sites that are marked by some artefact filter +dim(dplyr::filter(rdd, filter_artefact(filter))) } diff --git a/man/gather_repl.Rd b/man/gather_repl.Rd new file mode 100644 index 0000000..f6c5d3f --- /dev/null +++ b/man/gather_repl.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper.R +\name{gather_repl} +\alias{gather_repl} +\title{Extract values from a structured column.} +\usage{ +gather_repl(id, x, meta_cond = NULL) +} +\arguments{ +\item{id}{vector that unique identifies each row} + +\item{x}{structured column} + +\item{meta_cond}{vector of meta conditions} +} +\value{ +extracted column +} +\description{ +Extract values from a structured column. +} diff --git a/man/lapply_cond.Rd b/man/lapply_cond.Rd new file mode 100644 index 0000000..6d4831b --- /dev/null +++ b/man/lapply_cond.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper-robust.R +\name{lapply_cond} +\alias{lapply_cond} +\title{Apply f to all conditions} +\usage{ +lapply_cond(x, f, ..., cores = 1) +} +\arguments{ +\item{x}{tibble of data per condition} + +\item{f}{function to apply to each condition data} + +\item{...}{parameters for f} + +\item{cores}{numer of cores to use} +} +\value{ +tibble +} +\description{ +Wrapper for lapply +} diff --git a/man/lapply_repl.Rd b/man/lapply_repl.Rd new file mode 100644 index 0000000..05996ed --- /dev/null +++ b/man/lapply_repl.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper-robust.R +\name{lapply_repl} +\alias{lapply_repl} +\title{Apply f to all replicates} +\usage{ +lapply_repl(x, f, ..., cores = 1) +} +\arguments{ +\item{x}{tibble of data per condition} + +\item{f}{function to apply to each replicate data} + +\item{...}{parameters for f} + +\item{cores}{numer of cores to use} +} +\value{ +tibble +} +\description{ +Wrapper for lapply +} diff --git a/man/mapply_repl.Rd b/man/mapply_repl.Rd new file mode 100644 index 0000000..c68a1b8 --- /dev/null +++ b/man/mapply_repl.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/helper-robust.R +\name{mapply_repl} +\alias{mapply_repl} +\title{Apply f to all replicates} +\usage{ +mapply_repl(f, ..., MoreArgs = NULL, cores = 1) +} +\arguments{ +\item{f}{function to apply to each replicate data} + +\item{...}{see mapply} + +\item{MoreArgs}{see mapply} + +\item{cores}{numer of cores to use} +} +\value{ +tibble +} +\description{ +Wrapper for mapply. +} diff --git a/man/non_ref_ratio.Rd b/man/non_ref_ratio.Rd index daedc46..5f584b4 100644 --- a/man/non_ref_ratio.Rd +++ b/man/non_ref_ratio.Rd @@ -25,5 +25,5 @@ bases <- tidyr::tribble( 0, 1, 1, 1, 1, 1, 1, 1 ) -str(non_ref_ratio(ref, bases)) +non_ref_ratio(ref, bases) } diff --git a/man/plot_ecdf_allele_count.Rd b/man/plot_ecdf_allele_count.Rd deleted file mode 100644 index 7e6bee2..0000000 --- a/man/plot_ecdf_allele_count.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot-ecdf-allele-count.R -\name{plot_ecdf_allele_count} -\alias{plot_ecdf_allele_count} -\title{Plot cumulative allele count.} -\usage{ -plot_ecdf_allele_count( - result, - data_desc, - allele_count_column = "allele_count", - ... -) -} -\arguments{ -\item{result}{object created by \code{read_result()} or \code{read_results()}.} - -\item{data_desc}{vector of characters strings providing details about the sample.} - -\item{allele_count_column}{character sting specifing the name of the column.} - -\item{...}{parameters forwared to plot_ecdf_column.} -} -\value{ -ggplot2 plot object. -} -\description{ -Plots cumulative distribution of allele counts for single and multiple JACUSA2 result objects. -} -\seealso{ -\code{plot_ecdf_column()} -} diff --git a/man/plot_ecdf_column.Rd b/man/plot_ecdf_column.Rd deleted file mode 100644 index 82625a8..0000000 --- a/man/plot_ecdf_column.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/plot-ecdf-column.R -\name{plot_ecdf_column} -\alias{plot_ecdf_column} -\title{Generic plot for cumulative data (EXPERIMENTAL)} -\usage{ -plot_ecdf_column( - result, - column, - data_desc, - main_group, - sub_group = NULL, - name = "Data description" -) -} -\arguments{ -\item{result}{object created by \code{read_result()} or \code{read_results()}.} - -\item{column}{character string to be used for plotting.} - -\item{data_desc}{vector of characters strings providing details about the sample.} - -\item{main_group}{character string to be used to define colour.} - -\item{sub_group}{character string to be used to define linetype.} - -\item{name}{character string defines name of the legend.} -} -\value{ -ggplot2 plot object. -} -\description{ -Creates a generic plot of a \code{column} from a JACUSA2 \code{result} object and display -verbose data descripton \code{data_desc}. -} -\details{ -The user must make sure that dimensions (rows and length) of: -\code{result} and \code{data_desc} match and that \code{column} exists -in \code{result}. Furthermore, the user is responsible for correct filtering and grouping of the -\code{result} object. -\code{data_desc} is used to provide a descriptive name for each observation. -} diff --git a/man/read_results.Rd b/man/read_results.Rd index 8238c52..eddff57 100644 --- a/man/read_results.Rd +++ b/man/read_results.Rd @@ -4,13 +4,15 @@ \alias{read_results} \title{Read multiple related JACUSA2 results} \usage{ -read_results(files, meta_conds, unpack = FALSE, cores = 1) +read_results(files, meta_conds, cond_descs, unpack = FALSE, cores = 1) } \arguments{ \item{files}{Vector of files.} \item{meta_conds}{Vector of string vectors that explain files.} +\item{cond_descs}{Vector of strings that represent names/descriptions for conditions.} + \item{unpack}{Boolean indicates if info column should be processed.} \item{cores}{Integer defines how many cores to use.} diff --git a/man/robust.Rd b/man/robust.Rd index d3d585f..fac1c5a 100644 --- a/man/robust.Rd +++ b/man/robust.Rd @@ -4,10 +4,10 @@ \alias{robust} \title{Check if data is robust} \usage{ -robust(data) +robust(x) } \arguments{ -\item{data}{tibble of numeric.} +\item{x}{tibble of numeric.} } \value{ logical vector indicating robust rows. diff --git a/man/sub_ratio.Rd b/man/sub_ratio.Rd index c4907d7..0fdcf47 100644 --- a/man/sub_ratio.Rd +++ b/man/sub_ratio.Rd @@ -11,7 +11,7 @@ sub_ratio(ref, bases, bc = NULL) \item{bases}{matrix of observed base call counts.} -\item{bc}{vector strings that represents observed base calls. Default: "bases".} +\item{bc}{vector strings that represent observed base calls. Default: "bases".} } \value{ vector of base callsubstitution ratios. diff --git a/tests/testthat/test-bc-ratio.R b/tests/testthat/test-base-ratio.R similarity index 79% rename from tests/testthat/test-bc-ratio.R rename to tests/testthat/test-base-ratio.R index 882b5c2..fb6421e 100644 --- a/tests/testthat/test-bc-ratio.R +++ b/tests/testthat/test-base-ratio.R @@ -1,6 +1,6 @@ -context("bc_ratio") +context("base_ratio") -test_that("bc_ratio works as expected", { +test_that("base_ratio works as expected", { bases <- tidyr::tribble( ~A, ~C, ~G, ~T, 1, 0, 0, 0, @@ -20,7 +20,7 @@ test_that("bc_ratio works as expected", { ) expect_equal( - bc_ratio(bases), + base_ratio(bases), expected ) }) diff --git a/tests/testthat/test-cov.R b/tests/testthat/test-coverage.R similarity index 60% rename from tests/testthat/test-cov.R rename to tests/testthat/test-coverage.R index 126663d..bc6d162 100644 --- a/tests/testthat/test-cov.R +++ b/tests/testthat/test-coverage.R @@ -1,7 +1,7 @@ context("coverage") -test_that(".cov works as expected", { - bases <- tidyr::tibble( +test_that("coverageworks as expected", { + cond1 <- tidyr::tibble( bases1 = tidyr::tribble( ~A, ~C, ~G, ~T, 1, 0, 0, 0, @@ -21,12 +21,16 @@ test_that(".cov works as expected", { ) expected <- tidyr::tibble( - bases1 = c(1, 1, 1, 1, 4), - bases2 = c(1, 1, 1, 1, 4), + cond1=tidyr::tibble( + bases1 = c(1, 1, 1, 1, 4), + bases2 = c(1, 1, 1, 1, 4), + ) ) + actual <- tidyr::tibble(cond1=cond1) + expect_equal( - .cov(bases), - expected + as.matrix(coverage(actual)), + as.matrix(expected) ) }) diff --git a/tests/testthat/test-filter-by-filter-info.R b/tests/testthat/test-filter-by-filter-info.R index 14faa36..af9fd7d 100644 --- a/tests/testthat/test-filter-by-filter-info.R +++ b/tests/testthat/test-filter-by-filter-info.R @@ -1,12 +1,5 @@ context("helper_filter") -test_that("filter_artefact fails on missing option", { - expect_error( - filter_artefact(c(.EMPTY, .EMPTY, .EMPTY), ""), - "artefacts cannot be 0" - ) -}) - test_that("filter_artefact works as expected", { expect_equal( filter_artefact(c(.EMPTY, .EMPTY, .EMPTY), "D"), diff --git a/tests/testthat/test-guess-cond-count.R b/tests/testthat/test-guess-cond-count.R index 00a5334..0d9eae8 100644 --- a/tests/testthat/test-guess-cond-count.R +++ b/tests/testthat/test-guess-cond-count.R @@ -37,10 +37,10 @@ test_that(".guess_cond_count works as expected on rt-arrest", { .guess_cond_count( .RT_ARREST, c( - create_header_names(.ARREST_DATA_COL, 1, 3), - create_header_names(.THROUGH_DATA_COL, 1, 3), - create_header_names(.ARREST_DATA_COL, 2, 3), - create_header_names(.THROUGH_DATA_COL, 2, 3) + create_header_names(.ARREST_COL, 1, 3), + create_header_names(.THROUGH_COL, 1, 3), + create_header_names(.ARREST_COL, 2, 3), + create_header_names(.THROUGH_COL, 2, 3) ) ), 2 @@ -52,10 +52,10 @@ test_that(".guess_cond_count works as expected on lrt-arrest", { .guess_cond_count( .LRT_ARREST, c( - create_header_names(.ARREST_DATA_COL, 1, 3), - create_header_names(.THROUGH_DATA_COL, 1, 3), - create_header_names(.ARREST_DATA_COL, 2, 3), - create_header_names(.THROUGH_DATA_COL, 2, 3) + create_header_names(.ARREST_COL, 1, 3), + create_header_names(.THROUGH_COL, 1, 3), + create_header_names(.ARREST_COL, 2, 3), + create_header_names(.THROUGH_COL, 2, 3) ) ), 2 diff --git a/tests/testthat/test-guess-file-type.R b/tests/testthat/test-guess-file-type.R index ab8b797..97a69d8 100644 --- a/tests/testthat/test-guess-file-type.R +++ b/tests/testthat/test-guess-file-type.R @@ -50,8 +50,8 @@ create_condition <-function(condition, replicates, arrest, through) { test_that(".guess_file_type recognizes rt-arrest", { line <- paste( TEST_HEADER_COORD, - create_condition(1, 3, .ARREST_DATA_COL, .THROUGH_DATA_COL), - create_condition(2, 3, .ARREST_DATA_COL, .THROUGH_DATA_COL), + create_condition(1, 3, .ARREST_COL, .THROUGH_COL), + create_condition(2, 3, .ARREST_COL, .THROUGH_COL), TEST_HEADER_INFO, sep = "\t", collapse = "" ) @@ -62,8 +62,8 @@ test_that(".guess_file_type recognizes lrt-arrest", { line <- paste( TEST_HEADER_COORD, .LRT_ARREST_POS_COL, - create_condition(1, 3, .ARREST_DATA_COL, .THROUGH_DATA_COL), - create_condition(2, 3, .ARREST_DATA_COL, .THROUGH_DATA_COL), + create_condition(1, 3, .ARREST_COL, .THROUGH_COL), + create_condition(2, 3, .ARREST_COL, .THROUGH_COL), TEST_HEADER_INFO, sep = "\t", collapse = "" ) diff --git a/tests/testthat/test-io.R b/tests/testthat/test-io.R index 22597e4..45035ef 100644 --- a/tests/testthat/test-io.R +++ b/tests/testthat/test-io.R @@ -40,7 +40,7 @@ test_that(".fill_empty works as expected", { }) -test_that(".base_call works as expected", { +test_that("base_call works as expected", { bases <- tidyr::tribble( ~A, ~C, ~G, ~T, 1, 0, 0, 0, @@ -59,7 +59,7 @@ test_that(".base_call works as expected", { ) expect_equal( - .base_call(bases), + base_call(bases), expected ) }) \ No newline at end of file diff --git a/vignettes/JACUSA2helper-meta-condition.bib b/vignettes/JACUSA2helper-meta-condition.bib deleted file mode 100644 index 2c77bab..0000000 --- a/vignettes/JACUSA2helper-meta-condition.bib +++ /dev/null @@ -1,37 +0,0 @@ -@article{PieknaPrzybylska2007, - doi = {10.1093/nar/gkm855}, - url = {https://doi.org/10.1093/nar/gkm855}, - year = {2007}, - month = dec, - publisher = {Oxford University Press ({OUP})}, - volume = {36}, - number = {Database}, - pages = {D178--D183}, - author = {D. Piekna-Przybylska and W. A. Decatur and M. J. Fournier}, - title = {The 3D {rRNA} modification maps database: with interactive tools for ribosome analysis}, - journal = {Nucleic Acids Research} -} -@Article{Piechotta2017, - author = {Michael Piechotta and Emanuel Wyler and Uwe Ohler and Markus Landthaler and Christoph Dieterich}, - title = {{JACUSA}: site-specific identification of {RNA} editing events from replicate sequencing data}, - journal = {{BMC} Bioinformatics}, - year = {2017}, - volume = {18}, - number = {1}, - month = {jan}, - doi = {10.1186/s12859-016-1432-8}, - publisher = {Springer Nature}, -} -@article{Zhou2018, - doi = {10.1080/15476286.2018.1462654}, - url = {https://doi.org/10.1080/15476286.2018.1462654}, - year = {2018}, - month = may, - publisher = {Informa {UK} Limited}, - volume = {15}, - number = {7}, - pages = {892--900}, - author = {Katherine I. Zhou and Wesley C. Clark and David W. Pan and Matthew J. Eckwahl and Qing Dai and Tao Pan}, - title = {Pseudouridines have context-dependent mutation and stop rates in high-throughput sequencing}, - journal = {{RNA} Biology} -} \ No newline at end of file diff --git a/vignettes/JACUSA2helper-meta-condtions.Rmd b/vignettes/JACUSA2helper-meta-condtions.Rmd index 8a22a49..a6993cd 100644 --- a/vignettes/JACUSA2helper-meta-condtions.Rmd +++ b/vignettes/JACUSA2helper-meta-condtions.Rmd @@ -45,7 +45,7 @@ unique(Zhou2018$meta_cond) When manipulating a multi `results` object created by `read_results()`, it is crucial to distinguish the following files in an analysis pipeline: * `results %>% ... other functions()` -* `results %>% dplyr:group_by(results, meta_condition) %>% ... other functions()` +* `results %>% dplyr:group_by(results, meta_cond) %>% ... other functions()` The first statement will apply any subsequent functions to ALL sites regardless of the meta condition while the last statement will apply to sites of EACH meta condition! @@ -74,7 +74,7 @@ Filter by coverage regardless of the meta condition. ```{r} filtered_all <- Zhou2018 %>% - dplyr::filter(All(cov >= 20000)) %>% + dplyr::filter(All(cov$cond1 >= 20000) & All(cov$cond2 >= 20000)) %>% dplyr::group_by(contig, meta_cond) %>% dplyr::count() # count in a group filtered_all @@ -85,96 +85,107 @@ Filter by coverage of each meta condition. ```{r} filtered_meta <- Zhou2018 %>% dplyr::group_by(meta_cond) %>% - dplyr::filter(All(cov >= 20000)) %>% + dplyr::filter(All(cov$cond1 >= 20000) & All(cov$cond2 >= 20000)) %>% dplyr::group_by(contig, meta_cond) %>% dplyr::count() # count in a group filtered_meta ``` # Plot - -First, we add a description `cond_desc` of the conditions to the result object. The data sets of `Zhou2018` have been layout out in such a way that condition 1 and 2 correspond to carbodiimide (+CMC) treatment and control (-CMC), respectively. +First, we add a description `data_desc` of the conditions to the result object. +The data sets of `Zhou2018` have been layout out in such a way that condition 1 +and 2 correspond to carbodiimide (+CMC) treatment and control (-CMC), respectively. ```{r} data(Zhou2018) -result <- Zhou2018 %>% - dplyr::filter(strand == "+") - -# create nice label -#result$cond_desc <- "" -#result$cond_desc[result$cond == 1] <- "+CMC" -#result$cond_desc[result$cond == 2] <- "-CMC" -#result$group <- do.call(interaction, result[c("cond", "repl")]) +result <- gather_repl(Zhou2018$id, Zhou2018$arrest_rate) %>% + dplyr::inner_join(Zhou2018 %>% dplyr::select(id, pvalue, meta_cond), by = "id") %>% + dplyr::mutate( + data_desc = dplyr::case_when( + cond == "cond1" ~ "+CMC", + cond == "cond2" ~ "-CMC", + ) + ) + +# grouping enables nice legends in plot +group <- do.call(interaction, result[c("cond", "repl")]) +result$group <- group # relate group={cond(ition), repl(icat)}, and nice label -#meta_desc <- dplyr::distinct(result, cond, repl, group, cond_desc) -# map group values to nice labels (cond_desc) -#limits <- as.vector(meta_desc[["group"]]) -#labels <- meta_desc[["cond_desc"]] +meta_desc <- dplyr::distinct(result, cond, repl, group, data_desc) +# map group values to nice labels (data_desc) +limits <- as.vector(meta_desc[["group"]]) +labels <- meta_desc[["data_desc"]] ``` -Next, we define a ggplot2 object that allows to merge legend for different scales. Check [combine legends]{https://ozh2014.wordpress.com/2017/06/26/combine-related-legends-into-one-in-ggplot2/} for details. -In brief, we use `colour` to represent cond(ition) and `linetype` to represent repl(icate) an relate their possible combinations to descriptive `labels`. +Next, we define a ggplot2 object that allows to merge legend for different scales. +Check [combine legends]{https://ozh2014.wordpress.com/2017/06/26/combine-related-legends-into-one-in-ggplot2/} for details. +In brief, we use `colour` to represent cond(ition) and `linetype` to represent +repl(icate) an relate their possible combinations to descriptive `labels`. ```{r} -#colour_scale <- ggplot2::scale_colour_manual( -# name = "Treatment", -# labels = labels, -# limits = limits, -# values = factor(meta_desc[["cond"]]) %>% as.integer() -# ) -#linetype_scale <- ggplot2::scale_linetype_manual( -# name = "Treatment", -# labels = labels, -# limits = limits, -# values = factor(meta_desc[["repl"]]) %>% as.integer() -# ) +colour_scale <- ggplot2::scale_colour_manual( + name = "Treatment", + labels = labels, + limits = limits, + values = factor(meta_desc[["cond"]]) %>% as.integer() +) +linetype_scale <- ggplot2::scale_linetype_manual( + name = "Treatment", + labels = labels, + limits = limits, + values = factor(meta_desc[["repl"]]) %>% as.integer() +) ``` ## Pvalue distribution - Plot of the empirical cumulative distributions of pvalues for all meta conditions. ```{r} -#result %>% -# dplyr::distinct(contig, start, end, strand, meta_cond, pvalue) %>% -# ggplot2::ggplot(ggplot2::aes(x = pvalue, colour = meta_cond)) + -# ggplot2::labs(colour = "Meta condition") + -# ggplot2::ylab("Density") + -# ggplot2::stat_ecdf(geom = "step") + -# ggplot2::xlab("pvalue") + -# ggplot2::theme(legend.position = "bottom") +result %>% + ggplot2::ggplot(ggplot2::aes(x = pvalue, colour = meta_cond)) + + ggplot2::labs(colour = "Meta condition") + + ggplot2::ylab("Density") + + ggplot2::stat_ecdf(geom = "step") + + ggplot2::xlab("pvalue") + + ggplot2::theme(legend.position = "bottom") ``` -## Empirical cumulative arrest rate distribution +## Empirical cumulative arrest rate distribution ```{r} -#result %>% -# ggplot2::ggplot(ggplot2::aes(x = arrest_rate, colour = group, linetype = group)) + -# ggplot2::stat_ecdf(geom = "step") + -# colour_scale + -# linetype_scale + -# ggplot2::xlab("Density") + -# ggplot2::xlab("Arrest rate") + -# ggplot2::theme(legend.position = "bottom") + -# ggplot2::facet_grid(meta_cond ~ .) +result %>% + ggplot2::ggplot(ggplot2::aes(x = value, colour = group, linetype = group)) + + ggplot2::stat_ecdf(geom = "step") + + colour_scale + + linetype_scale + + ggplot2::xlab("Density") + + ggplot2::xlab("Arrest rate") + + ggplot2::theme(legend.position = "bottom") + + ggplot2::facet_grid(meta_cond ~ .) ``` ## Compare coverage - ```{r} -#result %>% -# ggplot2::ggplot(ggplot2::aes(x = cov, colour = group, linetype = group)) + -# ggplot2::stat_ecdf(geom = "step") + -# colour_scale + -# linetype_scale + -# ggplot2::scale_x_log10( -# breaks = scales::trans_breaks("log10", function(x) 10^x), -# labels = scales::trans_format("log10", scales::math_format(10^.x)) -# ) + -# ggplot2::xlab("Read Coverage") + -# ggplot2::theme(legend.position = "bottom") + -# ggplot2::facet_grid(meta_cond ~ .) +gather_repl(Zhou2018$id, Zhou2018$cov) %>% + dplyr::inner_join(dplyr::select(Zhou2018, id, meta_cond), by = "id") %>% + dplyr::mutate( + data_desc = dplyr::case_when( + cond == "cond1" ~ "+CMC", + cond == "cond2" ~ "-CMC", + ) + ) %>% + ggplot2::ggplot(ggplot2::aes(x = value, colour = group, linetype = group)) + + ggplot2::stat_ecdf(geom = "step") + + colour_scale + + linetype_scale + + ggplot2::scale_x_log10( + breaks = scales::trans_breaks("log10", function(x) 10^x), + labels = scales::trans_format("log10", scales::math_format(10^.x)) + ) + + ggplot2::xlab("Read Coverage") + + ggplot2::theme(legend.position = "bottom") + + ggplot2::facet_grid(meta_cond ~ .) ``` @@ -183,35 +194,34 @@ Plot of the empirical cumulative distributions of pvalues for all meta condition We plot the arrest rate for all three meta conditions and +CMC and -CMC treatment. ```{r} -result %>% +Zhou2018 %>% dplyr::filter(pvalue <= 0.00001) %>% dplyr::group_by(meta_cond) %>% - dplyr::filter_by(All(cov >= 20000)) %>% - dplyr::ungroup() %>% - dplyr::select(contig, start, end, strand, meta_cond, arrest_rate) %>% -# tidyr::spread(cond, arrest_rate) %>% -# ggplot2::ggplot(ggplot2::aes(x = arrest_rate1, y = arrest_rate2, colour = meta_cond)) + -# ggplot2::geom_point() + -# ggplot2::ylab("arrest rate -CMC") + -# ggplot2::xlab("arrest rate +CMC") + -# ggplot2::facet_grid(. ~ meta_cond) + -# ggplot2::geom_abline(linetype = "dashed", colour = "gray") + -# ggplot2::labs(colour = "Meta condition") + -# ggplot2::theme(legend.position = "bottom") + dplyr::filter(All(cov$cond1 >= 20000) & All(cov$cond2 >= 20000)) %>% + ggplot2::ggplot(ggplot2::aes(x = arrest_rate$cond1$rep1, y = arrest_rate$cond2$rep1, colour = meta_cond)) + + ggplot2::geom_point() + + ggplot2::ylab("arrest rate -CMC") + + ggplot2::xlab("arrest rate +CMC") + + ggplot2::facet_grid(. ~ meta_cond) + + ggplot2::geom_abline(linetype = "dashed", colour = "gray") + + ggplot2::labs(colour = "Meta condition") + + ggplot2::theme(legend.position = "bottom") ``` -We plot the arrest rate delta of +CMC and -CMC treatment for all three meta conditions and add Pseudouridine sites (black points) from [3Dmodmap]{https://people.biochem.umass.edu/fournierlab/3dmodmap/main.php}. +We plot the arrest rate delta of +CMC and -CMC treatment for all three meta +conditions and add Pseudouridine sites (black points) from +[3Dmodmap]{https://people.biochem.umass.edu/fournierlab/3dmodmap/main.php}. ```{r} data(modmap) mod <- modmap %>% dplyr::filter(rrna == "NR_003286_RNA18SN5", base == "PseudoU") -result %>% +Zhou2018 %>% dplyr::filter(contig == "NR_003286_RNA18SN5") %>% dplyr::filter(pvalue <= 0.00001) %>% dplyr::group_by(meta_cond) %>% - dplyr::filter(All(cov >= 20000)) %>% + dplyr::filter(All(cov$cond1 >= 20000) & All(cov$cond1 >= 20000)) %>% dplyr::mutate(arrest_rate_delta = arrest_rate$cond1$rep1 - arrest_rate$cond2$rep1) %>% ggplot2::ggplot(ggplot2::aes(x = end, y = arrest_rate_delta, fill = meta_cond)) + ggplot2::geom_bar(stat = "identity") + diff --git a/vignettes/JACUSA2helper.Rmd b/vignettes/JACUSA2helper.Rmd index 080d835..1382380 100644 --- a/vignettes/JACUSA2helper.Rmd +++ b/vignettes/JACUSA2helper.Rmd @@ -23,101 +23,186 @@ output file(s) with [JACUSA2helper](https://github.com/dieterich-lab/JACUSA2help consists of: * Read JACUSA2 output file(s) into result object. -* Add optional data, depending on the result file, e.g.: `base_sub()` for base substitutions. -* Filter result object by some criteria, e.g.: `dplyr::filter(All(cov >= 10))` and - use `All()` or `Any` on combined data (conditions and replicates). +* Add optional data, e.g.: `base_sub()` - add base substitution. +* Filter result object by some criteria, e.g.: `dplyr::filter(All(cov$cond1 >= 10))` - + retain sites with read coverage >= 10, and use `All()` or `Any` on structured + columns (conditions and replicates). * Plot filtered result object. -JACUSA2helper supports the analysis of the following methods from JACUSA2: *call-{1,2}*, *pileup*, *rt-arrest*, and *lrt-arrest*(experimental). -Check the [JACUSA2 manual](https://github.com/dieterich-lab/JACUSA2/manual/manual.pdf) for more details. +JACUSA2helper supports the analysis of the following methods from JACUSA2: +*call-{1,2}*, *pileup*, *rt-arrest*, and *lrt-arrest*(experimental). +Check the [JACUSA2 manual](https://github.com/dieterich-lab/JACUSA2/manual/manual.pdf) +for more details. -The main data structure is the result object that is implemented by following the tidy data approach from @Wickham2014 to feature easy interaction with [dplyr](https://github.com/tidyverse/dplyr) and [ggplot2](https://github.com/tidyverse/ggplot2). +The main data structure is the result object that is implemented by following +the tidy data approach from @Wickham2014 to feature easy interaction with +[dplyr](https://github.com/tidyverse/dplyr) and [ggplot2](https://github.com/tidyverse/ggplot2). -In the following, we will focus on single pairwise comparisons of two conditions of one output file. -If you want to analyze multiple files with similar pairwise comparisons, read `vignette("JACUSA2helper-meta-conditions")`. +In the following, we will focus on single pairwise comparisons of two conditions +of one output file. If you want to analyze multiple files with similar pairwise +comparisons, read `vignette("JACUSA2helper-meta-conditions")`. -## Multi line site - -While previously in JACUSA*1*, a site could be uniquely identified as one line in the output by the coordinate columns: 'contig', 'start', 'end', and 'strand', JACUSA2 features more complex data structures to store new features such as arrest positions of arrest events, read/base stratification and INDEL counts. Therefore, sites can now cover multiple lines of output - check Section "Introduction" of [JACUSA2 manual](https://github.com/dieterich-lab/JACUSA2/manual/manual.pdf) for details. +## Multi site and new features +While previously in JACUSA*1*, a site could be uniquely identified as one line +in the output by the coordinate columns: 'contig', 'start', 'end', and 'strand', +JACUSA*2* features more complex data structures to store new features such as +arrest positions of arrest events, variant tagging, and INDEL counts. +Therefore, sites can now cover multiple lines of output - check Section +"Introduction" of [JACUSA2 manual](https://github.com/dieterich-lab/JACUSA2/manual/manual.pdf) +for details. ## Variant calling - Data: rdd (RNA-DNA-differences) +To introduce the basic verbs for manipulating JAUCSA2 output, we'll use +`JACUSA2helper:rdd`. This data set contains a subset from @Piechotta2017 and is +documented in `?rdd`. In brief, the sample data set consists of 10.000 sites of +1x DNA and 2x stranded RNA sequencing libraries that allow to define +RNA-DNA-differences (RDDs). + +```{r} +library(JACUSA2helper) +data(rdd) +``` -To introduce the basic verbs for manipulating JAUCSA2 output, we'll use `JACUSA2helper:rdd`. This data set contains a subset from @Piechotta2017 and is documented in `?rdd`. In brief, the sample data set consists of 10.000 sites of 1x DNA and 2x stranded RNA sequencing libraries that allow to define RNA-DNA-differences (RDDs). +Note that `rdd` is a tibble - check [tibble](http://tibble.tidyverse.org) for +more details. +Use dplyr to manipulate the result object `rdd`. Check details on +[dplyr](https://github.com/tidyverse/dplyr) and read `vignette("dplyr")`. +### Structured column +JACUSA2 features structured columns (nested tibbles) where condition and replicate +specific data is stored, e.g.: bases, cov, etc. ```{r} -library(JACUSA2helper) -dplyr::arrange(rdd, contig, start, strand) +rdd[c("id", "bases", "cov")] ``` -Note that `rdd` is a tibble - check [tibble](http://tibble.tidyverse.org) for more details. +Check structure of column with: +```{r} +names(rdd$cov) +``` +To access specific coverage information for condition 2 use: +```{r} +rdd$cov$cond2 +``` +In summary, all structured/nested columns hold: +* data of a condition "cond*I*" on the first level, and +* data of a replicate "rep*J*" on the second level. -Use dplyr to manipulate the result object `rdd`. Check details on [dplyr](https://github.com/tidyverse/dplyr) and read `vignette("dplyr")`. +Coverage information for condition 2 and replicate 1 can be accessed with: +```{r} +rdd$cov$cond2$rep1 +``` -Once the result object is available, you can start to manipulate it. In a first step, we will retain sites with score >= 2. +#### Wrapper for `lapply()` and `mapply()` +For accessing and manipulating structured columns there are JACUSA2 specific +wrappers for `lapply()` and `mapply()`: +* `lapply_cond(x, f)` operates on condition data (list of replicate data), +* `lapply_repl(x, f)` operates on replicate data of each condition, and +* `mapply_repl(f, ...)` operates on replicate data, takes multiple inputs. + +To get the total coverage for each condition use: +```{r} +lapply_cond(rdd$cov, rowSums) +``` + +The following gives you the observed base calls: +```{r} +base_calls <- lapply_repl(rdd$bases, base_call) +base_calls$cond2 +``` + +### Filtering with `dplyr::filter()` +dplyr and `%>%` from magrittr allow to formulate compact analysis pipelines. + +In a first step, we will retain sites with score >= 2. ```{r} # before filtering dim(rdd) result <- dplyr::filter(rdd, score >= 2) +print("After filtering") # after filtering dim(result) ``` -dplyr and `%>%` from magrittr allow to formulate compact analysis pipelines. In the following, we will: +In order to formulate complicate statements with `dplyr::filter()` featuring +arbitrary combinations of conditions and/or replicates use the following +convenience functions: +* `All()` and +* `Any()`. + +The following statement retains sites where all replicates of condition 1 have +coverage >= 50 and at least one replicate of condition 2 has 10 reads: +```{r} +rdd %>% + dplyr::filter(All(cov$cond1 >= 50) & Any(cov$cond2 >= 10)) %>% + dplyr::select(id, cov) +``` + +In the following, we will: * remove sites with score < 2, * retain sites with coverage >= 10 for all replicates, -* remove sites with > 2 observed bases (including reference base), -* apply a filter that retains only robust sites, -* use the observed base calls from one condition to redefine the reference (useful in DNA vs. RND comparisons), -* add a field that shows base substitution (here RNA editing sites ) per data sample. +* remove sites with > 2 observed bases (excluding reference base), +* apply a filter that retains only robust sites (RNA editing must be present in + all/both replicates - see TODO), +* Finally, we compute the base substitution (here RNA editing sites against FASTA + reference) per replicate. ```{r} -# filter data filtered <- rdd %>% dplyr::filter(score >= 2) %>% - dplyr::filter(All(cov$cond1 >= 10) & All(cov$cond2 > 10)) - dplyr::filter(base_count(bc, ref) <= 2) %>% - dplyr::filter(robust(bases)) %>% - dplyr::mutate(base_sub = base_sub(bc$cond1, bc$cond2)) # add base substitutions + dplyr::filter(All(cov$cond1 >= 10) & All(cov$cond2 > 10)) %>% + dplyr::filter(base_count(bc) <= 2) %>% + dplyr::filter(robust(bases)) + +# sum base call counts of condition / RNA replicates +rna_bases <- Reduce("+", filtered$bases$cond2) + +# we don't need lapply_repl, because we don't operate on all replicate from all +# conditions - only condition 2 / RNA +ref2rna <- base_sub(filtered$ref, rna_bases) +table(ref2rna) + ``` -Finally, we will add a summary of base substitutions for a site by using `merge_sub()`: +Instead of using DNA information from the reference FASTA sequence via column "ref", +we could use the actual data from condition 1 / DNA. + +TODO Robust requires observations (here base calls) to be present in all replicates of at least condition. ```{r} -# summarise data -#summary_site_sub <- filtered %>% -# group_by_site() %>% # following function will be applied on rows group by site(s) -# dplyr::summarise(site_sub = merge_sub(sub)) # adds new column site_sub -#summary_site_sub +cond1_ref <- base_call(filtered$bases$cond1$rep1) +dna2rna <- base_sub(filtered$ref, rna_bases) +table(dna2rna) ``` -There is more than one way to arrive at this summary of the data. In another approach `dplry::summarise(site_sub = merge_sub(sub))` could be replaced with `dplyr::mutate(site_sub = merge_sub(sub)) %>% dplyr::distinct(contig, start, end, strand, site_sub)`. +Using different sources of DNA to describe RRDs via `ref2rna` and `dna2rna` +we can conclude that there are no polymorphic positions in `filtered` because +`ref2rna` and `dna2rna` are identical. ### Plot base substitutions - We can now plot the site specific distribution of base substitutions using [ggplot2](https://github.com/tidyverse/ggplot2) with: ```{r} -#summary_site_sub %>% -# ggplot2::ggplot(ggplot2::aes(x = site_sub, fill = site_sub)) + -# ggplot2::geom_bar() + -# ggplot2::xlab("Base substitution") + -# ggplot2::scale_fill_discrete(name = "Base substitution") + -# ggplot2::theme( -# legend.position = "bottom", -# axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1) -# ) +tidyr::tibble(base_sub = dna2rna) %>% # ggplot requires a data frame + ggplot2::ggplot(ggplot2::aes(x = base_sub, fill = base_sub)) + + ggplot2::geom_bar() + + ggplot2::xlab("Base substitution") + + ggplot2::scale_fill_discrete(name = "Base substitution") + + ggplot2::theme( + legend.position = "bottom", + axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1) + ) ``` ### Plot coverage distribution -To compare the coverage distribution of each data sample, we create a simple plot of the empirical coverage distribution, where we map cond(ition) to `colour` and repl(icate) to `linetype`. +To compare the coverage distribution of each data sample, we create a simple plot of the empirical coverage distribution, where we map cond(ition) to `colour` and rep(licate) to `linetype`. ```{r} -result %>% - ggplot2::ggplot(ggplot2::aes(x = cov, colour = cond, linetype = repl)) + +gather_repl(result$id, result$cov) %>% + ggplot2::ggplot(ggplot2::aes(x = value, colour = cond, linetype = repl)) + ggplot2::stat_ecdf(geom = "step") ``` @@ -131,16 +216,18 @@ In order to improve the preliminary plot, we need to modify the following aspect First, a data description is added to the result object: -# TODO ```{r} -data(rdd) -result <- rdd -# create nice data description -# DNA RNA -# data_desc for cond and repl in result: -dplyr::distinct(result, cond, repl, data_desc) +result <- gather_repl(rdd$id, rdd$cov) %>% + dplyr::mutate( + data_desc = dplyr::case_when( + cond == "cond1" ~ "DNA", + cond == "cond2" & repl == "rep1" ~ "RNA 1", + cond == "cond2" & repl == "rep2" ~ "RNA 2" + ) + ) ``` + Supplementary data `limits` and `labels` are created to enable sleek legend by relating cond(ition) and repl(icate) to data description. They will be mapped to `colour` and `linetype`. @@ -159,7 +246,7 @@ Finally, we combine the previous snippets and arrive at the final plot: ```{r} name <- "Data description" result %>% - ggplot2::ggplot(ggplot2::aes(x = cov, colour = group, linetype = group)) + + ggplot2::ggplot(ggplot2::aes(x = value, colour = group, linetype = group)) + # map values for cond to colour and use nice labels ggplot2::scale_colour_manual( name = name, @@ -183,26 +270,35 @@ result %>% labels = scales::trans_format("log10", scales::math_format(10^.x)) ) + ggplot2::xlab("Read Coverage") + - ggplot2::theme(legend.position = "bottom") + ggplot2::theme(legend.position = "bottom") ``` -From the plot, we can deduce that RNA samples have similar coverage distribution and are higher covered than the DNA data sample. This code serves as a blueprint for other plots, e.g.: +From the plot, we can deduce that RNA samples have similar coverage distribution +and are higher covered than the DNA data sample. This code serves as a blueprint +for other plots, e.g.: * empirical cumulative distribution of arrest rate or * number of observed bases. ## Arrest events -@Zhou2018 map RNA modification of pseudouridine ($\Psi$) by chemically modifying -pseudouridines with carbodiimide (+CMC) and detecting arrest events that are induced by reverse -transcription stops in high-throughput sequencing under 3 different conditions: HIVRT, SIIIRTMn, and -SIIIRTMg. The result of JACUSA2 are available by using `data()`. Read [JACUSA2 manual] for details on how data has been processed from the original publication by @Zhou2018. In brief, the data has been filtered to contain the following rRNAs: RNA18SN5 and RNA28SN5. +In @Zhou2018 the authors map RNA modification of pseudouridine ($\Psi$) by +chemically modifying pseudouridines with carbodiimide (+CMC) and detecting +arrest events that are induced by reverse transcription stops in high-throughput +sequencing under 3 different conditions: HIVRT, SIIIRTMn, and SIIIRTMg. +The result of JACUSA2 are available by using `data()`. +Read [JACUSA2 manual] for details on how data has been processed from the +original publication by @Zhou2018. In brief, the data has been filtered to +contain the following rRNAs: RNA18SN5 and RNA28SN5. -In the following, we will be looking at the results for `data(HIVRT)`. By default JACUSA2 output from *rt-arrest* will contain the following columns: +In the following, we will be looking at the results for `data(HIVRT)`. +By default JACUSA2 output from *rt-arrest* will contain the following +structured/nested columns: -* `arrest` and `through` base count columns. -* `bases` = `arrest` + `through`. Total base counts. -* TODO Columns prefixed with `bc_`, `cov_` will hold observed bases and coverage for the respective base counts, respectively. +* `arrest` and `through` +* `bases` = `arrest` + `through`: total base counts. +* `arrest_rate` +* `cov` holds coverage for total base counts. ```{r} data(HIVRT) @@ -214,87 +310,55 @@ We investigate the strand specific coverage distribution of reads and discover t The other mappings are probably artefacts. But this discussion is beyond the scope of this vignette. ```{r} -HIVRT %>% - ggplot2::ggplot(ggplot2::aes(x = cov, colour = strand)) + - ggplot2::geom_density() + ggplot2::scale_x_log10( - breaks = scales::trans_breaks("log10", function(x) 10^x), - labels = scales::trans_format("log10", scales::math_format(10^.x)) - ) + - ggplot2::xlab("Read Coverage") + - ggplot2::theme(legend.position = "bottom") + - ggplot2::facet_grid(contig ~ .) +gather_repl(HIVRT$id, HIVRT$cov) %>% # extract coverages +dplyr::inner_join(dplyr::select(HIVRT, id, contig, strand), by = "id") %>% # add contig and strand +ggplot2::ggplot(ggplot2::aes(x = value, colour = strand)) + + ggplot2::geom_density() + ggplot2::scale_x_log10( + breaks = scales::trans_breaks("log10", function(x) 10^x), + labels = scales::trans_format("log10", scales::math_format(10^.x)) + ) + +ggplot2::xlab("Read Coverage") + +ggplot2::theme(legend.position = "bottom") + +ggplot2::facet_grid(contig ~ .) ``` We filter by pvalue, coverage, strand, and only retain robust arrest events: ```{r} -result <- HIVRT -# result$cond_desc <- "" -# TODO -#result$cond_desc[result$cond == 1] <- "+CMC" -#result$cond_desc[result$cond == 2] <- "-CMC" -# TODO -filtered <- result %>% +dim(HIVRT) +filtered <- HIVRT %>% dplyr::filter(strand == "+") %>% - rt-arrest has pvalue instead of score - dplyr::filter(pvalue = 0.00001) %>% - dplyr::filter(All(cov$cond1 >= 100) & All(cov$cond21 >= 100)) %>% - dplyr::filter(robust(arrest)) %>% + dplyr::filter(pvalue <= 0.00001) %>% + dplyr::filter(All(cov$cond1 >= 100) & All(cov$cond2 >= 100)) %>% + dplyr::filter(robust(arrest)) dim(filtered) ``` - -We use the `peak()` function to transform the filtered result object and plot the arrest rate from each condition in a scatter plot. +We use `gather_repl()` function to transform the filtered result object and +plot the arrest rate from each condition in a scatter plot. ```{r} -# TODO -#peak(filtered, peak_cols = c("arrest_rate")) %>% -# ggplot2::ggplot(ggplot2::aes(x = arrest_rate11, y = arrest_rate21)) + -# ggplot2::geom_point() + -# ggplot2::facet_grid(. ~ contig) + -# ggplot2::geom_abline(colour = "red") + -# ggplot2::xlab("+CMC arrest rate") + -# ggplot2::ylab("-CMC arrest rate") +filtered %>% + ggplot2::ggplot(ggplot2::aes(x = arrest_rate$cond1$rep1, y = arrest_rate$cond2$rep1)) + + ggplot2::geom_point() + + ggplot2::facet_grid(. ~ contig) + + ggplot2::geom_abline(colour = "red") + + ggplot2::xlab("+CMC arrest rate") + + ggplot2::ylab("-CMC arrest rate") ``` From the above plot, we can deduce that the arrest rate is higher in the +CMC condition for both rRNAs when compared against -CMC. If you want to look simultaneously at all conditions (HIVRT, SIIIRTMn, and SIIIRTMg), checkout `vignette("JACUSA2helper-meta-conditions")`. -### Plot base substitutions - -# TODO -```{r} -#filtered %>% -# add_bc_ratio() %>% -# add_non_ref_ratio() %>% -# dplyr::select(contig, start, end, strand, cond_desc, cond, repl, bc_ratio, ref, non_ref_ratio) %>% -# group_by_site() %>% -# filter_by(any(non_ref_ratio >= 0.05)) %>% -# dplyr::ungroup() %>% -# dplyr::mutate(bc_max = base_call_non_ref(ref, bc_ratio), sub = base_sub(ref, bc_max)) %>% -# ggplot2::ggplot(ggplot2::aes(x = sub, fill = sub)) + -# ggplot2::geom_bar() + -# ggplot2::xlab("Base substitution") + -# ggplot2::scale_fill_discrete(name = "Base substitution") + -# ggplot2::theme( -# legend.position = "bottom", -# axis.text.x = ggplot2::element_text(angle = 90, vjust = 0.5, hjust=1) -# ) + -# ggplot2::facet_grid(cond_desc ~ .) -``` - ## Stranded data - When working with stranded RNA-Seq data, inverting base calls is not necessary because JACUSA2 will automatically invert Single End (SE) and Paired End (PE) depending on the provided library type option "-P" UNSTRANDED|FR_FIRSTSTRAND|RF_SECONDSTRAND". # Input/Output - -There are two functions to read JACUSA2 output `read_result(file)` and `read_results(files, meta_conditions)` which is explained in `vignette("JACUSA2helper-meta-conditions")`. +There are two functions to read JACUSA2 output `read_result(file)` and `read_results(files, meta_conds)` which is explained in `vignette("JACUSA2helper-meta-conditions")`. Use `result <- read_result("JACUSA2.out")` to read and create a JACUSA2 result object from a JACUSA2 output file `JACUSA2.out`. By default, variables stored in the **info** column will be processed and unpacked making them available for further manipulation. - Summary of input/output methods: @@ -303,52 +367,24 @@ Summary of input/output methods: * `write_bedGraph()` Writes a vector of values as bedGraph file. # General function layout - In the following the core functions of JACUSA2helper are presented. Check the respective help page, e.g.: `?bc_ratio` to get more details. `arrest_rate` Calculate arrest rate from arrest and through reads. -`bc` Calculates observed bases from base counts. In most cases this will be called automatically in `read_result()` -`bc_ratio` Calculates a ratio matrix from a base count matrix. +`base_call` Calculates observed bases from base counts. +`base_ratio` Calculates a ratio matrix from a base count matrix. `coverage` Calculates coverage from base counts. In most cases this will be called automatically in `read_result()` `non_ref_ratio` Calculates non reference base ratio for base counts of some base type. `base_sub` Calculates base substitution from reference and base counts, e.g.: A->G. `sub_ratio` Calculates base substitution ratio from reference and observed bases, e.g.: 25% A->G. -All of the core functions mentioned above have wrappers and additional function with the following naming name scheme. -For the core function `bc()` which calculates observed base, there exist the following functions: - -* `add_bc(result, ...)` Calls `bc()` and adds the outcome to the `result` object to the column specified by `bc_col()`. -* `bc_col(base_type = "bases")` Defines the field to be used for the provided `base_type`. Supported values for `base_type` are: -*bases*, *arrest*, or *through*. *bases* is the default value. -* `get_bc(result, ...)` Retrieves the value stored under `bc_col()` in the `result` object. - -For all core function *F*, there exist the following functions: `add_F()`, `get_F()`, and `F_col()`. The exceptions to this rule are: `coverage()` and `base_sub()`. In order to maintain short columns names and prevent from overwriting R functions, core JACUSA2 function have the long form and `add_()`, `get_()`, `_col()` the short form (cov, and sub). - # Filter result object +* `robust()` +* `filter_artefact()` +* `All(), Any()` -This function set enables filtering of sites. Make sure to use the correct parameters in `group_by_site()`. - -* `filter_by_robust_arrest_events()` Arrest event specific filter that will remove non robust results. Uses `filter_by()` internally. -* `filter_by_robust_variants()` Variant specific filter that will remove non robust results. Uses `filter_by()` internally. - -# Peak - change data layout - -Sometimes it is useful and desired to have observations of some variable(s) in columns. This enables a direct comparison. -Use `peak(result, peak_cols)` to transform a result object and transform variables defined in `peak_cols` to show them side-by-side. -In the following, we will transform `rdd` result object to show coverage information. - -```{r} -# TODO -#data(rdd) -#peak(rdd, peak_cols = "cov") -``` - -`peak()` uses "contig", "start", "end", "strand" columns to identify each site. Change parameter `id_cols` to change this. - +# Variant tagging (Base Stratification) ## References