Merged old, broken, and new

dieterich-lab · Oct 12, 2020 · bf7e574 · bf7e574
1 parent e047091
commit bf7e574
Show file tree

Hide file tree

Showing 127 changed files with 1,167 additions and 3,541 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,9 +1,9 @@
 Package: JACUSA2helper
 Type: Package
 Title: Post-processing for JACUSA2 output
-Version: 1.98-3
-Depends: R (>= 2.10)
-Date: 2019-11-10
+Version: 1.99-1
+Depends: R (>= 3.10)
+Date: 2020-10-11
 Author: Michael Piechotta
 Maintainer: <michael.piechotta@gmail.com>
 Description: This package enables post-processing of JACUSA2 output: read, filter, plot, write.
@@ -22,10 +22,9 @@ Imports:
   rlang, 
   magrittr,
   ggplot2, 
-  GGally, 
   parallel,
   scales,
-  tibble, 
+  sticky,
   methods, 
-  stringr
+  stringr 
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,59 +1,25 @@
 # Generated by roxygen2: do not edit by hand
 
+export(All)
+export(Any)
 export(add_arrest_rate)
-export(add_bc)
-export(add_bc_ratio)
-export(add_cov)
-export(add_non_ref_ratio)
-export(add_sub)
-export(add_sub_ratio)
-export(arrest_col)
+export(apply_cond)
+export(apply_repl)
 export(arrest_rate)
-export(arrest_rate_col)
-export(base_call)
-export(base_call_non_ref)
 export(base_count)
 export(base_sub)
-export(bc_col)
 export(bc_ratio)
-export(bc_ratio_col)
 export(clean_read_sub)
-export(copy_attr)
-export(cov_col)
-export(coverage)
-export(extract_ref)
 export(filter_all_artefacts)
 export(filter_artefact)
-export(filter_by)
-export(filter_by_max_score)
-export(filter_by_min_score)
-export(filter_by_robust_arrest_events)
-export(filter_by_robust_variants)
-export(get_arrest_rate)
-export(get_bc)
-export(get_bc_ratio)
-export(get_cov)
-export(get_non_ref_ratio)
-export(get_sub)
-export(get_sub_ratio)
-export(group_by_site)
 export(mask_sub)
 export(merge_sub)
 export(non_ref_ratio)
-export(non_ref_ratio_col)
-export(peak)
 export(plot_ecdf_allele_count)
 export(plot_ecdf_column)
-export(plot_pairs_matrix)
 export(read_result)
 export(read_results)
-export(redefine_ref)
-export(robust_arrest_events)
-export(sub_col)
+export(robust)
 export(sub_ratio)
-export(sub_ratio_col)
-export(through_col)
 export(write_bedGraph)
 importFrom(magrittr,"%>%")
-importFrom(rlang,syms)
-importFrom(tibble,add_column)
diff --git a/R/arrest-rate.R b/R/arrest-rate.R
@@ -1,36 +1,36 @@
 #' Add arrest rate.
 #' 
-#' Adds arrest rate calculated from \code{arrest[_suffix]} and \code{through[_suffix]} columns.
+#' Calculates and adds arrest rate from \code{arrest} and \code{through} columns.
 #' 
 #' @param result object created by \code{read_result()} or \code{read_results()}.
-#' @param suffix string added to specifiy base columns to calculate arrest rate. Default: NULL
-#' @return result object with arrest rate field \code{arrest_rate[_suffix]} added.
+#' @param cores Integer defines how many cores to use.
+#' @return result object with arrest rate field \code{arrest_rate} added.
+#'
 #' @export
-add_arrest_rate <- function(result, suffix = NULL) {
-  check_column_exists(result, arrest_col(suffix))
-  check_column_exists(result, through_col(suffix))
+add_arrest_rate <- function(result, cores = 1) {
+  arrest_cov <- .cov(result[[.ARREST_HELPER_COL]])
+  through_cov <- .cov(result[[.THROUGH_HELPER_COL]])
 
-  result[[arrest_rate_col(suffix)]] <- arrest_rate(
-    get_cov(result, arrest_col(suffix)),
-    get_cov(result, through_col(suffix))
-  )
 
-  result
-}
+  result[[.ARREST_RATE_COL]] <- lapply(result[[.ARREST_HELPER_COL]], function(x) {
+    tidyr::as_tibble(lapply(x, function(y) {
+      rowSums(y)
+    }))
+  }) %>% c() %>% tidyr::as_tibble()
 
-#' Retrieve arrest rate.
-#' 
-#' Retrieves arrest rate from \code{arrest[_suffix]} and \code{through[_suffix]} columns.
-#' 
-#' @param result object created by \code{read_result()} or \code{read_results()}.
-#' @param suffix string added to specifiy base columns to retrieve arrest rate. Default: NULL
-#' @return numeric vector with arrest rates for base counts with \code{suffix}.
-#' @examples
-#' data(HIVRT)
-#' str(get_arrest_rate(HIVRT))
-#' @export
-get_arrest_rate <- function(result, suffix = NULL) {
-  result[[arrest_rate_col(suffix)]]
+  # FIXME make it more robust
+  cond_count <- names(result[[.THROUGH_HELPER_COL]]) %>% length()
+  for (cond in paste0("cond", 1:cond_count)) {
+    cond_through <- result[[.THROUGH_HELPER_COL]][[cond]]
+    for (repl in names(cond_through)) {
+      arrest_cov <- result[[.ARREST_RATE_COL]][[cond]][[repl]]
+      through_cov <- rowSums(cond_through[[repl]])
+      
+      result[[.ARREST_RATE_COL]][[cond]][[repl]] <- arrest_rate(arrest_cov, through_cov)
+    }
+  }
+
+  result
 }
 
 #' Calculate arrest rate.
@@ -48,42 +48,3 @@ get_arrest_rate <- function(result, suffix = NULL) {
 arrest_rate <- function(arrest_cov, through_cov) {
   arrest_cov / (arrest_cov + through_cov)
 }
-
-#' Column name for arrest rate for \code{suffix}.
-#' 
-#' Column name for arrest ratio for \code{suffix}. 
-#' \code{suffix} will be used to create the column name:
-#' \code{arrest_rate[_suffix]}.
-#' 
-#' @param suffix string to specifiy base columns to retrieve arrest rate. Default: NULL
-#' @return string the represents the column name for arrest rate with  marked \code{suffix}.
-#' @export
-arrest_rate_col <- function(suffix = NULL) {
-  process_col(ARREST_RATE, suffix)
-}
-
-#' Column name for arrest base counts for \code{suffix}.
-#' 
-#' Column name for arrest base counts for \code{suffix}. 
-#' \code{suffix} will be used to create the column name:
-#' \code{arrest[_suffix]}".
-#' 
-#' @param suffix string to specifiy base columns to retrieve arrest bases Default: NULL
-#' @return string the represents the column name for arrest bases marked with \code{suffix}.
-#' @export
-arrest_col <- function(suffix = NULL) {
-  process_col(ARREST_COLUMN, suffix)
-}
-
-#' Column name for through base counts for \code{suffix}.
-#' 
-#' Column name for through base counts for \code{suffix}. 
-#' \code{suffix} will be used to create the column name:
-#' \code{through[_suffix]}".
-#' 
-#' @param suffix string to specifiy base columns to retrieve through bases Default: NULL
-#' @return string the represents the column name for through bases marked with \code{suffix}.
-#' @export
-through_col <- function(suffix = NULL) {
-  process_col(THROUGH_COLUMN, suffix)
-}
diff --git a/R/base-count.R b/R/base-count.R
@@ -1,6 +1,6 @@
 #' Retrieves the total number of observed unique bases.
 #'
-#' Retrieves the total number of observed unique bases for \code{base_type}.
+#' Retrieves the total number of observed unique bases.
 #' If \code{ref} is provided, reference base will be considered for counting, e.g.:
 #' At some site only base 'A' was observed but the reference was 'T'.
 #' \code{base_count} will be:

diff --git a/R/base_sub.R b/R/base_sub.R
@@ -0,0 +1,31 @@
+#' Calculates reference base to base call (base substitution).
+#' 
+#' Calculates and formats base changes for \code{ref} and \code{bc}.
+#' All elements in \code{ref} must be all length 1.
+#' 
+#' @param ref vector of reference bases.
+#' @param bc vector of base calls.
+#' @return vector of formatted base call substitutions.
+#' @examples
+#' ref <- c("A", "A", "C", "A")
+#' bc <- c("AG", "G", "C", "G")
+#' base_sub(ref, bc)
+#' @export
+base_sub <- function(ref, bc) {
+  if (all(nchar(ref) != 1)) {
+    stop("All ref elements must be nchar() == 1")
+  }
+
+  # remove ref in bc
+  # goal: A->G instead of A->AG
+  bc <- mapply(function(r, o) {
+    return(gsub(r, "", o))
+  }, ref, bc)
+
+  # add nice separator
+  sub <- paste(ref, sep = .SUB_SEP, bc)
+  # add nice info when there is no change
+  sub[ref == bc | bc == ""] <- .SUB_NO_CHANGE
+
+  sub
+}
diff --git a/R/bases.R b/R/bases.R
diff --git a/R/bc-ratio.R b/R/bc-ratio.R
@@ -1,52 +1,3 @@
-#' Adds base call ratio.
-#' 
-#' Calculates and adds base call ratio for all possible bases.
-#' For a given \code{base_type}, calculates a base ratio vector \code{bc_ratio[_base_type]} 
-#' for all possible bases (A, C, G, and T).
-#' 
-#' @param result created by \code{read_result()} or \code{read_results()}.
-#' @param base_type string specifying base column. Default: bases.
-#' @return result object with base call ratio added.
-#' @examples
-#' data(rdd)
-#' # use default bases to define ratio
-#' result <- add_bc_ratio(rdd)
-#' str(subset(result, select = grep("bc_ratio", names(result))))
-#' 
-#' data(HIVRT)
-#' # use arrest bases
-#' result <- add_bc_ratio(HIVRT, "arrest")
-#' subset(result, select = grep("bc_ratio_arrest", names(result)))
-#' @export 
-add_bc_ratio <- function(result, base_type = "bases") {
-  check_column_exists(result, base_type)
-  result[[bc_ratio_col(base_type)]] <- bc_ratio(result[[base_type]])
-
-  result
-}
-
-#' Retrive base call ratio from a result object.
-#'
-#' Extracts base call ratio for some \code{base_type} from a JACUSA2 result object.
-#'
-#' @param result created by \code{read_result()} or \code{read_results()}.
-#' @param base_type string specifying base column. Default: bases.
-#' @return numeric tibble of base call ratios for \code{base_type}.
-#' @examples
-#' data(rdd)
-#' # use default bases to define ratio
-#' ratio <- get_bc_ratio(rdd)
-#' str(ratio)
-#' 
-#' data(HIVRT)
-#' # use arrest bases to define ratio
-#' ratio <- get_bc_ratio(rdd, "arrest")
-#' str(ratio)
-#' @export
-get_bc_ratio <- function(result, base_type = "bases") {
-  result[[bc_ratio_col(base_type)]]
-}
-
 #' Calculate base call ratios for base counts.
 #' 
 #' Calculates base call ratios for base counts. 
@@ -64,28 +15,14 @@ bc_ratio <- function(bases) {
   m <- matrix(
     0.0, 
     nrow = nrow(bases), 
-    ncol = length(BASES)
+    ncol = length(.BASES)
   ) 
-  colnames(m) <- BASES
-  df <- tibble::as_tibble(m)
+  colnames(m) <- .BASES
+  df <- tidyr::as_tibble(m)
 
   # calculate ratio only for sites with cov > 0 -> x / cov
   non_zero_i <- total > 0
   df[non_zero_i, ] <- bases[non_zero_i, ] / total[non_zero_i]
 
   df
 }
-
-#' Column name for base call ratios for \code{base_type}.
-#' 
-#' Column name for base call ratios for \code{base_type}. 
-#' \code{base_type} will be used to create the column name:
-#' \code{bc_ratio[_base_type]}. Using \code{base_type = "bases"} 
-#' will result in the field "bc_ratio".
-#' 
-#' @param base_type string specifiying base column. Default: bases.
-#' @return string that represents the column name for base call ratios for \code{base_type}.
-#' @export
-bc_ratio_col <- function(base_type = "bases") {
-  process_col(BC_RATIO, base_type)
-}