Working CBRW implementation

- Working function resides in cbrw.R - Updated documentation - Improved performance of Wb generation slightly - Made all dplyr functions specify namespace explicitly - Removed full import of tidyr in favor of only `spread()` - Removed `plot_cbrw()` for now, as it was non-functional - fill arugment in matrix.R#61 replaced with numeric instead of int which was being coerced anyway TODO: More documentation
beansrowning · Mar 19, 2019 · 08e8e02 · 08e8e02
1 parent d9ae52b
commit 08e8e02
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 48 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,3 +12,4 @@ License: file LICENSE
 Encoding: UTF-8
 LazyData: true
 RoxygenNote: 6.1.1
+Suggests: testthat
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(biased_trans_matrix)
+export(cbrw)
 import(dplyr)
 import(rlang)
-import(tidyr)
+importFrom(tidyr,spread)
diff --git a/R/cbrw.R b/R/cbrw.R
@@ -0,0 +1,43 @@
+#' @title Coupled Biased Random Walks  
+#'  TODO
+#' @param data a data.frame containing catgorical data
+#' @value the input data frame with an additional \emph{score} variable representing
+#'        relative outlier-ness of the observation
+#' @export
+cbrw <- function(data) {
+  # TODO: defensive programming for type, id vars, high-dimensionality
+
+  # Set up and compute the biased transistion matrix
+  # returns a list with Wb, nodes, and edges
+  computed <- biased_trans_matrix(data, all_data = TRUE)
+
+  # Run random walk on Wb
+  pi_t <- random_walk(computed$Wb)
+
+  # Compute feature relevance and use that to compute
+  # the value score for each feature value
+  computed$nodes <- computed$nodes %>%
+    dplyr::mutate(
+      value_score = c(pi_t)
+   ) %>%
+   dplyr::group_by(feature) %>%
+   dplyr::mutate(
+     rel = sum(value_score)
+   ) %>%
+   dplyr::ungroup() %>%
+   dplyr::mutate(total_score = rel * value_score)
+
+  # Using the integer tibble as an index, create a new tibble
+  # with the value scores in place of each feature value int
+  # then take the rowsum to compute the observation score
+  obs_scores <- computed$data %>%
+    dplyr::mutate_all(dplyr::funs(computed$nodes[["total_score"]][.])) %>%
+    rowSums
+
+  # Append the observation scores and return
+  data <- data %>%
+    dplyr::mutate(score = obs_scores)
+
+  return(data)
+
+}
diff --git a/R/matrix.R b/R/matrix.R
@@ -10,7 +10,7 @@
 #'    \item \strong{Wb} the biased transition matrix
 #' }
 #' @import dplyr
-#' @import tidyr
+#' @importFrom tidyr spread
 #' @import rlang
 #' @export
 biased_trans_matrix <- function(data, all_data = FALSE) {
@@ -58,12 +58,16 @@ biased_trans_matrix <- function(data, all_data = FALSE) {
   # division
   A <- bind_rows(nodes, edges) %>%
     dplyr::select(u, v, p) %>%
-    tidyr::spread(v, p, fill = 0L) %>%
+    tidyr::spread(v, p, fill = 0) %>%
     dplyr::select(-u) %>%
     as.matrix()
 
   # Compute A(u,v)
-  A[lower.tri(A)] <- t(A)[lower.tri(A)]
+  # HACK: A + t(A) and halfing the diagonal
+  # is both faster and less memory intensive
+  # than A[lower.tri(A)] <- t(A)[lower.tri(A)]
+  A <- A + t(A)
+  diag(A) <- diag(A) / 2
   A <- A / diag(A)
   A <- t(A)
   diag(A) <- 0
@@ -80,54 +84,38 @@ biased_trans_matrix <- function(data, all_data = FALSE) {
 
   # If we want additional data, tidy things up a bit
   nodes <- nodes %>%
-    mutate(name = feature_values[u]) %>%
-    select(feature, name, freq, p, intra_dev, is_mode)
+    dplyr::mutate(name = feature_values[u]) %>%
+    dplyr::select(feature, name, freq, p, intra_dev, is_mode)
 
   out <- list(
     nodes = nodes,
     edges = edges,
-    Wb = Wb
+    Wb = Wb,
+    data = converted[[1]]
   )
 
   return(out)
 }
 
-# TODO: Make a nodes + Wb -> tbl_graph and viz
-# function
-plot_cbrw <- function(...) {
-  # TODO
-  hollow_edges <- bind_rows(
-    select(edges, u, v, group),
-    select(edges, v = u, u = v, group)
-  )
+# Internal Random-walk function used to calculate the stationary probabilities
+# pi*
+random_walk <- function(trans_matrix, alpha = 0.95, err_tol = 0.001, max_iter = 100) {
+
+  n <- dim(trans_matrix)[1L]
+  dampen_vec <- rep(1, n) * ( (1 - alpha) / n )
 
-  new_edges <- Wb %>%
-    as.data.frame() %>%
-    setNames(seq_along(.)) %>%
-    mutate(u = row_number()) %>%
-    gather("v", "weight", -u) %>%
-    mutate(v = as.integer(v)) %>%
-    right_join(hollow_edges, by = c("u", "v")) %>%
-    rename(from = u, to = v)
-
-  graph_out <- tbl_graph(
-    nodes = rename(nodes, from = u, to = v),
-    edges = new_edges
-  )
+  pi_t <- (1 / n) * rep(1, n)
 
-  graph_plot <- graph_out %>%
-    ggraph("drl") +
-    geom_edge_arc(
-      arrow = arrow(length = unit(4, "mm")),
-      aes(edge_width = (weight / max(weight)) * 0.01),
-      alpha = 0.4
-    ) +
-    geom_node_point(
-      aes(size = intra_dev),
-      color = "red"
-    ) +
-    geom_node_text(
-      aes(label = feature_values[to]),
-      vjust = -0.6
-    )
-}
+  for (i in seq_len(max_iter)) {
+    pi_next <- dampen_vec + (alpha * t(trans_matrix) %*% pi_t)
+    err <- norm(pi_t - pi_next, type = "I")
+
+    pi_t <- pi_next
+
+    if (err <= err_tol) {
+      break
+    }
+  }
+
+  return(pi_t)
+}
diff --git a/R/util.R b/R/util.R
@@ -9,14 +9,15 @@
 categorical_to_int <- function(data) {
   # Convert all observations into a
   # unique value which can be summarized into
-  # a single vector, V
+  # a single vector of feature values, V
+  # HACK: There has got to be a better way to do this
   data <- Map(
-    paste,
+    f = paste,
     names(data),
     data,
     sep = "=="
   ) %>%
-    as_tibble()
+    dplyr::as_tibble()
 
   # Calculate V, the set of unique feature values
   # within the dataset
@@ -34,4 +35,45 @@ categorical_to_int <- function(data) {
       ),
     all_fact
   )
-}
+}
+
+# TODO: Make a nodes + Wb -> tbl_graph and viz
+# Just scrap code which could become a func eventually
+# function
+plot_cbrw <- function(...) {
+  # TODO
+  hollow_edges <- bind_rows(
+    select(edges, u, v, group),
+    select(edges, v = u, u = v, group)
+  )
+
+  new_edges <- Wb %>%
+    as.data.frame() %>%
+    setNames(seq_along(.)) %>%
+    mutate(u = row_number()) %>%
+    gather("v", "weight", -u) %>%
+    mutate(v = as.integer(v)) %>%
+    right_join(hollow_edges, by = c("u", "v")) %>%
+    rename(from = u, to = v)
+
+  graph_out <- tbl_graph(
+    nodes = rename(nodes, from = u, to = v),
+    edges = new_edges
+  )
+
+  graph_plot <- graph_out %>%
+    ggraph("drl") +
+    geom_edge_arc(
+      arrow = arrow(length = unit(4, "mm")),
+      aes(edge_width = (weight / max(weight)) * 0.01),
+      alpha = 0.4
+    ) +
+    geom_node_point(
+      aes(size = intra_dev),
+      color = "red"
+    ) +
+    geom_node_text(
+      aes(label = feature_values[to]),
+      vjust = -0.6
+    )
+}