Skip to content

Commit

Permalink
Working CBRW implementation
Browse files Browse the repository at this point in the history
- Working function resides in cbrw.R
- Updated documentation
- Improved performance of Wb generation slightly
- Made all dplyr functions specify namespace explicitly
- Removed full import of tidyr in favor of only `spread()`
- Removed `plot_cbrw()` for now, as it was non-functional
- fill arugment in matrix.R#61 replaced with numeric instead of int which was being coerced anyway

TODO: More documentation
  • Loading branch information
beansrowning committed Mar 19, 2019
1 parent d9ae52b commit 08e8e02
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 48 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ License: file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.1.1
Suggests: testthat
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(biased_trans_matrix)
export(cbrw)
import(dplyr)
import(rlang)
import(tidyr)
importFrom(tidyr,spread)
43 changes: 43 additions & 0 deletions R/cbrw.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#' @title Coupled Biased Random Walks
#' TODO
#' @param data a data.frame containing catgorical data
#' @value the input data frame with an additional \emph{score} variable representing
#' relative outlier-ness of the observation
#' @export
cbrw <- function(data) {
# TODO: defensive programming for type, id vars, high-dimensionality

# Set up and compute the biased transistion matrix
# returns a list with Wb, nodes, and edges
computed <- biased_trans_matrix(data, all_data = TRUE)

# Run random walk on Wb
pi_t <- random_walk(computed$Wb)

# Compute feature relevance and use that to compute
# the value score for each feature value
computed$nodes <- computed$nodes %>%
dplyr::mutate(
value_score = c(pi_t)
) %>%
dplyr::group_by(feature) %>%
dplyr::mutate(
rel = sum(value_score)
) %>%
dplyr::ungroup() %>%
dplyr::mutate(total_score = rel * value_score)

# Using the integer tibble as an index, create a new tibble
# with the value scores in place of each feature value int
# then take the rowsum to compute the observation score
obs_scores <- computed$data %>%
dplyr::mutate_all(dplyr::funs(computed$nodes[["total_score"]][.])) %>%
rowSums

# Append the observation scores and return
data <- data %>%
dplyr::mutate(score = obs_scores)

return(data)

}
74 changes: 31 additions & 43 deletions R/matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#' \item \strong{Wb} the biased transition matrix
#' }
#' @import dplyr
#' @import tidyr
#' @importFrom tidyr spread
#' @import rlang
#' @export
biased_trans_matrix <- function(data, all_data = FALSE) {
Expand Down Expand Up @@ -58,12 +58,16 @@ biased_trans_matrix <- function(data, all_data = FALSE) {
# division
A <- bind_rows(nodes, edges) %>%
dplyr::select(u, v, p) %>%
tidyr::spread(v, p, fill = 0L) %>%
tidyr::spread(v, p, fill = 0) %>%
dplyr::select(-u) %>%
as.matrix()

# Compute A(u,v)
A[lower.tri(A)] <- t(A)[lower.tri(A)]
# HACK: A + t(A) and halfing the diagonal
# is both faster and less memory intensive
# than A[lower.tri(A)] <- t(A)[lower.tri(A)]
A <- A + t(A)
diag(A) <- diag(A) / 2
A <- A / diag(A)
A <- t(A)
diag(A) <- 0
Expand All @@ -80,54 +84,38 @@ biased_trans_matrix <- function(data, all_data = FALSE) {

# If we want additional data, tidy things up a bit
nodes <- nodes %>%
mutate(name = feature_values[u]) %>%
select(feature, name, freq, p, intra_dev, is_mode)
dplyr::mutate(name = feature_values[u]) %>%
dplyr::select(feature, name, freq, p, intra_dev, is_mode)

out <- list(
nodes = nodes,
edges = edges,
Wb = Wb
Wb = Wb,
data = converted[[1]]
)

return(out)
}

# TODO: Make a nodes + Wb -> tbl_graph and viz
# function
plot_cbrw <- function(...) {
# TODO
hollow_edges <- bind_rows(
select(edges, u, v, group),
select(edges, v = u, u = v, group)
)
# Internal Random-walk function used to calculate the stationary probabilities
# pi*
random_walk <- function(trans_matrix, alpha = 0.95, err_tol = 0.001, max_iter = 100) {

n <- dim(trans_matrix)[1L]
dampen_vec <- rep(1, n) * ( (1 - alpha) / n )

new_edges <- Wb %>%
as.data.frame() %>%
setNames(seq_along(.)) %>%
mutate(u = row_number()) %>%
gather("v", "weight", -u) %>%
mutate(v = as.integer(v)) %>%
right_join(hollow_edges, by = c("u", "v")) %>%
rename(from = u, to = v)

graph_out <- tbl_graph(
nodes = rename(nodes, from = u, to = v),
edges = new_edges
)
pi_t <- (1 / n) * rep(1, n)

graph_plot <- graph_out %>%
ggraph("drl") +
geom_edge_arc(
arrow = arrow(length = unit(4, "mm")),
aes(edge_width = (weight / max(weight)) * 0.01),
alpha = 0.4
) +
geom_node_point(
aes(size = intra_dev),
color = "red"
) +
geom_node_text(
aes(label = feature_values[to]),
vjust = -0.6
)
}
for (i in seq_len(max_iter)) {
pi_next <- dampen_vec + (alpha * t(trans_matrix) %*% pi_t)
err <- norm(pi_t - pi_next, type = "I")

pi_t <- pi_next

if (err <= err_tol) {
break
}
}

return(pi_t)
}
50 changes: 46 additions & 4 deletions R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,15 @@
categorical_to_int <- function(data) {
# Convert all observations into a
# unique value which can be summarized into
# a single vector, V
# a single vector of feature values, V
# HACK: There has got to be a better way to do this
data <- Map(
paste,
f = paste,
names(data),
data,
sep = "=="
) %>%
as_tibble()
dplyr::as_tibble()

# Calculate V, the set of unique feature values
# within the dataset
Expand All @@ -34,4 +35,45 @@ categorical_to_int <- function(data) {
),
all_fact
)
}
}

# TODO: Make a nodes + Wb -> tbl_graph and viz
# Just scrap code which could become a func eventually
# function
plot_cbrw <- function(...) {
# TODO
hollow_edges <- bind_rows(
select(edges, u, v, group),
select(edges, v = u, u = v, group)
)

new_edges <- Wb %>%
as.data.frame() %>%
setNames(seq_along(.)) %>%
mutate(u = row_number()) %>%
gather("v", "weight", -u) %>%
mutate(v = as.integer(v)) %>%
right_join(hollow_edges, by = c("u", "v")) %>%
rename(from = u, to = v)

graph_out <- tbl_graph(
nodes = rename(nodes, from = u, to = v),
edges = new_edges
)

graph_plot <- graph_out %>%
ggraph("drl") +
geom_edge_arc(
arrow = arrow(length = unit(4, "mm")),
aes(edge_width = (weight / max(weight)) * 0.01),
alpha = 0.4
) +
geom_node_point(
aes(size = intra_dev),
color = "red"
) +
geom_node_text(
aes(label = feature_values[to]),
vjust = -0.6
)
}

0 comments on commit 08e8e02

Please sign in to comment.