Skip to content

Commit

Permalink
Updates: Added directOutlyingness function for univariate dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianGoueguel committed Jun 23, 2024
1 parent 4123d8f commit dc9a7b7
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(biweight_midvariance)
export(biweight_scale)
export(center)
export(correlation)
export(directOutlyingness)
export(direct_orthogonal)
export(direct_osc)
export(epo)
Expand Down
125 changes: 125 additions & 0 deletions R/directOutlyingness.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#' @title Directional Outlyingness for Skewed Distribution
#'
#' @author Christian L. Goueguel
#'
#' @description
#' This function computes the directional outlyingness of a numeric vector, as
#' proposed by Rousseeuw *et al.* (2017).
#'
#' @details
#' Directional outlyingness takes the potential skewness of the underlying
#' distribution into account, while attaining a smaller computation time and bias.
#' The main idea is to split the sample into two half samples, and then to apply
#' a robust scale estimator to each of them.
#'
#' @references
#' - Rousseeuw, P.J., Raymaekers, J., Hubert, M., (2018).
#' A Measure of Directional Outlyingness With Applications to Image Data and Video.
#' Journal of Computational and Graphical Statistics, 27(2):345–359.
#'
#' @param x A numeric vector
#' @param cutoff.quantile A numeric value between 0 and 1 specifying the quantile for outlier detection (default: 0.995).
#' @param rmZeroes A logical value. If `TRUE`, removes values close to zero (default: `FALSE`).
#' @param maxRatio A numeric value greater than 2. If provided, constrains the ratio between positive and negative scales (default: `NULL`).
#' @param precScale A numeric value specifying the precision scale for near-zero comparisons (default: 1e-10).
#'
#' @return A tibble with columns:
#' \item{data}{The original input data}
#' \item{score}{The calculated outlyingness score}
#' \item{outlier}{Logical; TRUE if the point is identified as potantial outlier}
#'
#' @export directOutlyingness
#'
#' @examples
#' vec <- c(1, 5, 3, 9, 2, 6, 4, 8, 7, 1e3)
#' directOutlyingness(vec)
#'
directOutlyingness <- function(x, cutoff.quantile = 0.995, rmZeroes = FALSE, maxRatio = NULL, precScale = 1e-10) {

x <- x[!is.na(x)]
med <- stats::median(x)
xc <- x - med
n <- length(xc)
h <- n %/% 2
xa <- xc[xc > 0]
xb <- xc[xc < 0]
xa <- c(rep(0, (n - h - length(xa))), xa)
xb <- c(rep(0, (n - h - length(xb))), abs(xb))

if (rmZeroes){
xa <- xa[xa > precScale]
xb <- xb[xb > precScale]
}

if (!is.null(maxRatio)) {
if (maxRatio < 2) {
stop("maxRatio must be at least 2")
} else {
sall <- scale1StepM(x = xc, precScale = precScale)
sa <- min(c(max(sa, sall / maxRatio, na.rm = TRUE), sall * maxRatio), na.rm = TRUE)
sb <- min(c(max(sb, sall / maxRatio, na.rm = TRUE), sall * maxRatio), na.rm = TRUE)
}
} else {
sa <- scale1StepM(x = xa, precScale = precScale)
sb <- scale1StepM(x = xb, precScale = precScale)
}

res <- dplyr::if_else(x >= med, (x - med) / sa, (med - x) / sb)
cutoff <- computeCutoff(res, cutoff.quantile)

tbl <- tibble::tibble(
data = x,
score = res,
outlier = dplyr::if_else(res > cutoff, TRUE, FALSE)
) %>%
dplyr::arrange(dplyr::desc(score))

return(tbl)
}

rhoHuber <- function(x, c = 2.1){
rho <- (x / c)^2
rho[rho > 1] <- 1
r <- 1.54^2 * rho
return(r)
}

loc1StepM <- function(x, c1 = 3, precScale) {
x <- x[!is.na(x)]
medx <- stats::median(x)
ax <- abs(x - medx)
denom <- c1 * stats::median(ax)
mu <- if (denom > precScale) {
ax = ax/denom
w = 1 - ax * ax
w = ((abs(w) + w)/2)^2
sum(x * w ) / sum(w)
}
else {
medx
}
return(mu)
}

scale1StepM <- function(x, precScale) {
x <- x[!is.na(x)]
n <- length(x)
if (n == 0) {
return(0.0)
} else {
sigma0 <- 1.4826 * stats::median(abs(x))
if(sigma0 < precScale) {
return(0.0)
} else {
rho <- rhoHuber(x / sigma0)
return(sigma0 * sqrt(sum(rho) * 2 / n))
}
}
}


computeCutoff <- function(outl, quant) {
Ltemp <- log(0.1 + outl)
cutoff <- exp(stats::qnorm(quant) * stats::mad(Ltemp) + stats::median(Ltemp)) - 0.1
return(cutoff)
}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ reference:
contents:
- zscore
- iqrMethod
- directOutlyingness

- title: Multivariate filtering

Expand Down
56 changes: 56 additions & 0 deletions man/directOutlyingness.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit dc9a7b7

Please sign in to comment.