Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristianGoueguel committed Jun 25, 2024
1 parent 9ca3e5a commit 1c2e1eb
Show file tree
Hide file tree
Showing 5 changed files with 300 additions and 2 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ export(normalize)
export(o2pls)
export(opls)
export(osc)
export(outlierplot)
export(pareto)
export(pds)
export(peakfit)
Expand Down
15 changes: 15 additions & 0 deletions R/directOutlyingness.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,21 @@
#' directOutlyingness(vec)
#'
directOutlyingness <- function(x, cutoff.quantile = 0.995, rmZeroes = FALSE, maxRatio = NULL, precScale = 1e-10) {
if (missing(x)) {
stop("Missing 'x' argument.")
}
if (!is.numeric(x)) {
stop("The input 'x' must be a numeric vector.")
}
if (!is.numeric(cutoff.quantile) || cutoff.quantile < 0 || cutoff.quantile > 1) {
stop("The input 'cutoff.quantile' must be a numeric value between 0 and 1.")
}
if (!is.logical(rmZeroes)) {
stop("'rmZeroes' must be of type boolean (TRUE or FALSE)")
}
if (!is.numeric(precScale) || precScale < 0) {
stop("'precScale' must be a positive numeric value.")
}

x <- x[!is.na(x)]
med <- stats::median(x)
Expand Down
209 changes: 209 additions & 0 deletions R/outlierplot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#' @title Univariate representation of multivariate outliers
#'
#' @author Christian L. Goueguel
#'
#' @description
#' This function creates a visual representation of multivariate outliers using
#' a univariate plot.It uses robust covariance estimation methods to identify
#' outliers and provides options for displaying the results through various
#' plotting styles.
#'
#' @details
#' The function uses the Minimum Covariance Determinant (MCD) method to compute robust estimates
#' of location and scatter. It then applies an adaptive reweighting step to further improve
#' the outlier detection. The results are visualized using `ggplot2`, with options to highlight
#' outliers and/or color-code points based on their Mahalanobis distances.
#'
#' @param x A matrix or data frame.
#' @param quan A numeric value, between 0.5 and 1, that specifies the amount of observations which are used for MCD estimations. Default is 0.5.
#' @param alpha A numeric value specifying the amount of observations used for calculating the adjusted quantile. Default is 0.025.
#' @param show.outlier A logical value, if `TRUE` (default), outliers are highlighted in the plot.
#' @param show.mahal A logical value, if `FALSE` (default), robust Mahalanobis distances are not color-coded in the plot.
#'
#' @return Depending on the combination of `show.outlier` and `show.mahal`:
#' - A `ggplot` object with outliers highlighted (if `show.outlier = TRUE`)
#' - A `ggplot` object with Mahalanobis distances color-coded (if `show.mahal = TRUE`)
#' - A `ggplot` object combining both outlier highlighting and Mahalanobis distance color-coding
#' (if both `show.outlier` and `show.mahal` are `TRUE`)
#' - A tibble containing standardized scores, outlier flags, and Mahalanobis distances
#' (if both `show.outlier` and `show.mahal` are `FALSE`)
#'
#' @export outlierplot
#'
#' @examples
#' # Load the glass dataset from the chemometrics package
#' data(glass, package = "chemometrics")
#'
#' # Basic usage with default parameters
#' outlierplot(glass)
#'
#' # Adjust the proportion of observations used for MCD estimation
#' outlierplot(glass, quan = 0.75)
#'
#' # Show Mahalanobis distances instead of outlier highlighting
#' outlierplot(glass, show.outlier = FALSE, show.mahal = TRUE)
#'
#' # Combine outlier highlighting and Mahalanobis distance color-coding
#' outlierplot(glass, show.outlier = TRUE, show.mahal = TRUE)
#'
#' # Return data frame instead of plot
#' result_df <- outlierplot(glass, show.outlier = FALSE, show.mahal = FALSE)
#' head(result_df)
#'
outlierplot <- function(x, quan = 1/2, alpha = 0.025, show.outlier = TRUE, show.mahal = FALSE) {
if (!is.matrix(x) && !is.data.frame(x) && !tibble::is_tibble(x)) {
stop("'x' must be matrix or data.frame")
}
if (ncol(x) < 2) {
stop("'x' must be at least two-dimensional")
}
if (!is.numeric(quan) || quan < 0.5 || quan > 1) {
stop("'quan' must be a numeric value between 0.5 and 1")
}

if (is.data.frame(x) || tibble::is_tibble(x)) {
x <- as.matrix(x)
}

rob <- robustbase::covMcd(x, alpha = quan)
xarw <- adapt_reweight(x, rob$center, rob$cov, alpha = alpha)

if (xarw$cn != Inf) {
alpha <- sqrt(c(xarw$cn, stats::qchisq(c(0.75, 0.5, 0.25), ncol(x))))
} else {
alpha <- sqrt(stats::qchisq(c(0.975, 0.75, 0.5, 0.25), ncol(x)))
}

dist <- stats::mahalanobis(x, center = rob$center, cov = rob$cov)
sx <- matrix(NA, nrow = nrow(x), ncol = ncol(x))

for (i in 1:ncol(x)) {
sx[,i] <- (x[,i] - xarw$m[i]) / sqrt(xarw$c[i,i])
}
r <- range(sx)
out <- sqrt(dist) > min(sqrt(xarw$cn), sqrt(stats::qchisq(0.975, dim(x)[2])))

s_df <- as.data.frame(sx)
colnames(s_df) <- colnames(x)

s_df <- s_df %>%
tibble::as_tibble() %>%
dplyr::mutate(
outlier = out,
mahalanobis = sqrt(dist),
x = runif(nrow(x), min = -1, max = 1)
)

df_long <- s_df %>%
tidyr::pivot_longer(
cols = !c(outlier, mahalanobis, x),
names_to = "variable",
values_to = "score"
)

p <- ggplot2::ggplot(df_long) +
ggplot2::aes(x = x, y = score) +
ggplot2::geom_hline(yintercept = 0, linewidth = 0.1, linetype = "dotdash") +
ggplot2::ylim(r[1], r[2]) +
ggplot2::facet_grid(cols = ggplot2::vars(variable), scales = "fixed", space = "fixed") +
ggplot2::theme_light() +
ggplot2::theme(
axis.title.x = ggplot2::element_blank(),
axis.title.y = ggplot2::element_blank(),
axis.ticks.x = ggplot2::element_blank(),
axis.text.x = ggplot2::element_blank(),
strip.text = ggplot2::element_text(size = 5),
panel.grid = ggplot2::element_blank(),
legend.position = "bottom"
)

p_outl <- p +
ggplot2::geom_point(ggplot2::aes(color = outlier, shape = outlier)) +
ggplot2::scale_shape(solid = TRUE) +
ggplot2::scale_color_manual(values = c("#efb61f", "#011f4b"))

p_mahal <- p +
ggplot2::geom_point(ggplot2::aes(color = mahalanobis)) +
ggplot2::scale_color_gradient2(
limits = c(round(r[1], 0), round(r[2], 0)),
breaks = c(round(r[1], 0), 0, round(r[2], 0)),
low = "red",
mid = "green",
high = "blue",
midpoint = 0) +
ggplot2::guides(color = ggplot2::guide_colorbar())

p_all <- p_mahal + ggplot2::aes(shape = outlier)

if (show.outlier == TRUE && show.mahal == FALSE) {
return(p_outl)
}

if (show.outlier == FALSE && show.mahal == TRUE) {
return(p_mahal)
}

if (show.outlier == TRUE && show.mahal == TRUE) {
return(p_all)
}

if (show.outlier == FALSE && show.mahal == FALSE) {
return(s_df %>% dplyr::select(-x))
}

}


adapt_reweight <- function(x, m0, c0, alpha, pcrit){
n <- nrow(x)
p <- ncol(x)

if (missing(pcrit)){
if (p <= 10) pcrit <- (0.24 - 0.003 * p) / sqrt(n)
if (p > 10) pcrit <- (0.252 - 0.0018 * p) / sqrt(n)
}

if (missing(alpha)) {
delta <- stats::qchisq(0.975, p)
} else {
delta <- stats::qchisq(1 - alpha, p)
}

d2 <- stats::mahalanobis(x, m0, c0)
d2ord <- sort(d2)
dif <- stats::pchisq(d2ord, p) - (0.5:n) / n
i <- (d2ord >= delta) & (dif > 0)

if (sum(i) == 0) {
alfan <- 0
} else {
alfan <- max(dif[i])
}

if (alfan < pcrit) {
alfan <- 0
}
if (alfan > 0) {
cn <- max(d2ord[n - ceiling(n * alfan)], delta)
} else {
cn <- Inf
}

w <- d2 < cn

if(sum(w) == 0) {
m <- m0
c <- c0
} else {
m <- apply(x[w,], 2, mean)
c1 <- as.matrix(x - tcrossprod(rep(1, n), m))
c <- crossprod(c1 * w, c1) / sum(w)
}

list(
m = m,
c = c,
cn = cn,
w = w
)
}
5 changes: 3 additions & 2 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ reference:
contents:
- zscore
- iqrMethod
- outlierplot
- adjusted_boxplot
- directOutlyingness
- generalized_boxplot

- title: Multivariate filtering

Expand Down Expand Up @@ -51,8 +54,6 @@ reference:
contents:
- plotfit
- plotSpec
- adjusted_boxplot
- generalized_boxplot

- title: Smoothing
desc: >
Expand Down
72 changes: 72 additions & 0 deletions man/outlierplot.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1c2e1eb

Please sign in to comment.