Merge pull request #35 from meantrix/review_joss

Remarks regarding JOSS paper
meantrix · Mar 4, 2025 · fa7c6da · fa7c6da
2 parents fa51ed6 + 409c6fb
commit fa7c6da
Show file tree

Hide file tree

Showing 26 changed files with 419 additions and 381 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: corrp
 Type: Package
 Title: Compute Correlations Type Analysis in Parallel
-Version: 0.5.0
+Version: 0.6.0
 Authors@R: 
     c(
         person("Igor", "Dornelles Schoeller Siciliani", email = "igor@meantrix.com", role = c("aut", "cre")),
@@ -17,7 +17,7 @@ Description: Compute multiple types of correlation analyses, including
     the average correlation clustering algorithm and distance 
     correlation t-test.
 Imports: 
-    Rcpp (>= 1.0.13-1),
+    Rcpp (>= 1.0.13),
     RcppArmadillo (>= 14.2.2-1),
     corrplot (>= 0.95),
     lsr (>= 0.5.2),

diff --git a/NEWS.md b/NEWS.md
@@ -3,11 +3,18 @@
 
 ## 0.6.0
 
+Dedicated version for the publication of the corrp package in the JOSS.
+
 - Add `VignetteBuilder: knitr` to DESCRIPTION
 - Add usefull error message for required parameters.
 - Fix C++ `Astar` method.
 - Run benchmarks, expand the paper to include statements on resource-intensive options, and incorporate an enhanced version of `energy::dcorT.test`. Also, change the data used in the paper.
-
+- Update paper:
+  - Give a more detail explanation of `ACCA` algorithm.
+  - Strenghted statement of need.
+  - Make a map of correlation → R method.
+  - Provide a brief remark on the symmetry of the correlation matrix.
+- Update `README.md` according with changes in the paper and functions.
 
 ### Methods Added
 
@@ -21,9 +28,11 @@
 
 ### Documentation
 
-- Enhanced the documentation for `corrp` by including examples, refining the pair type section with additional details and references, and providing a more comprehensive explanation of the output format and its interpretation.
+- Enhanced the documentation for `corrp` and `corr_fun` by including examples, refining the pair type section with additional details and references, and providing a more comprehensive explanation of the output format and its interpretation.
 - Improved the documentation for `corr_rm` by adding examples and providing a clearer explanation of the `c` parameter.
+- Improved the documentation for `acca` by adding examples and providing a more detailed explanation in the description.
 - Added examples of usage in the documentation for: `acca`, `best_acca`, `corrp`, `corr_rm`, `corr_matrix`, `corr_fun`, `ptest`, `sil_acca`.
+- Fix grammar and ensure package style cohesion.
 
 
 ## 0.5.0

diff --git a/R/acca.R b/R/acca.R
@@ -3,47 +3,61 @@
 #' @importFrom RcppArmadillo armadillo_version
 #' @importFrom corrplot corrplot
 
+
 #' @title Average correlation clustering algorithm
 #'
-#' @description A C++ implementation of the ACCA method
-#' that works directly with the correlation
-#' matrix derived from the \code{\link{corr_matrix}} function.
-#' In this sense, this implementation differs from the original,
-#' it works with mixed data and several correlation methods.
+#' @description A C++ implementation of the Average Correlation Clustering Algorithm (ACCA) \url{https://www.sciencedirect.com/science/article/pii/S1532046410000158}, originally developed for genetic studies using Pearson correlation as a similarity measure. Unlike traditional clustering methods that rely on distance metrics such as Euclidean or Mahalanobis distance, ACCA groups data based on correlation patterns.\cr
+#' This implementation works directly with the correlation matrix derived from the \code{\link{corr_matrix}} function and supports mixed data types along with various correlation methods.\cr
+#' ACCA is an unsupervised clustering method, meaning it identifies patterns without predefined labels. Similar to k-means, it requires defining the K parameter, which controls the number of clusters.
 #'
 #' @param m  \[\code{matrix(1)}]\cr correlation matrix from
 #' \code{\link{corr_matrix}} or a distance matrix.
 #' @param k \[\code{integer(1)}]\cr number of clusters considered.
 #' @param maxrep \[\code{integer(1)}]\cr maximum number of
 #' interactions without change in the clusters.
 #' @param maxiter \[\code{integer(1)}]\cr maximum number of interactions.
-#' @param ... Additional arguments .
+#' @param ... Not used. Included for S3 method consistency.
 #'
 #' @return \[\code{acca_list(k)}]\cr A list with the
 #' final result of the clustering method.
-#'  That is, the name of the variables belonging to each cluster k.
+#'  That is, every element of the list group names of the variables belonging to each cluster k.
 #'
 #' @author Igor D.S. Siciliani, Paulo H. dos Santos
 #'
-#' @keywords correlation , acca
+#' @keywords correlation, acca
 #'
 #' @references
 #' Bhattacharya, Anindya, and Rajat K. De.
 #' "Average correlation clustering algorithm (ACCA) for grouping of co-regulated
 #' genes with similar pattern of variation in their expression values."
 #' Journal of Biomedical Informatics 43.4 (2010): 560-568.
-#' 
+#'
 #' @examples
 #'
+#' # Clustering a correlation matrix with 3 clusters
+#' x <- corrp::corrp(iris)
+#' m <- corrp::corr_matrix(x)
+#' result <- corrp::acca(m, k = 3)
+#' print(result)
+#'
+#' # Clustering with 5 clusters and increasing the maximum number of interactions
 #' x <- corrp::corrp(iris)
 #' m <- corrp::corr_matrix(x)
-#' corrp::acca(m, 2)
+#' result <- corrp::acca(m, k = 5, maxiter = 200)
+#' print(result)
+#'
+#'
+#' # Adjusting the maximum number of iterations without change in clusters
+#' x <- corrp::corrp(iris)
+#' m <- corrp::corr_matrix(x)
+#' result <- corrp::acca(m, k = 2, maxrep = 50)
+#' print(result)
 #'
 #' @export
 #'
 acca <- function(m, k, ...) {
   assert_required_argument(m, "The 'm' argument must be a cmatrix object, which is the output from corr_matrix function, or it must be a matrix.")
-  assert_required_argument(m, "The 'k' argument must be the number of number of clusters considered.")
+  assert_required_argument(m, "The 'k' argument must be the number of clusters considered.")
   UseMethod("acca", m)
 }
 
@@ -68,11 +82,9 @@ acca.cmatrix <- function(m, k, maxrep = 2L, maxiter = 100L, ...) {
 #' @rdname acca
 acca.matrix <- function(m, k, maxrep = 2L, maxiter = 100L, ...) {
   warning(
-    "m is not an object of the 'cmatrix'
-    class some results may go wrong."
+    "m is not an object of the 'cmatrix' class, so some results may be incorrect."
   )
 
-
   k <- as.integer(k)
   maxrep <- as.integer(maxrep)
   maxiter <- as.integer(maxiter)

diff --git a/R/best_acca.R b/R/best_acca.R
@@ -2,7 +2,7 @@
 #'
 #' @description Determining the optimal number of
 #' cluster in the ACCA clustering using the
-#' average silhouette aproach.
+#' average silhouette approach.
 #'
 #' @param m  \[\code{matrix(1)}]\cr correlation matrix
 #' from \code{\link{corr_matrix}}.
@@ -12,7 +12,7 @@
 #' without change in the clusters in the ACCA method.
 #' @param maxiter \[\code{integer(1)}]\cr maximum number
 #' of interactions in the ACCA method.
-#' @param ... Additional arguments.
+#' @param ... Not used. Included for S3 method consistency.
 #'
 #' @return \[\code{list(3)}]\cr A list with:
 #' silhouette average with per k `$silhouette.ave`;
@@ -22,7 +22,7 @@
 #'
 #' @author Igor D.S. Siciliani, Paulo H. dos Santos
 #'
-#' @keywords silhouette , acca , optimal , k
+#' @keywords silhouette, acca, optimal, k
 #'
 #' @references
 #' Leonard Kaufman; Peter J. Rousseeuw (1990).
@@ -68,11 +68,9 @@ best_acca.cmatrix <- function(m, mink, maxk, maxrep = 2L, maxiter = 100L, ...) {
 #' @rdname best_acca
 best_acca.matrix <- function(m, mink, maxk, maxrep = 2L, maxiter = 100L, ...) {
   warning(
-    "m is not an object of the 'cmatrix'
-    class some results may go wrong."
+    "m is not an object of the 'cmatrix' class, so some results may be incorrect."
   )
 
-
   mink <- as.integer(mink)
   maxk <- as.integer(maxk)
   maxrep <- as.integer(maxrep)

diff --git a/R/corr_fun.R b/R/corr_fun.R
@@ -1,50 +1,45 @@
-#' @title Compute Correlation type analysis with Statistical Significance
+#' @title Compute Correlation Type Analysis with Statistical Significance
 #'
-#' @description Compute correlation type analysis
-#' on two mixed classes columns of a given dataframe.
-#'   The dataframe is allowed to have columns of these four classes: integer,
-#'   numeric, factor and character. The character column is considered as
-#'   categorical variable.
+#' @description Performs correlation type analysis
+#' on two mixed-class columns of a given dataframe.
+#'   The dataframe can contain columns of four types: integer,
+#'   numeric, factor, and character. The character column is considered as
+#'   a categorical variable.
 #'
 #' @name corr_fun
 #'
 #' @inheritSection corrp Pair Types
 #'
 #'
 #' @return list with all statistical results.\cr
-#' - All statistical tests are controlled by the confidence internal of
-#'   p.value param. If the statistical tests do not 
-#' obtain a significance greater/less
-#'   than p.value the value of variable `isig` will be `FALSE`.\cr
-#' - There is no statistical significance test 
-#' for the pps algorithm. By default `isig` is TRUE.\cr
-#' - If any errors occur during operations by 
-#' default the association measure(`infer.value`) will be `NA`.
+#' All statistical tests are controlled by the confidence interval of p.value parameter. If the statistical tests do not obtain a significance greater/less than p.value the value of variable `isig` will be `FALSE`.\cr
+#' If any errors occur during operations the association measure (`infer.value`) will be `NA`.\cr
+#' The result `data` and `index` will have \eqn{N^2} rows, where N is the number of variables of the input data.
+#' By default there is no statistical significance test for the PPS algorithm. In this case `isig` is NA, you can enable it by setting `ptest = TRUE` in `pps.args`.\cr
+#' All the `*.args` can modify the parameters (`p.value`, `comp`, `alternative`, `num.s`, `rk`, `ptest`) for the respective method on it's prefix.
 #'
 #'
 #' @inheritParams corrp
-#' @param nx \[\code{character(1)}]\cr first variable column name: independent/predictor variable. 
+#' @param nx \[\code{character(1)}]\cr first variable column name: independent/predictor variable.
 #' @param ny \[\code{character(1)}]\cr second variable column name: dependent/target variable.
 #'
 #'
 #' @author Igor D.S. Siciliani, Paulo H. dos Santos
 #'
-#' @keywords correlation , power predictive score , linear model , distance correlation ,
-#' mic , point biserial , pearson , cramer'sV
+#' @keywords correlation, power predictive score, linear model, distance correlation, mic, point biserial, pearson, cramer'sV
 #'
 #' @references
-#' KS Srikanth,sidekicks,cor2, 2020.
+#' KS Srikanth, sidekicks, cor2, 2020.
 #' URL \url{https://github.com/talegari/sidekicks/}.
 #'
-#'
-#' Paul van der Laken, ppsr,2021.
+#' Paul van der Laken, ppsr, 2021.
 #' URL \url{https://github.com/paulvanderlaken/ppsr}.
 #'
 #' @examples
-#' 
+#'
 #' # since both `nx` and `ny` columns are numerical the method type is defined by `cor.nn`
 #' corr_fun(iris, nx = "Sepal.Length", ny = "Sepal.Width", cor.nn = "dcor")
-#' 
+#'
 #' @export
 corr_fun <- function(df,
                      nx,
@@ -64,16 +59,10 @@ corr_fun <- function(df,
                      mic.args = list(),
                      pps.args = list(ptest = FALSE),
                      cramersV.args = list(),
-                     uncoef.args = list(),
-                     ...) {
-
-
-  assert_required_argument(df, 
-    "The 'df' argument must be a data.frame containing the data to analyze.")
-  assert_required_argument(nx, 
-    "The 'nx' argument must be a character vector specifying a column name from 'df' for the independent variable(s).")
-  assert_required_argument(ny, 
-    "The 'ny' argument must be a character string specifying a column name from 'df' for the dependent variable.")
+                     uncoef.args = list()) {
+  assert_required_argument(df, "The 'df' argument must be a data.frame containing the data to analyze.")
+  assert_required_argument(nx, "The 'nx' argument must be a character vector specifying a column name from 'df' for the independent variable(s).")
+  assert_required_argument(ny, "The 'ny' argument must be a character string specifying a column name from 'df' for the dependent variable.")
 
   alternative <- match.arg(alternative)
   cor.nn <- match.arg(cor.nn)
@@ -191,7 +180,7 @@ corr_fun <- function(df,
   }
 
   if (inherits(r, "try-error")) {
-    msg <- ""   
+    msg <- ""
     if (verbose) {
       warnings(cat(
         "ERROR: some operations produces Nas values.", "\n",

diff --git a/R/corr_matrix.R b/R/corr_matrix.R
@@ -1,23 +1,26 @@
-#' @title Create Correlation Matrix from corrp inferences
+#' @title Create Correlation Matrix from corrp Inferences
 #'
-#' @description Through the results obtained from corrp function
-#' create a correlation matrix.
+#' @description Using the results obtained from the corrp function,
+#' this function creates a correlation matrix.
 #'
-#' @param c \[\code{clist(1)}]\cr output from the \code{\link{corrp}} function.
-#' @param col \[\code{character(1)}]\cr choose the column to be used in the correlation matrix.
-#' @param isig \[\code{logical(1)}]\cr values that are not statistically significant will
-#' be represented by NA or FALSE in the correlation matrix.
-#' @param ... Additional arguments (TODO).
+#' @param c \[\code{clist(1)}]\cr Output from the \code{\link{corrp}} function.
+#' @param col \[\code{character(1)}]\cr Specifies the column to be used in the correlation matrix.
+#' @param isig \[\code{logical(1)}]\cr Determines whether values that are not statistically significant
+#' should be represented by NA or FALSE in the correlation matrix.
+#' @param ... Not used. Included for S3 method consistency.
 #'
 #' @author Igor D.S. Siciliani, Paulo H. dos Santos
 #'
-#' @keywords correlation matrix , corrp
+#' @keywords correlation matrix, corrp
 #'
 #' @examples
 #'
 #' iris_cor <- corrp(iris)
 #' iris_m <- corr_matrix(iris_cor, isig = FALSE)
-#' corrplot::corrplot(iris_m)
+#' if (require("corrplot")) {
+#'   corrplot(iris_m) # You can visualize the matrix using corrplot
+#' }
+
 #' @export
 corr_matrix <- function(c, ...) {
   assert_required_argument(c, "The 'c' argument must be a clist object, which is the output from corrp.")
@@ -27,18 +30,18 @@ corr_matrix <- function(c, ...) {
 #' @export
 #' @rdname corr_matrix
 corr_matrix.default <- function(c, col = c("infer.value", "stat.value", "isig"), isig = TRUE, ...) {
-  warning("it is not an object of the 'clist' class some results may go wrong.")
+  warning("The provided object is not of class 'clist'; some results may be incorrect.")
 
-  .corr_matrix(c = c, col = col, isig = isig, ...)
+  .corr_matrix(c = c, col = col, isig = isig)
 }
 
 #' @export
 #' @rdname corr_matrix
 corr_matrix.clist <- function(c, col = c("infer.value", "stat.value", "isig"), isig = TRUE, ...) {
-  .corr_matrix(c = c, col = col, isig = isig, ...)
+  .corr_matrix(c = c, col = col, isig = isig)
 }
 
-.corr_matrix <- function(c, col = c("infer.value", "stat.value", "isig"), isig = TRUE, ...) {
+.corr_matrix <- function(c, col = c("infer.value", "stat.value", "isig"), isig = TRUE) {
   checkmate::assert_names(names(c), identical.to = c("data", "index"))
   checkmate::assert_logical(isig, len = 1)
   stopifnot(all(unique(c$index$i) == unique(c$index$j)))