From 20c68d35824d2ffb032deaa147776832fafcd12f Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Thu, 18 Dec 2025 14:42:27 +0100 Subject: [PATCH 01/11] feat: add recognized source contol domain detection --- R/data_source_control.R | 149 ++++++++++++++++++++++++++++++++++++++++ R/options.R | 17 ++++- 2 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 R/data_source_control.R diff --git a/R/data_source_control.R b/R/data_source_control.R new file mode 100644 index 0000000..4e7615d --- /dev/null +++ b/R/data_source_control.R @@ -0,0 +1,149 @@ +#' Source Control Metrics +#' +#' Metrics for assessing whether a package has source code on a recognized +#' hosting platform based on an allow-list of known domains. +#' +#' @section Metrics: +#' +#' \describe{ +#' \item{`recognized_source_url`}{The inferred URL of the package's source +#' control repository, extracted from the URL and BugReports fields in the +#' DESCRIPTION file and matched against an allow-list of recognized domains.} +#' \item{`has_recognized_source`}{A logical indicating whether the package has +#' a source code repository on a recognized hosting platform from the +#' allow-list.} +#' } +#' +#' @section Allow-List Approach: +#' +#' These metrics use an allow-list of recognized source control hosting domains +#' to identify source repositories. This means: +#' \itemize{ +#' \item Only URLs matching domains in the allow-list are recognized +#' \item Packages may have source control not detected if hosted elsewhere +#' \item The allow-list is customizable to include additional domains +#' \item Results indicate recognized hosting, not definitive source control presence +#' } +#' +#' @section Customization: +#' +#' The recognized domains can be customized. For details on the default +#' recognized domains and how to customize them, see [`options`]. +#' +#' This is useful for: +#' \itemize{ +#' \item Adding self-hosted GitLab, Gitea, or Forgejo instances +#' \item Including internal enterprise git hosting services +#' \item Supporting additional federated git providers +#' } +#' +#' @examples +#' \dontrun{ +#' # Check if a package has source on a recognized platform +#' p <- pkg("ggplot2") +#' p$has_recognized_source # TRUE +#' p$recognized_source_url # "https://github.com/tidyverse/ggplot2" +#' +#' # Customize to add self-hosted GitLab to the allow-list +#' # Save the original value to restore later +#' old_domains <- opt_set("source_control_domains", c( +#' "github.com", +#' "gitlab.com", +#' "git.mycompany.com" +#' )) +#' +#' # Or extend the defaults +#' default_domains <- opt("source_control_domains") +#' old_domains <- opt_set("source_control_domains", c( +#' default_domains, +#' "git.mycompany.com", +#' "forge.myorg.net" +#' )) +#' +#' # Restore original value +#' opt_set("source_control_domains", old_domains) +#' } +#' +#' @seealso [`options`] for details on the `source_control_domains` option +#' @name source_control_metrics +#' @include impl_data.R +NULL + +# Source control URL extraction and inference +impl_data( + "recognized_source_url", + class = class_character, + title = "Recognized Source Control URL", + description = paste( + "The URL of the package's source control repository on a recognized", + "hosting platform, determined by matching against an allow-list of", + "known domains. Extracted from the URL and BugReports fields in the", + "DESCRIPTION file. The allow-list can be customized; see ?options for", + "details." + ), + function(pkg, resource, field, ...) { + # Get URLs from DESCRIPTION file + desc_urls <- tryCatch( + pkg$desc$get_urls(), + error = function(e) character(0) + ) + + # Get BugReports field if available + bug_reports <- tryCatch( + pkg$desc$get_field("BugReports"), + error = function(e) character(0) + ) + + # Combine all URLs to check, filtering out empty strings and NAs + all_urls <- c(desc_urls, bug_reports) + all_urls <- all_urls[!is.na(all_urls) & nzchar(all_urls)] + + if (length(all_urls) == 0) { + return(NA_character_) + } + + # Get recognized source control domains from options + source_control_domains <- opt("source_control_domains") + + # Try to find a URL matching known source control domains + # We use case-insensitive matching for domains + for (url in all_urls) { + for (domain in source_control_domains) { + if (grepl(domain, url, ignore.case = TRUE)) { + return(url) + } + } + } + + # No known source control URL found + NA_character_ + } +) + +impl_data( + "has_recognized_source", + class = class_logical, + metric = TRUE, + tags = c("best practice"), + permissions = c(), + title = "Has Recognized Source Repository", + description = paste( + "Indicates whether the package has a source code repository on a", + "recognized hosting platform from an allow-list of known domains.", + "Inferred from the URL and BugReports fields in the DESCRIPTION file.", + "See ?options for customizing the allow-list." + ), + function(pkg, resource, field, ...) { + !is.na(pkg$recognized_source_url) + } +) + +# Mock implementation for random packages +impl_data( + "has_recognized_source", + for_resource = mock_resource, + function(pkg, resource, field, ...) { + # Simulate realistic distribution - most packages have recognized source + runif(1) > 0.2 + } +) diff --git a/R/options.R b/R/options.R index b38c869..c59bbd4 100644 --- a/R/options.R +++ b/R/options.R @@ -26,5 +26,20 @@ define_options( "Silences console output during evaluation. This applies when pulling package resources (such as download and installation output) and executing code (for example, running `R CMD check`)", - quiet = TRUE + quiet = TRUE, + + fmt("Recognized source control hosting domains used when inferring whether a + package has a source code repository on a recognized hosting platform. + Customize this to add additional git hosting services (e.g., self-hosted + GitLab instances or other federated git providers)."), + source_control_domains = c( + "github.com", + "gitlab.com", + "bitbucket.org", + "r-forge.r-project.org", + "codeberg.org", + "sr.ht", # sourcehut + "gitea.com", + "git.sr.ht" + ) ) From 8d03dec91cec4070bb76655ee92451f3a758daea Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Thu, 18 Dec 2025 14:44:04 +0100 Subject: [PATCH 02/11] docs: update documentation --- DESCRIPTION | 3 +- man/metrics.Rd | 7 +++ man/options.Rd | 10 +++++ man/options_params.Rd | 5 +++ man/source_control_metrics.Rd | 80 +++++++++++++++++++++++++++++++++++ 5 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 man/source_control_metrics.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 0fb2f50..d0c2e1c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -84,6 +84,7 @@ Collate: 'data_desc.R' 'data_downloads_total.R' 'data_r_cmd_check.R' + 'data_source_control.R' 'data_vignettes.R' 'data_web_html.R' 'generic_metric_coerce.R' @@ -96,7 +97,7 @@ Collate: 'zzz.R' Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 3.5) LazyData: true diff --git a/man/metrics.Rd b/man/metrics.Rd index 0541b5d..96f0252 100644 --- a/man/metrics.Rd +++ b/man/metrics.Rd @@ -61,6 +61,13 @@ For access to \emph{all} the internally calculated data, pass \code{all = TRUE}. \Sexpr[stage=render,results=rd]{if (!is.na(match("network", getOption("val.meter.permissions")))) "\\\\ifelse{html}{\\\\figure{badge-req-network-x-flat-square-green.svg}{options: alt = \\"[network]\\"}}{\\\\strong{[req::network]}}" else "\\\\ifelse{html}{\\\\figure{badge-req-network-x-flat-square-red.svg}{options: alt = \\"[network]\\"}}{\\\\strong{[req::network]}}"} \Sexpr[stage=install,results=rd]{if (numeric_version(paste0(R.version$major, ".", R.version$minor)) < "4.5.0") { "\\\\ifelse{html}{\\\\figure{badge-adoption-x-flat-square-blue.svg}{options: alt = \\"[adoption]\\"}}{\\\\strong{[adoption]}}\\n\\\\ifelse{html}{\\\\figure{badge-transient-x-flat-square-blue.svg}{options: alt = \\"[transient]\\"}}{\\\\strong{[transient]}}\\n\\\\ifelse{html}{\\\\figure{badge-version--independent-x-flat-square-blue.svg}{options: alt = \\"[version-independent]\\"}}{\\\\strong{[version-independent]}}" } else { "\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-adoption-x-flat-square-blue.svg}{options: alt = \\"[adoption]\\"}}{\\\\strong{[adoption]}}}\\n\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-transient-x-flat-square-blue.svg}{options: alt = \\"[transient]\\"}}{\\\\strong{[transient]}}}\\n\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-version--independent-x-flat-square-blue.svg}{options: alt = \\"[version-independent]\\"}}{\\\\strong{[version-independent]}}}" }} +} + \subsection{Has Recognized Source Repository}{ +\code{} Indicates whether the package has a source code repository on a recognized hosting platform from an allow-list of known domains. Inferred from the URL and BugReports fields in the DESCRIPTION file. See ?options for customizing the allow-list. + + + +\Sexpr[stage=install,results=rd]{if (numeric_version(paste0(R.version$major, ".", R.version$minor)) < "4.5.0") { "\\\\ifelse{html}{\\\\figure{badge-best_practice-x-flat-square-blue.svg}{options: alt = \\"[best practice]\\"}}{\\\\strong{[best practice]}}" } else { "\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-best_practice-x-flat-square-blue.svg}{options: alt = \\"[best practice]\\"}}{\\\\strong{[best practice]}}}" }} } \subsection{Dependency Count}{ \code{} the number of required dependencies diff --git a/man/options.Rd b/man/options.Rd index 9436013..b861199 100644 --- a/man/options.Rd +++ b/man/options.Rd @@ -51,6 +51,16 @@ resources (such as download and installation output) and executing code \item{envvar: }{R_VAL_METER_QUIET (evaluated if possible, raw string otherwise)} }} +\item{source_control_domains}{\describe{ +Recognized source control hosting domains used when inferring whether a +package has a source code repository on a recognized hosting platform. +Customize this to add additional git hosting services (e.g., self-hosted +GitLab instances or other federated git providers).\item{default: }{\preformatted{c("github.com", "gitlab.com", "bitbucket.org", "r-forge.r-project.org", + "codeberg.org", "sr.ht", "gitea.com", "git.sr.ht")}} +\item{option: }{val.meter.source_control_domains} +\item{envvar: }{R_VAL_METER_SOURCE_CONTROL_DOMAINS (evaluated if possible, raw string otherwise)} +}} + } } diff --git a/man/options_params.Rd b/man/options_params.Rd index e3a9d6f..5e588b2 100644 --- a/man/options_params.Rd +++ b/man/options_params.Rd @@ -18,6 +18,11 @@ calculating metrics. (Defaults to \code{policy()}, overwritable using option 'va \item{logs}{Logging directory where artifacts will be stored. Defaults to a temporary directory. (Defaults to \code{ns_tmp_root()}, overwritable using option 'val.meter.logs' or environment variable 'R_VAL_METER_LOGS')} + +\item{source_control_domains}{Recognized source control hosting domains used when inferring whether a +package has a source code repository on a recognized hosting platform. +Customize this to add additional git hosting services (e.g., self-hosted +GitLab instances or other federated git providers). (Defaults to \verb{c("github.com", "gitlab.com", "bitbucket.org", "r-forge.r-project.org", ; "codeberg.org", "sr.ht", "gitea.com", "git.sr.ht")}, overwritable using option 'val.meter.source_control_domains' or environment variable 'R_VAL_METER_SOURCE_CONTROL_DOMAINS')} } \description{ Options As Parameters diff --git a/man/source_control_metrics.Rd b/man/source_control_metrics.Rd new file mode 100644 index 0000000..4288d74 --- /dev/null +++ b/man/source_control_metrics.Rd @@ -0,0 +1,80 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_source_control.R +\name{source_control_metrics} +\alias{source_control_metrics} +\title{Source Control Metrics} +\description{ +Metrics for assessing whether a package has source code on a recognized +hosting platform based on an allow-list of known domains. +} +\section{Metrics}{ + + +\describe{ +\item{\code{recognized_source_url}}{The inferred URL of the package's source +control repository, extracted from the URL and BugReports fields in the +DESCRIPTION file and matched against an allow-list of recognized domains.} +\item{\code{has_recognized_source}}{A logical indicating whether the package has +a source code repository on a recognized hosting platform from the +allow-list.} +} +} + +\section{Allow-List Approach}{ + + +These metrics use an allow-list of recognized source control hosting domains +to identify source repositories. This means: +\itemize{ +\item Only URLs matching domains in the allow-list are recognized +\item Packages may have source control not detected if hosted elsewhere +\item The allow-list is customizable to include additional domains +\item Results indicate recognized hosting, not definitive source control presence +} +} + +\section{Customization}{ + + +The recognized domains can be customized. For details on the default +recognized domains and how to customize them, see \code{\link{options}}. + +This is useful for: +\itemize{ +\item Adding self-hosted GitLab, Gitea, or Forgejo instances +\item Including internal enterprise git hosting services +\item Supporting additional federated git providers +} +} + +\examples{ +\dontrun{ +# Check if a package has source on a recognized platform +p <- pkg("ggplot2") +p$has_recognized_source # TRUE +p$recognized_source_url # "https://github.com/tidyverse/ggplot2" + +# Customize to add self-hosted GitLab to the allow-list +# Save the original value to restore later +old_domains <- opt_set("source_control_domains", c( + "github.com", + "gitlab.com", + "git.mycompany.com" +)) + +# Or extend the defaults +default_domains <- opt("source_control_domains") +old_domains <- opt_set("source_control_domains", c( + default_domains, + "git.mycompany.com", + "forge.myorg.net" +)) + +# Restore original value +opt_set("source_control_domains", old_domains) +} + +} +\seealso{ +\code{\link{options}} for details on the \code{source_control_domains} option +} From 6cba12ad4bd4ca80f5beefcaecdfeb1d39ecd543 Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Thu, 18 Dec 2025 15:38:44 +0100 Subject: [PATCH 03/11] style: fix linting --- R/data_source_control.R | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/R/data_source_control.R b/R/data_source_control.R index 4e7615d..662ddd9 100644 --- a/R/data_source_control.R +++ b/R/data_source_control.R @@ -8,7 +8,8 @@ #' \describe{ #' \item{`recognized_source_url`}{The inferred URL of the package's source #' control repository, extracted from the URL and BugReports fields in the -#' DESCRIPTION file and matched against an allow-list of recognized domains.} +#' DESCRIPTION file and matched against an allow-list of recognized +#' domains.} #' \item{`has_recognized_source`}{A logical indicating whether the package has #' a source code repository on a recognized hosting platform from the #' allow-list.} @@ -22,7 +23,8 @@ #' \item Only URLs matching domains in the allow-list are recognized #' \item Packages may have source control not detected if hosted elsewhere #' \item The allow-list is customizable to include additional domains -#' \item Results indicate recognized hosting, not definitive source control presence +#' \item Results indicate recognized hosting, not definitive source control +#' presence #' } #' #' @section Customization: @@ -51,7 +53,7 @@ #' "gitlab.com", #' "git.mycompany.com" #' )) -#' +#' #' # Or extend the defaults #' default_domains <- opt("source_control_domains") #' old_domains <- opt_set("source_control_domains", c( @@ -59,7 +61,7 @@ #' "git.mycompany.com", #' "forge.myorg.net" #' )) -#' +#' #' # Restore original value #' opt_set("source_control_domains", old_domains) #' } @@ -87,24 +89,24 @@ impl_data( pkg$desc$get_urls(), error = function(e) character(0) ) - + # Get BugReports field if available bug_reports <- tryCatch( pkg$desc$get_field("BugReports"), error = function(e) character(0) ) - + # Combine all URLs to check, filtering out empty strings and NAs all_urls <- c(desc_urls, bug_reports) all_urls <- all_urls[!is.na(all_urls) & nzchar(all_urls)] - + if (length(all_urls) == 0) { return(NA_character_) } - + # Get recognized source control domains from options source_control_domains <- opt("source_control_domains") - + # Try to find a URL matching known source control domains # We use case-insensitive matching for domains for (url in all_urls) { @@ -114,7 +116,7 @@ impl_data( } } } - + # No known source control URL found NA_character_ } From 4adcd1a73ebc6a4dfec5eb917ef3b32dd5e7ee24 Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Thu, 18 Dec 2025 15:39:24 +0100 Subject: [PATCH 04/11] test: add basic tests for data source control metric --- tests/testthat/test-data_source_control.R | 253 ++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 tests/testthat/test-data_source_control.R diff --git a/tests/testthat/test-data_source_control.R b/tests/testthat/test-data_source_control.R new file mode 100644 index 0000000..57d8fd4 --- /dev/null +++ b/tests/testthat/test-data_source_control.R @@ -0,0 +1,253 @@ +describe("source control metrics with real packages", { + # Test with real packages that have known source control URLs + it("recognizes ggplot2 on GitHub", { + skip_if_offline() + skip_on_cran() + + p <- pkg("ggplot2") + expect_true(p$has_recognized_source) + expect_true(grepl( + "github.com", + p$recognized_source_url, + ignore.case = TRUE + )) + }) + + it("recognizes dplyr on GitHub", { + skip_if_offline() + skip_on_cran() + + p <- pkg("dplyr") + expect_true(p$has_recognized_source) + expect_true(grepl( + "github.com", + p$recognized_source_url, + ignore.case = TRUE + )) + }) +}) + +describe("source control metrics implementation details", { + # Test the actual implementation logic with mocked desc objects + it("extracts URLs from DESCRIPTION and matches against domains", { + # Create a temporary DESCRIPTION file + tmp_dir <- withr::local_tempdir() + desc_file <- file.path(tmp_dir, "DESCRIPTION") + + writeLines(c( + "Package: testpkg", + "Version: 1.0.0", + "Title: Test Package", + "Description: A test package.", + "URL: https://github.com/user/testpkg", + "BugReports: https://github.com/user/testpkg/issues" + ), desc_file) + + d <- desc::desc(file = desc_file) + + # Get URLs and check matching logic + urls <- d$get_urls() + domains <- opt("source_control_domains") + + expect_true(length(urls) > 0) + expect_true(any(sapply(urls, function(url) { + any(sapply(domains, function(domain) { + grepl(domain, url, ignore.case = TRUE) + })) + }))) + }) + + it("returns NA when no URLs match recognized domains", { + tmp_dir <- withr::local_tempdir() + desc_file <- file.path(tmp_dir, "DESCRIPTION") + + writeLines(c( + "Package: testpkg", + "Version: 1.0.0", + "Title: Test Package", + "Description: A test package.", + "URL: https://example.com/testpkg" + ), desc_file) + + d <- desc::desc(file = desc_file) + urls <- d$get_urls() + domains <- opt("source_control_domains") + + # None should match + matches <- any(sapply(urls, function(url) { + any(sapply(domains, function(domain) { + grepl(domain, url, ignore.case = TRUE) + })) + })) + + expect_false(matches) + }) + + it("is case-insensitive when matching domains", { + tmp_dir <- withr::local_tempdir() + desc_file <- file.path(tmp_dir, "DESCRIPTION") + + writeLines(c( + "Package: testpkg", + "Version: 1.0.0", + "Title: Test Package", + "Description: A test package.", + "URL: https://GitHub.COM/user/testpkg" + ), desc_file) + + d <- desc::desc(file = desc_file) + urls <- d$get_urls() + + # Should match despite different case + matched <- any(sapply(urls, function(url) { + grepl("github.com", url, ignore.case = TRUE) + })) + + expect_true(matched) + }) + + it("handles BugReports field", { + tmp_dir <- withr::local_tempdir() + desc_file <- file.path(tmp_dir, "DESCRIPTION") + + writeLines(c( + "Package: testpkg", + "Version: 1.0.0", + "Title: Test Package", + "Description: A test package.", + "BugReports: https://github.com/user/testpkg/issues" + ), desc_file) + + d <- desc::desc(file = desc_file) + + # BugReports should be available + bug_reports <- tryCatch( + d$get_field("BugReports"), + error = function(e) character(0) + ) + + expect_true(length(bug_reports) > 0) + expect_true(grepl("github.com", bug_reports, ignore.case = TRUE)) + }) +}) + +describe("source control option customization", { + it("respects custom source_control_domains option", { + skip_if_offline() + skip_on_cran() + + # Save original and set custom domains + old_domains <- opt_set("source_control_domains", c("custom.org")) + on.exit(opt_set("source_control_domains", old_domains)) + + # Create test DESCRIPTION with custom domain + tmp_dir <- withr::local_tempdir() + desc_file <- file.path(tmp_dir, "DESCRIPTION") + + writeLines(c( + "Package: testpkg", + "Version: 1.0.0", + "Title: Test Package", + "Description: A test package.", + "URL: https://custom.org/testpkg" + ), desc_file) + + d <- desc::desc(file = desc_file) + urls <- d$get_urls() + domains <- opt("source_control_domains") + + # Should match custom domain + expect_equal(domains, "custom.org") + expect_true(any(sapply(urls, function(url) { + grepl("custom.org", url, ignore.case = TRUE) + }))) + }) +}) + +describe("source control metrics metadata", { + it("has_recognized_source is registered as a metric", { + info <- pkg_data_info("has_recognized_source") + expect_true(info@metric) + }) + + it("has_recognized_source has 'best practice' tag", { + info <- pkg_data_info("has_recognized_source") + expect_true("best practice" %in% info@tags) + }) + + it("recognized_source_url is not a metric", { + info <- pkg_data_info("recognized_source_url") + expect_false(info@metric) + }) + + it("metrics have non-empty titles", { + info_metric <- pkg_data_info("has_recognized_source") + info_data <- pkg_data_info("recognized_source_url") + + expect_true(length(info_metric@title) > 0) + expect_true(nchar(info_metric@title) > 0) + expect_true(length(info_data@title) > 0) + expect_true(nchar(info_data@title) > 0) + }) + + it("metrics have non-empty descriptions", { + info_metric <- pkg_data_info("has_recognized_source") + info_data <- pkg_data_info("recognized_source_url") + + expect_true(length(info_metric@description) > 0) + expect_true(length(info_data@description) > 0) + }) +}) + +describe("source control option integration", { + it("default domains include major platforms", { + domains <- opt("source_control_domains") + + expect_true("github.com" %in% domains) + expect_true("gitlab.com" %in% domains) + expect_true("bitbucket.org" %in% domains) + expect_true(length(domains) >= 5) + }) + + it("can extend default domains", { + defaults <- opt("source_control_domains") + old_domains <- opt_set("source_control_domains", c( + defaults, + "git.company.com" + )) + on.exit(opt_set("source_control_domains", old_domains)) + + extended <- opt("source_control_domains") + expect_true("git.company.com" %in% extended) + expect_true("github.com" %in% extended) + expect_equal(length(extended), length(defaults) + 1) + }) + + it("can replace domains entirely", { + old_domains <- opt_set("source_control_domains", c("custom.org")) + on.exit(opt_set("source_control_domains", old_domains)) + + domains <- opt("source_control_domains") + expect_equal(domains, "custom.org") + expect_equal(length(domains), 1) + }) +}) + +describe("source control mock implementation", { + it("random packages have realistic source control distribution", { + # Generate multiple random packages and check distribution + n <- 100 + results <- replicate(n, { + p <- random_pkg() + p$has_recognized_source + }) + + # Should be logical + expect_type(results, "logical") + + # Most packages should have recognized source (around 80%) + prop_with_source <- mean(results, na.rm = TRUE) + expect_true(prop_with_source > 0.5) + expect_true(prop_with_source < 1.0) + }) +}) From 2a101b6d4428d9c386e23ba40f93a06ca900fd23 Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Thu, 18 Dec 2025 15:45:34 +0100 Subject: [PATCH 05/11] docs: standardize docs for the package to follow other implementations --- R/data_source_control.R | 71 ------------------------------- man/source_control_metrics.Rd | 80 ----------------------------------- 2 files changed, 151 deletions(-) delete mode 100644 man/source_control_metrics.Rd diff --git a/R/data_source_control.R b/R/data_source_control.R index 662ddd9..a0756d7 100644 --- a/R/data_source_control.R +++ b/R/data_source_control.R @@ -1,75 +1,4 @@ -#' Source Control Metrics -#' -#' Metrics for assessing whether a package has source code on a recognized -#' hosting platform based on an allow-list of known domains. -#' -#' @section Metrics: -#' -#' \describe{ -#' \item{`recognized_source_url`}{The inferred URL of the package's source -#' control repository, extracted from the URL and BugReports fields in the -#' DESCRIPTION file and matched against an allow-list of recognized -#' domains.} -#' \item{`has_recognized_source`}{A logical indicating whether the package has -#' a source code repository on a recognized hosting platform from the -#' allow-list.} -#' } -#' -#' @section Allow-List Approach: -#' -#' These metrics use an allow-list of recognized source control hosting domains -#' to identify source repositories. This means: -#' \itemize{ -#' \item Only URLs matching domains in the allow-list are recognized -#' \item Packages may have source control not detected if hosted elsewhere -#' \item The allow-list is customizable to include additional domains -#' \item Results indicate recognized hosting, not definitive source control -#' presence -#' } -#' -#' @section Customization: -#' -#' The recognized domains can be customized. For details on the default -#' recognized domains and how to customize them, see [`options`]. -#' -#' This is useful for: -#' \itemize{ -#' \item Adding self-hosted GitLab, Gitea, or Forgejo instances -#' \item Including internal enterprise git hosting services -#' \item Supporting additional federated git providers -#' } -#' -#' @examples -#' \dontrun{ -#' # Check if a package has source on a recognized platform -#' p <- pkg("ggplot2") -#' p$has_recognized_source # TRUE -#' p$recognized_source_url # "https://github.com/tidyverse/ggplot2" -#' -#' # Customize to add self-hosted GitLab to the allow-list -#' # Save the original value to restore later -#' old_domains <- opt_set("source_control_domains", c( -#' "github.com", -#' "gitlab.com", -#' "git.mycompany.com" -#' )) -#' -#' # Or extend the defaults -#' default_domains <- opt("source_control_domains") -#' old_domains <- opt_set("source_control_domains", c( -#' default_domains, -#' "git.mycompany.com", -#' "forge.myorg.net" -#' )) -#' -#' # Restore original value -#' opt_set("source_control_domains", old_domains) -#' } -#' -#' @seealso [`options`] for details on the `source_control_domains` option -#' @name source_control_metrics #' @include impl_data.R -NULL # Source control URL extraction and inference impl_data( diff --git a/man/source_control_metrics.Rd b/man/source_control_metrics.Rd deleted file mode 100644 index 4288d74..0000000 --- a/man/source_control_metrics.Rd +++ /dev/null @@ -1,80 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/data_source_control.R -\name{source_control_metrics} -\alias{source_control_metrics} -\title{Source Control Metrics} -\description{ -Metrics for assessing whether a package has source code on a recognized -hosting platform based on an allow-list of known domains. -} -\section{Metrics}{ - - -\describe{ -\item{\code{recognized_source_url}}{The inferred URL of the package's source -control repository, extracted from the URL and BugReports fields in the -DESCRIPTION file and matched against an allow-list of recognized domains.} -\item{\code{has_recognized_source}}{A logical indicating whether the package has -a source code repository on a recognized hosting platform from the -allow-list.} -} -} - -\section{Allow-List Approach}{ - - -These metrics use an allow-list of recognized source control hosting domains -to identify source repositories. This means: -\itemize{ -\item Only URLs matching domains in the allow-list are recognized -\item Packages may have source control not detected if hosted elsewhere -\item The allow-list is customizable to include additional domains -\item Results indicate recognized hosting, not definitive source control presence -} -} - -\section{Customization}{ - - -The recognized domains can be customized. For details on the default -recognized domains and how to customize them, see \code{\link{options}}. - -This is useful for: -\itemize{ -\item Adding self-hosted GitLab, Gitea, or Forgejo instances -\item Including internal enterprise git hosting services -\item Supporting additional federated git providers -} -} - -\examples{ -\dontrun{ -# Check if a package has source on a recognized platform -p <- pkg("ggplot2") -p$has_recognized_source # TRUE -p$recognized_source_url # "https://github.com/tidyverse/ggplot2" - -# Customize to add self-hosted GitLab to the allow-list -# Save the original value to restore later -old_domains <- opt_set("source_control_domains", c( - "github.com", - "gitlab.com", - "git.mycompany.com" -)) - -# Or extend the defaults -default_domains <- opt("source_control_domains") -old_domains <- opt_set("source_control_domains", c( - default_domains, - "git.mycompany.com", - "forge.myorg.net" -)) - -# Restore original value -opt_set("source_control_domains", old_domains) -} - -} -\seealso{ -\code{\link{options}} for details on the \code{source_control_domains} option -} From 667bdeea81acda6211e20e2ea3bea5776308878e Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Thu, 18 Dec 2025 15:57:17 +0100 Subject: [PATCH 06/11] fix: adjust package to pass checks --- DESCRIPTION | 3 ++- tests/testthat/test-data_source_control.R | 29 ----------------------- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d0c2e1c..b893589 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,8 @@ Suggests: knitr, rmarkdown, rosv, - testthat (>= 3.0.0) + testthat (>= 3.0.0), + withr Config/Needs/vignettes: knitr, rmarkdown, diff --git a/tests/testthat/test-data_source_control.R b/tests/testthat/test-data_source_control.R index 57d8fd4..04b3b59 100644 --- a/tests/testthat/test-data_source_control.R +++ b/tests/testthat/test-data_source_control.R @@ -1,32 +1,3 @@ -describe("source control metrics with real packages", { - # Test with real packages that have known source control URLs - it("recognizes ggplot2 on GitHub", { - skip_if_offline() - skip_on_cran() - - p <- pkg("ggplot2") - expect_true(p$has_recognized_source) - expect_true(grepl( - "github.com", - p$recognized_source_url, - ignore.case = TRUE - )) - }) - - it("recognizes dplyr on GitHub", { - skip_if_offline() - skip_on_cran() - - p <- pkg("dplyr") - expect_true(p$has_recognized_source) - expect_true(grepl( - "github.com", - p$recognized_source_url, - ignore.case = TRUE - )) - }) -}) - describe("source control metrics implementation details", { # Test the actual implementation logic with mocked desc objects it("extracts URLs from DESCRIPTION and matches against domains", { From b17dd806fac6732fdc0d831e08725a0f8d9e621d Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Wed, 7 Jan 2026 16:51:44 +0100 Subject: [PATCH 07/11] feat: update description of data documentation The data documentation uses Rd syntax now --- R/data_source_control.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/R/data_source_control.R b/R/data_source_control.R index a0756d7..d2b944b 100644 --- a/R/data_source_control.R +++ b/R/data_source_control.R @@ -6,11 +6,11 @@ impl_data( class = class_character, title = "Recognized Source Control URL", description = paste( - "The URL of the package's source control repository on a recognized", - "hosting platform, determined by matching against an allow-list of", - "known domains. Extracted from the URL and BugReports fields in the", - "DESCRIPTION file. The allow-list can be customized; see ?options for", - "details." + "The \\acronym{URL} of the package's source control repository on a", + "recognized hosting platform, determined by matching against an", + "allow-list of known domains. Extracted from the \\acronym{URL} and", + "\\code{BugReports} fields in the \\code{DESCRIPTION} file. The", + "allow-list can be customized; see \\code{?options} for details." ), function(pkg, resource, field, ...) { # Get URLs from DESCRIPTION file @@ -61,8 +61,9 @@ impl_data( description = paste( "Indicates whether the package has a source code repository on a", "recognized hosting platform from an allow-list of known domains.", - "Inferred from the URL and BugReports fields in the DESCRIPTION file.", - "See ?options for customizing the allow-list." + "Inferred from the \\acronym{URL} and \\code{BugReports} fields in the", + "\\code{DESCRIPTION} file. See \\code{?options} for customizing the", + "allow-list." ), function(pkg, resource, field, ...) { !is.na(pkg$recognized_source_url) From 51eb942a960fee8b6c362a46971773d6093c83d2 Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Wed, 7 Jan 2026 16:55:54 +0100 Subject: [PATCH 08/11] feat: change structure of return value of recognized_source_ulr Instead of returning a length 1 char vector which possibly can be NA, we are returning a character vector that can be of any length which are not NA. If the lenght of vector is 0, this indicates that no recognized source URL was found. --- R/data_source_control.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/data_source_control.R b/R/data_source_control.R index d2b944b..31ca79a 100644 --- a/R/data_source_control.R +++ b/R/data_source_control.R @@ -30,24 +30,24 @@ impl_data( all_urls <- all_urls[!is.na(all_urls) & nzchar(all_urls)] if (length(all_urls) == 0) { - return(NA_character_) + return(character(0)) } # Get recognized source control domains from options source_control_domains <- opt("source_control_domains") - # Try to find a URL matching known source control domains - # We use case-insensitive matching for domains - for (url in all_urls) { - for (domain in source_control_domains) { - if (grepl(domain, url, ignore.case = TRUE)) { - return(url) - } - } - } + # Find all URLs matching known source control domains + # We use case-insensitive matching by converting to lowercase + # and fixed = TRUE to avoid regex special character issues (e.g., '.') + is_src_url <- vapply( + tolower(source_control_domains), + grepl, + logical(length(all_urls)), + x = tolower(all_urls), + fixed = TRUE + ) - # No known source control URL found - NA_character_ + all_urls[rowSums(is_src_url) > 0] } ) @@ -66,7 +66,7 @@ impl_data( "allow-list." ), function(pkg, resource, field, ...) { - !is.na(pkg$recognized_source_url) + length(pkg$recognized_source_url) > 0L } ) From 5e9f1b301f6c4f4b23f359368f18f6d4bc56c92f Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Wed, 7 Jan 2026 16:56:31 +0100 Subject: [PATCH 09/11] refacotr: rename file to follow data name --- R/{data_source_control.R => data_recognized_source.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{data_source_control.R => data_recognized_source.R} (100%) diff --git a/R/data_source_control.R b/R/data_recognized_source.R similarity index 100% rename from R/data_source_control.R rename to R/data_recognized_source.R From fe8d30c21da182987b86d230d31c813722165cd5 Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Wed, 7 Jan 2026 16:58:22 +0100 Subject: [PATCH 10/11] test: update tests to reflect recent changes --- ...test-data_source_control.R => test-data_recognized_source.R} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename tests/testthat/{test-data_source_control.R => test-data_recognized_source.R} (98%) diff --git a/tests/testthat/test-data_source_control.R b/tests/testthat/test-data_recognized_source.R similarity index 98% rename from tests/testthat/test-data_source_control.R rename to tests/testthat/test-data_recognized_source.R index 04b3b59..a03c31c 100644 --- a/tests/testthat/test-data_source_control.R +++ b/tests/testthat/test-data_recognized_source.R @@ -28,7 +28,7 @@ describe("source control metrics implementation details", { }))) }) - it("returns NA when no URLs match recognized domains", { + it("returns empty vector when no URLs match recognized domains", { tmp_dir <- withr::local_tempdir() desc_file <- file.path(tmp_dir, "DESCRIPTION") From e5f4beadd5d53bd1f883d3e7617cad8eae19acc6 Mon Sep 17 00:00:00 2001 From: Dominik Rafacz Date: Wed, 7 Jan 2026 16:58:38 +0100 Subject: [PATCH 11/11] docs: update description and readme --- DESCRIPTION | 2 +- man/metrics.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b893589..d7f673c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -85,7 +85,7 @@ Collate: 'data_desc.R' 'data_downloads_total.R' 'data_r_cmd_check.R' - 'data_source_control.R' + 'data_recognized_source.R' 'data_vignettes.R' 'data_web_html.R' 'generic_metric_coerce.R' diff --git a/man/metrics.Rd b/man/metrics.Rd index 96f0252..95cb6eb 100644 --- a/man/metrics.Rd +++ b/man/metrics.Rd @@ -63,7 +63,7 @@ For access to \emph{all} the internally calculated data, pass \code{all = TRUE}. \Sexpr[stage=install,results=rd]{if (numeric_version(paste0(R.version$major, ".", R.version$minor)) < "4.5.0") { "\\\\ifelse{html}{\\\\figure{badge-adoption-x-flat-square-blue.svg}{options: alt = \\"[adoption]\\"}}{\\\\strong{[adoption]}}\\n\\\\ifelse{html}{\\\\figure{badge-transient-x-flat-square-blue.svg}{options: alt = \\"[transient]\\"}}{\\\\strong{[transient]}}\\n\\\\ifelse{html}{\\\\figure{badge-version--independent-x-flat-square-blue.svg}{options: alt = \\"[version-independent]\\"}}{\\\\strong{[version-independent]}}" } else { "\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-adoption-x-flat-square-blue.svg}{options: alt = \\"[adoption]\\"}}{\\\\strong{[adoption]}}}\\n\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-transient-x-flat-square-blue.svg}{options: alt = \\"[transient]\\"}}{\\\\strong{[transient]}}}\\n\\\\link[val.meter:tags]{\\\\ifelse{html}{\\\\figure{badge-version--independent-x-flat-square-blue.svg}{options: alt = \\"[version-independent]\\"}}{\\\\strong{[version-independent]}}}" }} } \subsection{Has Recognized Source Repository}{ -\code{} Indicates whether the package has a source code repository on a recognized hosting platform from an allow-list of known domains. Inferred from the URL and BugReports fields in the DESCRIPTION file. See ?options for customizing the allow-list. +\code{} Indicates whether the package has a source code repository on a recognized hosting platform from an allow-list of known domains. Inferred from the \acronym{URL} and \code{BugReports} fields in the \code{DESCRIPTION} file. See \code{?options} for customizing the allow-list.