Merge pull request #75 from oxford-pharmacoepi/concept_cohort_set

concept function
OHDSI · Apr 11, 2024 · bcf69bf · bcf69bf
2 parents 8f05e80 + 834e456
commit bcf69bf
Show file tree

Hide file tree

Showing 15 changed files with 567 additions and 109 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -8,3 +8,4 @@
 ^_pkgdown\.yml$
 ^doc$
 ^Meta$
+^data-raw$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,7 +9,9 @@ Authors@R: c(
     person("Marta", "Alcalde", , "marta.alcaldeherraiz@ndorms.ox.ac.uk", 
     role = c("aut"), comment = c(ORCID = "0000-0000-0000-0000")),
     person("Yuchen", "Guo", email = "yuchen.guo@ndorms.ox.ac.uk", 
-    role = c("aut"), comment = c(ORCID = "0000-0002-0847-4855"))
+    role = c("aut"), comment = c(ORCID = "0000-0002-0847-4855")),
+    person("Nuria", "Mercade-Besora", , "nuria.mercadebesora@ndorms.ox.ac.uk", 
+    role = c("ctb"), comment = c(ORCID = "0009-0006-7948-3747"))
     )
 Description: This package aims to provide functionalities to manipulate
     and evaluate cohorts in data mapped to the Observational Medical 
@@ -47,3 +49,5 @@ Config/testthat/parallel: true
 VignetteBuilder: knitr
 Remotes:
     darwin-eu-dev/PatientProfiles
+Depends: 
+    R (>= 4.1)
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,11 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
 export(attrition)
+export(cohortCodelist)
 export(cohortCount)
+export(conceptCohort)
 export(generateIntersectCohortSet)
-export(generateMatchedCohortSet)
 export(getIdentifier)
 export(joinOverlap)
+export(matchCohort)
 export(requireAge)
 export(requireCohortIntersectFlag)
 export(requireDemographics)
@@ -19,6 +21,7 @@ export(splitOverlap)
 export(trimToDateRange)
 importFrom(magrittr,"%>%")
 importFrom(omopgenerics,attrition)
+importFrom(omopgenerics,cohortCodelist)
 importFrom(omopgenerics,cohortCount)
 importFrom(omopgenerics,settings)
 importFrom(rlang,":=")

diff --git a/R/conceptCohort.R b/R/conceptCohort.R
@@ -0,0 +1,185 @@
+#' Generate a cohort based on a concept set. The current supported domains are:
+#'
+#' @param cdm A cdm_reference object.
+#' @param conceptSet A conceptSet, can either be a list of concepts, a codelist
+#' or a conceptSetExpression (TO DO).
+#' @param name Name of the cohort in the cdm object.
+#'
+#' @export
+#'
+#' @return A cohort_table object.
+#'
+conceptCohort <- function(cdm,
+                          conceptSet,
+                          name) {
+  # initial input validation
+  cdm <- validateCdm(cdm)
+  name <- validateName(name)
+  if (length(conceptSet) == 0) {
+    cli::cli_inform(c("i" = "Empty codelist provided, returning empty cohort"))
+    cdm <- omopgenerics::emptyCohortTable(cdm = cdm, name = name)
+    return(cdm[[name]])
+  }
+  conceptSet <- validateConceptSet(conceptSet)
+
+  # create concept set tibble
+  cohortSet <- dplyr::tibble("cohort_name" = names(conceptSet)) |>
+    dplyr::mutate("cohort_definition_id" = dplyr::row_number())
+  cohortCodelist <- lapply(conceptSet, dplyr::as_tibble) |>
+    dplyr::bind_rows(.id = "cohort_name") |>
+    dplyr::inner_join(cohortSet, by = "cohort_name") |>
+    dplyr::select("cohort_definition_id", "concept_id"  = "value", "codelist_name" = "cohort_name") |>
+    dplyr::mutate("type" = "index event") |>
+    addDomains(cdm)
+
+  ud <- cohortCodelist |>
+    dplyr::group_by(.data$domain_id) |>
+    dplyr::tally() |>
+    dplyr::collect() |>
+    dplyr::filter(!.data$domain_id %in% domainsData$domain_id)
+  for (k in seq_len(nrow(ud))) {
+    cli::cli_inform(c(
+      "x" = "Domain {.strong {ud$domain_id[k]}} ({ud$n[k]} concept{?s}) excluded because it is not supported."
+    ))
+  }
+
+  cohortCodelist <- cohortCodelist |>
+    dplyr::filter(.data$domain_id %in% !!domainsData$domain_id) |>
+    dplyr::compute()
+
+  domains <- cohortCodelist |>
+    dplyr::select("domain_id") |>
+    dplyr::distinct() |>
+    dplyr::pull()
+
+  cohorts <- list()
+  for (k in seq_along(domains)) {
+    domain <- domains[k]
+    table <- domainsData$table[domainsData$domain_id == domain]
+    concept <- domainsData$concept[domainsData$domain_id == domain]
+    start <- domainsData$start[domainsData$domain_id == domain]
+    end <- domainsData$end[domainsData$domain_id == domain]
+    n <- cohortCodelist |>
+      dplyr::filter(.data$domain_id %in% .env$domain) |>
+      dplyr::tally() |>
+      dplyr::pull()
+    if (table %in% names(cdm)) {
+      cli::cli_inform(c(
+        "i" = "Subsetting table {.strong {table}} using {n} concept{?s} with domain: {.strong {domain}}."
+      ))
+      cohorts[[k]] <- cdm[[table]] |>
+        dplyr::select(
+          "subject_id" = "person_id",
+          "concept_id" = dplyr::all_of(concept),
+          "cohort_start_date" = dplyr::all_of(start),
+          "cohort_end_date" = dplyr::all_of(end)
+        ) |>
+        dplyr::inner_join(
+          cohortCodelist |>
+            dplyr::filter(.data$domain_id %in% .env$domain) |>
+            dplyr::select("concept_id", "cohort_definition_id"),
+          by = "concept_id"
+        )
+    } else {
+      cli::cli_inform(c(
+        "x" = "Domain {.strong {domain}} ({n} concept{?s}) excluded because table {table} is not present in the cdm."
+      ))
+    }
+  }
+
+  if (length(cohorts) == 0) {
+    cli::cli_inform(c("i" = "No table could be subsetted, returning empty cohort."))
+    cdm <- omopgenerics::emptyCohortTable(cdm = cdm, name = name)
+    cdm[[name]] <- cdm[[name]] |>
+      omopgenerics::newCohortTable(
+        cohortSetRef = cohortSet,
+        cohortAttritionRef = NULL,
+        cohortCodelistRef = NULL
+      )
+    return(cdm[[name]])
+  }
+
+  cli::cli_inform(c("i" = "Subsetting tables."))
+  cohort <- cohorts[[1]]
+  if (length(cohorts) > 1) {
+    for (k in 2:length(cohorts)) {
+      cohort <- cohort |> dplyr::union_all(cohorts[[k]])
+    }
+  }
+  cohort <- cohort |>
+    dplyr::compute(name = name, temporary = FALSE)
+
+  cli::cli_inform(c("i" = "Collapsing records."))
+  # assign to cdm so we keep class, to be removed when https://github.com/darwin-eu-dev/omopgenerics/issues/256
+  cdm[[name]] <- cohort |>
+    collapseGap(gap = 0)
+  cohort <- cdm[[name]] |>
+    dplyr::compute(name = name, temporary = FALSE)
+
+  cli::cli_inform(c("i" = "Creating cohort attributes."))
+  cdm[[name]] <- cohort |>
+    omopgenerics::newCohortTable(
+      cohortSetRef = cohortSet,
+      cohortAttritionRef = NULL,
+      cohortCodelistRef = cohortCodelist |>
+        dplyr::select(-"domain_id") |>
+        dplyr::collect()
+    )
+
+  cli::cli_inform(c("v" = "Cohort {.strong {name}} created."))
+
+  return(cdm[[name]])
+}
+
+addDomains <- function(cohortCodelist, cdm) {
+  # insert table as temporary
+  tmpName <- omopgenerics::uniqueTableName()
+  cdm <- omopgenerics::insertTable(
+    cdm = cdm, name = tmpName, table = cohortCodelist
+  )
+  cdm[[tmpName]] <- cdm[[tmpName]] |> dplyr::compute()
+
+  cohortCodelist <- cdm[["concept"]] |>
+    dplyr::select("concept_id", "domain_id") |>
+    dplyr::right_join(cdm[[tmpName]], by = "concept_id") |>
+    dplyr::mutate("domain_id" = tolower(.data$domain_id)) |>
+    dplyr::compute()
+
+  omopgenerics::dropTable(cdm = cdm, name = tmpName)
+
+  return(cohortCodelist)
+}
+collapseGap <- function(cohort, gap) {
+  start <- cohort |>
+    dplyr::select(
+      "cohort_definition_id", "subject_id", "date" = "cohort_start_date"
+    ) |>
+    dplyr::mutate("date_id" = -1)
+  end <- cohort |>
+    dplyr::select(
+      "cohort_definition_id", "subject_id", "date" = "cohort_end_date"
+    ) |>
+    dplyr::mutate("date_id" = 1)
+  start |>
+    dplyr::union_all(end) |>
+    dplyr::group_by(.data$cohort_definition_id, .data$subject_id) |>
+    dplyr::arrange(.data$date, .data$date_id) |>
+    dplyr::mutate("cum_id" = cumsum(.data$date_id)) |>
+    dplyr::filter(
+      .data$cum_id == 0 | (.data$cum_id == -1 & .data$date_id == -1)
+    ) |>
+    dplyr::mutate(
+      "name" = dplyr::if_else(
+        .data$date_id == -1, "cohort_start_date", "cohort_end_date"
+      ),
+      "era_id" = dplyr::if_else(.data$date_id == -1, 1, 0)
+    ) |>
+    dplyr::mutate("era_id" = cumsum(as.numeric(.data$era_id))) |>
+    dplyr::ungroup() |>
+    dplyr::arrange() |>
+    dplyr::select(
+      "cohort_definition_id", "subject_id", "era_id", "name", "date"
+    ) |>
+    tidyr::pivot_wider(names_from = "name", values_from = "date") |>
+    dplyr::select(-"era_id")
+}
diff --git a/R/generateMatchedCohortSet.R → R/matchCohort.R b/R/generateMatchedCohortSet.R → R/matchCohort.R
@@ -22,21 +22,21 @@
 #' library(dplyr)
 #' cdm <- mockDrugUtilisation(numberIndividuals = 100)
 #' cdm <- cdm %>%
-#'   generateMatchedCohortSet(name = "new_matched_cohort",
+#'   matchCohort(name = "new_matched_cohort",
 #'                            targetCohortName = "cohort1",
 #'                            targetCohortId = c(1,2),
 #'                            matchSex = TRUE,
 #'                            matchYearOfBirth = TRUE,
 #'                            ratio = 2)
 #' cdm$new_matched_cohort
 #'
-generateMatchedCohortSet <- function(cdm,
-                                     name,
-                                     targetCohortName,
-                                     targetCohortId = NULL,
-                                     matchSex = TRUE,
-                                     matchYearOfBirth = TRUE,
-                                     ratio = 1){
+matchCohort <- function(cdm,
+                        name,
+                        targetCohortName,
+                        targetCohortId = NULL,
+                        matchSex = TRUE,
+                        matchYearOfBirth = TRUE,
+                        ratio = 1){
   cli::cli_inform("Starting matching")
 
   # validate initial input

diff --git a/R/reexports-omopgenerics.R b/R/reexports-omopgenerics.R
@@ -9,3 +9,7 @@ omopgenerics::settings
 #' @importFrom omopgenerics attrition
 #' @export
 omopgenerics::attrition
+
+#' @importFrom omopgenerics cohortCodelist
+#' @export
+omopgenerics::cohortCodelist
diff --git a/R/sysdata.rda b/R/sysdata.rda
diff --git a/R/validateFunctions.R b/R/validateFunctions.R
@@ -2,6 +2,14 @@ validateCDM <- function(cdm) {
   if (!isTRUE(inherits(cdm, "cdm_reference"))) {
     cli::cli_abort("cohort must be part of a cdm reference")
   }
+  return(invisible(cdm))
+}
+
+validateCdm <- function(cdm) {
+  if (!isTRUE(inherits(cdm, "cdm_reference"))) {
+    cli::cli_abort("cdm must be a cdm_reference object.")
+  }
+  return(invisible(cdm))
 }
 
 validateCohortTable <- function(cohort) {
@@ -46,3 +54,23 @@ validateDateRange<-function(dateRange){
   }
   return(invisible(dateRange))
 }
+
+validateName <- function(name) {
+  em <- c(
+    "x" = "{name} it is not a valida value for name.",
+    "i" = "It must be:",
+    "*" = "lowercase character vector of length 1",
+    "*" = "NA or NULL values are not allowed"
+  )
+  if (!is.character(name) | length(name) != 1 | is.na(name)) {
+    cli::cli_abort(em)
+  }
+  if (tolower(name) != name){
+    cli::cli_abort(em)
+  }
+  return(invisible(name))
+}
+
+validateConceptSet <- function(conceptSet) {
+  omopgenerics::newCodelist(conceptSet)
+}
diff --git a/data-raw/domainsData.R b/data-raw/domainsData.R
@@ -0,0 +1,14 @@
+## code to prepare `DATASET` dataset goes here
+
+domainsData <- dplyr::tribble(
+  ~"domain_id",   ~"table",              ~"concept",               ~"start",                     ~"end",
+  "drug",        "drug_exposure",        "drug_concept_id",        "drug_exposure_start_date",   "drug_exposure_end_date",
+  "condition",   "condition_occurrence", "condition_concept_id",   "condition_start_date",       "condition_end_date",
+  "procedure",   "procedure_occurrence", "procedure_concept_id",   "procedure_date",             "procedure_date",
+  "observation", "observation",          "observation_concept_id", "observation_date",           "observation_date",
+  "measurement", "measurement",          "measurement_concept_id", "measurement_date",           "measurement_date",
+  "visit",       "visit_occurrence",     "visit_concept_id",       "visit_start_date",           "visit_end_date",
+  "device",      "device_exposure",      "device_concept_id",      "device_exposure_start_date", "device_exposure_end_date"
+)
+
+usethis::use_data(domainsData, internal = TRUE, overwrite = TRUE)
diff --git a/man/CohortConstructor-package.Rd b/man/CohortConstructor-package.Rd
diff --git a/man/conceptCohort.Rd b/man/conceptCohort.Rd
diff --git a/man/generateMatchedCohortSet.Rd → man/matchCohort.Rd b/man/generateMatchedCohortSet.Rd → man/matchCohort.Rd
diff --git a/man/reexports.Rd b/man/reexports.Rd