diff --git a/R/intersectCohorts.R b/R/intersectCohorts.R index fba18fbd..3ae6d025 100644 --- a/R/intersectCohorts.R +++ b/R/intersectCohorts.R @@ -10,7 +10,7 @@ #' @inheritParams cohortIdSubsetDoc #' @inheritParams gapDoc #' @inheritParams nameDoc -#' @param mutuallyExclusive Whether the generated cohorts are mutually +#' @param returnNonOverlappingCohorts Whether the generated cohorts are mutually #' exclusive or not. #' @param keepOriginalCohorts If TRUE the original cohorts and the newly #' created intersection cohort will be returned. If FALSE only the new cohort @@ -37,7 +37,7 @@ intersectCohorts <- function(cohort, cohortId = NULL, gap = 0, - mutuallyExclusive = FALSE, + returnNonOverlappingCohorts = FALSE, keepOriginalCohorts = FALSE, name = tableName(cohort)) { # checks @@ -47,38 +47,39 @@ intersectCohorts <- function(cohort, validateCDM(cdm) ids <- omopgenerics::settings(cohort)$cohort_definition_id cohortId <- validateCohortId(cohortId, settings(cohort)) - assertNumeric(gap, - integerish = TRUE, - min = 0, - length = 1) - assertLogical(mutuallyExclusive, length = 1) + assertNumeric(gap, integerish = TRUE, min = 0, length = 1) + assertLogical(returnNonOverlappingCohorts, length = 1) assertLogical(keepOriginalCohorts, length = 1) if (length(cohortId) < 2) { cli::cli_abort("Settings of cohort table must contain at least two cohorts.") } - # generate cohort + uniquePrefix <- omopgenerics::tmpPrefix() + if (keepOriginalCohorts) { + originalNm <- omopgenerics::uniqueTableName(prefix = uniquePrefix) + originalCohorts <- subsetCohorts( + cohort = cohort, + cohortId = cohortId, + name = originalNm + ) + } - tblName <- omopgenerics::uniqueTableName() + # get intersections between cohorts + tblName <- omopgenerics::uniqueTableName(prefix = uniquePrefix) cohortOut <- cohort %>% dplyr::filter(.data$cohort_definition_id %in% .env$cohortId) %>% dplyr::select(-"cohort_definition_id") %>% - splitOverlap(by = "subject_id", - name = tblName, - tmp = paste0(tblName, "_tmp_")) - CDMConnector::dropTable(cdm, - name = dplyr::starts_with(paste0(tblName, "_tmp_"))) - - cohortOut <- cohortOut |> + splitOverlap(by = "subject_id", name = tblName, tmp = paste0(tblName)) |> PatientProfiles::addCohortIntersectFlag( targetCohortTable = omopgenerics::tableName(cohort), targetCohortId = cohortId, window = c(0, 0), - nameStyle = "{cohort_name}" + nameStyle = "{cohort_name}", + name = tblName ) - # create cohort_definition_id + # create intersect cohort set cohortNames <- omopgenerics::settings(cohort) %>% dplyr::filter(.data$cohort_definition_id %in% .env$cohortId) %>% dplyr::pull("cohort_name") @@ -86,158 +87,125 @@ intersectCohorts <- function(cohort, names(x) <- cohortNames cohSet <- expand.grid(x) %>% dplyr::as_tibble() %>% - dplyr::filter(dplyr::if_any(dplyr::everything(), ~ . != 0)) %>% - addNames() %>% - dplyr::mutate(cohort_definition_id = as.integer(dplyr::row_number())) - - if (!mutuallyExclusive) { - dic <- cohSet %>% - dplyr::mutate(cohort_definition_id = as.integer(dplyr::row_number())) %>% - dplyr::select("cohort_name", "cohort_definition_id") - cohSet <- cohSet %>% - dplyr::select(-"cohort_name", -"cohort_definition_id") %>% - notMutuallyEclusiveCohortSet() %>% - dplyr::inner_join(dic, by = "cohort_definition_id") - } - - individualId <- cohSet %>% - dplyr::rowwise() %>% - dplyr::mutate(sum = sum(dplyr::c_across(-dplyr::all_of( - c("cohort_definition_id", "cohort_name") - )), na.rm = TRUE)) %>% - dplyr::filter(.data$sum == 1) %>% - dplyr::pull("cohort_definition_id") - if (keepOriginalCohorts) { + dplyr::filter(dplyr::if_any(dplyr::everything(), ~ . != 0)) |> + addNames() |> + dplyr::mutate(cohort_definition_id = as.integer(dplyr::row_number())) |> + dplyr::rowwise() |> + dplyr::mutate( + sum = sum(dplyr::c_across(-dplyr::all_of( + c("cohort_definition_id", "cohort_name") + )), na.rm = TRUE), + gap = gap + ) |> + dplyr::ungroup() |> + dplyr::left_join( + settings(cohort) |> + dplyr::select("id_in" = "cohort_definition_id", "cohort_name"), + by = "cohort_name" + ) + # filter to cohorts of interest + if (returnNonOverlappingCohorts) { + cohSet <- cohSet |> + dplyr::filter(.data$sum == 1 | .data$sum == length(.env$cohortId)) |> + dplyr::mutate( + cohort_name = dplyr::if_else(.data$sum == 1, paste0("only_in_", .data$cohort_name), .data$cohort_name), + non_overlapping = dplyr::if_else(.data$sum == 1, TRUE, NA) + ) + } else { cohSet <- cohSet |> - dplyr::filter(!.data$cohort_definition_id %in% .env$individualId) %>% - dplyr::group_by(.data$cohort_name) %>% - dplyr::mutate(cohort_definition_id = dplyr::cur_group_id()) %>% - dplyr::ungroup() + dplyr::filter(.data$sum == length(.env$cohortId)) } + # reset cohort ids + cohSet <- cohSet |> + dplyr::arrange(dplyr::desc(.data$sum)) |> + dplyr::mutate(cohort_definition_id = as.integer(dplyr::row_number())) - ## intersect cohort - tempName <- omopgenerics::uniqueTableName() + # intersect cohort + setName <- omopgenerics::uniqueTableName(prefix = uniquePrefix) cdm <- omopgenerics::insertTable( cdm = cdm, - name = tempName, - table = cohSet %>% - dplyr::left_join( - settings(cohort) %>% - dplyr::select("id_in" = "cohort_definition_id", "cohort_name"), - by = "cohort_name" - ) + name = setName, + table = cohSet ) - - if (!keepOriginalCohorts && !mutuallyExclusive && gap < 1) { - nameComputing <- omopgenerics::uniqueTableName() - # if not mutually exclusive --> cohorts in = individual cohorts out: - # cohort in cannot be recover after splitting (if joinOverlap with gap = 1 is - # done we might be joining different input entries) - cohortOut <- cohortOut %>% - dplyr::inner_join(cdm[[tempName]] |> - dplyr::filter(!.data$cohort_definition_id %in% .env$individualId), - by = cohortNames) %>% - dplyr::select("cohort_definition_id", - "subject_id", - "cohort_start_date", - "cohort_end_date") %>% - dplyr::union_all( - cohort %>% - dplyr::rename("id_in" = "cohort_definition_id") %>% - dplyr::inner_join( - cdm[[tempName]] %>% - dplyr::filter(.data$cohort_definition_id %in% .env$individualId), - by = "id_in" - ) %>% - dplyr::select( - "cohort_definition_id", - "subject_id", - "cohort_start_date", - "cohort_end_date" - ) |> - dplyr::compute(name = nameComputing, temporary = FALSE) - ) %>% - dplyr::compute(name = name, temporary = FALSE) - cdm <- omopgenerics::dropTable(cdm, name = nameComputing) - } else { - cohortOut <- cohortOut %>% - dplyr::inner_join(cdm[[tempName]], by = cohortNames) %>% - dplyr::select("cohort_definition_id", - "subject_id", - "cohort_start_date", - "cohort_end_date") %>% - dplyr::compute(name = name, temporary = FALSE) - } - cdm <- omopgenerics::dropTable(cdm = cdm, name = tempName) - + cohortOut <- cohortOut %>% + dplyr::inner_join(cdm[[setName]], by = cohortNames) %>% + dplyr::select("cohort_definition_id", "subject_id", + "cohort_start_date", "cohort_end_date") %>% + dplyr::compute(name = name, temporary = FALSE) if (cohortOut |> dplyr::tally() |> dplyr::pull("n") > 0) { cohortOut <- cohortOut %>% dplyr::compute(name = name, temporary = FALSE) |> joinOverlap(name = name, gap = gap) } - if (!mutuallyExclusive) { - cohSet <- cohSet %>% - dplyr::group_by(.data$cohort_definition_id, .data$cohort_name) %>% - dplyr::mutate(dplyr::across( - dplyr::everything(), - ~ dplyr::if_else(dplyr::n_distinct(.x) == 1, 1, 0) - )) %>% - dplyr::ungroup() %>% - dplyr::distinct() - } - - # attrition + # attributes counts <- cohortOut |> dplyr::group_by(.data$cohort_definition_id) |> dplyr::summarise( - number_records = as.integer(dplyr::n()), - number_subjects = as.integer(dplyr::n_distinct(.data$subject_id)) + number_records = dplyr::n() |> as.integer(), + number_subjects = dplyr::n_distinct(.data$subject_id) |> as.integer() ) |> dplyr::collect() |> - dplyr::right_join(cohSet |> - dplyr::select("cohort_definition_id"), by = "cohort_definition_id") |> - dplyr::mutate(dplyr::across( - dplyr::starts_with("number"), - ~ dplyr::if_else(is.na(.x), 0L, as.integer(.x)) - )) - cohAtt <- intersectCohortAttrition(cohort, cohSet, counts, keepOriginalCohorts, mutuallyExclusive) - - # concept codelists - codelist <- attr(cohort, "cohort_codelist") - codelist <- cohSet |> - dplyr::select(dplyr::all_of(c(cohortNames, "cohort_definition_id"))) |> - tidyr::pivot_longer(cols = dplyr::all_of(cohortNames), names_to = "cohort_name") |> - dplyr::filter(.data$value == 1) |> - dplyr::select("cohort_definition_id", "cohort_name") |> - dplyr::inner_join( - settings(cohort) |> - dplyr::inner_join(codelist, copy = TRUE, by = "cohort_definition_id") |> - dplyr::select(-"cohort_definition_id"), - by = "cohort_name", - relationship = "many-to-many" - ) |> - dplyr::select("cohort_definition_id", - "codelist_name", - "concept_id", - "type") - - cohSet <- cohSet %>% - dplyr::mutate("mutually_exclusive" = mutuallyExclusive, "gap" = gap) %>% - dplyr::relocate(c("cohort_definition_id", "cohort_name")) - + dplyr::arrange(.data$cohort_definition_id) + countsInt <- counts |> dplyr::filter(.data$cohort_definition_id == 1) + intersectCodelist <- attr(cohort, "cohort_codelist") |> + dplyr::filter(.data$cohort_definition_id %in% .env$cohortId) |> + dplyr::mutate(cohort_definition_id = 1L) + intersectAttrition <- dplyr::tibble( + cohort_definition_id = 1L, number_records = countsInt$number_records[1], + number_subjects = countsInt$number_subjects[1], reason_id = 1L, + reason = "Initial qualifying events", excluded_records = 0L, + excluded_subjects = 0L + ) |> + dplyr::mutate(dplyr::across(dplyr::starts_with("number_"), ~dplyr::if_else(is.na(.x), 0L, .x))) + + if (returnNonOverlappingCohorts) { + intersectCodelist <- intersectCodelist |> + dplyr::union_all( + attr(cohort, "cohort_codelist") |> + dplyr::rename("id_in" = "cohort_definition_id") |> + dplyr::inner_join( + cdm[[setName]] |> dplyr::select("cohort_definition_id", "id_in"), + by = "id_in" + ) |> + dplyr::select(!"id_in") + ) + intersectAttrition <- intersectAttrition |> + dplyr::union_all( + attrition(cohort) |> + dplyr::rename("id_in" = "cohort_definition_id") |> + dplyr::inner_join( + cohSet |> dplyr::select("cohort_definition_id", "id_in"), + by = "id_in" + ) |> + dplyr::select(!"id_in") |> + addAttritionReason( + counts = counts |> dplyr::filter(.data$cohort_definition_id != 1), + reason = "Trim to non overlapping entries" + ) + ) + } + cohSet <- cohSet |> + dplyr::select(dplyr::any_of(c( + "cohort_definition_id", "cohort_name", "gap", "non_overlapping", cohortNames + ))) - cohortOut <- omopgenerics::newCohortTable( + # intersect cohort + cdm[[name]] <- omopgenerics::newCohortTable( table = cohortOut, cohortSetRef = cohSet, - cohortAttritionRef = cohAtt, - cohortCodelistRef = codelist, - .softValidation = TRUE + cohortAttritionRef = intersectAttrition, + cohortCodelistRef = intersectCodelist, + .softValidation = FALSE ) - CDMConnector::dropTable(cdm, name = tblName) + if (keepOriginalCohorts) { + cdm <- bind(cdm[[name]], originalCohorts, name = name) + } - return(cohortOut) + CDMConnector::dropTable(cdm, name = dplyr::starts_with(uniquePrefix)) + + return(cdm[[name]]) } #' To split overlapping periods in non overlapping period. @@ -413,7 +381,7 @@ joinOverlap <- function(cohort, return(x) } -#' Join all periods into single periods. +#' Join all periods into single periods (joinOverlap with gap = Inf). #' #' @param x Table in the cdm. #' @param startDate Column that indicates the start of periods. @@ -508,81 +476,9 @@ addNames <- function(cs) { } return(cs) } -notMutuallyEclusiveCohortSet <- function(cs) { - logic <- cs %>% - dplyr::mutate(cohort_definition_id = dplyr::row_number()) %>% - tidyr::pivot_longer(!"cohort_definition_id") %>% - dplyr::filter(.data$value == 1) - cohset <- list() - for (k in logic$cohort_definition_id) { - logi <- logic %>% - dplyr::filter(.data$cohort_definition_id == .env$k) %>% - tidyr::pivot_wider() - cohset[[k]] <- cs %>% - dplyr::inner_join(logi, by = colnames(logi)[colnames(logi) != "cohort_definition_id"]) - } - cs <- dplyr::bind_rows(cohset) - return(cs) -} - -intersectCohortAttrition <- function(cohort, - cohortSet, - counts, - keepOriginalCohorts, - mutuallyExclusive) { - # attrition - # intersect cohorts - intersectId <- cohortSet |> - dplyr::rowwise() %>% - dplyr::mutate(sum = sum(dplyr::c_across(-dplyr::all_of( - c("cohort_definition_id", "cohort_name") - )), na.rm = TRUE)) %>% - dplyr::filter(.data$sum > 1) |> - dplyr::pull("cohort_definition_id") - cohAtt <- counts |> - dplyr::filter(.data$cohort_definition_id %in% .env$intersectId) |> - dplyr::mutate( - "reason_id" = 1L, - "reason" = "Initial qualifying events", - "excluded_records" = 0L, - "excluded_subjects" = 0L - ) - if (!keepOriginalCohorts) { - # individual cohorts - individualId <- cohortSet$cohort_definition_id[!cohortSet$cohort_definition_id %in% intersectId] - cohAtt <- cohAtt |> - dplyr::union_all( - cohortSet |> - dplyr::inner_join( - omopgenerics::attrition(cohort) |> - dplyr::inner_join( - omopgenerics::settings(cohort) |> - dplyr::select("cohort_definition_id", "cohort_name"), - by = "cohort_definition_id" - ) |> - dplyr::select(-"cohort_definition_id"), - by = "cohort_name" - ) |> - dplyr::select(dplyr::all_of( - omopgenerics::cohortColumns("cohort_attrition") - )) - ) - if (mutuallyExclusive) { - cohAtt <- cohAtt %>% - addAttritionReason(counts = counts, - ids = individualId, - reason = "Mutually exclusive cohorts") - } - } - cohAtt <- cohAtt |> - dplyr::select(dplyr::all_of(omopgenerics::cohortColumns("cohort_attrition"))) |> - dplyr::arrange(.data$cohort_definition_id, .data$reason_id) - return(cohAtt) -} -getPriorCohortCount <- function(attr, ids) { +getPriorCohortCount <- function(attr) { attr |> - dplyr::filter(.data$cohort_definition_id %in% ids) |> dplyr::group_by(.data$cohort_definition_id) |> dplyr::filter(.data$reason_id == max(.data$reason_id)) |> dplyr::summarise( @@ -592,19 +488,24 @@ getPriorCohortCount <- function(attr, ids) { ) } -addAttritionReason <- function(att, counts, ids, reason) { +addAttritionReason <- function(att, counts, reason) { + counts <- att |> + dplyr::distinct(.data$cohort_definition_id) |> + dplyr::left_join(counts, by = "cohort_definition_id") |> + dplyr::mutate( + dplyr::across(dplyr::starts_with("number_"), ~dplyr::if_else(is.na(.x), 0L, .x)) + ) dplyr::bind_rows( att |> dplyr::select(dplyr::all_of( omopgenerics::cohortColumns("cohort_attrition") )), counts |> - dplyr::filter(.data$cohort_definition_id %in% ids) |> dplyr::mutate(dplyr::across( dplyr::all_of(c("number_records", "number_subjects")), ~ dplyr::if_else(is.na(.x), as.integer(0), as.integer(.x)) )) |> - dplyr::inner_join(att |> getPriorCohortCount(ids), by = "cohort_definition_id") |> + dplyr::inner_join(att |> getPriorCohortCount(), by = "cohort_definition_id") |> dplyr::mutate( "excluded_records" = .data$previous_number_records - .data$number_records, "excluded_subjects" = .data$previous_number_subjects - .data$number_subjects @@ -612,7 +513,6 @@ addAttritionReason <- function(att, counts, ids, reason) { dplyr::inner_join( att |> dplyr::filter( - .data$cohort_definition_id %in% ids, .data$reason_id == max(.data$reason_id) ) |> dplyr::select("cohort_definition_id", "reason_id") |> diff --git a/man/intersectCohorts.Rd b/man/intersectCohorts.Rd index e797e94b..bf420fd7 100644 --- a/man/intersectCohorts.Rd +++ b/man/intersectCohorts.Rd @@ -9,7 +9,7 @@ intersectCohorts( cohort, cohortId = NULL, gap = 0, - mutuallyExclusive = FALSE, + returnNonOverlappingCohorts = FALSE, keepOriginalCohorts = FALSE, name = tableName(cohort) ) @@ -24,7 +24,7 @@ removed from the cohort set.} \item{gap}{Number of days between two subsequent cohort entries to be merged in a single cohort record.} -\item{mutuallyExclusive}{Whether the generated cohorts are mutually +\item{returnNonOverlappingCohorts}{Whether the generated cohorts are mutually exclusive or not.} \item{keepOriginalCohorts}{If TRUE the original cohorts and the newly diff --git a/tests/testthat/test-intersectCohorts.R b/tests/testthat/test-intersectCohorts.R index 9ab0e932..7ad864d9 100644 --- a/tests/testthat/test-intersectCohorts.R +++ b/tests/testthat/test-intersectCohorts.R @@ -126,16 +126,16 @@ test_that("intersectCohorts", { omock::mockCohort(name = c("cohort1"), numberCohorts = 2) cdm <- cdm_local |> copyCdm() - # mutually exclusive + # returnNonOverlappingCohorts expect_no_error(cdm$cohort2 <- intersectCohorts( cohort = cdm$cohort1, name = "cohort2", - mutuallyExclusive = TRUE + returnNonOverlappingCohorts = TRUE )) - expect_true(all(omopgenerics::settings(cdm$cohort2)$mutually_exclusive == TRUE)) + expect_equal(omopgenerics::settings(cdm$cohort2)$non_overlapping,c(NA, TRUE, TRUE)) expect_true(cdm$cohort2 %>% dplyr::tally() %>% dplyr::pull() == 10) expect_true(all( CDMConnector::cohortCount(cdm$cohort2) %>% - dplyr::arrange(.data$cohort_definition_id) %>% + dplyr::arrange(.data$number_records) %>% dplyr::pull("number_records") == c(1, 4, 5) )) expect_true(nrow(omopgenerics::settings(cdm$cohort2)) == 3) @@ -147,55 +147,59 @@ test_that("intersectCohorts", { "2003-06-15", "2005-11-24", "2015-03-05", "2015-03-25", "2015-04-15") )) expect_true(all( - omopgenerics::attrition(cdm$cohort2)$reason == - c("Initial qualifying events", "Mutually exclusive cohorts", - "Initial qualifying events", "Mutually exclusive cohorts", - "Initial qualifying events") + omopgenerics::attrition(cdm$cohort2)$reason |> sort() == + c("Initial qualifying events", "Initial qualifying events", + "Initial qualifying events", "Trim to non overlapping entries", + "Trim to non overlapping entries") )) expect_true(all( - omopgenerics::attrition(cdm$cohort2)$reason_id == c(1, 2, 1, 2, 1) + omopgenerics::attrition(cdm$cohort2)$reason_id |> sort() == c(1, 1, 1, 2, 2) )) expect_true(all( - omopgenerics::attrition(cdm$cohort2)$excluded_records == c(0, 3, 0, 0, 0) + omopgenerics::attrition(cdm$cohort2)$excluded_records |> sort() == c(0, 0, 0, 0, 3) )) - # not mutually exclusive and gap + # not overlap, keep original and gap expect_no_error(cdm$cohort3 <- intersectCohorts( cohort = cdm$cohort1, name = "cohort3", - mutuallyExclusive = FALSE, gap = 1 + returnNonOverlappingCohorts = FALSE, gap = 1, + keepOriginalCohorts = TRUE )) - expect_true(all(omopgenerics::settings(cdm$cohort3)$mutually_exclusive == FALSE)) - expect_true(cdm$cohort3 %>% dplyr::tally() %>% dplyr::pull() == 7) + expect_false("non_overlapping" %in% colnames(settings(cdm$cohort3))) + expect_true(cdm$cohort3 %>% dplyr::tally() %>% dplyr::pull() == 10) + expect_equal(collectCohort(cdm$cohort1, 1), collectCohort(cdm$cohort3, 2)) + expect_equal(collectCohort(cdm$cohort1, 2), collectCohort(cdm$cohort3, 3)) expect_true(all( omopgenerics::cohortCount(cdm$cohort3) %>% - dplyr::arrange(.data$cohort_definition_id) %>% - dplyr::pull("number_records") == c(4, 4, 2) + dplyr::pull("number_records") |> sort() == c(2, 4, 4) )) expect_true(all( omopgenerics::cohortCount(cdm$cohort3) %>% - dplyr::arrange(.data$cohort_definition_id) %>% - dplyr::pull("number_subjects") == c(3, 2, 2) + dplyr::pull("number_subjects") |> + sort() == c(2, 2, 3) )) expect_true(nrow(omopgenerics::settings(cdm$cohort3)) == 3) expect_true(all( cdm$cohort3 %>% dplyr::pull("cohort_start_date") %>% sort() == - c("1997-10-22", "2000-06-23", "2001-03-30", "2001-03-30", "2015-03-05", "2015-03-25", "2015-03-25") + c("1997-10-22", "2000-06-23", "2001-03-30", "2001-03-30", "2001-07-16", + "2001-12-04", "2003-06-15", "2015-03-05", "2015-03-25", "2015-03-25") )) expect_true(all( cdm$cohort3 %>% dplyr::pull("cohort_end_date") %>% sort() == - c("1999-05-28", "2005-11-23", "2005-11-23", "2006-09-27", "2015-04-14", "2015-04-14", "2015-07-06") + c("1999-05-28", "2001-07-15", "2001-12-03", "2003-06-14", "2005-11-23", + "2005-11-23", "2006-09-27", "2015-04-14", "2015-04-14", "2015-07-06") )) expect_true(all( omopgenerics::attrition(cdm$cohort3)$reason == c("Initial qualifying events", "Initial qualifying events", "Initial qualifying events") )) expect_true(all(omopgenerics::attrition(cdm$cohort3)$reason_id == c(1, 1, 1))) - expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_records == c(4, 4, 2))) - expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_subjects == c(3, 2, 2))) + expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_records == c(2, 4, 4))) + expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_subjects == c(2, 3, 2))) expect_true(all(omopgenerics::attrition(cdm$cohort3)$excluded_records == c(0, 0, 0))) expect_true(all(omopgenerics::attrition(cdm$cohort3)$excluded_subjects == c(0, 0, 0))) @@ -225,35 +229,35 @@ test_that("keepOriginalCohorts", { cdm$cohort2 <- intersectCohorts( cohort = cdm$cohort1, name = "cohort2", - mutuallyExclusive = FALSE, keepOriginalCohorts = TRUE + returnNonOverlappingCohorts = FALSE, keepOriginalCohorts = TRUE ) - expect_true(nrow(dplyr::collect(cdm$cohort2)) == 0) + expect_true(nrow(dplyr::collect(cdm$cohort2)) == nrow(dplyr::collect(cdm$cohort1))) + expect_equal(collectCohort(cdm$cohort1, 1), collectCohort(cdm$cohort2, 2)) + expect_equal(collectCohort(cdm$cohort1, 2), collectCohort(cdm$cohort2, 3)) expect_true(all( omopgenerics::attrition(cdm$cohort2)$reason == - c("Initial qualifying events") + rep("Initial qualifying events", 3) )) - expect_true(omopgenerics::attrition(cdm$cohort2)$reason_id == 1) - expect_true(omopgenerics::attrition(cdm$cohort2)$number_records == 0) - expect_true(omopgenerics::attrition(cdm$cohort2)$number_subjects == 0) - expect_true(omopgenerics::attrition(cdm$cohort2)$excluded_records == 0) - expect_true(omopgenerics::attrition(cdm$cohort2)$excluded_subjects == 0) + expect_true(all(omopgenerics::attrition(cdm$cohort2)$reason_id == rep(1, 3))) + expect_true(all(omopgenerics::attrition(cdm$cohort2)$number_records == c(0, 4, 4))) + expect_true(all(omopgenerics::attrition(cdm$cohort2)$number_subjects == c(0, 3, 2))) + expect_true(all(omopgenerics::attrition(cdm$cohort2)$excluded_records == rep(0, 3))) + expect_true(all(omopgenerics::attrition(cdm$cohort2)$excluded_subjects == rep(0, 3))) # nUll combination, return individuals cdm$cohort4 <- intersectCohorts( cohort = cdm$cohort1, name = "cohort4", - mutuallyExclusive = FALSE, keepOriginalCohorts = FALSE + returnNonOverlappingCohorts = FALSE, keepOriginalCohorts = FALSE ) - expect_true(nrow(dplyr::collect(cdm$cohort4)) == nrow(dplyr::collect(cdm$cohort1))) - expect_true(all( - omopgenerics::attrition(cdm$cohort4)$reason == - c("Initial qualifying events", "Initial qualifying events", "Initial qualifying events") - )) - expect_true(all(omopgenerics::attrition(cdm$cohort4)$reason_id == c(1, 1, 1))) - expect_true(all(omopgenerics::attrition(cdm$cohort4)$number_records == c(4, 4, 0))) - expect_true(all(omopgenerics::attrition(cdm$cohort4)$number_subjects == c(3, 2, 0))) - expect_true(all(omopgenerics::attrition(cdm$cohort4)$excluded_records == c(0, 0, 0))) - expect_true(all(omopgenerics::attrition(cdm$cohort4)$excluded_subjects == c(0, 0, 0))) - expect_true(nrow(cdm$cohort1 |> dplyr::anti_join(cdm$cohort4, by = colnames(cdm$cohort4)) |> dplyr::collect()) == 0) + expect_true(nrow(dplyr::collect(cdm$cohort4)) == 0) + expect_true(omopgenerics::attrition(cdm$cohort4)$reason == "Initial qualifying events") + expect_true(all(omopgenerics::attrition(cdm$cohort4)$reason_id == 1)) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$number_records == 0)) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$number_subjects == 0)) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$excluded_records == 0)) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$excluded_subjects == 0)) + expect_true(settings(cdm$cohort4)$cohort_name == "cohort_1_cohort_2") + # not null combination cdm_local <- omock::mockCdmReference() |> @@ -263,62 +267,74 @@ test_that("keepOriginalCohorts", { cdm <- cdm_local |> copyCdm() cdm$cohort3 <- intersectCohorts( cohort = cdm$cohort1, name = "cohort3", - mutuallyExclusive = FALSE, keepOriginalCohorts = TRUE, gap = 1 + returnNonOverlappingCohorts = FALSE, keepOriginalCohorts = TRUE, gap = 1 ) + expect_true(all(settings(cdm$cohort3)$cohort_name |> sort() == c( + "cohort_1", "cohort_1_cohort_2_cohort_3", "cohort_2", "cohort_3" + ))) + expect_equal(collectCohort(cdm$cohort1, 1), collectCohort(cdm$cohort3, 2)) + expect_equal(collectCohort(cdm$cohort1, 2), collectCohort(cdm$cohort3, 3)) + expect_equal(collectCohort(cdm$cohort1, 3), collectCohort(cdm$cohort3, 4)) expect_equal( cdm$cohort3 |> - dplyr::collect() %>% - dplyr::arrange(.data$cohort_start_date) %>% + dplyr::filter(.data$cohort_definition_id == 1) |> + dplyr::collect() |> dplyr::pull(.data$cohort_start_date), - as.Date(c("1997-10-22", "2001-03-30", "2015-03-05", "2015-03-25", "2015-03-25", "2015-03-25")) + as.Date("2015-03-25") ) expect_equal( cdm$cohort3 |> - dplyr::collect() %>% - dplyr::arrange(cohort_end_date) %>% + dplyr::filter(.data$cohort_definition_id == 1) |> + dplyr::collect() |> dplyr::pull(cohort_end_date), - as.Date(c("1999-05-28", "2005-11-23", "2015-04-14", "2015-04-14", "2015-04-14", "2015-07-06")) + as.Date("2015-04-14") ) expect_true(nrow(omopgenerics::settings(cdm$cohort3)) == 4) - expect_true(all(omopgenerics::settings(cdm$cohort3)$cohort_1 == c(1, 1, 1, 0))) - expect_true(all(omopgenerics::settings(cdm$cohort3)$cohort_2 == c(1, 1, 0, 1))) - expect_true(all(omopgenerics::settings(cdm$cohort3)$cohort_3 == c(0, 1, 1, 1))) - expect_false(any(omopgenerics::settings(cdm$cohort3)$mutually_exclusive)) + expect_equal(omopgenerics::settings(cdm$cohort3)$cohort_1, c(1, NA, NA, NA)) + expect_equal(omopgenerics::settings(cdm$cohort3)$cohort_2, c(1, NA, NA, NA)) + expect_equal(omopgenerics::settings(cdm$cohort3)$cohort_3, c(1, NA, NA, NA)) expect_true(all( omopgenerics::attrition(cdm$cohort3)$reason == c("Initial qualifying events", "Initial qualifying events", "Initial qualifying events", "Initial qualifying events") )) expect_true(all(omopgenerics::attrition(cdm$cohort3)$reason_id == c(1, 1, 1, 1))) - expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_records == c(2, 1, 2, 1))) - expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_subjects == c(2, 1, 2, 1))) + expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_records == c(1, 4, 4, 4))) + expect_true(all(omopgenerics::attrition(cdm$cohort3)$number_subjects == c(1, 3, 2, 3))) expect_true(all(omopgenerics::attrition(cdm$cohort3)$excluded_records == c(0, 0, 0, 0))) expect_true(all(omopgenerics::attrition(cdm$cohort3)$excluded_subjects == c(0, 0, 0, 0))) cdm$cohort4 <- intersectCohorts( cohort = cdm$cohort1, name = "cohort4", - mutuallyExclusive = TRUE, keepOriginalCohorts = TRUE, gap = 1 + returnNonOverlappingCohorts = TRUE, keepOriginalCohorts = TRUE, gap = 1 ) - + expect_true(nrow(settings(cdm$cohort4)) == 7) + expect_equal(settings(cdm$cohort4)$non_overlapping, c(NA, TRUE, TRUE, TRUE, NA, NA, NA)) + expect_equal(collectCohort(cdm$cohort1, 1), collectCohort(cdm$cohort4, 5)) + expect_equal(collectCohort(cdm$cohort1, 2), collectCohort(cdm$cohort4, 6)) + expect_equal(collectCohort(cdm$cohort1, 3), collectCohort(cdm$cohort4, 7)) expect_equal( cdm$cohort4 |> - dplyr::collect() %>% - dplyr::arrange(.data$cohort_start_date) %>% - dplyr::pull(.data$cohort_start_date), - as.Date(c("1997-10-22", "2001-03-30", "2015-03-05", "2015-03-25", "2015-04-15")) + dplyr::filter(.data$cohort_definition_id %in% 1:4) |> + dplyr::collect() |> + dplyr::pull(.data$cohort_start_date) |> + sort(), + as.Date(c('1994-06-17', '1999-05-29', '1999-12-19', '2000-06-23', + '2005-11-24', '2015-01-19', '2015-03-25', '2015-07-07')) ) expect_equal( cdm$cohort4 |> - dplyr::collect() %>% - dplyr::arrange(cohort_end_date) %>% - dplyr::pull(cohort_end_date), - as.Date(c("1999-05-28", "2005-11-23", "2015-03-24", "2015-04-14", "2015-07-06")) + dplyr::filter(.data$cohort_definition_id %in% 1:4) |> + dplyr::collect() |> + dplyr::pull(cohort_end_date) |> sort(), + as.Date(c('1997-10-21', '2001-03-29', '2001-08-26', '2006-09-27', + '2007-08-06', '2015-03-04', '2015-04-14', '2015-09-14')) ) - expect_true(nrow(omopgenerics::settings(cdm$cohort4)) == 4) - expect_true(all(omopgenerics::settings(cdm$cohort4)$cohort_1 == c(1, 1, 1, 0))) - expect_true(all(omopgenerics::settings(cdm$cohort4)$cohort_2 == c(1, 1, 0, 1))) - expect_true(all(omopgenerics::settings(cdm$cohort4)$cohort_3 == c(0, 1, 1, 1))) - expect_true(all(omopgenerics::settings(cdm$cohort4)$mutually_exclusive)) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$reason_id |> sort() == c(rep(1, 7), rep(2, 3)))) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$number_records |> sort() == c(0, 1, 2, rep(4, 6), 5))) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$number_subjects |> sort() == c(0, 1, 1, 2, 2, rep(3, 5)))) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$excluded_records |> sort() == c(-1, rep(0, 7), 2, 4))) + expect_true(all(omopgenerics::attrition(cdm$cohort4)$excluded_subjects |> sort() == c(rep(0, 8), 1, 3))) PatientProfiles::mockDisconnect(cdm) }) @@ -340,26 +356,32 @@ test_that("attrition and cohortId", { cdm$cohort1 <- intersectCohorts( cohort = cdm$cohort1, cohortId = c("cohort_1", "cohort_2"), - name = "cohort1", mutuallyExclusive = TRUE + name = "cohort1", returnNonOverlappingCohorts = TRUE, + keepOriginalCohorts = FALSE ) + expect_true(nrow(settings(cdm$cohort1)) == 3) + expect_equal(settings(cdm$cohort1)$non_overlapping, c(NA, TRUE, TRUE)) expect_true(all( - omopgenerics::attrition(cdm$cohort1)$reason == - c("Initial qualifying events", "cohort_start_date after 1990-01-01", "cohort_start_date before 2025-01-01", - "Sex requirement: Female", "Age requirement: 0 to 40", "Mutually exclusive cohorts", - "Initial qualifying events", "cohort_start_date after 1990-01-01", "cohort_start_date before 2025-01-01", - "Sex requirement: Female", "Age requirement: 0 to 40", "Mutually exclusive cohorts", - "Initial qualifying events" ) + omopgenerics::attrition(cdm$cohort1)$reason %in% + c('Initial qualifying events', 'Initial qualifying events', + 'cohort_start_date after 1990-01-01', 'cohort_start_date before 2025-01-01', + 'Sex requirement: Female', 'Age requirement: 0 to 40', + 'Trim to non overlapping entries', 'Initial qualifying events', + 'cohort_start_date after 1990-01-01', 'cohort_start_date before 2025-01-01', + 'Sex requirement: Female', 'Age requirement: 0 to 40', + 'Trim to non overlapping entries') )) - expect_true(all(omopgenerics::attrition(cdm$cohort1)$reason_id == c(1:6, 1:6, 1))) - expect_true(all(omopgenerics::attrition(cdm$cohort1)$number_records == - c(4, 4, 4, 1, 1, 1, 4, 4, 4, 0, 0, 0, 0))) - expect_true(all(omopgenerics::attrition(cdm$cohort1)$number_subjects == - c(3, 3, 3, 1, 1, 1, 2, 2, 2, 0, 0, 0, 0))) - expect_true(all(omopgenerics::attrition(cdm$cohort1)$excluded_records == - c(0, 0, 0, 3, 0, 0, 0, 0, 0, 4, 0, 0, 0))) - expect_true(all(omopgenerics::attrition(cdm$cohort1)$excluded_subjects == - c(0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0))) - expect_true(all(omopgenerics::settings(cdm$cohort1)$cohort_name == c("cohort_1", "cohort_2", "cohort_1_cohort_2"))) + expect_true(all(omopgenerics::attrition(cdm$cohort1)$reason_id == c(1, 1:6, 1:6))) + expect_true(all(omopgenerics::attrition(cdm$cohort1)$number_records |> sort() == + c(0, 0, 0, 0, 1, 1, 1, 4, 4, 4, 4, 4, 4))) + expect_true(all(omopgenerics::attrition(cdm$cohort1)$number_subjects |> sort() == + c(0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3))) + expect_true(all(omopgenerics::attrition(cdm$cohort1)$excluded_records |> sort() == + c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4))) + expect_true(all(omopgenerics::attrition(cdm$cohort1)$excluded_subjects |> sort() == + c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2))) + expect_true(all(omopgenerics::settings(cdm$cohort1)$cohort_name |> sort() == + c("cohort_1_cohort_2", "only_in_cohort_1", "only_in_cohort_2"))) PatientProfiles::mockDisconnect(cdm) }) @@ -403,46 +425,45 @@ test_that("codelist", { cdm$cohort2 <- intersectCohorts(cdm$cohort1, name = "cohort2") expect_true(all( cdm$cohort2 %>% dplyr::pull("cohort_start_date") %>% sort() == - c("2009-12-22", "2010-01-01", "2010-01-11", "2010-05-31", "2012-01-21", - "2012-01-21", "2012-09-27", "2014-02-09", "2014-02-09", "2014-12-06") + c("2012-01-21", "2014-02-09") )) expect_true(all( cdm$cohort2 %>% dplyr::pull("cohort_end_date") %>% sort() == - c("2010-05-04", "2011-08-24", "2012-03-11", "2012-03-11", "2014-02-09", - "2014-03-31", "2014-03-31", "2014-05-20", "2014-12-10", "2015-06-24") + c("2012-03-11", "2014-03-31") )) expect_true(all( - cdm$cohort2 %>% dplyr::pull("subject_id") %>% sort() == c(1, 1, 1, 1, 1, 1, 2, 3, 3, 3) + cdm$cohort2 %>% dplyr::pull("subject_id") %>% sort() == c(1, 1) )) codes <- attr(cdm$cohort2, "cohort_codelist") - expect_true(all(codes |> dplyr::pull("codelist_name") |> sort() == c(rep("c1", 4), rep("c2", 2)))) - expect_true(all(codes |> dplyr::pull("concept_id") |> sort() == c(1, 1, 2, 2, 3, 3))) - expect_true(all(codes |> dplyr::pull("type") |> sort()== rep("index event", 6))) - expect_true(all(codes |> dplyr::pull("cohort_definition_id") |> sort() == c(1, 1, 2, 3, 3, 3))) + expect_true(all(codes |> dplyr::pull("codelist_name") |> sort() == c(rep("c1", 2), "c2"))) + expect_true(all(codes |> dplyr::pull("concept_id") |> sort() == c(1, 2, 3))) + expect_true(all(codes |> dplyr::pull("type") |> sort() == rep("index event", 3))) + expect_true(all(codes |> dplyr::pull("cohort_definition_id") |> sort() == c(1, 1, 1))) # mutually esclusive - cdm$cohort3 <- intersectCohorts(cdm$cohort1, mutuallyExclusive = TRUE, name = "cohort3") + cdm$cohort3 <- intersectCohorts(cdm$cohort1, returnNonOverlappingCohorts = TRUE, name = "cohort3") + expect_equal(collectCohort(cdm$cohort3, 1), collectCohort(cdm$cohort2, 1)) expect_true(all( cdm$cohort3 %>% dplyr::pull("cohort_start_date") %>% sort() == - c("2009-12-22", "2010-01-01", "2010-01-11", "2010-05-31", "2012-01-21", - "2012-03-12", "2012-09-27", "2014-02-09", "2014-04-01", "2014-12-06") + c('2009-12-22', '2010-01-01', '2010-01-11', '2010-05-31', '2012-01-21', + '2012-03-12', '2012-09-27', '2014-02-09', '2014-04-01', '2014-12-06') )) expect_true(all( cdm$cohort3 %>% dplyr::pull("cohort_end_date") %>% sort() == - c("2010-05-04", "2011-08-24", "2012-01-20", "2012-03-11", "2014-02-08", - "2014-02-09", "2014-03-31", "2014-05-20", "2014-12-10", "2015-06-24") + c('2010-05-04', '2011-08-24', '2012-01-20', '2012-03-11', '2014-02-08', + '2014-02-09', '2014-03-31', '2014-05-20', '2014-12-10', '2015-06-24') )) expect_true(all( cdm$cohort3 %>% dplyr::pull("subject_id") %>% sort() == c(1, 1, 1, 1, 1, 1, 2, 3, 3, 3) )) - codes <- attr(cdm$cohort2, "cohort_codelist") + codes <- attr(cdm$cohort3, "cohort_codelist") expect_true(all(codes |> dplyr::pull("codelist_name") |> sort() == c(rep("c1", 4), rep("c2", 2)))) expect_true(all(codes |> dplyr::pull("concept_id") |> sort() == c(1, 1, 2, 2, 3, 3))) expect_true(all(codes |> dplyr::pull("type") |> sort()== rep("index event", 6))) - expect_true(all(codes |> dplyr::pull("cohort_definition_id") |> sort() == c(1, 1, 2, 3, 3, 3))) + expect_true(all(codes |> dplyr::pull("cohort_definition_id") |> sort() == c(1, 1, 1, 2, 2, 3))) # only comb - cdm$cohort4 <- intersectCohorts(cdm$cohort1, keepOriginalCohorts = TRUE, name = "cohort4") + cdm$cohort4 <- intersectCohorts(cdm$cohort1, keepOriginalCohorts = FALSE, name = "cohort4") expect_true(all( cdm$cohort4 %>% dplyr::pull("cohort_start_date") %>% sort() == c("2012-01-21", "2014-02-09") @@ -464,10 +485,10 @@ test_that("codelist", { cdm <- omopgenerics::bind(cdm$cohort, cdm$cohort1, name = "cohort5") cdm$cohort6 <- intersectCohorts(cdm$cohort5, name = "cohort6") codes <- attr(cdm$cohort6, "cohort_codelist") - expect_true(all(codes |> dplyr::pull("codelist_name") |> sort() == c(rep("c1", 8), rep("c2", 4)))) - expect_true(all(codes |> dplyr::pull("concept_id") |> sort() == c(rep(1, 4), rep(2, 4), rep(3, 4)))) - expect_true(all(codes |> dplyr::pull("type") |> sort() == rep("index event", 12))) - expect_true(all(codes |> dplyr::pull("cohort_definition_id") |> sort() == c(2, 2, 3, 3, 4, 5, 6, 6, 6, 7, 7, 7))) + expect_true(all(codes |> dplyr::pull("codelist_name") |> sort() == c(rep("c1", 2), "c2"))) + expect_true(all(codes |> dplyr::pull("concept_id") |> sort() == c(1, 2, 3))) + expect_true(all(codes |> dplyr::pull("type") |> sort() == rep("index event", 3))) + expect_true(all(codes |> dplyr::pull("cohort_definition_id") |> sort() == c(1, 1, 1))) PatientProfiles::mockDisconnect(cdm) }) diff --git a/vignettes/a01_building_base_cohorts.Rmd b/vignettes/a01_building_base_cohorts.Rmd index 37bb1277..93fd1673 100644 --- a/vignettes/a01_building_base_cohorts.Rmd +++ b/vignettes/a01_building_base_cohorts.Rmd @@ -2,7 +2,7 @@ title: "Building base cohorts" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{a01_building_concept_cohorts} + %\VignetteIndexEntry{a01_building_base_cohorts} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---