From 42e88432bf0edccf57b983916ca969b912f4da5e Mon Sep 17 00:00:00 2001 From: Daniel Chudnov Date: Thu, 24 Dec 2020 10:26:09 -0500 Subject: [PATCH] Replaces "Duplicates" with "Extraneous-Same-Day" in exclusion names and code references to "duplicate[s]", closes https://github.com/mitre/growthcleanr/issues/15 --- DESCRIPTION | 2 +- R/growth.R | 252 +++++++++++++++++------------------ README-adjustcarryforward.md | 28 ++-- README.md | 64 ++++----- man/cleangrowth.Rd | 2 +- tests/testthat/test-utils.R | 2 +- 6 files changed, 175 insertions(+), 175 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 208f4f4..c7a0de6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Authors@R: c( person("Campos","Diego",,"camposd@email.chop.edu","aut") ) Maintainer: Robert Grundmeier -Description: Cleans growth data that may contain implausible data based on unit or data range +Description: growthcleanr cleans growth data that may contain implausible data based on unit or data range. Imports: data.table (>= 1.13.0), tidyr (>= 1.1.0), diff --git a/R/growth.R b/R/growth.R index ace5602..afbf75d 100644 --- a/R/growth.R +++ b/R/growth.R @@ -1,10 +1,10 @@ -#' Helper function for cleanbatch to identify subset of observations that are either "included" or a "temporary duplicate" +#' Helper function for cleanbatch to identify subset of observations that are either "included" or a "temporary extraneous" #' #' @keywords internal #' @noRd valid <- function(df, - include.temporary.duplicates = F, - include.duplicates = F, + include.temporary.extraneous = F, + include.extraneous = F, include.carryforward = F) { exclude <- if (is.data.frame(df)) df$exclude @@ -13,9 +13,9 @@ valid <- function(df, return( exclude < 'Exclude' | - include.temporary.duplicates & - exclude == 'Exclude-Temporary-Duplicate' - | include.duplicates & exclude == 'Exclude-Duplicate' + include.temporary.extraneous & + exclude == 'Exclude-Temporary-Extraneous-Same-Day' + | include.extraneous & exclude == 'Exclude-Extraneous-Same-Day' | include.carryforward & exclude == 'Exclude-Carried-Forward' @@ -31,63 +31,63 @@ na_as_false <- function(v) { v } -#' Function for temporary duplicates (step 5): -#' 5. Temporary duplicates: I use duplicates to refer to more than more than one recorded value for a parameter on the same day, +#' Function for temporary extraneous (step 5): +#' 5. Temporary extraneous: I use extraneous to refer to more than more than one recorded value for a parameter on the same day, #' and we need to select which one to include in our analysis. The overall strategy will be to select a measurement using a simple #' strategy that will be used temporarily, and select permanently in a later step after we have a somewhat cleaner dataset that -#' can help us identify the best duplicate. -#' a. For subjects/parameters with duplicates: Determine median_tbc*sd for both parameters: the median tbc*sd for each subject -#' and parameter including only non-duplicate values with exc_*==0. The median of the same parameter as the duplicate will +#' can help us identify the best extraneous. +#' a. For subjects/parameters with extraneous: Determine median_tbc*sd for both parameters: the median tbc*sd for each subject +#' and parameter including only non-extraneous values with exc_*==0. The median of the same parameter as the extraneous will #' be referred to as median_tbc*sd, the median of the other parameter will be referred to as median_tbcOsd. -#' b. For each subject/parameter with duplicates and at least one value for the subject/parameter on a day with no duplicates, -#' select the value closest to the median_tbc*sd for temporary inclusion, and assign all other duplicates exc_*=2. -#' i. For each subject/parameter with duplicates and no values for the subject/parameter on a day with no duplicates, select the -#' value closest to the median_tbcOsd for temporary inclusion, and assign all other duplicates exc_*=2. -#' If median_tbcOsd is missing because there are no values for the other parameter, randomly choose one duplicate value for -#' each subject/parameter/age to keep as exc_*=0 and replace exc_*=2 for all other duplicates for that subject/parameter/age. +#' b. For each subject/parameter with extraneous and at least one value for the subject/parameter on a day with no extraneous, +#' select the value closest to the median_tbc*sd for temporary inclusion, and assign all other extraneous exc_*=2. +#' i. For each subject/parameter with extraneous and no values for the subject/parameter on a day with no extraneous, select the +#' value closest to the median_tbcOsd for temporary inclusion, and assign all other extraneous exc_*=2. +#' If median_tbcOsd is missing because there are no values for the other parameter, randomly choose one extraneous value for +#' each subject/parameter/age to keep as exc_*=0 and replace exc_*=2 for all other extraneous for that subject/parameter/age. #' #' @keywords internal #' @noRd -temporary_duplicates <- function(df) { +temporary_extraneous <- function(df) { # add subjid and param if needed (may be missing depending on where this is called from) if (is.null(df$subjid)) df[, subjid := NA] if (is.null(df$param)) df[, param := NA] - # only operate on valid rows (but include rows that may have previously been flagged as a "temporary duplicate") - valid.rows <- valid(df, include.temporary.duplicates = T) + # only operate on valid rows (but include rows that may have previously been flagged as a "temporary extraneous") + valid.rows <- valid(df, include.temporary.extraneous = T) # make a small copy of df with only the fields we need for efficiency df <- df[j = .(tbc.sd), keyby = .(subjid, param, agedays, index)] # initialize some useful fields df[, `:=`( median.sd = as.double(NA), delta.median.sd = as.double(NA), - duplicates.this.day = F, - duplicate = F + extraneous.this.day = F, + extraneous = F )] - # determine on which days there are duplicate measurements (more than 1 valid measurement on that age day) - df[valid.rows, duplicates.this.day := (.N > 1), by = .(subjid, param, agedays)] - # calculate median of measurements on days where there is no duplicate + # determine on which days there are extraneous measurements (more than 1 valid measurement on that age day) + df[valid.rows, extraneous.this.day := (.N > 1), by = .(subjid, param, agedays)] + # calculate median of measurements on days where there is no extraneous df[valid.rows & - !duplicates.this.day, median.sd := median(tbc.sd), by = .(subjid, param)] - # distribute median to other valid or potential duplicate rows for same parameter for that subject + !extraneous.this.day, median.sd := median(tbc.sd), by = .(subjid, param)] + # distribute median to other valid or potential extraneous rows for same parameter for that subject df[valid.rows, median.sd := sort(median.sd)[1], by = .(subjid, param)] # take care of subject/parameters with more than one day with a valid observation # determine the absolute difference between the measurements sd score and the median for that parameter for each child df[valid.rows & - duplicates.this.day, delta.median.sd := abs(tbc.sd - median.sd), by = .(subjid, param)] - # identify subjects that have duplicates on all days of observation for that parameter (i.e. delta.median.sd undefined) + extraneous.this.day, delta.median.sd := abs(tbc.sd - median.sd), by = .(subjid, param)] + # identify subjects that have duplicates/extraneous on all days of observation for that parameter (i.e. delta.median.sd undefined) subj.all.dups <- df[valid.rows & - duplicates.this.day & + extraneous.this.day & is.na(delta.median.sd), unique(subjid)] df[valid.rows & subjid %in% subj.all.dups, delta.median.sd := (function(subj.df) { # iterate over parameters where delta.median.sd is not yet defined # pass 1: take median from other parameter(s) for (p in subj.df[is.na(delta.median.sd) & - duplicates.this.day, unique(param)]) { + extraneous.this.day, unique(param)]) { median.other.sd <- subj.df[param != p & - !duplicates.this.day, median(tbc.sd)] + !extraneous.this.day, median(tbc.sd)] subj.df[param == p, delta.median.sd := abs(tbc.sd - median.other.sd)] } return(subj.df$delta.median.sd) @@ -95,15 +95,15 @@ temporary_duplicates <- function(df) { # Final pass: take median as zero (i.e. if no measurements from a different parameter) # NOTE -- this is not exactly the same as taking a random parameter df[valid.rows & - duplicates.this.day & + extraneous.this.day & is.na(delta.median.sd), delta.median.sd := abs(tbc.sd)] - # flag any duplicate value on the same day that is not the minimum distance from the median sd score + # flag any extraneous value on the same day that is not the minimum distance from the median sd score # NOTE: in the case of exact ducplicates, "which.min" will pick the first df[valid.rows & - duplicates.this.day, duplicate := seq_along(delta.median.sd) != which.min(delta.median.sd), by = + extraneous.this.day, extraneous := seq_along(delta.median.sd) != which.min(delta.median.sd), by = .(subjid, param, agedays)] - # return the duplicated valid rows (i.e. the ones that should be temporarily excluded) - return(df$duplicate & valid.rows) + # return the extraneous valid rows (i.e. the ones that should be temporarily excluded) + return(df$extraneous & valid.rows) } #' Function to identify switches (step 6): @@ -198,13 +198,13 @@ cleanbatch <- function(data.df, if (!quietly) cat(sprintf( - "[%s] Preliminarily identify potential duplicates...\n", + "[%s] Preliminarily identify potential extraneous...\n", Sys.time() )) - data.df$exclude[temporary_duplicates(data.df)] <- 'Exclude-Temporary-Duplicate' + data.df$exclude[temporary_extraneous(data.df)] <- 'Exclude-Temporary-Extraneous-Same-Day' - # capture a list of subjects with possible duplicates for efficiency later - subj.dup <- data.df[exclude == 'Exclude-Temporary-Duplicate', unique(subjid)] + # capture a list of subjects with possible extraneous for efficiency later + subj.dup <- data.df[exclude == 'Exclude-Temporary-Extraneous-Same-Day', unique(subjid)] # 7a. For each day on which a subject had both a weight and a height recorded, calculate tbc*sd_sw: SD scores as if the weight had been recorded as the height # and the height had been recorded as the weight, recentered using rcsd_*. I intentionally did not allow values that were the first or last for a subject/parameter to be replaced as a switch. @@ -398,7 +398,7 @@ cleanbatch <- function(data.df, # is fairly small, and when this is the case the carried forward measurements provide little new information. # a. Calculate d_prev_wt=wt-wtprev and d_prev_ht=ht-htprev. Use original measurements rather than transformed # measurements (unit errors and switches). - # b. Unlike most steps, do this step for all duplicate values (exc_*==2) in addition to included values (exc_*==0), comparing all values for one day to all + # b. Unlike most steps, do this step for all extraneous values (exc_*==2) in addition to included values (exc_*==0), comparing all values for one day to all # values from the prior day – if there are any values with a d_prev*==0, the value on the latter day should be excluded. # c. Replace exc_*=3 for all values with d_prev*==0 & (exc_*==0 OR exc_*==2) if (!include.carryforward) { @@ -412,13 +412,13 @@ cleanbatch <- function(data.df, data.df[, prev.v := as.double(NaN)] data.df[valid(data.df), prev.v := c(NA, v.orig[-.N]), by = .(subjid, param)] - # optimize "carry forward" for children without duplicates. + # optimize "carry forward" for children without extraneous. data.df[!(subjid %in% subj.dup) & v.orig == prev.v, exclude := 'Exclude-Carried-Forward'] - # need to handle children with duplicate measurements on same day separately + # need to handle children with extraneous measurements on same day separately data.df[subjid %in% subj.dup & - valid(data.df, include.temporary.duplicates = T), exclude := (function(df) { + valid(data.df, include.temporary.extraneous = T), exclude := (function(df) { setkey(df, agedays) ages = unique(agedays) # no point in looking for measurements carried forward if all measurements are from a single day of life @@ -436,9 +436,9 @@ cleanbatch <- function(data.df, })(copy(.SD)), .SDcols = c('agedays', 'exclude', 'v.orig'), by = .(subjid, param)] } - # 9d. Replace exc_*=0 if exc_*==2 & redo step 5 (temporary duplicates) - data.df[exclude == 'Exclude-Temporary-Duplicate', exclude := 'Include'] - data.df[temporary_duplicates(data.df), exclude := 'Exclude-Temporary-Duplicate'] + # 9d. Replace exc_*=0 if exc_*==2 & redo step 5 (temporary extraneous) + data.df[exclude == 'Exclude-Temporary-Extraneous-Same-Day', exclude := 'Include'] + data.df[temporary_extraneous(data.df), exclude := 'Exclude-Temporary-Extraneous-Same-Day'] # 10. Exclude extreme errors with SD cutoffs. For this, a cutoff of |SD|>25 is used. Because of differences in SD and z score, there are some very extreme values # with a |z|>25 that are implausible with an |SD|<25, so both are used to exclude extreme errors. This works better than using a lower value for the limit @@ -452,16 +452,16 @@ cleanbatch <- function(data.df, Sys.time() )) data.df[na_as_false( - valid(data.df, include.temporary.duplicates = T) & abs(tbc.sd) > sd.extreme + valid(data.df, include.temporary.extraneous = T) & abs(tbc.sd) > sd.extreme | - exclude %in% c('Include', 'Exclude-Temporary-Duplicate') & + exclude %in% c('Include', 'Exclude-Temporary-Extraneous-Same-Day') & abs(z.orig) > z.extreme ), exclude := 'Exclude-SD-Cutoff'] - # 10d. Redo temporary duplicates as in step 5. - data.df[exclude == 'Exclude-Temporary-Duplicate', exclude := 'Include'] - data.df[temporary_duplicates(data.df), exclude := 'Exclude-Temporary-Duplicate'] + # 10d. Redo temporary extraneous as in step 5. + data.df[exclude == 'Exclude-Temporary-Extraneous-Same-Day', exclude := 'Include'] + data.df[temporary_extraneous(data.df), exclude := 'Exclude-Temporary-Extraneous-Same-Day'] # 11. Exclude extreme errors with EWMA # a. Erroneous measurements can distort the EWMA for measurements around them. Therefore, if the EWMA method identifies more than one value for a subject and @@ -476,15 +476,15 @@ cleanbatch <- function(data.df, )) data.df <- data.df[, exclude := (function(df) { num.ewma.excluded <- 0 - # optimization: determine whether this subject has any duplicates - has.duplicates <- subjid %in% subj.dup + # optimization: determine whether this subject has any extraneous + has.extraneous <- subjid %in% subj.dup while (T) { df[, (ewma.fields) := as.double(NaN)] df[valid(exclude), (ewma.fields) := ewma(agedays, tbc.sd, ewma.exp, T)] # note: at this point, only one ewma exists per param on a given day for a subject, so sort(ewma.all)[1] will returns the non-missing ewma.all - # restrict to children with possible duplicates for efficiency - if (has.duplicates) { + # restrict to children with possible extraneous for efficiency + if (has.extraneous) { df[, `:=`( ewma.all = sort(ewma.all)[1], ewma.before = sort(ewma.before)[1], @@ -551,11 +551,11 @@ cleanbatch <- function(data.df, df[worst.row, exclude := 'Exclude-EWMA-Extreme-Pair'] } - # 11h. Recalculate temporary duplicates as in step 5 - # optimize: only perform these steps if this subject is known to have duplicate measurements - if (has.duplicates) { - df[exclude == 'Exclude-Temporary-Duplicate', exclude := 'Include'] - df[temporary_duplicates(df), exclude := 'Exclude-Temporary-Duplicate'] + # 11h. Recalculate temporary extraneous as in step 5 + # optimize: only perform these steps if this subject is known to have extraneous measurements + if (has.extraneous) { + df[exclude == 'Exclude-Temporary-Extraneous-Same-Day', exclude := 'Include'] + df[temporary_extraneous(df), exclude := 'Exclude-Temporary-Extraneous-Same-Day'] } # 11i. If there was at least one subject who had a potential exclusion identified in step 11c, repeat steps 11b-11g. If there were no subjects with potential @@ -572,24 +572,24 @@ cleanbatch <- function(data.df, - # 12. Redo duplicates using EWMA. This will be the final time duplicates are done. - # For some duplicates it is very difficult to tell which one is likely representative. If the duplicates are very similar to each - # other, we will select one. If it is very difficult to tell which one is correct and the duplicates are not very similar, we will - # exclude all duplicates for that subject/parameter on that day - # a. Replace exc_*=0 for all temporarily excluded duplicates (exc_*==0) + # 12. Redo extraneous using EWMA. This will be the final time extraneous are done. + # For some extraneous it is very difficult to tell which one is likely representative. If the extraneous are very similar to each + # other, we will select one. If it is very difficult to tell which one is correct and the extraneous are not very similar, we will + # exclude all extraneous for that subject/parameter on that day + # a. Replace exc_*=0 for all temporarily excluded extraneous (exc_*==0) if (!quietly) - cat(sprintf("[%s] Exclude duplicates based on EWMA...\n", Sys.time())) - data.df[exclude == 'Exclude-Temporary-Duplicate', exclude := 'Include'] + cat(sprintf("[%s] Exclude extraneous based on EWMA...\n", Sys.time())) + data.df[exclude == 'Exclude-Temporary-Extraneous-Same-Day', exclude := 'Include'] - # 12b. Select which duplicate to include in EWMA calculations using the same criteria as in step 5. However, do not include values + # 12b. Select which extraneous to include in EWMA calculations using the same criteria as in step 5. However, do not include values # in these medians that were excluded in steps 9-11 (exc_*=3, 4, 5 or 6) # i. Determine median_tbc*sd and median_tbcOsd as in step 5. - # ii. For each subject/parameter with duplicates and at least one non-duplicate value, select the value closest to the + # ii. For each subject/parameter with extraneous and at least one non-extraneous value, select the value closest to the # median_tbc*sd for inclusion in EWMA calculations. - # This is functionally the same as re-doing the "temporary duplicate" step before doing the EWMA - temp.dups <- temporary_duplicates(data.df) - data.df[temp.dups, exclude := 'Exclude-Temporary-Duplicate'] + # This is functionally the same as re-doing the "temporary extraneous" step before doing the EWMA + temp.dups <- temporary_extraneous(data.df) + data.df[temp.dups, exclude := 'Exclude-Temporary-Extraneous-Same-Day'] # prepare a list of valid rows and initialize variables for convenience valid.rows <- valid(data.df) @@ -597,26 +597,26 @@ cleanbatch <- function(data.df, ewma.all = as.double(NaN), abssum2 = as.double(NaN), median.other.sd = as.double(NaN), - duplicate = F + extraneous = F )] - # 12c. Calculate a EWMA step for all subjects/parameters with duplicates and at least one non-duplicate value with the - # following modifications - # i. For calculating the EWMA, include only the duplicate selected in 12c - # ii. Calculate dewma_* for all values of duplicates + # 12c. Calculate a EWMA step for all subjects/parameters with duplicates/extraneous and at + # least one non-extraneous value with the following modifications + # i. For calculating the EWMA, include only the extraneous selected in 12c + # ii. Calculate dewma_* for all values of extraneous # iii. You do not need to calculate EWMAbef or EWMAaft for this step - # determine proportion of days with duplication for each parameter ahead of time for efficiency + # determine proportion of days with extraneous/duplication for each parameter ahead of time for efficiency dup.ratio.df <- data.df[subjid %in% subj.dup & (valid.rows | temp.dups), list(dup = (.N > 1)), by = .(subjid, param, agedays)][j = list(dup.ratio = mean(dup)), keyby = .(subjid, param)] - # identify subject/parameters where there isduplication but at least one day with no duplicates for that parameter + # identify subject/parameters where there is duplication but at least one day with no extraneous for that parameter subj.param.not.all.dups <- dup.ratio.df[dup.ratio < 1.0, list(subjid, param)] # identify subject/parameters where there is duplication for all days for that parameter subj.param.all.dups <- dup.ratio.df[dup.ratio == 1, list(subjid, param)] subj.all.dups <- subj.param.all.dups[, unique(subjid)] - # perform ewma for subjects with duplicates + # perform ewma for subjects with extraneous data.df[subjid %in% subj.dup & valid.rows, ewma.all := ewma(agedays, tbc.sd, ewma.exp, ewma.adjacent = F), by = .(subjid, param)] @@ -627,17 +627,17 @@ cleanbatch <- function(data.df, # NOTE: only children with more than one ageday with valid measurements will have a valid ewma from above data.df[, abssum2 := 2 * abs(tbc.sd - ewma.all) + abs(tbc.sd)] - # 12d. For each subject/parameter/age with duplicates and at least one non-duplicate value: + # 12d. For each subject/parameter/age with extraneous and at least one non-extraneous value: # i. Replace exc_*=7 for all values except the value that has the smallest abssum2_*. - data.df[J(subj.param.not.all.dups), duplicate := seq_along(abssum2) != which.min(abssum2), by = + data.df[J(subj.param.not.all.dups), extraneous := seq_along(abssum2) != which.min(abssum2), by = .(subjid, param, agedays)] data.df[temp.dups, exclude := 'Include'] data.df[(valid.rows | temp.dups) & - duplicate, exclude := 'Exclude-Duplicate'] - # ii. Determine dup_tot_* (# of days with duplicates for that subject/parameter) and nodup_tot_* (# of days with nonexlcuded - # non-duplicates for that subject/parameter). - # iii. If dup_tot_*/(dup_tot_*+nodup_tot_*) is greater than 1/2, replace exc_*=7 for all duplicates for that subject/parameter + extraneous, exclude := 'Exclude-Extraneous-Same-Day'] + # ii. Determine dup_tot_* (# of days with extraneous for that subject/parameter) and nodup_tot_* (# of days with nonexlcuded + # non-extraneous for that subject/parameter). + # iii. If dup_tot_*/(dup_tot_*+nodup_tot_*) is greater than 1/2, replace exc_*=7 for all extraneous for that subject/parameter # for each age where the largest measurement minus the smallest measurement for that subject/parameter/age is larger than # the maximum difference (ht 3cm; wt 0-9.999 kg 0.25kg; wt 10-29.9999 kg 0.5 kg; wt 30kg and higher 1 kg). @@ -646,8 +646,8 @@ cleanbatch <- function(data.df, tbc.sd.max = as.double(NaN))] df[valid( exclude, - include.duplicates = T, - include.temporary.duplicates = T + include.extraneous = T, + include.temporary.extraneous = T ), `:=`(tbc.sd.min = min(tbc.sd), tbc.sd.max = max(tbc.sd))] df[tbc.sd.max - tbc.sd.min > ifelse(param == 'HEIGHTCM', @@ -657,48 +657,48 @@ cleanbatch <- function(data.df, ifelse(tbc.sd.min < 10, 0.25, ifelse(tbc.sd.min < 30, 0.5, 1)), NA )), - exclude := 'Exclude-Duplicate'] + exclude := 'Exclude-Extraneous-Same-Day'] return(df$exclude) })(copy(.SD)), .SDcols = c('exclude', 'tbc.sd'), by = .(subjid, param, agedays)] - # 12e. For each subject/parameter/age with duplicates and no nonduplicate values: + # 12e. For each subject/parameter/age with extraneous and no nonextraneous values: # i. Replace exc_*=7 for all values except the value with the smallest |tbc*sd-median_tbcOsd|. If median_tbcOsd is missing because there are no values - # for the other parameter, randomly choose one duplicate value for each subject/parameter/age to keep as exc_*=0 and replace exc_*=7 for all other - # duplicates for that subject/parameter/age. + # for the other parameter, randomly choose one extraneous value for each subject/parameter/age to keep as exc_*=0 and replace exc_*=7 for all other + # extraneous for that subject/parameter/age. # ii. If the largest measurement minus the smallest measurement for that subject/parameter/age is larger than the maximum difference - # (ht 3cm; wt 0-9.999 kg 0.25kg; wt 10-29.9999 kg 0.5 kg; wt 30kg and higher 1 kg)., replace exc_*=7 for all duplicates for that + # (ht 3cm; wt 0-9.999 kg 0.25kg; wt 10-29.9999 kg 0.5 kg; wt 30kg and higher 1 kg)., replace exc_*=7 for all extraneous for that # subject/parameter/age. # calculate median for other parameter (restrict to subjects with all duplication for at least one parameter) data.df[subjid %in% subj.all.dups, exclude := (function(subj.df) { - # flag days that have duplicates / potentially valid parameters + # flag days that have extraneous / potentially valid parameters subj.df[, `:=`( - duplicates.this.day = F, - duplicate = F, + extraneous.this.day = F, + extraneous = F, tbc.sd.min = as.double(NaN), tbc.sd.max = as.double(NaN) )] valid.rows = valid( subj.df, - include.duplicates = T, - include.temporary.duplicates = T + include.extraneous = T, + include.temporary.extraneous = T ) - subj.df[valid.rows, duplicates.this.day := (.N > 1), by = .(param, agedays)] + subj.df[valid.rows, extraneous.this.day := (.N > 1), by = .(param, agedays)] for (p in subj.df[j = unique(param)]) { median.sd <- subj.df[param != p & - !duplicates.this.day, median(tbc.sd)] + !extraneous.this.day, median(tbc.sd)] subj.df[param == p, median.other.sd := median.sd] } # safety check -- assign median.other.sd==0 to ensure "which.min" functions correctly below subj.df[is.na(median.other.sd), median.other.sd := 0] - # identify rows as duplicate where |tbc*sd-median_tbcOsd| is not at the minimum value - subj.df[duplicates.this.day == T, duplicate := (seq_along(median.other.sd) != which.min(abs(tbc.sd - median.other.sd))), by = + # identify rows as extraneous where |tbc*sd-median_tbcOsd| is not at the minimum value + subj.df[extraneous.this.day == T, extraneous := (seq_along(median.other.sd) != which.min(abs(tbc.sd - median.other.sd))), by = .(param, agedays)] - subj.df[duplicates.this.day & - !duplicate, exclude := 'Include'] - subj.df[duplicates.this.day & - duplicate, exclude := 'Exclude-Duplicate'] - subj.df[duplicates.this.day == T, `:=`(tbc.sd.min = min(tbc.sd), + subj.df[extraneous.this.day & + !extraneous, exclude := 'Include'] + subj.df[extraneous.this.day & + extraneous, exclude := 'Exclude-Extraneous-Same-Day'] + subj.df[extraneous.this.day == T, `:=`(tbc.sd.min = min(tbc.sd), tbc.sd.max = max(tbc.sd)), by = .(param, agedays)] subj.df[tbc.sd.max - tbc.sd.min > ifelse(param == 'HEIGHTCM', 3, @@ -707,28 +707,28 @@ cleanbatch <- function(data.df, ifelse(tbc.sd.min < 10, 0.25, ifelse(tbc.sd.min < 30, 0.5, 1)), NA )), - exclude := 'Exclude-Duplicate'] + exclude := 'Exclude-Extraneous-Same-Day'] - # identify kids who had an SD or EWMA extreme excluded that was a duplicate and re-label as "Exclude-Duplicate" - subj.df[, duplicates.this.day := F] - # consider any non-missing measurement when determining presence of duplicates - subj.df[exclude != 'Missing', duplicates.this.day := (.N > 1), by = + # identify kids who had an SD or EWMA extreme excluded that was a extraneous and re-label as "Exclude-Extraneous-Same-Day" + subj.df[, extraneous.this.day := F] + # consider any non-missing measurement when determining presence of extraneous + subj.df[exclude != 'Missing', extraneous.this.day := (.N > 1), by = .(param, agedays)] - subj.df[duplicates.this.day & + subj.df[extraneous.this.day & exclude %in% c('Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', - 'Exclude-EWMA-Extreme-Pair'), exclude := 'Exclude-Duplicate'] + 'Exclude-EWMA-Extreme-Pair'), exclude := 'Exclude-Extraneous-Same-Day'] return(subj.df$exclude) })(copy(.SD)), .SDcols = c('param', 'agedays', 'exclude', 'tbc.sd'), by = .(subjid)] - # 12f. For any values that were excluded with exc_*=4, 5, or 6 that are also duplicates, replace exc_*=7. + # 12f. For any values that were excluded with exc_*=4, 5, or 6 that are also extraneous, replace exc_*=7. data.df[subjid %in% subj.dup, exclude := (function(subj.df) { if (.N > 1) { subj.df[exclude %in% c('Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', - 'Exclude-EWMA-Extreme-Pair'), exclude := 'Exclude-Duplicate'] + 'Exclude-EWMA-Extreme-Pair'), exclude := 'Exclude-Extraneous-Same-Day'] } return(subj.df$exclude) })(copy(.SD)), .SDcols = c('exclude'), by = .(subjid, param, agedays)] @@ -1528,7 +1528,7 @@ cleanbatch <- function(data.df, # CD e-mail 2/10/15: It looks like there are two problems. One is carried forward measurements. The other is that I forgot to include an # important part of the rule in the English - tot_exc_* has to be >=2. I'm forwarding an updated English version. # Also, because your exclusions are handled a little differently I wanted to specify that unit errors and swaps are not included - # in tot_exc_* and are included in tot_inc_*, whereas carried forwards and duplicates are not included in either count. + # in tot_exc_* and are included in tot_inc_*, whereas carried forwards and extraneous are not included in either count. # NOTE: updated to include optional argument "include.carryforward=T" in the valid() function, and added the # constraints "exclude.count.this.parameter >= 2" and "exclude.count.other.parameter >= 2" in the code below @@ -1544,7 +1544,7 @@ cleanbatch <- function(data.df, inc.exc <- subj.df[, j = list(exclude.count = sum( !valid( exclude, - include.duplicates = T, + include.extraneous = T, include.carryforward = T ) ), @@ -1638,7 +1638,7 @@ cleanbatch <- function(data.df, #' #' * 'Include', 'Unit-Error-High', 'Unit-Error-Low', 'Swapped-Measurements', 'Missing', #' * 'Exclude-Carried-Forward', 'Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', 'Exclude-EWMA-Extreme-Pair', -#' * 'Exclude-Duplicate', +#' * 'Exclude-Extraneous-Same-Day', #' * 'Exclude-EWMA-8', 'Exclude-EWMA-9', 'Exclude-EWMA-10', 'Exclude-EWMA-11', 'Exclude-EWMA-12', 'Exclude-EWMA-13', 'Exclude-EWMA-14', #' * 'Exclude-Min-Height-Change', 'Exclude-Max-Height-Change', #' * 'Exclude-Pair-Delta-17', 'Exclude-Pair-Delta-18', 'Exclude-Pair-Delta-19', @@ -1714,7 +1714,7 @@ cleangrowth <- function(subjid, num.batches <- getDoParWorkers() } # variables needed for parallel workers - var_for_par <- c("temporary_duplicates", "valid", "swap_parameters", + var_for_par <- c("temporary_extraneous", "valid", "swap_parameters", "na_as_false", "ewma", "read_anthro", "as_matrix_delta", "sd_median") @@ -1806,8 +1806,8 @@ cleangrowth <- function(subjid, # 2. Data set-up # a. I always code sex as 0=Male, 1=Female, so I recoded the variable sex that way and left a variable sexorigcode the way the data was sent to me (1=Female 2=Male) - # b. Remove rows that are duplicates for subjid, param, and measurement from further analysis - # NOTE: this step is not needed -- handled automatically by "temporary duplicate" step. + # b. Remove rows that are extraneous for subjid, param, and measurement from further analysis + # NOTE: this step is not needed -- handled automatically by "temporary extraneous" step. # c. I generated separate variables for weight (wt) and height (ht), as well as exc_* and subjid_* variables. Set exc_*=0 if value is not missing # and exc_*=1 if value is missing. In all future steps, exc_* should only be changed if it is 0. This helps to keep track of which step excluded a value. # I also kept the measurement variable there and untouched because sometimes wt and ht got transformed to something else. @@ -1851,12 +1851,12 @@ cleangrowth <- function(subjid, 'Swapped-Measurements', 'Exclude', 'Missing', - 'Exclude-Temporary-Duplicate', + 'Exclude-Temporary-Extraneous-Same-Day', 'Exclude-Carried-Forward', 'Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', 'Exclude-EWMA-Extreme-Pair', - 'Exclude-Duplicate', + 'Exclude-Extraneous-Same-Day', 'Exclude-EWMA-8', 'Exclude-EWMA-9', 'Exclude-EWMA-10', diff --git a/README-adjustcarryforward.md b/README-adjustcarryforward.md index bf884a8..e883c10 100644 --- a/README-adjustcarryforward.md +++ b/README-adjustcarryforward.md @@ -92,15 +92,15 @@ parameters). * Warning: this will take much longer! The default number of sweep steps is 9; this can be changed with the option -`--gridlength`. +`--gridlength`. For testing options of handling strings of multiple carried forward -values, several options from 0 to 3 have been incorporated. 0 (no change) is the -default option, and can be changed `--exclude_opt`. More information on each +values, several options from 0 to 3 have been incorporated. 0 (no change) is the +default option, and can be changed `--exclude_opt`. More information on each option can be found in the `adjustcarryforward()` documentation. In addition to multiple options for carried-forward strings, "answers" for a given -dataset have been incorporated. When the `--add_answers` flag is set to `TRUE` +dataset have been incorporated. When the `--add_answers` flag is set to `TRUE` (`TRUE` by default), a column called `acf_answers` will have, for each height value, "Definitely Exclude", "Definitely Include", or "Unknown" (if it does not fall in either category). Weight values are set as `NA`. @@ -180,16 +180,16 @@ sweep (hence the examples w/5 and 9 step sweeps). And the first few result rows in `test_adjustcarrforward_DATE_TIME.csv` would be: ```R -id subjid sex agedays param measurement clean_value run-1 run-2 run-3 run-4 run-5 -1510 775155 0 889 HEIGHTCM 84.9 Exclude-Duplicate Missing Missing Missing Missing Missing -1511 775155 0 889 HEIGHTCM 89.06 Include No Change No Change No Change No Change No Change -1512 775155 0 1071 HEIGHTCM 92.5 Include No Change No Change No Change No Change No Change -1513 775155 0 1253 HEIGHTCM 96.2 Include No Change No Change No Change No Change No Change -1514 775155 0 1435 HEIGHTCM 96.2 Exclude-Carried-Forward No Change No Change Include Include Include -1515 775155 0 1435 HEIGHTCM 99.692 Include No Change No Change No Change No Change No Change -1516 775155 0 1806 HEIGHTCM 106.1 Include No Change No Change No Change No Change No Change -1517 775155 0 2177 HEIGHTCM 112.3 Include No Change No Change No Change No Change No Change -1518 775155 0 889 WEIGHTKG 13.1 Include No Change No Change No Change No Change No Change +id subjid sex agedays param measurement clean_value run-1 run-2 run-3 run-4 run-5 +1510 775155 0 889 HEIGHTCM 84.9 Exclude-Extraneous-Same-Day Missing Missing Missing Missing Missing +1511 775155 0 889 HEIGHTCM 89.06 Include No Change No Change No Change No Change No Change +1512 775155 0 1071 HEIGHTCM 92.5 Include No Change No Change No Change No Change No Change +1513 775155 0 1253 HEIGHTCM 96.2 Include No Change No Change No Change No Change No Change +1514 775155 0 1435 HEIGHTCM 96.2 Exclude-Carried-Forward No Change No Change Include Include Include +1515 775155 0 1435 HEIGHTCM 99.692 Include No Change No Change No Change No Change No Change +1516 775155 0 1806 HEIGHTCM 106.1 Include No Change No Change No Change No Change No Change +1517 775155 0 2177 HEIGHTCM 112.3 Include No Change No Change No Change No Change No Change +1518 775155 0 889 WEIGHTKG 13.1 Include No Change No Change No Change No Change No Change ``` The fifth row in the example above demonstrates the results of the experimental script; diff --git a/README.md b/README.md index 2a509b7..170b6ca 100644 --- a/README.md +++ b/README.md @@ -378,31 +378,31 @@ with `cleangrowth()` will likely take a few minutes to complete. > setkey(data, subjid, param, agedays) > cleaned_data <- data[, clean_value:=cleangrowth(subjid, param, agedays, sex, measurement)] > head(cleaned_data) - id subjid sex agedays param measurement clean_value -1: 1510 775155 0 889 HEIGHTCM 84.900 Exclude-Duplicate -2: 1511 775155 0 889 HEIGHTCM 89.060 Include -3: 1512 775155 0 1071 HEIGHTCM 92.500 Include -4: 1513 775155 0 1253 HEIGHTCM 96.200 Include -5: 1514 775155 0 1435 HEIGHTCM 96.200 Exclude-Carried-Forward -6: 1515 775155 0 1435 HEIGHTCM 99.692 Include + id subjid sex agedays param measurement clean_value +1: 1510 775155 0 889 HEIGHTCM 84.900 Exclude-Extraneous-Same-Day +2: 1511 775155 0 889 HEIGHTCM 89.060 Include +3: 1512 775155 0 1071 HEIGHTCM 92.500 Include +4: 1513 775155 0 1253 HEIGHTCM 96.200 Include +5: 1514 775155 0 1435 HEIGHTCM 96.200 Exclude-Carried-Forward +6: 1515 775155 0 1435 HEIGHTCM 99.692 Include > cleaned_data %>% group_by(clean_value) %>% tally(sort=TRUE) # A tibble: 14 x 2 - clean_value n - - 1 Include 38875 - 2 Exclude-Duplicate 10546 - 3 Exclude-Carried-Forward 6694 - 4 Exclude-SD-Cutoff 168 - 5 Exclude-EWMA-8 135 - 6 Exclude-EWMA-Extreme 95 - 7 Exclude-EWMA-9 93 - 8 Exclude-Min-Height-Change 65 - 9 Swapped-Measurements 16 -10 Exclude-Too-Many-Errors 6 -11 Exclude-EWMA-11 5 -12 Exclude-EWMA-12 2 -13 Exclude-Pair-Delta-18 2 -14 Exclude-Max-Height-Change 1 + clean_value n + + 1 Include 38875 + 2 Exclude-Extraneous-Same-Day 10546 + 3 Exclude-Carried-Forward 6694 + 4 Exclude-SD-Cutoff 168 + 5 Exclude-EWMA-8 135 + 6 Exclude-EWMA-Extreme 95 + 7 Exclude-EWMA-9 93 + 8 Exclude-Min-Height-Change 65 + 9 Swapped-Measurements 16 +10 Exclude-Too-Many-Errors 6 +11 Exclude-EWMA-11 5 +12 Exclude-EWMA-12 2 +13 Exclude-Pair-Delta-18 2 +14 Exclude-Max-Height-Change 1 ``` If you are able to run these steps and see a similar result, you have the @@ -596,7 +596,7 @@ the algorithm's step labels and labels used in comment text in `growthcleanr`. | - | - | - | - | | 2d | 0 | Include | - | | 2d | 1 | Missing | - | -| 5b | 2 | Exclude-Temporary-Duplicate | - | +| 5b | 2 | Exclude-Temporary-Extraneous-Same-Day | - | | 7d | - | Swapped-Measurement | - | | 8f | - | Unit-Error-High | - | | 8f | - | Unit-Error-Low | - | @@ -605,7 +605,7 @@ the algorithm's step labels and labels used in comment text in `growthcleanr`. | 10c | 4 | Exclude-SD-Cutoff | 10d, 10e | | 11d | 5 | Exclude-EWMA-Extreme | 11e | | 11f.ii | 6 | Exclude-EWMA-Extreme-Pair | 11i (R only) | -| 12d.i | 7 | Exclude-Duplicate | 12diii, 12ei, 12f | +| 12d.i | 7 | Exclude-Extraneous-Same-Day | 12diii, 12ei, 12f | | 14f.i | 8 | Exclude-EWMA-8 | Set in 14h (in R) | | 14f.ii | 9 | Exclude-EWMA-9 | Set in 14h (in R) | | 14f.iii | 10 | Exclude-EWMA-10 | Set in 14h (in R) | @@ -826,13 +826,13 @@ parameter type as `type`, specify each, with quotes: ```R > head(my_cleaned_data) - id subjid sex aged type measurement clean_value -1: 1510 775155 0 889 HEIGHTCM 84.90 Exclude-Duplicate -2: 1511 775155 0 889 HEIGHTCM 89.06 Include -3: 1518 775155 0 889 WEIGHTKG 13.10 Include -4: 1512 775155 0 1071 HEIGHTCM 92.50 Include -5: 1519 775155 0 1071 WEIGHTKG 14.70 Include -6: 1513 775155 0 1253 HEIGHTCM 96.20 Include + id subjid sex aged type measurement clean_value +1: 1510 775155 0 889 HEIGHTCM 84.90 Exclude-Extraneous-Same-Day +2: 1511 775155 0 889 HEIGHTCM 89.06 Include +3: 1518 775155 0 889 WEIGHTKG 13.10 Include +4: 1512 775155 0 1071 HEIGHTCM 92.50 Include +5: 1519 775155 0 1071 WEIGHTKG 14.70 Include +6: 1513 775155 0 1253 HEIGHTCM 96.20 Include > longwide(my_cleaned_data, agedays="aged", param="type") ``` diff --git a/man/cleangrowth.Rd b/man/cleangrowth.Rd index e45feff..c9f755f 100644 --- a/man/cleangrowth.Rd +++ b/man/cleangrowth.Rd @@ -113,7 +113,7 @@ Possible values for each code are: \itemize{ \item 'Include', 'Unit-Error-High', 'Unit-Error-Low', 'Swapped-Measurements', 'Missing', \item 'Exclude-Carried-Forward', 'Exclude-SD-Cutoff', 'Exclude-EWMA-Extreme', 'Exclude-EWMA-Extreme-Pair', -\item 'Exclude-Duplicate', +\item 'Exclude-Extraneous-Same-Day', \item 'Exclude-EWMA-8', 'Exclude-EWMA-9', 'Exclude-EWMA-10', 'Exclude-EWMA-11', 'Exclude-EWMA-12', 'Exclude-EWMA-13', 'Exclude-EWMA-14', \item 'Exclude-Min-Height-Change', 'Exclude-Max-Height-Change', \item 'Exclude-Pair-Delta-17', 'Exclude-Pair-Delta-18', 'Exclude-Pair-Delta-19', diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 2bf049b..d9aecd4 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -340,7 +340,7 @@ test_that("longwide works as expected with custom values", { # run longwide on changed data with some exclusion types included inc_types <- c("Include", "Exclude-Carried-Forward", - "Exclude-Duplicate") + "Exclude-Extraneous-Same-Day") wide_syn <- longwide(sub_syn, clean_value = "cv", inclusion_types = inc_types)