Merge pull request #34 from mitre/t9-mitre-reference-medians

Add NHANES reference medians, updating behavior of sd.recenter; closes mitre#9
carriedaymont · Mar 1, 2021 · 7a8d8e3 · 7a8d8e3
2 parents 7842e55 + 34bbbbf
commit 7a8d8e3
Show file tree

Hide file tree

Showing 10 changed files with 29,698 additions and 69 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: growthcleanr
 Type: Package
 Title: Growth Measurements Cleaner
-Version: 1.2.3
+Version: 1.2.5
 Authors@R: c(
     person("Daymont","Carrie",,"carriedaymont@gmail.com",c("aut","cre")),
     person("Grundmeier","Robert",,"grundmeier@email.chop.edu","aut"),

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,21 @@
 # growthcleanr
 
+## [1.2.5] - 2021-02-26
+
+### Changed
+
+- Updated behavior of `sd.recenter` option to include new NHANES reference
+  medians and explicit specification with "NHANES" or "derive"
+  (https://github.com/mitre/growthcleanr/issues/9)
+- Switched `README.md` to be generated from `README.Rmd` w/knitr (thanks
+  @mcanouil) (#17)
+- Switched to use `file.path()` more consistently in `R/growth.R`
+
+### Added
+
+- Added `inst/extdata/nhanes-reference-medians.csv`, reference medians for
+  recentering derived from NHANES (described in README)
+
 ## [1.2.4] - 2021-01-14
 
 ### Changed

diff --git a/R/extdata.R b/R/extdata.R
@@ -43,6 +43,20 @@ NULL
 #'
 NULL
 
+#' NHANES reference medians
+#'
+#' Contains reference median values for default recentering, derived from NHANES
+#' years 2009-2018
+#'
+#' @name nhanes-reference-medians
+#'
+#' @section nhanes-reference-medians.csv:
+#'
+#' Used in function `cleangrowth()`
+#'
+#'
+NULL
+
 #' Tanner Growth Velocity Table
 #'
 #' Part of default CDC-derived tables
@@ -177,4 +191,4 @@ NULL
 #' Used to test function `ext_bmiz()`
 #'
 #'
-NULL
+NULL
diff --git a/R/growth.R b/R/growth.R
@@ -1604,10 +1604,24 @@ cleanbatch <- function(data.df,
 #' considering excluding all measurements. Defaults to 2.
 #' @param error.load.threshold threshold of percentage of excluded measurement count to included measurement
 #' count that must be exceeded before excluding all measurements of either parameter. Defaults to 0.5.
-#' @param sd.recenter Data frame or table with median SD-scores per day of life
-#' by gender and parameter. Columns in the table must include param, sex,
-#' agedays, and sd.median.  If not supplied, the median values will be
-#' calculated using the growth data that is being cleaned. Defaults to NA.
+#' @param sd.recenter specifies how to recenter medians. May be a data frame or
+#' table w/median SD-scores per day of life by gender and parameter, or "NHANES"
+#' or "derive" as a character vector.
+#' \itemize{
+#'   \item If `sd.recenter` is specified as a data set, use the data set
+#'   \item If `sd.recenter` is specified as "`nhanes`", use NHANES reference medians
+#'   \item If `sd.recenter` is specified as "`derive`", derive from input
+#'   \item If `sd.recenter` is not specified or `NA`:
+#'     \itemize{
+#'       \item If the input set has at least 5,000 observations, derive medians from input
+#'       \item If the input set has fewer than 5,000 observations, use NHANES
+#'     }
+#' }
+#'
+#' If specifying a data set, columns must include param, sex, agedays, and sd.median
+#' (referred to elsewhere as "modified Z-score"), and those medians will be used
+#' for recentering. A summary of how the NHANES reference medians were derived is
+#' available in README.md. Defaults to NA.
 #' @param sdmedian.filename Name of file to save sd.median data calculated on the input dataset to as CSV.
 #' Defaults to "", for which this data will not be saved. Use for extracting medians for parallel processing
 #' scenarios other than the built-in parallel option.
@@ -1739,9 +1753,8 @@ cleangrowth <- function(subjid,
   # recode column names to match syntactic style ("." rather than "_" in variable names)
   tanner_ht_vel_path <- ifelse(
     ref.data.path == "",
-    system.file("extdata/tanner_ht_vel.csv", package = "growthcleanr"),
-    paste(ref.data.path, "tanner_ht_vel.csv", sep =
-            "")
+    system.file(file.path("extdata", "tanner_ht_vel.csv"), package = "growthcleanr"),
+    file.path(ref.data.path, "tanner_ht_vel.csv")
   )
 
   tanner.ht.vel <- fread(tanner_ht_vel_path)
@@ -1756,16 +1769,14 @@ cleangrowth <- function(subjid,
 
   who_max_ht_vel_path <- ifelse(
     ref.data.path == "",
-    system.file("extdata/who_ht_maxvel_3sd.csv", package = "growthcleanr"),
-    paste(ref.data.path, "who_ht_maxvel_3sd.csv", sep =
-            "")
+    system.file(file.path("extdata", "who_ht_maxvel_3sd.csv"), package = "growthcleanr"),
+    file.path(ref.data.path, "who_ht_maxvel_3sd.csv")
   )
 
   who_ht_vel_3sd_path <- ifelse(
     ref.data.path == "",
-    system.file("extdata/who_ht_vel_3sd.csv", package = "growthcleanr"),
-    paste(ref.data.path, "who_ht_vel_3sd.csv", sep =
-            "")
+    system.file(file.path("extdata", "who_ht_vel_3sd.csv"), package = "growthcleanr"),
+    file.path(ref.data.path, "who_ht_vel_3sd.csv")
   )
   who.max.ht.vel <- fread(who_max_ht_vel_path)
   who.ht.vel <- fread(who_ht_vel_3sd_path)
@@ -1902,24 +1913,62 @@ cleangrowth <- function(subjid,
     cat(sprintf("[%s] Re-centering data...\n", Sys.time()))
 
   # see function definition below for explanation of the re-centering process
-  # returns a data table indexed by param, sex, agedays
+  # returns a data table indexed by param, sex, agedays. can use NHANES reference
+  # data, derive from input, or use user-supplied data.
   if (!is.data.table(sd.recenter)) {
-    sd.recenter <- data.all[exclude < 'Exclude', sd_median(param, sex, agedays, sd.orig)]
-    if (sdmedian.filename != "") {
-      write.csv(sd.recenter, sdmedian.filename, row.names = F)
+    # Use NHANES medians if the string "nhanes" is specified instead of a data.table
+    # or if sd.recenter is not specified as "derive" and N < 5000.
+    if ((is.character(sd.recenter) & tolower(sd.recenter) == "nhanes") |
+      (!(is.character(sd.recenter) & tolower(sd.recenter) == "derive") & (data.all[, .N] < 5000))) {
+      nhanes_reference_medians_path <- ifelse(
+        ref.data.path == "",
+        system.file(file.path("extdata", "nhanes-reference-medians.csv"), package = "growthcleanr"),
+        file.path(ref.data.path, "nhanes-reference-medians.csv")
+      )
+      sd.recenter <- fread(nhanes_reference_medians_path)
       if (!quietly)
         cat(
           sprintf(
-            "[%s] Wrote re-centering medians to %s...\n",
-            Sys.time(),
-            sdmedian.filename
+            "[%s] Using NHANES reference medians...\n",
+            Sys.time()
           )
         )
+    } else {
+      # Derive medians from input data
+      sd.recenter <- data.all[exclude < 'Exclude', sd_median(param, sex, agedays, sd.orig)]
+      if (!quietly)
+        cat(
+          sprintf(
+            "[%s] Using re-centering medians derived from input...\n",
+            Sys.time()
+          )
+        )
+      if (sdmedian.filename != "") {
+        write.csv(sd.recenter, sdmedian.filename, row.names = F)
+        if (!quietly)
+          cat(
+            sprintf(
+              "[%s] Wrote re-centering medians to %s...\n",
+              Sys.time(),
+              sdmedian.filename
+          )
+        )
+      }
     }
   } else {
-    # ensure passed-in medians are sorted correctly
-    setkey(sd.recenter, param, sex, agedays)
+    # Use specified data
+    if (!quietly)
+      cat(
+        sprintf(
+          "[%s] Using specified re-centering medians...\n",
+          Sys.time()
+        )
+      )
   }
+
+  # ensure recentering medians are sorted correctly
+  setkey(sd.recenter, param, sex, agedays)
+
   # add sd.recenter to data, and recenter
   setkey(data.all, param, sex, agedays)
   data.all <- sd.recenter[data.all]
@@ -1938,6 +1987,19 @@ cleangrowth <- function(subjid,
       )
   }
 
+  # notification: ensure awareness of small subsets in data
+  if (!quietly) {
+    year.counts <- data.all[, .N, floor(agedays / 365.25)]
+    if (year.counts[N < 100, .N] > 0) {
+      cat(
+        sprintf(
+          "[%s] Note: input data has at least one age-year with < 100 subjects...\n",
+          Sys.time()
+        )
+      )
+    }
+  }
+
   # safety check: treat observations where tbc.sd cannot be calculated as missing
   data.all[is.na(tbc.sd), exclude := 'Missing']
 
@@ -2025,22 +2087,22 @@ read_anthro <- function(path = "", cdc.only = F) {
   # set correct path based on input reference table path (if any)
   weianthro_path <- ifelse(
     path == "",
-    system.file(file.path("extdata","weianthro.txt"), package = "growthcleanr"),
+    system.file(file.path("extdata", "weianthro.txt"), package = "growthcleanr"),
     file.path(path, "weianthro.txt")
   )
   lenanthro_path <- ifelse(
     path == "",
-    system.file(file.path("extdata","lenanthro.txt"), package = "growthcleanr"),
+    system.file(file.path("extdata", "lenanthro.txt"), package = "growthcleanr"),
     file.path(path, "lenanthro.txt")
   )
   bmianthro_path <- ifelse(
     path == "",
-    system.file(file.path("extdata","bmianthro.txt"), package = "growthcleanr"),
+    system.file(file.path("extdata", "bmianthro.txt"), package = "growthcleanr"),
     file.path(path, "bmianthro.txt")
   )
   growth_cdc_ext_path <- ifelse(
     path == "",
-    system.file(file.path("extdata","growthfile_cdc_ext.csv"), package = "growthcleanr"),
+    system.file(file.path("extdata", "growthfile_cdc_ext.csv"), package = "growthcleanr"),
     file.path(path, "growthfile_cdc_ext.csv")
   )
 

diff --git a/README.Rmd b/README.Rmd
@@ -246,6 +246,17 @@ devtools::install_github("carriedaymont/growthcleanr", ref="main")
 Note that `ref="main"` is required; the default branch is "main", and must be
 referred to explicitly.
 
+If you are unable to install `devtools`, a similar function is available in the
+`remotes` package:
+
+```{r, eval = FALSE}
+install.packages("remotes")
+remotes::install_github("carriedaymont/growthcleanr", ref="main")
+```
+
+Note that `ref="main"` is required; the default branch is "main", and must be
+referred to explicitly.
+
 ### Source-level install for developers
 
 If you want to work with and potentially change the `growthcleanr` code itself,
@@ -507,8 +518,9 @@ The following options change the behavior of the growthcleanr algorithm.
   and included as valid measurements for cleaning.
 
 - `sd.extreme` - default `25`; a very extreme value check on modified
-  (recentered) Z-scores used as a first-pass elimination of clearly implausable
+  (recentered) Z-scores used as a first-pass elimination of clearly implausible
   values, often due to misplaced decimals.
+
 - `z.extreme` - default `25`; similar usage as `sd.extreme`, for absolute
   Z-scores.
 
@@ -555,10 +567,23 @@ techniques.
   - `flag.both` - in case of two measurements with at least one beyond
     thresholds, flag both instead of one (as in default)
 
-- `sd.recenter` - defaults to NA; data frame or table w/median SD-scores per day
-  of life by gender and parameter. Columns must include param, sex, agedays, and
-  sd.median (referred to elsewhere as "modified Z-score"). By default, median
-  values will be calculated using growth data to be cleaned.
+- `sd.recenter` - default `NA`; specifies how to recenter medians. May be a data frame
+  or table w/median SD-scores per day of life by gender and parameter, or "`nhanes`"
+  or "`derive`" as a character vector.
+
+  - If `sd.recenter` is specified as a data set, use the data set
+  - If `sd.recenter` is specified as "`nhanes`", use NHANES reference medians
+  - If `sd.recenter` is specified as "`derive`", derive from input
+  - If `sd.recenter` is not specified or `NA`:
+    - If the input set has at least 5,000 observations, derive medians from input
+    - If the input set has fewer than 5,000 observations, use NHANES
+
+  If specifying a data set, columns must include param, sex, agedays, and sd.median
+  (referred to elsewhere as "modified Z-score"), and those medians will be used for
+  centering. This data set must include a row for every ageday present in the dataset
+  to be cleaned; the NHANES reference medians include a row for every ageday in the
+  range (731-7305 days). A summary of how the NHANES reference medians were derived is
+  below under [NHANES reference data](#nhanes).
 
 ### Operational options
 
@@ -959,20 +984,77 @@ for `cleangrowth()`.
 
 ## <a name="related"></a>Related tools
 
-The CDC provides a
-[SAS Program for the 2000 CDC Growth Charts](https://www.cdc.gov/nccdphp/dnpao/growthcharts/resources/sas.htm)
-which can also be used to identify biologically implausible values using a different
-approach, as also implemented for `growthcleanr` in the function `ext_bmiz()`, described
-above under [Computing BMI percentiles and Z-scores](#bmi).
+The CDC provides a [SAS Program for the 2000 CDC Growth
+Charts](https://www.cdc.gov/nccdphp/dnpao/growthcharts/resources/sas.htm) which can
+also be used to identify biologically implausible values using a different approach, as
+also implemented for `growthcleanr` in the function `ext_bmiz()`, described above under
+[Computing BMI percentiles and Z-scores](#bmi).
 
 [GrowthViz](https://github.com/mitre/GrowthViz) provides insights into how
-`growthcleanr` assesses data, packaged in a Jupyter notebook. It ships with the
-same `syngrowth` synthetic example dataset as `growthcleanr`, with results
-included.
+`growthcleanr` assesses data, packaged in a Jupyter notebook. It ships with the same
+`syngrowth` synthetic example dataset as `growthcleanr`, with results included.
+
+## <a name="nhanes"></a>NHANES reference medians
+
+`growthcleanr` [releases](https://github.com/carriedaymont/growthcleanr/releases) up to
+1.2.4 offered two options for recentering medians, either the default of deriving
+medians from the input set, or supplying an externally-defined set of medians. These
+left out an option for researchers working with either small datasets or with data
+which might otherwise not be representative of the population, as deriving medians from
+the input set in those cases might be problematic. To provide a standard default
+reference to address these latter cases, a set of medians were derived from the
+[National Health and Nutrition Examination
+Survey](https://wwwn.cdc.gov/nchs/nhanes/Default.aspx) (NHANES). A summary of that
+process is below. As of release 1.2.5, the default behavior is:
+
+- If `sd.recenter` is specified as a data set, use the data set
+- If `sd.recenter` is specified as `nhanes`, use NHANES
+- If `sd.recenter` is specified as `derive`, derive from input
+- If `sd.recenter` is not specified or `NA`:
+  - If the input set has at least 5,000 observations, derive medians from input
+  - If the input set has fewer than 5,000 observations, use NHANES
+
+With the verbose `cleangrowth()` option `quietly = FALSE`, the recentering medians
+approach used will be noted in the output. If the input set has fewer than 100
+observations for any age-year, this will also be noted in the output.
+
+The NHANES reference medians are based primarily on data from NHANES 2009-2010 through
+2017-2018, including approximately 39,000 heights/lengths and weights from children and
+adolescents between the ages of 0 months and <240 months. Weight and height SD scores
+were calculated from the [L, M, and S
+parameters](https://www.cdc.gov/growthcharts/percentile_data_files.htm) for the [CDC
+growth charts](https://www.cdc.gov/nccdphp/dnpao/growthcharts/resources/sas.htm) were
+used as the reference to calculate weight and height SD scores for the NHANES 2009-2010
+through 2017-2018 samples. Based on the distributions of age-days in children at 0
+months, an age adjustment was made based on the median number of days among these
+infants. This adjustment was made after consultation with the National Center for
+Health Statistics confirmed that a general assumption of ages occurring at the midpoint
+of the indicated integer month of age did not apply to children recorded as 0 months,
+and uses 0.75 months instead.
+
+Weights were supplemented with a random sample of birthweights from NCHS's [Vital
+Statistics Natality Birth
+Data](https://www.nber.org/research/data/vital-statistics-natality-birth-data) for 2018. These had sample weights assigned so that the sum of the sample weights for the
+sample equalled the sum of the sample weights for each month for infants in NHANES, as
+NHANES is a multi-stage complex survey. The reference data was then smoothed using the
+`svysmooth()` function in the R
+[`survey`](https://cran.r-project.org/web/packages/survey/index.html) package to
+estimate the weight and height SD scores for each day up to 7,305 days, with a
+bandwidth chosen to balance between over- and under-fitting, and interpolation between
+the estimates from this function was used to obtain an estimate for each day of age.
+Predictions from a regression model fit to smoothed height SDs between 23 and 365 days
+(the youngest child in NHANES had an estimated age in days of 23) were used to extend
+smoothed height SD scores to children between 1 and 22 days of age.
 
 ## <a name="changes"></a>Changes
 
-For a detailed history of released versions, see `NEWS.md`.
+For a detailed history of released versions, see `NEWS.md`. Tagged releases, starting
+with 1.2.3 in January 2021, are listed [at
+GitHub](https://github.com/carriedaymont/growthcleanr/releases).
+
+In release 1.2.5 in February 2021, the default behavior of recentering medians
+changed as described in [NHANES reference medians](#nhanes). To confirm prior
+results based on derived medians, specify the `sd.recenter` option "derive".
 
 In release 1.2.4 in January 2021, an update was made to the WHO height velocity 3sd
 files to correct a small number of errors: