Merge pull request #49 from abigailsnyder/acs-fldgen-subsetting-outpu…

…t-fcn acs-intermediate fix to saved fldgen memory bloat
JGCRI · Jul 15, 2020 · ca50388 · ca50388
2 parents b85eff4 + d28b269
commit ca50388
Show file tree

Hide file tree

Showing 10 changed files with 269 additions and 123 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(concatGrids)
 export(concatGrids.general)
 export(coord_array)
 export(drop_NAs)
+export(emulator_reducer)
 export(eof_analyze)
 export(extract_box)
 export(file.pairer)
@@ -16,7 +17,6 @@ export(fldgen_object_TP)
 export(fldts2df)
 export(generate.TP.fullgrids)
 export(generate.TP.resids)
-export(loadmodel)
 export(mkcorrts)
 export(normalize.resids)
 export(phase_eqn_coef)
@@ -32,7 +32,6 @@ export(read.temperatures)
 export(read_globalAvg)
 export(readtgav)
 export(reconst_fields)
-export(savemodel)
 export(splitGrids)
 export(splitGrids.general)
 export(split_eof)

diff --git a/R/generateTPresids.R b/R/generateTPresids.R
@@ -23,7 +23,7 @@
 #' @export
 generate.TP.resids <- function(emulator, ngen, method = 1){
 
-    Ngrid <- ncol(emulator$meanfldT$r)
+    Ngrid <- nrow(emulator$meanfldT$w)
 
     newgrids <- lapply(1:ngen,
                        function(x) {

diff --git a/R/readdata.R b/R/readdata.R
@@ -619,29 +619,6 @@ read.precipitations <- function(filename, len=NULL, tag=basename(filename), varn
 }
 
 
-#' @rdname saving_and_restoring
-#' @export
-loadmodel <- function(file, oldfmt=FALSE)
-{
-    if(oldfmt) {
-        load(file)
-        if(!exists('modeldata', inherits=FALSE)) {
-            modeldata <- NULL               # silence check notes.
-            stop('No model data in file.')
-        }
-    }
-    else {
-        modeldata <- readRDS(file)
-    }
-
-    if(!inherits(modeldata, 'fldgen')) {
-        stop('Object loaded from file is not of type "fldgen".')
-    }
-
-    modeldata
-}
-
-
 #' Read and format global mean temperature
 #'
 #' Read global mean temperature from an input netCDF file and format for use

diff --git a/R/writedata.R b/R/writedata.R
@@ -1,5 +1,97 @@
 #### Output functions
 
+#' Subset a trained emulator.
+#'
+#' A trained fldgen emulator features a large amount of data for both
+#' using the emulator and rigorously validating an emulator.
+#'
+#' If one is just interested in the use of an emulator for generating
+#' felds, this function can be called to reduce a trained emulator to
+#' the bare essential list entries, which can then be saved and called
+#' the same as an unreduced emulator by generate.TP.resids and
+#' generate.TP.fullgrids
+#'
+#' Note that with this reduced emulator, there is NO way to reconstruct
+#' the training data. A fully trained emulator contains a copy of the
+#' training data, in addition to the training regressor values (tgav),
+#' and the estimated linear model parameters and residuals
+#'  (meanfieldT$b, w, r), which together can also reconstruct the data.
+#'
+#' Even though the coordinate information stored in an emulator$griddataT
+#' is not needed directly to generate a new field of residuals or full data,
+#' it is often needed in downstream use of the fields. Therfore an entry
+#' reducedEmulator$griddataT$coord containg a matrix is saved in the
+#' reducedEmulator. Each is a matrix of coordinates for each grid cell, with
+#' cells in rows and latitude, longitude in the two columns. Keeping these
+#' coordinate matrices for T and P is negligible.
+#'
+#' Finally, the reduced emulator produced by this function is specifically
+#' meant for temperature and precipitation only, and is not robust to
+#' extension to other variables.
+#'
+#' Finally finally, if a user is interested in a different subset of
+#' list entries in a trained emulator, they are encouraged to subset and
+#' save themself, as appropriate for their project.
+#'
+#' @param emulator A trained fldgen emulator, with all entries needed
+#' for generating new residuals and for rigourously validating the
+#' quality of the trained emulator
+#'
+#' @return reducedEmulator A trained fldgen emulator with only the list
+#' entries needed by generate.TP.resids and generate.TP.fullgrids for
+#' generating new fields:
+#' \describe{
+#' \item{griddataT}{Only the coordinate ids and set information.}
+#' \item{griddataP}{Only the coordinate ids and set information, and the
+#' function to convert from logP to P.}\
+#' \item{tgav}{The Tgav data from training.}
+#' \item{meanfldT}{the slope (w) and intercept (b) terms from the mean field
+#' fit.}
+#' \item{meanfldP}{the slope (w) and intercept (b) terms from the mean field
+#' fit.}
+#'\item{tfuns}{The empirical quantile functions for temperature, mapping
+#'N(0,1) to the native distribution in each grid cell.}
+#'\item{pfuns}{The empirical quantile functions for logP, mapping
+#'N(0,1) to the native distribution in each grid cell.}
+#'\item{reof}{The EOFs.}
+#'\item{fx}{Time coefficients for each EOF from training data.}
+#'\item{infiles}{The names of the files used for training the emulator.}
+#' }
+#'
+#' @author ACS July 2020
+#' @export
+emulator_reducer <- function(emulator){
+
+    if(length(names(emulator)) < 10){  # a full emulator has 10 list entries, check
+        # to make sure that's showing up.
+        stop('Your emulator is already reduced (missing at least one list entry)')
+    }
+
+    # This function reduces the size of the object while preserving the structure
+    # expected by generate.TP.resids and generate.TP.fullgrids.
+    list(griddataT = list(gridid_full = emulator$griddataT$gridid_full,
+                          coord = emulator$griddataT$coord),
+         griddataP = list(gridid_full = emulator$griddataP$gridid_full,
+                          coord = emulator$griddataP$coord,
+                          pvarconvert_fcn = emulator$griddataP$pvarconvert_fcn),
+         tgav = emulator$tgav,
+         # not reconstructing training data, don't need residuals in the
+         # mean fields
+         meanfldT = list(w = emulator$meanfldT$w,
+                           b = emulator$meanfldT$b),
+         meanfldP = list(w = emulator$meanfldP$w,
+                           b = emulator$meanfldP$b),
+         tfuns = list(quant = emulator$tfuns$quant),
+         pfuns = list(quant = emulator$pfuns$quant),
+         reof = emulator$reof,
+         fx = emulator$fx,
+         infiles = emulator$infiles) ->
+        reducedEmulator
+
+    return(reducedEmulator)
+}
+
+
 #' Write a temperature field as a netcdf file.
 #'
 #' Format a field as a netcdf file and write it to the specified file.  The lat,
@@ -56,34 +148,3 @@ write.temperature <- function(fld, file, griddata, varname='tas', varunit='K',
 
     ncdf4::nc_close(ncout)
 }
-
-#' Load and save emulator training data
-#'
-#' \code{savemodel} saves the results of training an emulator in a portable
-#' format.  \code{loadmodel} loads a model from a file created this way and
-#' returns it as a \code{fldgen} object.
-#'
-#' @param modeldata A \code{fldgen} object returned by either
-#' \code{\link{train}} or \code{\link{fldgen_object}}.
-#' @param file Name of the file to write the data to.
-#' @param clobber Flag indicating whether it's ok to overwrite an existing file
-#' @param oldfmt Flag indicating that we should try to load the old (.rda) format from
-#' pre-2.1 versions of fldgen.
-#' @name saving_and_restoring
-NULL
-
-#' @rdname saving_and_restoring
-#' @export
-savemodel <- function(modeldata, file, clobber=FALSE)
-{
-    compress='xz'
-
-    if(!inherits(modeldata, 'fldgen')) {
-        stop('modeldata must be a fldgen object.')
-    }
-
-    if(!clobber && file.exists(file)) {
-        stop('File ', file, ' exists, and noclobber is set.')
-    }
-    saveRDS(modeldata, file=file, compress=compress)
-}
diff --git a/inst/scripts/train-emulators.R b/inst/scripts/train-emulators.R
@@ -0,0 +1,29 @@
+library('fldgen')
+
+train_models <- function(models, tasvar='tasAdjust', prvar='prAdjust',
+                         datadir='./training-data') {
+
+    ## The following would give you the complete set of models:
+    ## models <- c('GFDL-ESM2M', 'HadGEM2-ES', 'IPSL-CM5A-LR', 'MIROC5')
+
+    for (model in models) {
+        datafiles <- list.files(path=datadir, pattern=model, full.names=TRUE)
+        cat('Processing model ', model, '  datafiles:\n', paste(datafiles, collapse='\n'),'\n')
+        emu <- trainTP(datafiles, tvarname=tasvar, pvarname=prvar)
+        emu$griddataP$vardata_raw <- NULL
+        outfilename <- paste0('fldgen-',model, '.rds')
+        coord <- emu$griddataT$coord
+        coord[67382, ] <- c(-49.75, 178.75)
+        emu$griddataT$coord <- coord
+        emu$griddataP$coord <- coord
+
+        saveRDS(emu, outfilename)
+
+
+        emulator <- emulator_reducer(emu)
+        outfilename <- paste0('fldgen-',model, '_reducedEmulator.rds')
+        saveRDS(reducedEmulator, outfilename)
+    }
+}
+
+
diff --git a/inst/scripts/train-emulators.zsh b/inst/scripts/train-emulators.zsh
@@ -0,0 +1,16 @@
+#!/bin/zsh
+
+#SBATCH -p short
+#SBATCH -t 180
+#SBATCH -A IHESD
+
+
+module purge
+module load gcc/8.1.0
+module load netcdf
+module load R/3.4.3
+
+## 
+echo Rscript -e \"source('train-emulators.R'); train_models('$1')\"
+
+Rscript -e "source('train-emulators.R'); train_models('$1')"
diff --git a/man/emulator_reducer.Rd b/man/emulator_reducer.Rd
diff --git a/man/saving_and_restoring.Rd b/man/saving_and_restoring.Rd