ncss-tech · brownag · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023 · Dec 27, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rosettaPTF
 Title: R Frontend for Rosetta Pedotransfer Functions
-Version: 0.1.3
+Version: 0.1.4
 Author: Soil and Plant Science Division Staff
 Maintainer: Andrew G. Brown <andrew.g.brown@usda.gov>
 Description: Access Python rosetta-soil pedotransfer functions in an R environment. Rosetta is a neural network-based model for predicting unsaturated soil hydraulic parameters from basic soil characterization data. The model predicts parameters for the van Genuchten unsaturated soil hydraulic properties model, using sand, silt, and clay, bulk density and water content. The codebase is now maintained by Dr. Todd Skaggs and other U.S. Department of Agriculture employees. This R package is intended to provide for use cases that involve many thousands of calls to the pedotransfer function. Less demanding use cases are encouraged to use the web interface or API endpoint. There are additional wrappers of the API endpoints provided by the soilDB R package `ROSETTA()` method.

diff --git a/R/find_python.R b/R/find_python.R
@@ -69,7 +69,7 @@ find_python <- function(envname = NULL,
   if (length(pypath) == 0) {
 
     # find newest python installation with rosetta installed
-    x <- try(reticulate::py_discover_config("rosetta"))
+    x <- try(reticulate::py_discover_config("rosetta"), silent = TRUE)
     if (!is.null(x$python_versions)) {
       xxx <- lapply(x$python_versions, function(x) {
         y <- gsub("Python ", "", system(paste(shQuote(x), "--version"), intern = TRUE, ignore.stdout = TRUE, ignore.stderr = TRUE))
@@ -82,7 +82,7 @@ find_python <- function(envname = NULL,
 
     # otherwise find newest python installation
     if (is.null(res) || inherits(res, 'try-error')) {
-      x <- try(reticulate::py_discover_config())
+      x <- try(reticulate::py_discover_config(), silent = TRUE)
       if (!is.null(x$python_versions)) {
         xxx <- lapply(x$python_versions, function(x) {
           y <- gsub("Python ", "", system(paste(shQuote(x), "--version"), intern = TRUE, ignore.stdout = TRUE, ignore.stderr = TRUE, show.output.on.console = FALSE))
@@ -109,8 +109,8 @@ find_python <- function(envname = NULL,
       options(rosettaPTF.python_path = PYEXE_PATH)
       options(rosettaPTF.arcpy_path = arcpy_path)
 
-      subres
-    }, silent = TRUE)
+      subres[grep("envs/arcgispro-py3", subres, fixed = TRUE)]
+    }, silent = FALSE)
 
   # User can/should use regular reticulate methods for this
 

diff --git a/R/install_rosetta.R b/R/install_rosetta.R
@@ -6,7 +6,10 @@
 #' @param method `"auto"`, `"virtualenv"`, or `"conda"`; Default: `"auto"`
 #' @param conda Default: `"auto"`
 #' @param pip _logical_. Use `pip` for package installation? Default: `TRUE`. This is only relevant when Conda environments are used, as otherwise packages will be installed from the Conda repositories.
+#' @param system _logical_. Default: `FALSE`. If `TRUE`, try installing to system (user) site library with `system()` and set reticulate to use system Python.
+#' @param user _logical_. Default: `FALSE`. Pass `--user` flag. This should only be done if other installation methods fail and it is impossible to use a virtual environment.
 #' @param arcpy_path Argument passed to `find_python()`. Path to ArcGIS Pro Python installation e.g. ``. Set as `NULL` (default) to prevent use of ArcGIS Pro instance.
+#'
 #' @details From `reticulate::py_install()`: On Linux and OS X the "virtualenv" method will be used by default ("conda" will be used if virtualenv isn't available). On Windows, the "conda" method is always used.
 #'
 #' @export
@@ -17,17 +20,32 @@ install_rosetta <- function(envname = NULL,
                             method = "auto",
                             conda = "auto",
                             pip = TRUE,
+                            user = FALSE,
+                            system = FALSE,
                             arcpy_path = getOption("rosettaPTF.arcpy_path")) {
 
   # use heuristics to find python executable
-  if (!is.null(arcpy_path) && dir.exists(arcpy_path)){
+  if (!is.null(arcpy_path) && dir.exists(arcpy_path)) {
     find_python(envname = envname, arcpy_path = arcpy_path)
   }
 
-  # get rosetta-soil (and numpy if needed)
-  try(reticulate::py_install("rosetta-soil", envname = envname, method = method, conda = conda, pip = pip))
+  if (!is.null(arcpy_path) && dir.exists(arcpy_path)) {
+    p <- find_python(envname = envname, arcpy_path = arcpy_path)
+    system <- TRUE
+  } else {
+    p <- Sys.which("python")
+  }
+
+  if (isFALSE(system)) {
+    # get rosetta-soil (and numpy if needed)
+    try(reticulate::py_install("rosetta-soil", envname = envname, method = method,
+                               conda = conda, pip = pip, pip_options = ifelse(user, "--user", "")), silent = TRUE)
+  } else {
+    reticulate::use_python(p, required = TRUE)
+    try(system(paste(shQuote(p), "-m pip install --upgrade --user rosetta-soil")), silent = TRUE)
+  }
 
-    # load modules globally in package (prevents having to reload rosettaPTF library in session)
+  # load modules globally in package (prevents having to reload rosettaPTF library in session)
   .loadModules()
 
 }
diff --git a/R/run_rosetta.R b/R/run_rosetta.R
@@ -54,6 +54,7 @@ run_rosetta <- function(soildata,
                         vars = NULL,
                         rosetta_version = 3,
                         cores = 1,
+                        core_thresh = NULL,
                         file = NULL,
                         nrows = NULL,
                         overwrite = NULL)
@@ -122,8 +123,9 @@ run_rosetta.RasterStack <- function(soildata,
                                     vars = NULL,
                                     rosetta_version = 3,
                                     cores = 1,
-                                    file = paste0(tempfile(),".grd"),
-                                    nrows = nrow(soildata),
+                                    core_thresh = 20000L,
+                                    file = paste0(tempfile(), ".tif"),
+                                    nrows = nrow(soildata) / (terra::ncell(soildata) / core_thresh),
                                     overwrite = TRUE) {
   ## for in memory only, can just convert to data.frame and use that method
   # res <- run_rosetta(raster::as.data.frame(soildata),
@@ -153,8 +155,9 @@ run_rosetta.RasterBrick <- function(soildata,
                                     vars = NULL,
                                     rosetta_version = 3,
                                     cores = 1,
-                                    file = paste0(tempfile(),".grd"),
-                                    nrows = nrow(soildata),
+                                    core_thresh = 20000L,
+                                    file = paste0(tempfile(), ".tif"),
+                                    nrows = nrow(soildata) / (terra::ncell(soildata) / core_thresh),
                                     overwrite = TRUE) {
   run_rosetta(terra::rast(soildata),
               vars = vars,
@@ -166,8 +169,9 @@ run_rosetta.RasterBrick <- function(soildata,
   )
 }
 #' @param cores number of cores; used only for processing _SpatRaster_ or _Raster*_ input
+#' @param core_thresh Magic number for determining processing chunk size. Default `20000L`. Used to calculate default `nrows`
 #' @param file path to write incremental raster processing output for large inputs that do not fit in memory; passed to `terra::writeStart()` and used only for processing _SpatRaster_ or _Raster*_ input; defaults to a temporary file created by `tempfile()` if needed
-#' @param nrows number of rows to use per block; passed to `terra::readValues()` `terra::writeValues()`; used only for processing _SpatRaster_ or _Raster*_ input; defaults to number of rows in dataset if needed
+#' @param nrows number of rows to use per block chunk; passed to `terra::readValues()` and `terra::writeValues()`; used only for processing _SpatRaster_ or _Raster*_ inputs. Defaults to the total number of rows divided by the number of cells divided by `core_thresh`.
 #' @param overwrite logical; overwrite `file`? passed to `terra::writeStart()`; defaults to `TRUE` if needed
 #' @export
 #' @rdname run_rosetta
@@ -177,10 +181,19 @@ run_rosetta.SpatRaster <- function(soildata,
                                    vars = NULL,
                                    rosetta_version = 3,
                                    cores = 1,
-                                   file = paste0(tempfile(),".grd"),
-                                   nrows = nrow(soildata),
+                                   core_thresh = 20000L,
+                                   file = paste0(tempfile(), ".tif"),
+                                   nrows = nrow(soildata) / (terra::ncell(soildata) / core_thresh),
                                    overwrite = TRUE) {
-  terra::readStart(soildata)
+
+  if (!terra::inMemory(soildata)) {
+    terra::readStart(soildata)
+    on.exit({
+      try({
+        terra::readStop(soildata)
+      }, silent = TRUE)
+    }, add = TRUE)
+  }
 
   # create template brick
   out <- terra::rast(soildata)
@@ -191,23 +204,30 @@ run_rosetta.SpatRaster <- function(soildata,
   names(out) <- cnm
   out_info <- terra::writeStart(out, filename = file, overwrite = overwrite)
 
+  on.exit({
+    try({
+      out <- terra::writeStop(out)
+    }, silent = TRUE)
+  }, add = TRUE)
+
   start_row <- seq(1, out_info$nrows, nrows)
   n_row <- diff(c(start_row, out_info$nrows + 1))
 
-  if (cores > 1 && out_info$nrows*ncol(soildata) > 20000) {
+  if (cores > 1 && out_info$nrows*ncol(soildata) > core_thresh) {
     cls <- parallel::makeCluster(cores)
     on.exit(parallel::stopCluster(cls))
 
     # TODO: can blocks be parallelized?
-    for(i in seq_along(start_row)) {
+    for (i in seq_along(start_row)) {
       if (n_row[i] > 0) {
         blockdata <- terra::readValues(soildata, row = start_row[i], nrows = n_row[i], dataframe = TRUE)
         ids <- 1:nrow(blockdata)
         # soilDB makeChunks logic; what is tradeoff between chunk size and number of requests?
         # run_rosetta is a "costly" function and not particularly fast, so in theory parallel would help
 
         # parallel within-block processing
-        X <- split(blockdata, rep(seq(from = 1, to = floor(length(ids) / 20000) + 1), each = 20000)[1:length(ids)])
+        n <- floor(length(ids) / core_thresh / cores) + 1
+        X <- split(blockdata, rep(seq(from = 1, to = n, each = n)))[1:length(ids)]
         r <- do.call('rbind', parallel::clusterApply(cls, X, function(x) rosettaPTF::run_rosetta(x,
                                                                                                  vars = vars,
                                                                                                  rosetta_version = rosetta_version)))
@@ -216,7 +236,7 @@ run_rosetta.SpatRaster <- function(soildata,
       }
     }
   } else {
-    for(i in seq_along(start_row)) {
+    for (i in seq_along(start_row)) {
       if (n_row[i] > 0) {
         foo <- rosettaPTF::run_rosetta(terra::readValues(soildata, row = start_row[i], nrows = n_row[i], dataframe = TRUE),
                                        vars = vars,
@@ -226,11 +246,9 @@ run_rosetta.SpatRaster <- function(soildata,
     }
   }
 
-  out <- terra::writeStop(out)
-  terra::readStop(soildata)
+  # replace NaN with NA_real_ (note: not compatible with calling writeStop() on.exit())
+  # out <- terra::classify(out, cbind(NaN, NA_real_)) #terra::values(out)[is.nan(terra::values(out))] <- NA_real_
 
-  # replace NaN with NA_real_
-  terra::values(out)[is.nan(terra::values(out))] <- NA_real_
   out
 }
 

diff --git a/man/install_rosetta.Rd b/man/install_rosetta.Rd
diff --git a/man/run_rosetta.Rd b/man/run_rosetta.Rd
diff --git a/tests/testthat/test-install_rosetta.R b/tests/testthat/test-install_rosetta.R
@@ -8,7 +8,7 @@ test_that("rosetta-soil module can be installed", {
   cat("\n\n")
 
   # use pip (if available) or use ArcGIS Pro Conda environment (if available)
-  res <- try(install_rosetta(pip = TRUE, arcpy_path = "C:/Program Files/ArcGIS/Pro/bin/Python/"))
+  res <- try(install_rosetta(system = TRUE, arcpy_path = "C:/Program Files/ArcGIS/Pro/bin/Python/"))
 
   if (inherits(res, 'try-error'))
     skip("Unable to install")

diff --git a/tests/testthat/test-rosetta.R b/tests/testthat/test-rosetta.R
@@ -50,11 +50,11 @@ test_that("run on SSURGO data", {
   resrose <- rosettaPTF::run_rosetta(soildata[,varnames])
   resrose$mukey <- resprop$mukey
 
-  rdf <- data.frame(mukey = as.numeric(terra::cats(res)[[1]][["category"]]))
+  rdf <- data.frame(mukey = as.numeric(terra::cats(res)[[1]][[1]]))
   rdf2 <- merge(rdf, resprop, by = "mukey", all.x = TRUE, sort = FALSE, incomparables = NA)
   rdf3 <- merge(rdf2, resrose, by = "mukey", all.x = TRUE, sort = FALSE, incomparables = NA)
   rdf3 <- rdf3[match(rdf3[["mukey"]], umukeys, incomparables = NA),][1:nrow(resprop),]
-  levels(res) <- data.frame(ID = 1:nrow(rdf3), rdf3)
+  terra::set.cats(res, 1, data.frame(ID = 1:nrow(rdf3), rdf3))
 
   # @params x a SpatRaster with `levels()` set such that `cats(x)[[1]]` defines the mapping between raster values and one or more new attributes
   # @params columns character vector of column names to map from the categorical levels to raster values