diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..3777d4f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +data-raw/ diff --git a/.gitignore b/.gitignore index b46a5f2..9ebfcc7 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .RData .Ruserdata metastore_db/ +inst/python/.ipynb_checkpoints/ .DS_Store vignettes/metastore_db/ vignettes/*.RDS diff --git a/DESCRIPTION b/DESCRIPTION index 6ccf1f4..ecdbd78 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,19 @@ Package: analysisPipelines Type: Package -Title: Compose interoperable analysis pipelines, and put them into production -Version: 0.0.0.9003 +Title: This Package Allows Data Scientists to Compose Interoperable Analysis Pipelines, and Put Them into Production +Version: 0.0.0.9004 Authors@R: c( - person("Naren","Srinivasan", email = "Naren.Srinivasan@mu-sigma.com", role = c("aut")), + person("Naren","Srinivasan", email = "naren1991@gmail.com", role = c("aut")), person("Zubin Dowlaty","", email = "Zubin.Dowlaty@mu-sigma.com", role = c("ctb")), person("Sanjay","", email = "Sanjay@mu-sigma.com", role = c("ctb")), person("Neeratyoy","Mallik", email = "Neeratyoy.Mallik@mu-sigma.com", role = c("ctb")), person("Anoop S","", email = "Anoop.S@mu-sigma.com", role = c("ctb")), person("Mu Sigma, Inc.", email = "ird.experiencelab@mu-sigma.com", role = c("cre")) ) -Description: The package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. Credits to Mu Sigma for supporting the development of the package. -Depends: R (>= 3.4.0), tibble, magrittr, data.table, pipeR, devtools -Imports: ggplot2, dplyr, futile.logger, RCurl, proto -Suggests: plotly, knitr, rmarkdown, SparkR, parallel, visNetwork, rjson, DT, shiny -Remotes: github::cran/SparkR +Description: This package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. Credits to Mu Sigma for supporting the development of the package. +Depends: R (>= 3.4.0), magrittr, pipeR, methods +Imports: ggplot2, dplyr, futile.logger, RCurl, proto, rlang, purrr, devtools +Suggests: plotly, knitr, rmarkdown, parallel, visNetwork, rjson, DT, shiny, R.devices, corrplot, reticulate Encoding: UTF-8 License: Apache License 2.0 LazyLoad: yes @@ -28,5 +27,6 @@ Collate: 'core-functions-meta-pipelines.R' 'core-streaming-functions.R' 'r-batch-eda-utilities.R' + 'r-helper-utilites-python.R' 'spark-structured-streaming-utilities.R' 'zzz.R' diff --git a/NAMESPACE b/NAMESPACE index 8bbd076..9608cec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,13 +15,18 @@ export(exportAsMetaPipeline) export(generateReport) export(genericPipelineException) export(getDatatype) +export(getFeaturesForPyClassification) export(getInput) export(getLoggerDetails) export(getOutputById) export(getPipeline) export(getPipelinePrototype) export(getRegistry) +export(getResponse) +export(getTargetForPyClassification) +export(getTerm) export(ignoreCols) +export(isDependencyParam) export(loadMetaPipeline) export(loadPipeline) export(loadPredefinedFunctionRegistry) @@ -34,6 +39,7 @@ export(savePipeline) export(saveRegistry) export(setInput) export(setLoggerDetails) +export(setPythonEnvir) export(sparkRSessionCreateIfNotPresent) export(univarCatDistPlots) export(updateObject) @@ -44,5 +50,19 @@ exportClasses(MetaAnalysisPipeline) exportClasses(StreamingAnalysisPipeline) exportMethods(checkSchemaMatch) exportMethods(generateOutput) -exportMethods(initialize) -import(SparkR) +importFrom(graphics,image) +importFrom(magrittr,"%>%") +importFrom(methods,getClass) +importFrom(methods,new) +importFrom(methods,removeMethod) +importFrom(methods,setClassUnion) +importFrom(methods,setGeneric) +importFrom(methods,setOldClass) +importFrom(pipeR,"%>>%") +importFrom(rlang,.data) +importFrom(stats,as.formula) +importFrom(stats,lm) +importFrom(stats,reorder) +importFrom(stats,terms) +importFrom(utils,installed.packages) +importFrom(utils,read.csv) diff --git a/R/analysisPipelines_package.R b/R/analysisPipelines_package.R index 49eb1d6..ec868c5 100644 --- a/R/analysisPipelines_package.R +++ b/R/analysisPipelines_package.R @@ -3,6 +3,9 @@ #' The package aims at enabling data scientists to compose pipelines of analysis which consist of data manipulation, #' exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools #' of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. +#' +#' Important Note - This package uses 'SparkR' to interact with Spark and automatically installs it if not present +#' from a Github repo as 'SparkR' is not distrubuted on CRAN #' @docType package #' @name analysisPipelines NULL diff --git a/R/core-functions-batch.R b/R/core-functions-batch.R index 1639c0e..0075a74 100644 --- a/R/core-functions-batch.R +++ b/R/core-functions-batch.R @@ -5,14 +5,16 @@ # Description: An R package version - Currently supports R and Spark ################################################################################################## -#' @name AnalysisPipeline +#' @importFrom magrittr %>% +NULL + +#' @name AnalysisPipeline-class +#' @rdname AnalysisPipeline-class #' @title Class for constructing Analysis Pipelines for batch/ one-time analyeses #' @details Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions, #' the data on which the pipeline is to be applied, as well as the pipeline itself #' @details Additionally, this class is meant to be used for batch/ one-time processing. Contains additional slots to #' hold the data frame to be used for the pipeline and associated schema -#' @details More details of how an object of this class should be initialized is provided in the -#' constructor - \link{initializeAnalysisPipeline} #' @slot input The input dataset on which analysis is to be performed #' @slot originalSchemaDf Empty data frame representing the schema of the input #' @family Package core functions for batch/one-time analyses @@ -26,18 +28,13 @@ AnalysisPipeline <- setClass("AnalysisPipeline", originalSchemaDf = "data.frame" ), contains = "BaseAnalysisPipeline") -#' @name initializeAnalysisPipeline +#' AnalysisPipeline constructor +#' @docType methods +#' @rdname initialize-methods #' @title Constructor for the \link{AnalysisPipeline} class -#' @param .Object The \code{AnalysisPipeline} object -#' @param input The data frame on which operations need to be performed -#' @param filePath File path for a .csv file to directly read in the dataset from -#' @details -#' Either one of \code{input} or \code{filePath} need to be provided i.e. either the -#' data frame or the file path to a csv file -#' @return an object of class \code{AnalysisPipeline}, initialized with the input data frame provided #' @include core-functions.R #' @family Package core functions for batch/one-time analyses -#' @export +#' @keywords internal setMethod( f = "initialize", @@ -202,7 +199,7 @@ checkSchema <- function(dfOld, dfNew){ ## Check engine setup object %>>% assessEngineSetUp -> engineAssessment - engineAssessment %>>% dplyr::filter(requiredForPipeline == T) -> requiredEngines + engineAssessment %>>% dplyr::filter(.data$requiredForPipeline == T) -> requiredEngines if(!all(requiredEngines$isSetup)){ m <- paste0("All engines required for the pipelines have not been configured. ", @@ -243,9 +240,9 @@ checkSchema <- function(dfOld, dfNew){ # Set Input data and set type to engine with max. number of operations - pipelineRegistryOrderingJoin %>>% dplyr::group_by(engine) %>>% dplyr::summarise(numOp = dplyr::n()) -> engineCount + pipelineRegistryOrderingJoin %>>% dplyr::group_by(.data$engine) %>>% dplyr::summarise(numOp = dplyr::n()) -> engineCount - engineCount %>>% dplyr::filter(numOp == max(numOp)) -> maxEngine + engineCount %>>% dplyr::filter(.data$numOp == max(.data$numOp)) -> maxEngine if(nrow(maxEngine) == 1){ @@ -271,27 +268,27 @@ checkSchema <- function(dfOld, dfNew){ name='logger.func') }else if(maxEngineName == "python"){ - # TODO: convert to Python data frame + startTypeConv <- Sys.time() + + inputToExecute <- reticulate::r_to_py(object@input) + + endTypeConv <- Sys.time() + typeConvTime <- endTypeConv - startTypeConv + futile.logger::flog.info(paste("|| Initial Type conversion from R dataframe to Pandas DataFrame,", + "as maximum number of operations are on the Python engine.", + "Time taked : %s seconds ||"), + typeConvTime, + name='logger.func') } batches <- unique(pipelineRegistryOrderingJoin$level) numBatches <- max(as.numeric(batches)) - # no_cores <- parallel::detectCores() - 1 - # cl <- parallel::makeCluster(no_cores) - # parallel::clusterExport(cl = cl, envir = .GlobalEnv, varlist = ls()) - # .executeBatch <- function(y, functionsInBatch){ - # - # return(NULL) - # } - - - # Iterate across batches lapply(batches, function(x, object, pipelineRegistryOrderingJoin, outputCache){ startBatch <- Sys.time() - pipelineRegistryOrderingJoin %>>% dplyr::filter(level == x) -> functionsInBatch + pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$level == x) -> functionsInBatch futile.logger::flog.info("|| Executing Batch Number : %s/%s containing functions '%s' ||", x, numBatches, paste(functionsInBatch$operation, collapse = ", "), name='logger.batch') @@ -299,12 +296,12 @@ checkSchema <- function(dfOld, dfNew){ # Garbage cleaning in the cache - Previous batch outputs prevBatch <- as.character(as.numeric(x) - 1) - pipelineRegistryOrderingJoin %>>% dplyr::filter(level == prevBatch) -> funcInPrevBatch + pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$level == prevBatch) -> funcInPrevBatch if(nrow(funcInPrevBatch)>0){ possiblePrevCacheOutputNames <- paste0("f", funcInPrevBatch$id, ".out") previousCachedOutputNames <- intersect(possiblePrevCacheOutputNames, ls(outputCache)) - pipelineRegistryOrderingJoin %>>% dplyr::filter(storeOutput == TRUE) -> requiredOutputs + pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$storeOutput == TRUE) -> requiredOutputs requiredOutputs <- paste0("f", requiredOutputs$id, ".out") unrequiredCachedOutputNames <- setdiff(possiblePrevCacheOutputNames, @@ -327,32 +324,59 @@ checkSchema <- function(dfOld, dfNew){ ## Replace formula parameters with actual outputs startFunc <- Sys.time() - functionsInBatch %>>% dplyr::filter(id == y) %>>% as.list -> funcDetails + functionsInBatch %>>% dplyr::filter(.data$id == y) %>>% as.list -> funcDetails futile.logger::flog.info("|| Function ID '%s' named '%s' STARTED on the '%s' engine ||", funcDetails$id, funcDetails$operation, funcDetails$engine, name='logger.func') + # Set parameters - if(funcDetails$isDataFunction){ - if(funcDetails$outAsIn && funcDetails$id != "1"){ - dataOpFn <- paste0("f", as.numeric(funcDetails$id) - 1) - actualDataObjectName <- paste0(dataOpFn, ".out") - inputToExecute <- get(actualDataObjectName, envir = outputCache) + params <- unlist(funcDetails$parameters, recursive = F) + dep <- unique(unlist(funcDetails$dependencies, recursive = F)) + depTerms <- paste0("f", dep) + + params <- lapply(params, function(p, depTerms, outputCache){ + if(class(p) == "formula"){ + isDepParam <- analysisPipelines::isDependencyParam(p) + if(isDepParam){ + formulaTerm <- analysisPipelines::getTerm(p) + argName <- analysisPipelines::getResponse(p) + if(formulaTerm %in% depTerms){ + ## Formula of previous function in pipeline + actualParamObjectName <- paste0(formulaTerm, ".out") + p <- get(actualParamObjectName, envir = outputCache) + } + } } - #Check engine - ###TODO: Python to be added + return(p) + }, depTerms, outputCache) + + if(funcDetails$isDataFunction){ + # Not passed as a formula + if(any(class(params[[1]]) == "rlang_fake_data_pronoun")){ + # Checking for outAsIn + if(funcDetails$outAsIn && funcDetails$id != "1"){ + dataOpFn <- paste0("f", as.numeric(funcDetails$id) - 1) + actualDataObjectName <- paste0(dataOpFn, ".out") + inputToExecute <- get(actualDataObjectName, envir = outputCache) + } + }else if(any(class(params[[1]]) %in% c("pandas.core.frame.DataFrame", "data.frame","SparkDataFrame"))){ + inputToExecute <- params[[1]] + } + } currEngine <- funcDetails$engine - prevEngine <- ifelse(class(inputToExecute) == "SparkDataFrame", 'spark', - ifelse(class(inputToExecute) == "data.frame" || class(inputToExecute) == "tibble", - 'r', 'python')) + prevEngine <- ifelse(any(class(inputToExecute) == "SparkDataFrame"), 'spark', + ifelse(any(class(inputToExecute) == "data.frame") || + any(class(inputToExecute) == "tibble"), + 'r', ifelse(any(class(inputToExecute) == "pandas.core.frame.DataFrame"), + 'python', + 'r'))) #Check engine - ###TODO: Python to be added - if(prevEngine != currEngine){ if(prevEngine == 'spark'){ @@ -367,7 +391,15 @@ checkSchema <- function(dfOld, dfNew){ typeConvTime, name='logger.func') }else if(currEngine == 'python'){ - #TODO: python + startTypeConv <- Sys.time() + + inputToExecute <- SparkR::as.data.frame(inputToExecute) %>>% reticulate::r_to_py() + + endTypeConv <- Sys.time() + typeConvTime <- endTypeConv - startTypeConv + futile.logger::flog.info("|| Type conversion from Spark DataFrame to Pandas DataFrame took %s seconds ||", + typeConvTime, + name='logger.func') } }else if(prevEngine == 'r'){ @@ -381,65 +413,50 @@ checkSchema <- function(dfOld, dfNew){ futile.logger::flog.info("|| Type conversion from R dataframe to Spark DataFrame took %s seconds ||", typeConvTime, name='logger.func') - }else if(prevEngine == 'python'){ - # TODO: python + }else if(currEngine == 'python'){ + startTypeConv <- Sys.time() + + inputToExecute <- reticulate::r_to_py(inputToExecute) + + endTypeConv <- Sys.time() + typeConvTime <- endTypeConv - startTypeConv + futile.logger::flog.info("|| Type conversion from Pandas DataFrame to R dataframe took %s seconds ||", + typeConvTime, + name='logger.func') } }else if(prevEngine == 'python'){ - #TODO: python - } - } - } + if(currEngine == "spark"){ + startTypeConv <- Sys.time() + inputToExecute <- reticulate::py_to_r(inputToExecute) %>>% SparkR::as.DataFrame() - # Set parameters - - params <- unlist(funcDetails$parameters, recursive = F) - dep <- unique(unlist(funcDetails$dependencies, recursive = F)) - depTerms <- paste0("f", dep) + endTypeConv <- Sys.time() + typeConvTime <- endTypeConv - startTypeConv + futile.logger::flog.info("|| Type conversion from Pandas DataFrame to Spark DataFrame took %s seconds ||", + typeConvTime, + name='logger.func') + }else if(currEngine == 'r'){ + startTypeConv <- Sys.time() - params <- lapply(params, function(p, depTerms, outputCache){ - if(class(p) == "formula"){ - isDepParam <- analysisPipelines:::isDependencyParam(p) - if(isDepParam){ - formulaTerm <- analysisPipelines:::getTerm(p) - argName <- analysisPipelines:::getResponse(p) - if(formulaTerm %in% depTerms){ + inputToExecute <- reticulate::py_to_r(inputToExecute) - ## Formula of previous function in pipeline - actualParamObjectName <- paste0(formulaTerm, ".out") - p <- get(actualParamObjectName, envir = outputCache) + endTypeConv <- Sys.time() + typeConvTime <- endTypeConv - startTypeConv + futile.logger::flog.info("|| Type conversion from Pandas DataFrame to R dataframe took %s seconds ||", + typeConvTime, + name='logger.func') } } } - return(p) - }, depTerms, outputCache) + #Setting converted dataframe for first parameter of data function + if(funcDetails$isDataFunction){ + inputToExecute -> params[[1]] + } #Call - - #Assign as named parameters - #Get names of params - # paramNames <- lapply(params, function(p){ - # return(names(p)) - # }) %>>% unlist - # params <-lapply(params, function(p){ - # names(p) <- NULL - # return(p) - # }) - # names(params) <- paramNames args <- params - if(funcDetails$isDataFunction){ - formals(funcDetails$operation) %>>% as.list %>>% names %>>% dplyr::first() -> firstArgName - firstArg <- list(inputToExecute) - names(firstArg) <- firstArgName - args <- append(firstArg, params) - } - # }else{ - # firstParam <- params[1] - # names(firstParam) <- "object" - # args <- append(firstParam, params[-1]) - # } output <- tryCatch({do.call(what = funcDetails$operation, args = args)}, error = function(e){ @@ -476,14 +493,15 @@ checkSchema <- function(dfOld, dfNew){ futile.logger::flog.info("|| Batch Number %s/%s COMPLETE. Time taken : %s seconds ||", x, numBatches, batchExecTime, name='logger.batch') - }, object, pipelineRegistryOrderingJoin, outputCache) + }, object, + pipelineRegistryOrderingJoin, outputCache) futile.logger::flog.info("|| Performing final garbage cleaning and collection of outputs ||", name='logger.pipeline') #Final garbage cleaning - pipelineRegistryOrderingJoin %>>% dplyr::filter(storeOutput == TRUE) -> requiredOutputs + pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$storeOutput == TRUE) -> requiredOutputs requiredOutputs <- paste0("f", requiredOutputs$id, ".out") unrequiredCachedOutputNames <- setdiff(ls(outputCache), requiredOutputs) @@ -497,12 +515,6 @@ checkSchema <- function(dfOld, dfNew){ object@output <- mget(ls(outputCache), envir = outputCache) rm(list = ls(outputCache), envir = outputCache) - # stopCluster(cl) - # object@output <- object@pipelineExecutor$cache - # - # #Clear cache - # object@pipelineExecutor$cache <- NULL - endPipelineExecution <- Sys.time() executionTime <- endPipelineExecution - startPipelineExecution @@ -528,6 +540,7 @@ setMethod( #' @name generateReport +#' @rdname generateReport #' @title Generate a HTML report from an \code{AnalysisPipeline} object #' @details #' The sequence of operations stored in the \code{AnalysisPipeline} object are run, outputs generated, @@ -546,6 +559,7 @@ setGeneric( } ) +#' @rdname generateReport setMethod( f = "generateReport", signature = c("AnalysisPipeline", "character"), @@ -560,7 +574,8 @@ setMethod( } # object <- updateObject(object, "emptyRow", "emptyRow",list("emptyRow"),F) - opEngineDetails <- object@pipeline %>>% dplyr::filter(storeOutput == T) + opEngineDetails <- dplyr::left_join(object@pipeline %>>% dplyr::filter(.data$storeOutput == T), + getRegistry(), by = c("operation" = "functionName")) if(!all(unique(opEngineDetails$engine) == 'r')){ futile.logger::flog.warn(paste("|| Pipeline contains engines other than R.", "Will attempt coercing of outputs for rendinring through 'rmarkdown'.", diff --git a/R/core-functions-meta-pipelines.R b/R/core-functions-meta-pipelines.R index a2f49e1..eb2cfb2 100644 --- a/R/core-functions-meta-pipelines.R +++ b/R/core-functions-meta-pipelines.R @@ -8,7 +8,8 @@ # proto' is an S3 class whic is used as a slot, and hence it is defined in the environment setOldClass("proto") -#' @name MetaAnalysisPipeline +#' @name MetaAnalysisPipeline-class +#' @rdname MetaAnalysisPipeline-class #' @title Class for creating and working with meta-pipelines #' @details This class works with the \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} classes, and allows the #' pipeline to be exported as meta-pipeline. A meta-pipeline is a construct, where the input dataset as well as the arguments @@ -28,14 +29,12 @@ MetaAnalysisPipeline <- setClass("MetaAnalysisPipeline", type = "character" )) -#' @name initializeMetaAnalysisPipeline +#' MetaAnalysisPipeline constructor +#' @docType methods +#' @rdname initialize-methods #' @title This is the constructor for the \link{MetaAnalysisPipeline} class -#' @param .Object The \code{MetaAnalysisPipeline} object -#' @param type A string defining whether it is a batch or streaming pipeline. Acceptable values are 'batch' & 'streaming' -#' @details This method is a constructor for the \code{MetaAnalysisPipeline} class -#' @return an object of class \code{MetaAnalysisPipeline}" #' @family Package core functions -#' @export +#' @keywords internal setMethod( f = "initialize", @@ -43,7 +42,7 @@ setMethod( definition = function(.Object, type = "batch") { tryCatch({ - .Object@pipeline <- tibble( + .Object@pipeline <- dplyr::tibble( id = character(), operation = character(), heading = character(), @@ -63,10 +62,11 @@ setMethod( ) #' @name exportAsMetaPipeline +#' @rdname exportAsMetaPipeline #' @title Method to export a meta-pipeline #' @details This method exports a Pipeline object i.e. of the classes \code{AnalysisPipeline} or #' \code{StreamingAnalysisPipeline} as a meta-pipeline -#' @param .Object A Pipeline object +#' @param object A Pipeline object #' @return an object of class "\code{MetaAnalysisPipeline}" #' @family Package core functions #' @export @@ -101,9 +101,9 @@ setGeneric( purrr::imap(parameters, function(p, np){ # n <- names(p) if(class(p) == "formula"){ - if(analysisPipelines:::isDependencyParam(p)){ - n <- analysisPipelines:::getResponse(p) - p <- paste0("~", analysisPipelines:::getTerm(p)) %>>% as.formula + if(analysisPipelines::isDependencyParam(p)){ + n <- analysisPipelines::getResponse(p) + p <- paste0("~", analysisPipelines::getTerm(p)) %>>% as.formula() } } assign(x = paste0(np), @@ -118,6 +118,7 @@ setGeneric( return(metaPipeline) } +#' @rdname exportAsMetaPipeline setMethod( f = "exportAsMetaPipeline", signature = "BaseAnalysisPipeline", @@ -126,8 +127,9 @@ setMethod( #' @name getPipelinePrototype +#' @rdname getPipelinePrototype #' @title Obtain the prototype of the functions in the pipeline -#' @param object A \code{MetaAnalysisPipeline} object +#' @param metaPipelineObj A \code{MetaAnalysisPipeline} object #' @details This method returns the prototype of functions in the pipeline and their respective arguments as \code{proto} object. #' Functions in the pipeline can be accessed easily by using the '$' operator, and within the functions the arguments can #' be accessed the same way. These can be accessed and set to new values. This pipeline prototype can then be passed to the @@ -146,6 +148,7 @@ setGeneric( return(metaPipelineObj@pipelinePrototype) } +#' @rdname getPipelinePrototype setMethod( f = "getPipelinePrototype", signature = "MetaAnalysisPipeline", @@ -154,6 +157,7 @@ setMethod( #' @name createPipelineInstance +#' @rdname createPipelineInstance #' @title Create a Pipeline object from a meta-pipeline #' @param metaPipelineObj A \code{MetaAnalysisPipeline} object #' @param newParams Either a nested named list containing all the functions in the pipeline, their arguments and @@ -173,11 +177,6 @@ setGeneric( .createPipelineInstance <- function(metaPipelineObj, newParams){ - ## deal with formulas - ## get named arguments when creating pipeline - ## provide a template to fill - ## error handling - if(metaPipelineObj@type == "batch"){ pipelineObj <- AnalysisPipeline() }else if(metaPipelineObj@type == "streaming"){ @@ -188,39 +187,46 @@ setGeneric( newParamList <- newParams if(any(class(newParams) == "proto")){ - names(newParams) %>>% grep(x = ., pattern = "^[.]", value = T, invert = T ) -> fnNames + names(newParams) %>>% grep(pattern = "^[.]", value = T, invert = T ) -> fnNames newParamList <- purrr::imap(fnNames, function(fn, nfn){ fnEnvir <- get(fn, envir = newParams) - fnEnvir %>>% names %>>% grep(x = ., pattern = "^[.]", invert = T, value = T ) -> argNames + fnEnvir %>>% names %>>% grep(pattern = "^[.]", invert = T, value = T ) -> argNames params <- mget(x = argNames, envir = newParams[[fn]]) params <- purrr::imap(params, function(p, np){ if(class(p) == "formula"){ - if(analysisPipelines:::isDependencyParam(p)){ - p <- paste(np, "~", analysisPipelines:::getTerm(p)) %>>% as.formula - # names(p) <- NULL + if(analysisPipelines::isDependencyParam(p)){ + p <- paste(np, "~", analysisPipelines::getTerm(p)) %>>% as.formula } #TODO: Deal with normal formula parameters - } #else{ - # names(p) <- np - # } + } return(p) }) - # names(params) <- NULL return(params) }) names(newParamList) <- fnNames } + # Match pipeline table order tblOrder <- match(pipelineObj@pipeline$operation, names(newParamList)) newParamList <- newParamList[tblOrder] - names(newParamList) <- NULL + #Match argument list orders + newParamList <- purrr::imap(newParamList, function(params, fnName){ + pipelineParams <- pipelineObj@pipeline %>>% dplyr::filter(.data$operation == fnName) + pipelineParams <- unlist(pipelineParams$parameters, recursive = F) + argOrder <- match(names(pipelineParams), names(params)) + params <- params[argOrder] + return(params) + }) + + names(newParamList) <- NULL pipelineObj@pipeline %>>% dplyr::mutate(parameters = newParamList) -> pipelineObj@pipeline return(pipelineObj) } +#' @rdname createPipelineInstance setMethod( f = "createPipelineInstance", signature = "MetaAnalysisPipeline", @@ -240,6 +246,7 @@ setMethod( return(vis) } +#' @rdname visualizePipeline setMethod( f = "visualizePipeline", signature = "MetaAnalysisPipeline", @@ -264,6 +271,7 @@ setMethod( }) } +#' @rdname savePipeline setMethod( f = "savePipeline", signature = "MetaAnalysisPipeline", @@ -282,15 +290,16 @@ setMethod( #' @export loadMetaPipeline <- function(path){ tryCatch({ - + object <- NULL futile.logger::flog.warn("|| The existing registry will be overwritten with the registry from the RDS file ||", name = "logger.base") load(path, envir = environment()) functionNames = setdiff(ls(envir = environment()), c("path", "object", ".registry")) - .setRegistry(.registry) + eval(parse(paste0(".setRegistry(.registry)"))) lapply(functionNames, function(x){ - assign(x, get(x, environment()), globalenv()) + assign(x, get(x, environment()), globEnv) }) + return(object) },error = function(e){ futile.logger::flog.error(e, name = "logger.base") diff --git a/R/core-functions.R b/R/core-functions.R index 64abb41..40ff9f0 100644 --- a/R/core-functions.R +++ b/R/core-functions.R @@ -6,11 +6,32 @@ # Spark DataFrames including Structured Streaming ################################################################################################## +#' @importFrom pipeR %>>% +#' @importFrom rlang .data +#' @importFrom graphics image +#' @importFrom methods getClass new removeMethod setClassUnion setGeneric setOldClass +#' @importFrom stats as.formula lm reorder terms +#' @importFrom utils installed.packages read.csv +NULL + +pos <- 1 +globEnv = as.environment(pos) + +try({ + if(!("SparkR" %in% installed.packages())){ + futile.logger::flog.warn(paste0("|| The SparkR package is not installed. Please ensure the right SparkR version compatible", + "compatible with the Spark distribution you plan to use is installed. You can use the 'devtools'", + "package to do the same using 'devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg')' ||"), + name = "logger.base") + } +}, silent = TRUE) + + #' This section defines the environment which the package uses for maintaining the registry and an outputCache #' @keywords internal .analysisPipelinesEnvir <- new.env(parent = emptyenv()) -.analysisPipelinesEnvir$.functionRegistry <- tibble( +.analysisPipelinesEnvir$.functionRegistry <- dplyr::tibble( functionName = character(), heading = character(), engine = character(), @@ -21,19 +42,18 @@ ) .analysisPipelinesEnvir$.outputCache <- new.env() -#' @name BaseAnalysisPipeline +#' @name BaseAnalysisPipeline-class +#' @rdname BaseAnalysisPipeline-class #' @title Base class for \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} objects #' @details The class which holds the metadata including the registry of available functions, #' the data on which the pipeline is to be applied, as well as the pipeline itself, and serves #' as the base class for various types of Pipeline objects such as Batch and Streaming. #' @details This base class which contains the slots related to the registry, pipeline and output can be extended #' to create custom class for specific scenarios if required. -#' @details The details of the constructor for this class can be found at \link{initializeBaseAnalysisPipeline} #' @details In the documentation, objects of classes which are subclasses of this class are referred to as 'Pipeline' objects #' @slot pipeline A tibble which holds functions to be called #' @slot pipelineExecutor A list containing details of the execution, such as topological ordering of functions to be executed, #' dependency map of functions, as well as logger configuration -#' @slot registry A tibble which holds all the registered functions #' @slot output A list which holds all the functions output #' @family Package core functions #' @exportClass BaseAnalysisPipeline @@ -42,21 +62,16 @@ BaseAnalysisPipeline <- setClass("BaseAnalysisPipeline", slots = c( pipeline = "tbl", - # registry = "tbl", pipelineExecutor = "list", output = "list" )) -#' @name initializeBaseAnalysisPipeline +#' BaseAnalysisPipeline constructor +#' @docType methods +#' @rdname initialize-methods #' @title This is the constructor for the \link{BaseAnalysisPipeline} class -#' @param .Object The \code{BaseAnalysisPipeline} object -#' @param loggerDetails Provide logger details -#' @details -#' This is a constructor function for the base class for various types of Analysis Pipelines. This method gets -#' internally called by \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} constructors. -#' @return an object of class \code{BaseAnalysisPipeline}" #' @family Package core functions -#' @export +#' @keywords internal setMethod( f = "initialize", @@ -64,7 +79,7 @@ setMethod( definition = function(.Object) { tryCatch({ - .Object@pipeline <- tibble( + .Object@pipeline <- dplyr::tibble( id = character(), operation = character(), heading = character(), @@ -74,9 +89,9 @@ setMethod( ) .Object@pipelineExecutor <- list( - topologicalOrdering = tibble(id = character(), + topologicalOrdering = dplyr::tibble(id = character(), level = character()), - dependencyLinks = tibble(from = character(), + dependencyLinks = dplyr::tibble(from = character(), to = character()), loggerDetails <- list() ) @@ -105,6 +120,7 @@ setMethod( #' @param functionType type of function - 'batch' for \code{AnalysisPipeline} objects, 'streaming' for \code{StreamingAnalysisPipeline} objects #' @param engine specifies which engine the function is to be run on. Available engines include "r", "spark", and "python" #' @param isDataFunction logical parameter which defines whether the function to be registered operates on data i.e. the first parameter is a dataframe +#' @param exceptionFunction R object corresponding to the exception function #' @param firstArgClass character string with the class of the first argument to the function, if it is a non-data function #' @param loadPipeline logical parameter to see if function is being used in loadPipeline or not. This is for internal working #' @param userDefined logical parameter defining whether the function is user defined. By default, set to true @@ -160,7 +176,10 @@ registerFunction <- function( functionName, heading = "", dataFrameClass <- "data.frame" if(engine == "spark" || engine == 'spark-structured-streaming'){ - dataFrameClass <- "SparkDataFrame" + # dataFrameClass <- "SparkDataFrame" + dataFrameClass <- "ANY" + }else if(engine == 'python'){ + dataFrameClass <- "pandas.core.frame.DataFrame" } @@ -219,36 +238,69 @@ registerFunction <- function( functionName, heading = "", newArgs <- alist() f <- get(functionName, .GlobalEnv) - originalArgs <- formals(f) %>>% as.list - firstArg <- names(originalArgs)[1] - - - if(isDataFunction){ - # originalArgs <- originalArgs[-1] - newArgs <- originalArgs - firstArgClass <- dataFrameClass - paramsToBeParsed <- paste0(originalArgs[-1] %>>% names, collapse = ", ") - genericSignature <- names(originalArgs)[1] - objectName <- names(originalArgs)[1] - - packageMethodSignature <- paste0('"', childClass, '"') - origMethodSignature <-paste0('"', dataFrameClass, '"') + origF <- f + originalArgs <- list() + + argEnv <- NULL + #Checking for direct python functions + if(any(class(f) == "python.builtin.function")){ + inspect <- reticulate::import("inspect") + argEnv <- inspect$getargspec(f) + originalArgs <- argEnv$args %>>% lapply(function(x){ + a <- eval(parse(text = paste0("alist(", x, " = )"))) + return(a) + }) %>>% unlist %>>% as.list }else{ + originalArgs <- formals(f) %>>% as.list + } + firstArg <- names(originalArgs)[1] + # originalArgs <- formals(f) %>>% as.list + # firstArg <- names(originalArgs)[1] + + + # if(isDataFunction){ + # # originalArgs <- originalArgs[-1] + # newArgs <- originalArgs + # firstArgClass <- dataFrameClass + # paramsToBeParsed <- paste0(originalArgs[-1] %>>% names, collapse = ", ") + # genericSignature <- names(originalArgs)[1] + # objectName <- names(originalArgs)[1] + # + # packageMethodSignature <- paste0('"', childClass, '"') + # origMethodSignature <-paste0('"', dataFrameClass, '"') + # }else{ + if(isDataFunction){ + firstArgClass <- dataFrameClass + originalArgs[[1]] <- rlang::.data + } newArgs <- append(objectArg, originalArgs) paramsToBeParsed <- paste0(originalArgs %>>% names, collapse = ", ") genericSignature <- c("object", names(originalArgs)[1]) - formulaUnionClassName <- paste0("formulaOR", firstArgClass) - setClassUnion(name = formulaUnionClassName, - c("formula", firstArgClass), - where = .GlobalEnv) - packageMethodSignature <- c(childClass, formulaUnionClassName) + ## Adding missing signature to method + if(isDataFunction){ + firstArgClassName <- paste0("formulaOR", firstArgClass, "ORmissing") + setClassUnion(name = firstArgClassName, + c("formula", firstArgClass, "missing"), + where = .GlobalEnv) + }else{ + firstArgClassName <- paste0("formulaOR", firstArgClass) + setClassUnion(name = firstArgClassName, + c("formula", firstArgClass), + where = .GlobalEnv) + } + + # formulaMissingUnionClassName <- paste0("formulaOR", firstArgClass, "ORmissing") + # setClassUnion(name = formulaUnionClassName, + # c("formula", firstArgClass, "missing"), + # where = .GlobalEnv) + packageMethodSignature <- c(childClass, firstArgClassName) origMethodSignature <- c("missing", firstArgClass) #Converting to string packageMethodSignature <- paste0('c("', paste(packageMethodSignature, collapse = '", "'), '")') origMethodSignature <- paste0('c("', paste(origMethodSignature, collapse = '", "'), '")') - } + # } parametersName <- paste0(newArgs %>>% names, collapse = ", ") methodParams <- paste0(originalArgs %>>% names, collapse = ", ") @@ -258,12 +310,21 @@ registerFunction <- function( functionName, heading = "", # } methodBody <- paste0(utils::capture.output(body(eval(parse(text=functionName)))),collapse="\n") + + # if(engine == 'python'){ + # gsub(pattern = "py_resolve_dots", replacement = "reticulate:::py_resolve_dots", methodBody) -> methodBody + # gsub(pattern = "py_call_impl", replacement = "reticulate:::py_call_impl", methodBody) -> methodBody + # } + + # if(isDataFunction && childClass == "StreamingAnalysisPipeline"){ + # methodBody <- gsub(pattern = "\\{", replacement = paste0("{ check", firstArg , " = object;"), x = methodBody) + # } # methodBody <- gsub(pattern = "\\{", replacement = paste0("{", firstArg , " = object;"), x = methodBody) ##Assigning the exception function to the global Environment assign(exceptionFunction, get(x = exceptionFunction, envir = environment()), - envir = .GlobalEnv) + envir = globEnv) @@ -271,6 +332,18 @@ registerFunction <- function( functionName, heading = "", formals(f) <- genericArgs body(f) <- paste('standardGeneric("', functionName,'")') + ## Suffix for python & Spark functions + if(engine == 'python'){ + methodBody <- paste0('{', + 'val <- ', functionName, '(', methodParams, ');', + 'return(val);}') + functionName <- paste0(functionName, "_py") + }else if(engine == "spark"){ + functionName <- paste0(functionName, "_spark") + }else if(engine == "spark-structured-streaming"){ + functionName <- paste0(functionName, "_sparkSS") + } + registerFunText <- paste0( # Adding to pipeline when run on a Analysis Pipeline object @@ -284,8 +357,8 @@ registerFunction <- function( functionName, heading = "", 'parametersPassed <- lapply(parametersList, function(x){', 'val <- eval(parse(text = x));', 'if(class(val) == "formula"){', - 'if(analysisPipelines:::isDependencyParam(val)){', - 'val <- as.formula(paste(x,"~",analysisPipelines:::getTerm(val)))', + 'if(analysisPipelines::isDependencyParam(val)){', + 'val <- as.formula(paste(x,"~",analysisPipelines::getTerm(val)))', '};',#else{', # 'names(val) <- x', # '};', @@ -305,6 +378,9 @@ registerFunction <- function( functionName, heading = "", 'signature = ', origMethodSignature, ',', 'definition = function( ', methodParams,')', methodBody, ')' + # 'setMethod(f = "',functionName,'",', + # 'signature = ', origMethodSignature, ',', + # 'definition = origF)' ) #Register function @@ -417,6 +493,7 @@ loadPredefinedFunctionRegistry <- function(){ } #' @name setInput +#' @rdname setInput #' @title Sets the input for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object #' @details #' Assigns the input to the pipeline for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object @@ -428,6 +505,7 @@ loadPredefinedFunctionRegistry <- function(){ #' @return Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object #' @family Package core functions #' @export + setGeneric( name = "setInput", def = function(object, @@ -452,6 +530,7 @@ setGeneric( }) } +#' @rdname setInput setMethod( f = "setInput", signature = "BaseAnalysisPipeline", @@ -459,6 +538,7 @@ setMethod( ) #' @name updateObject +#' @rdname updateObject #' @title Update the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by adding an operation to the pipeline #' @details #' The specified operation along with the heading and parameters is updated in the pipeline slot @@ -496,7 +576,7 @@ setGeneric( }else{ id = max(as.numeric(object@pipeline$id)) + 1 } - object@pipeline %>>% add_row(id = id, + object@pipeline %>>% dplyr::add_row(id = id, operation = operation, heading = heading, parameters = list(parameters), @@ -511,6 +591,7 @@ setGeneric( }) } +#' @rdname updateObject setMethod( f = "updateObject", signature = "BaseAnalysisPipeline", @@ -518,15 +599,15 @@ setMethod( ) #' @name assessEngineSetUp +#' @rdname assessEngineSetUp #' @title Assesses engine (R, Spark, Python, Spark Structured Streaming) set up #' @details #' Assesses whether engines required for executing functions in an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} #' object have been set up -#' @details This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines -#' which extend this class -#' @param object object that contains input, pipeline, registry and output -#' @return Tibble containing the details of available engines, whether they are required for a recipe, a logical reporting -#' whether the engine has been set up, and comments. +#' @details This method is implemented on the base class as it is a shared functionality across Pipeline objects +#' @param object A Pipeline object +#' @return Tibble containing the details of available engines, whether they are required for a pipeline, a logical value +#' reporting whether the engine has been set up, and comments. #' @family Package core functions #' @export @@ -544,7 +625,7 @@ setGeneric( startEngineAssessment <- Sys.time() futile.logger::flog.info("|| Engine Assessment for pipeline STARTED ||" , name='logger.engine.assessment') - engineAssessment <- tibble(engine = character(), + engineAssessment <- dplyr::tibble(engine = character(), requiredForPipeline = logical(), isSetup = logical(), comments = character()) @@ -589,7 +670,28 @@ setGeneric( isSetup = isSparkSetup, comments = sparkComments) -> engineAssessment - #TO DO - Python + #Python + isPythonSetup <- F + pythonComments <- "" + checkSession <- "" + checkSession <- tryCatch({ + reticulate::py_run_string('a = "Is session running"') %>>% reticulate::py_to_r() -> sess + if("a" %in% names(sess)){ + isPythonSetup <- T + pythonComments <- reticulate::py_config() %>% as.character %>>% paste(collapse = "\n") + }else{ + pythonComments <- paste0("There does not seem to be a Python Session initialized through reticulate ", + "which is required to execute pipelines containing Python functions. ") + } + }, error = function(e){ + pythonComments <- paste0("There does not seem to be a Python Session initialized through reticulate ", + "which is required to execute pipelines containing Python functions. ") + }) + + engineAssessment %>>% dplyr::add_row(engine = "python", + requiredForPipeline = ifelse("python" %in% requiredEngines, T, F), + isSetup = isPythonSetup, + comments = pythonComments) -> engineAssessment } @@ -607,7 +709,7 @@ setGeneric( }) } - +#' @rdname assessEngineSetUp setMethod( f = "assessEngineSetUp", signature = "BaseAnalysisPipeline", @@ -615,6 +717,7 @@ setMethod( ) #' @name savePipeline +#' @rdname savePipeline #' @title Saves the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object to the file system without outputs #' @details #' The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object is saved to the file system in the paths specified @@ -650,6 +753,7 @@ setGeneric( }) } +#' @rdname savePipeline setMethod( f = "savePipeline", signature = "BaseAnalysisPipeline", @@ -658,6 +762,7 @@ setMethod( #' @name getPipeline +#' @rdname getPipeline #' @title Obtain the pipeline #' @param object The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object #' @details @@ -680,6 +785,7 @@ setGeneric( return(object@pipeline) } +#' @rdname getPipeline setMethod( f = "getPipeline", signature = "BaseAnalysisPipeline", @@ -701,6 +807,7 @@ getRegistry <- function(){ } #' @name getInput +#' @rdname getInput #' @title Obtains the initializedInput #' @param object The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object #' @details @@ -723,6 +830,7 @@ setGeneric( return(object@input) } +#' @rdname getInput setMethod( f = "getInput", signature = "BaseAnalysisPipeline", @@ -730,9 +838,10 @@ setMethod( ) #' @name getOutputById +#' @rdname getOutputById #' @title Obtains a specific output #' @param object The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object -#' @param id The position of the function for which the output is desired in the sequence of operations in the pipeline. +#' @param reqId The position of the function for which the output is desired in the sequence of operations in the pipeline. #' @param includeCall Logical which defines whether the call used to generate the output should be returned. By, default this is false #' @details #' Obtains a specific output from the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by passing the position @@ -760,7 +869,7 @@ setGeneric( op <- list(call = data.frame(), output = list()) reqId <- as.character(reqId) - object@pipeline %>>% dplyr::filter(id == reqId) -> call + object@pipeline %>>% dplyr::filter(.data$id == reqId) -> call if(call$storeOutput){ object@output[[paste0("f", reqId, ".out")]] -> output }else{ @@ -783,6 +892,7 @@ setGeneric( }) } +#' @rdname getOutputById setMethod( f = "getOutputById", signature = "BaseAnalysisPipeline", @@ -793,23 +903,33 @@ setMethod( #' @name getResponse #' @title Obtains the response term from the formula -#' @keywords internal +#' @param f formula from which term is to be extracted. +#' @details This is a helper function to extract the response variable from a formula +#' @return The response variable in the formula as a string +#' @export getResponse <- function(f){ - resp <- dimnames(attr(terms(f), "factors"))[[1]][1] + resp <- dimnames(attr(stats::terms(f), "factors"))[[1]][1] return(resp) } #' @name getTerm #' @title Obtains the dependency term from the formula -#' @keywords internal +#' @param f formula from which term is to be extracted. +#' @details This is a helper function to extract the terms from a formula +#' @return String with the terms +#' @export getTerm <- function(f){ - t <- attr(terms(f), "term.labels") + t <- attr(stats::terms(f), "term.labels") return(t) } #' @name isDependencyParam #' @title Checks if the parameter is the dependency parameter -#' @keywords internal +#' @param f formula from which term is to be extracted. +#' @details This is a helper function to check if the formula provided is a dependency parameter, +#' as per the package's formula semantics, capturing function dependencies +#' @return Logical as to whether it is a dependency parameter +#' @export isDependencyParam <- function(f){ termRegexPattern <- "[f]|[:digit:]" t <- NULL @@ -924,13 +1044,13 @@ computeEdges <- function(pipelineRegistryJoin){ } return(edges) - }) %>>% dplyr::bind_rows(.) -> edgesDf + }) %>>% dplyr::bind_rows(.data) -> edgesDf if(nrow(edgesDf) == 0 && ncol(edgesDf) == 0){ - edgesDf <- tibble(from = character(), + edgesDf <- dplyr::tibble(from = character(), to = character()) }else{ - edgesDf %>>% dplyr::distinct(from, to, .keep_all = TRUE) -> edgesDf + edgesDf %>>% dplyr::distinct(.data$from, .data$to, .keep_all = TRUE) -> edgesDf } return(edgesDf) },error = function(e){ @@ -981,7 +1101,7 @@ identifyTopLevelRecursively <- function(input = list(topDf = dplyr::tibble(), }else{ startingPoints <- getStartingPoints(nodes, edgeDf) topDf %>>% dplyr::bind_rows(dplyr::bind_cols(id = startingPoints, level = rep(as.character(l), length(startingPoints)))) -> topDf - edgeDf %>>% dplyr::filter(!(from %in% startingPoints)) -> edgeDf + edgeDf %>>% dplyr::filter(!(.data$from %in% startingPoints)) -> edgeDf nodes %>>% setdiff(startingPoints) -> nodes output <- list(topDf = topDf, @@ -1017,9 +1137,11 @@ identifyTopologicalLevels <- function( } ####################### Execution prep ############################# -#' @rdname prepExecution + #' @name prepExecution +#' @rdname prepExecution #' @title Prepare the pipleline for execution +#' @param object A Pipeline object #' @details The pipeline is prepared for execution by identifying the graph of the pipeline as well as its topological ordering, #' and dependency map in order to prepare for execution #' @return Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object @@ -1067,6 +1189,7 @@ setGeneric( }) } +#' @rdname prepExecution setMethod( f = "prepExecution", signature = "BaseAnalysisPipeline", @@ -1074,6 +1197,7 @@ setMethod( ) #' @name visualizePipeline +#' @rdname visualizePipeline #' @title Visualizes the pipeline as a graph #' @details Indicates dependencies amongst functions as well as functions for which output #' needs to be stored @@ -1141,34 +1265,36 @@ setGeneric( edge_df <- object@pipelineExecutor$dependencyLinks - storedOutputs <- node_df %>>% dplyr::filter(storeOutput == T) + storedOutputs <- node_df %>>% dplyr::filter(.data$storeOutput == T) storedOutputs <- storedOutputs$id - spData <- node_df %>>% dplyr::filter(isDataFunction == T) - spData <- spData %>>% dplyr::filter(level == min(as.numeric(level))) + spData <- node_df %>>% dplyr::filter(.data$isDataFunction == T) + spData <- spData %>>% dplyr::filter(.data$level == min(as.numeric(.data$level))) spDataIds <- spData$id - spParam <-node_df %>>% dplyr::filter(isDataFunction == F) - spParam <- spParam %>>% dplyr::filter(level == min(as.numeric(level))) + spParam <-node_df %>>% dplyr::filter(.data$isDataFunction == F) + spParam <- spParam %>>% dplyr::filter(.data$level == min(as.numeric(.data$level))) spParamIds <- spParam$id - node_df %>>% dplyr::mutate(image = ifelse(engine == "r", rLogo, - ifelse(engine == "spark", sparkLogo, - ifelse(engine == 'spark-structured-streaming', sparkSsLogo, + node_df %>>% dplyr::mutate(image = ifelse(.data$engine == "r", rLogo, + ifelse(.data$engine == "spark", sparkLogo, + ifelse(.data$engine == 'spark-structured-streaming', sparkSsLogo, pythonLogo)))) -> node_df node_df$shape <- "image" # node_df %>>% dplyr::mutate(group = ifelse(storeOutput == T, "Stored output", "Auxiliary step")) node_df$group <- "function" node_df %>>% - dplyr::select(id, operation, group, shape, image) -> node_df + dplyr::select(.data$id, .data$operation, .data$group, .data$shape, .data$image) -> node_df colnames(node_df) <- c("id", "label", "group","shape", "image") - node_df %>>% dplyr::add_row(id = "d0", label = "Data", group = "data", shape = "image", image = dataLogo ) -> node_df + node_df %>>% dplyr::add_row(id = "d0", label = "Data", group = "data", shape = "image", + image = dataLogo ) -> node_df for(o in storedOutputs){ - node_df %>>% dplyr::add_row(id = paste0("o",o), label = paste("Output ID:", o ), group = "output", shape = "image", + node_df %>>% dplyr::add_row(id = paste0("o",o), label = paste("Output ID:", o ), group = "output", + shape = "image", image = outputLogo) -> node_df } @@ -1180,7 +1306,8 @@ setGeneric( for(s in spParamIds){ pId <- paste0("p", s) - node_df %>>% dplyr::add_row(id = pId, label = "Non-data parameter", group = "parameter", shape = "image", + node_df %>>% dplyr::add_row( id = pId, label = "Non-data parameter", group = "parameter", + shape = "image", image = paramLogo ) -> node_df edge_df %>>% dplyr::add_row(from = paste0("p", s), to = s) -> edge_df } @@ -1219,6 +1346,7 @@ setGeneric( } +#' @rdname visualizePipeline setMethod( f = "visualizePipeline", signature = "BaseAnalysisPipeline", @@ -1229,8 +1357,9 @@ setMethod( ########### Changing generics ############################################ -#' @rdname generateOutput + #' @name generateOutput +#' @rdname generateOutput #' @title Generate a list of outputs from Pipeline objects #' @details \code{generateOutput} is a generic function that is implemented for various types of pipeline objects #' such as \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} @@ -1239,7 +1368,7 @@ setMethod( #' are run and outputs generated, stored in a list #' @param object object that contains input, pipeline, registry and output #' @return Updated Pipeline object with the outputs at each step stored in the \code{output} slot. -#' @return Specific outputs can be obtained by using the \link{getOuputByOrderId} function +#' @return Specific outputs can be obtained by using the \link{getOutputById} function #' @family Package core functions #' @include core-functions.R #' @exportMethod generateOutput @@ -1252,8 +1381,9 @@ setGeneric( } ) -#' @rdname checkSchemaMatch + #' @name checkSchemaMatch +#' @rdname checkSchemaMatch #' @title Checks the schema of the input to a Pipeline object against the original #' @param object A Pipeline object #' @param newData The newData that the pipeline is to be initialized with @@ -1274,14 +1404,15 @@ setGeneric( ######### Logging functions ################### -#' @rdname setLoggerDetails + #' @name setLoggerDetails +#' @rdname setLoggerDetails #' @title Sets the logger configuration for the pipeline #' @details This function sets the logger configuration for the pipeline. #' @param object A Pipeline object #' @param target A string value. 'console' for appending to console, 'file' for appending to a file, or 'console&file' for both #' @param targetFile File name of the log file in case the target is 'file' -#' @param targetLayout Specify the layout according to 'futile.logger' package convention +#' @param layout Specify the layout according to 'futile.logger' package convention #' @family Package core functions #' @export @@ -1304,14 +1435,16 @@ setGeneric( return(object) } +#' @rdname setLoggerDetails setMethod( f = "setLoggerDetails", signature = "BaseAnalysisPipeline", definition = .setLoggerDetails ) -#' @rdname getLoggerDetails + #' @name getLoggerDetails +#' @rdname getLoggerDetails #' @title Obtains the logger configuration for the pipeline #' @details This function obtains the logger configuration for the pipeline. #' @param object A Pipeline object @@ -1329,6 +1462,7 @@ setGeneric( return(object@pipelineExecutor$loggerDetails ) } +#' @rdname getLoggerDetails setMethod( f = "getLoggerDetails", signature = "BaseAnalysisPipeline", @@ -1372,7 +1506,7 @@ initializeLoggers <- function(object){ #' @export genericPipelineException <- function(error){ message <- error$message - m <- paste0("EXCEPTION OCCURED WHILE RUNNING THE PIPELINE FUNCTION WITH PROVIDED PARAMETERS: ", message) + m <- paste0("|| EXCEPTION OCCURED WHILE RUNNING THE PIPELINE FUNCTION WITH PROVIDED PARAMETERS: ", message, " ||") futile.logger::flog.error(m, name = 'logger.func') stop(m) } @@ -1407,10 +1541,11 @@ loadPipeline <- function(path, input = data.frame() , filePath = ""){ lapply(functionNames, function(x){ - assign(x, get(x, environment()), globalenv()) + assign(x, get(x, environment()), globEnv) }) - .setRegistry(.registry) + eval(parse(paste0(".setRegistry(.registry)"))) + futile.logger::flog.info("|| Registry loaded succesfully ||", name = "logger.base") @@ -1419,29 +1554,29 @@ loadPipeline <- function(path, input = data.frame() , filePath = ""){ schemaCheck <- object %>>% checkSchemaMatch(input) if(!schemaCheck$isSchemaSame){ if(length(schemaCheck$removedColumns) > 0){ - m <- paste0("Some columns which were present in the original schema ", + m <- paste0("|| Some columns which were present in the original schema ", "for the pipeline, ", "are not present in the new data frame. Some pipeline functions ", "may not execute as expected. Use the checkSchemaMatch function to obtain ", - "a detailed comparison") + "a detailed comparison ||") futile.logger::flog.warn(m, name = 'logger.pipeline') warning(m) } if(length(schemaCheck$addedColumns) > 0){ - m <- paste0("Some new columns have been added to the new data frame ", + m <- paste0("|| Some new columns have been added to the new data frame ", "as compared to the original schema for the pipeline. ", "Use the checkSchemaMatch function to obtain ", - "a detailed comparison") + "a detailed comparison ||") futile.logger::flog.warn(m, name = 'logger.pipeline') warning(m) } if(length(schemaCheck$addedColumns) == 0 && length(schemaCheck$removedColumns) == 0){ - m <- paste0("Colummn names are the same but types have changed", + m <- paste0("|| Colummn names are the same but types have changed", "Some pipeline functions may not execute as expected. ", "Use the checkSchemaMatch function to obtain ", - "a detailed comparison") + "a detailed comparison ||") futile.logger::flog.warn(m, name = 'logger.pipeline') warning(m) } @@ -1475,18 +1610,34 @@ initDfBasedOnType <- function(input, filePath){ if(!all(dim(input) == c(0,0))){ #Check for R, Spark, Python data frame if(class(input) == "SparkDataFrame"){ - input <- SparkR::as.data.frame(input) - }else if(class(input) == "data.frame" || class(input) == "tibble"){ - #do nothing for R - }else{ - m <- "The provided input is not of class - data.frame or SparkDataFrame" + if("SparkR" %in% installed.packages()){ + input <- SparkR::as.data.frame(input) + }else{ + futile.logger::flog.error(paste0("|| 'SparkR' is not installed. Please install before initializing the pipeline", + " with a SparkDataFrame ||"), + name = 'logger.pipeline') + stop() + } + }else if(any(class(input) == "pandas.core.frame.DataFrame")){ + if("reticulate" %in% installed.packages()){ + input <- reticulate::py_to_r(input) + }else{ + futile.logger::flog.error(paste0("|| 'reticulate' is not installed. Please install before initializing the pipeline", + " with a Pandas DataFrame ||"), + name = 'logger.pipeline') + stop() + } + }else if(any(class(input) %in% c("data.frame", "tibble"))){ + # do nothing for R - Check is required so that the exception is not thrown + } else{ + m <- "|| The provided input is not of class - data.frame, SparkDataFrame or Pandas DataFrame ||" futile.logger::flog.error(m, name = 'logger.pipeline') - stop(m) + stop() } } } else{ - input <- read.csv(filePath) + input <- utils::read.csv(filePath) } return(input) @@ -1531,10 +1682,11 @@ loadRegistry <- function(path){ load(path, envir = environment()) functionNames = setdiff(ls(envir = environment()), c("path", ".registry")) - .setRegistry(.registry) + + eval(parse(paste0(".setRegistry(.registry)"))) lapply(functionNames, function(x){ - assign(x, get(x, environment()), globalenv()) + assign(x, get(x, environment()), globEnv) }) diff --git a/R/core-streaming-functions.R b/R/core-streaming-functions.R index 6233409..4241b78 100644 --- a/R/core-streaming-functions.R +++ b/R/core-streaming-functions.R @@ -12,15 +12,14 @@ # - Test loadPipeline function #' @include core-functions.R +NULL - -#' @name StreamingAnalysisPipeline +#' @name StreamingAnalysisPipeline-class +#' @rdname StreamingAnalysisPipeline-class #' @title Class for constructing Analysis Pipelines for streaming analyeses #' @details Inherits the base class \link{BaseAnalysisPipeline} class which holds the metadata including the registry of available functions, #' the data on which the pipeline is to be applied, as well as the pipeline itself #' @details This class currently only supports Apache Spark Structured Streaming, implemented through the SparkR interface -#' @details More details of how an object of this class should be initialized is provided in the -#' constructor - \link{initializeStreamingAnalysisPipeline} #' @slot input The input Spark DataFrame on which analysis is to be performed #' @slot originalSchemaDf Empty Spark DataFrame representing the schema of the input #' @family Package core functions for Streaming Analyses @@ -28,23 +27,20 @@ #' @exportClass StreamingAnalysisPipeline #' @export StreamingAnalysisPipeline + StreamingAnalysisPipeline <- setClass("StreamingAnalysisPipeline", slots = c( - input = "SparkDataFrame", - originalSchemaDf = "SparkDataFrame" + input = "ANY", + #Should be a SparkDataFrame, but unable to specify as SparkR is not distributed on CRAN + originalSchemaDf = "ANY" ), contains = "BaseAnalysisPipeline") -#' @name initializeStreamingAnalysisPipeline +#' StreamingAnalysisPipeline constructor +#' @docType methods +#' @rdname initialize-methods #' @title Constructor for the \code{StreamingAnalysisPipeline} object -#' @param .Object The \code{StreamingAnalysisPipeline} object -#' @param input The Spark DataFrame on which operations need to be performed -#' @details -#' \code{input} needs to be provded and the argument needs to be of class \code{SparkDataFrame}, which is -#' generally created through operations using SparkR -#' @return an object of class "\code{StreamingAnalysisPipeline}", initialized with the input Spark DataFrame provided -#' @family Package core functions for Streaming Analyses #' @include core-functions.R -#' @export +#' @keywords internal setMethod( f = "initialize", @@ -59,6 +55,13 @@ setMethod( } ) +.checkSparkDataFrame <- function(obj){ + if(class(obj) != "SparkDataFrame"){ + futile.logger::flog.error("|| The input should be of class 'SparkDataFrame' from the 'SparkR' package ||", + name = "logger.base") + stop() + } +} .executeStream<- function(object){ @@ -72,7 +75,6 @@ setMethod( dplyr::left_join(object@pipeline, getRegistry(), by = c("operation" = "functionName")) %>>% dplyr::left_join(object@pipelineExecutor$topologicalOrdering, by = c("id" = "id")) -> pipelineRegistryOrderingJoin - # pipelineRegistryJoin <- dplyr::left_join(object@pipeline, getRegistry(), by = c("operation" = "functionName")) batches <- unique(pipelineRegistryOrderingJoin$level) numBatches <- max(as.numeric(batches)) @@ -80,30 +82,17 @@ setMethod( # Iterate across batches i.e. sets of independent functions lapply(batches, function(x, object, pipelineRegistryOrderingJoin, outputCache){ - # startBatch <- Sys.time() - pipelineRegistryOrderingJoin %>>% dplyr::filter(level == x) -> functionsInBatch - # futile.logger::flog.info("|| Executing Batch Number : %s/%s containing functions '%s' ||", - # x, numBatches, paste(functionsInBatch$operation, collapse = ", "), - # name='logger.batch') + pipelineRegistryOrderingJoin %>>% dplyr::filter(.data$level == x) -> functionsInBatch ## Function execution in a stream lapply(functionsInBatch$id, function(y, object, functionsInBatch, outputCache){ - functionsInBatch %>>% dplyr::filter(id == y) %>>% as.list -> funcDetails + functionsInBatch %>>% dplyr::filter(.data$id == y) %>>% as.list -> funcDetails futile.logger::flog.info("|| Function ID '%s' named '%s' STARTED on the '%s' engine ||", funcDetails$id, funcDetails$operation, funcDetails$engine, name='logger.func') - # Set Input data - inputToExecute <- object@input - - - if(funcDetails$outAsIn && funcDetails$id != "1"){ - dataOpFn <- paste0("f", as.numeric(funcDetails$id) - 1) - actualDataObjectName <- paste0(dataOpFn, ".out") - inputToExecute <- get(actualDataObjectName, envir = outputCache) - } # Set parameters @@ -111,12 +100,14 @@ setMethod( dep <- unique(unlist(funcDetails$dependencies, recursive = F)) depTerms <- paste0("f", dep) + # Datasets passed as a formula are updated here + params <- lapply(params, function(p, depTerms, outputCache){ if(class(p) == "formula"){ - isDepParam <- analysisPipelines:::isDependencyParam(p) + isDepParam <- analysisPipelines::isDependencyParam(p) if(isDepParam){ - formulaTerm <- analysisPipelines:::getTerm(p) - argName <- analysisPipelines:::getResponse(p) + formulaTerm <- analysisPipelines::getTerm(p) + argName <- analysisPipelines::getResponse(p) if(formulaTerm %in% depTerms){ ## Formula of previous function in pipeline @@ -129,26 +120,26 @@ setMethod( return(p) }, depTerms, outputCache) + # No type conversion for Streaming pipelines + + if(funcDetails$isDataFunction){ + # Not passed as a formula + if(any(class(params[[1]]) == "rlang_fake_data_pronoun")){ + # Checking for outAsIn + if(funcDetails$outAsIn && funcDetails$id != "1"){ + dataOpFn <- paste0("f", as.numeric(funcDetails$id) - 1) + actualDataObjectName <- paste0(dataOpFn, ".out") + params[[1]] <- get(actualDataObjectName, envir = outputCache) + }else{ + # On original input + params[[1]]<- object@input + } + } + } #Call startFunc <- Sys.time() - #Assign as named parameters - #Get names of params - # paramNames <- lapply(params, function(p){ - # return(names(p)) - # }) %>>% unlist - # params <-lapply(params, function(p){ - # names(p) <- NULL - # return(p) - # }) - # names(params) <- paramNames args <- params - if(funcDetails$isDataFunction){ - formals(funcDetails$operation) %>>% as.list %>>% names %>>% dplyr::first() -> firstArgName - firstArg <- list(inputToExecute) - names(firstArg) <- firstArgName - args <- append(firstArg, params) - } output <- tryCatch({do.call(what = funcDetails$operation, args = args)}, error = function(e){ @@ -163,11 +154,6 @@ setMethod( endFunc <- Sys.time() funcExecTime <- endFunc - startFunc - # ##outAsIn - # if(funcDetails$outAsIn){ - # outputCache$workingInput <- output - # } - opName <- paste0("f", funcDetails$id, ".out") #eg: f1.out if(funcDetails$storeOutput){ assign(opName, value = output, envir = outputCache) @@ -213,7 +199,7 @@ setMethod( ## Check engine setup object %>>% assessEngineSetUp -> engineAssessment - engineAssessment %>>% dplyr::filter(requiredForPipeline == T) -> requiredEngines + engineAssessment %>>% dplyr::filter(.data$requiredForPipeline == T) -> requiredEngines if(!all(requiredEngines$isSetup)){ m <- paste0("All engines required for the pipelines have not been configured. ", @@ -242,40 +228,3 @@ setMethod( signature = "StreamingAnalysisPipeline", definition = .generateStreamingOutput ) - -# .generateStreamingOutput = function(object) -# { -# inputToExecute <- object@input -# -# ## Check engine setup -# object %>>% assessEngineSetUp -> engineAssessment -# engineAssessment %>>% dplyr::filter(requiredForPipeline == T) -> requiredEngines -# -# if(!all(requiredEngines$isSetup)){ -# stop(paste0("All engines required for the pipelines have not been configured. ", -# "Please use the analysisPipelines::assessEngine() function to check")) -# } -# pipelineRegistryJoin <- dplyr::left_join(object@pipeline, getRegistry(), by = c("operation" = "functionName")) -# -# if(nrow(pipelineRegistryJoin) > 0){ -# for(rowNo in 1:nrow(pipelineRegistryJoin)){ -# -# ## Check outAsIn and engine conversion accordingly -# if(pipelineRegistryJoin[['outAsIn']][rowNo] == T && rowNo > 1){ -# inputToExecute <- object@output[[rowNo-1]] -# }else{ -# inputToExecute <- object@input -# } -# object@output[[rowNo]] <- do.call(pipelineRegistryJoin[['operation']][[rowNo]], -# append(list(inputToExecute), -# pipelineRegistryJoin[['parameters']][[rowNo]])) -# } -# }else{ -# stop("No functions have been added to the pipeline") -# } -# -# return(object) -# } - - - diff --git a/R/r-batch-eda-utilities.R b/R/r-batch-eda-utilities.R index 0a64d8e..45e0a27 100644 --- a/R/r-batch-eda-utilities.R +++ b/R/r-batch-eda-utilities.R @@ -37,6 +37,7 @@ ignoreCols <- function(data, columns){ #' @name univarCatDistPlots #' @title Univariate Categoric Distribution #' @details A univariate distribution graph on the selected categorical columns from the dataframe +#' @param data the dataset where the column on which the plot is to be generated is present #' @param uniCol the name of column on which the plot needs to be generated #' @param priColor the primary color for the plots #' @param optionalPlots A Flag for optional plots @@ -48,8 +49,10 @@ univarCatDistPlots <- function(data, uniCol, priColor,optionalPlots){ data[[uniCol]][is.na(data[[uniCol]])] <- "NA" data <- data %>% dplyr::group_by_(.dots = c(uniCol)) %>% dplyr::summarise(count = dplyr::n()) y=data[[uniCol]] + data %>>% dplyr::arrange(.data$count) -> data + catPlot <- ggplot2::ggplot(data, - ggplot2::aes(x = reorder(y, count), y=count)) + + ggplot2::aes_(x = as.name(uniCol), y= as.name("count"))) + ggplot2::geom_bar(stat = "identity", fill = priColor,alpha=0.7) + ggplot2::xlab(uniCol) + ggplot2::ylab("Frequency") + ggplot2::theme_bw() + @@ -93,7 +96,7 @@ outlierPlot <- function(data,method,columnName,cutoffValue, priColor,optionalPlo Outlier<-data$Outlier Value<-data[,columnName] outlierPlotObj <- ggplot2::ggplot(data) + - ggplot2::geom_histogram(ggplot2::aes(x = Value, fill = Outlier),bins=30,alpha=0.7) + + ggplot2::geom_histogram(ggplot2::aes(x = Value, fill = .data$Outlier),bins=30,alpha=0.7) + ggplot2::scale_fill_manual(values = c(priColor, "red"),breaks=c("FALSE", "TRUE"), labels=c("Normal", "Outlier"),name = "Status") + ggplot2::theme_bw() + @@ -107,7 +110,7 @@ outlierPlot <- function(data,method,columnName,cutoffValue, priColor,optionalPlo y<-data[,columnName] outlierPlotObj <- ggplot2::ggplot(data, ggplot2::aes(x = Zscore, y = y)) + - ggplot2::geom_point(ggplot2::aes(color = Outlier),alpha=0.7)+ + ggplot2::geom_point(ggplot2::aes(color = .data$Outlier),alpha=0.7)+ ggplot2::scale_color_manual("Status", values = c("TRUE" = "red","FALSE" =priColor))+ ggplot2::ylab(columnName)+ ggplot2::theme_bw() + @@ -148,7 +151,7 @@ multiVarOutlierPlot <- function(data,depCol,indepCol,sizeCol, priColor,optionalP y<-data[,depCol] size<-data[,sizeCol] outlierPlot <- ggplot2::ggplot(data,ggplot2::aes(x = x,y = y),alpha=0.6)+ - ggplot2::geom_point(ggplot2::aes(color = Outlier, size = size),alpha=0.7)+ + ggplot2::geom_point(ggplot2::aes(color = .data$Outlier, size = size),alpha=0.7)+ ggplot2::scale_color_manual("",values = c("Outlier" = "red", "Normal" = priColor))+ ggplot2::labs(title = paste(depCol,"vs",indepCol)) + ggplot2::theme_bw() + ggplot2::theme(panel.border=ggplot2::element_rect(size=0.1),panel.grid.minor.x=ggplot2::element_blank(),legend.position = "bottom") + @@ -213,7 +216,7 @@ bivarPlots <- function(dataset, select_var_name_1, select_var_name_2, priColor = a=as.vector(as.character(unique(new_df[[select_var_name_1]]))) y=new_df[[select_var_name_1]] label=new_df[[select_var_name_2]] - bivarPlot <-ggplot2::ggplot(new_df, ggplot2::aes(x = y, y= n, fill = label)) + + bivarPlot <-ggplot2::ggplot(new_df, ggplot2::aes(x = y, y= .data$n, fill = label)) + ggplot2::geom_bar(position = "dodge", stat = "identity",alpha=0.9) + ggplot2::guides(fill=ggplot2::guide_legend(title=select_var_name_2)) + ggplot2::coord_flip()+ @@ -259,6 +262,7 @@ bivarPlots <- function(dataset, select_var_name_1, select_var_name_2, priColor = #' @title Correlation Matrix Plot #' @description A correlation matrix is created and plotted across all the columns in the dataset #' @param dataset the dataset that needs to be loaded +#' @param methodused methods to be used for computing correlation #' @return Correlation Matrix graph #' @family Package EDA Utilites functions #' @export diff --git a/R/r-helper-utilites-python.R b/R/r-helper-utilites-python.R new file mode 100644 index 0000000..5f72d67 --- /dev/null +++ b/R/r-helper-utilites-python.R @@ -0,0 +1,61 @@ +#' @name setPythonEnvir +#' @title Sets the python environment to be used +#' @details Wrapper function over reticulate functions to set a python environment to be used +#' @param type Type of python environment. Takes three possible vales - 'conda' for Anaconda environments, +#' 'virtualenv' for Virtual environments, and 'python' to manually set the python path to use +#' @param pathOrEnvirName Name of the environment for Anaconda and Virtual environments, +#' or the Python path when type is 'python' +#' @family R helper utilities for Python +#' @export +setPythonEnvir <- function(type = 'conda', pathOrEnvirName = 'base'){ + tryCatch({ + if(type == 'conda'){ + reticulate::use_condaenv(pathOrEnvirName, required = T) + futile.logger::flog.info("|| Using conda environment of name '%s' ||", pathOrEnvirName, + name = "logger.base") + }else if(type == 'virtualenv'){ + reticulate::use_virtualenv(pathOrEnvirName, required = T) + futile.logger::flog.info("|| Using virtual environment of name '%s' ||", pathOrEnvirName, + name = "logger.base") + }else if (type == 'python'){ + reticulate::use_python(pathOrEnvirName, required = T) + futile.logger::flog.info("|| Using python at path: '%s' ||", pathOrEnvirName, + name = "logger.base") + }else{ + futile.logger::flog.error("|| Invalid type - Should be one of 'conda', 'virtualenv', or 'python' ||") + } + }, error = function(e){ + futile.logger::flog.error("|| %s ||", e, name = 'logger.base') + }) +} + + +#' @name getFeaturesForPyClassification +#' @title Extracts selected columns from a data frame as a Python array +#' @details Helper function, which when provided an R data frame and a set of column/ feature names, +#' extracts them from the R data frame as a matrix and converts them to the equivalent Python array. +#' @details Typically this function can be used when providing a feature matrix to a Python machine learning function +#' @param dataset an R data frame +#' @param featureNames Column names to be extracted from the R data frames. A character vector. +#' @family R helper utilities for Python +#' @export +getFeaturesForPyClassification <- function(dataset, featureNames){ + dataset %>% dplyr::select(!!featureNames) %>% as.matrix %>% reticulate::r_to_py() -> featureMatrix + return(featureMatrix) +} + +#' @name getTargetForPyClassification +#' @title Extracts selected column from a data frame a binary class Python array +#' @details Helper function, which when provided an R dataframe and a binary categorical column, +#' extracts it from the R data frame, converts it to 1/0 class coding, and converts it to a Python array +#' @details Typically this function can be used to extract a target variable for a classifier to be provided to a +#' Python machine learning function +#' @export +getTargetForPyClassification <- function(dataset, targetVarName, positiveClass){ + dataset %>% dplyr::mutate(target = ifelse(!!rlang::sym(targetVarName) == !!(positiveClass) , 1, 0)) %>% dplyr::select(target) %>% + as.list() %>% unlist -> targetList + names(targetList) <- NULL + targetList %>% as.factor %>% reticulate::r_to_py() -> target + return(target) +} + diff --git a/R/spark-structured-streaming-utilities.R b/R/spark-structured-streaming-utilities.R index 4da3719..34a5df4 100644 --- a/R/spark-structured-streaming-utilities.R +++ b/R/spark-structured-streaming-utilities.R @@ -5,7 +5,6 @@ # Description: Functions to work with Spark, incuding Structured Streaming ###################################################################################################### -#' @import SparkR #' @name sparkRSessionCreateIfNotPresent #' @title Connect to a Spark session @@ -31,8 +30,7 @@ sparkRSessionCreateIfNotPresent <- function(...){ .libPaths(c(file.path(sparkHome, "R", "lib"), .libPaths())) } - library(SparkR) - sparkR.session(...) + SparkR::sparkR.session(...) } #' @name castKafkaStreamAsString @@ -62,7 +60,7 @@ castKafkaStreamAsString <- function(streamObj){ #' @export convertKafkaValueFromJson <- function(streamObj, schema){ - streamObj <- select(streamObj, from_json(streamObj$value, + streamObj <- SparkR::select(streamObj, SparkR::from_json(streamObj$value, schema = schema)) return(streamObj) } diff --git a/R/sysdata.rda b/R/sysdata.rda index 12b6e1c..7693e77 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/zzz.R b/R/zzz.R index de70863..453f0bd 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,3 +1,4 @@ + .onAttach <- function(libName, pkgName){ loadPredefinedFunctionRegistry() } diff --git a/README.md b/README.md index a0ee871..2b84101 100644 --- a/README.md +++ b/README.md @@ -41,21 +41,22 @@ The package allows both the use of *magrittr* pipe **(%>%)** or the *pipeR* pipe ## Supported engines -As of this version, the package supports functions executed on *R*, or *Spark* through the SparkR interface for batch pipelines. It also supports *Apache Spark Structured Streaming* pipelines for streaming analyses. In subsequent releases, *Python* will also be supported. +As of this version, the package supports functions executed on *R*, or *Spark* through the SparkR interface, as well as Python functions run through *reticulate* for batch pipelines. It also supports *Apache Spark Structured Streaming* pipelines for streaming analyses. ## Available vignettes This package contains 6 vignettes: * **Analysis pipelines - Core functionality and working with R data frames and functions** - This is the main vignette describing the package's core functionality, and explaining this through **batch** pipelines in just **R** -* **Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses** - This vignette describes creating **batch** pipelines to execute in solely in a *Spark* environment +* **Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses** - This vignette describes creating **batch** pipelines to execute solely in a *Spark* environment +* **Analysis pipelines for working with Python functions** - This vignette describes creating **batch** pipelines to execute solely in a *Python* environment * **Interoperable analysis pipelines** - This vignette describes creating and executing **batch** pipelines which are composed of functions executing across *supported engines* * **Streaming Analysis Pipelines for working with Apache Spark Structured Streaming** - This vignette describes setting up streaming pipelines on *Apache Spark Structured Streaming* * **Using pipelines inside Shiny widgets or apps** - A brief vignette which illustrates an example of using a pipeline inside a shiny widget with reactive elements and changing data -* **An introduction to meta-pipelines** - This vignette which illustrates the use of meta-pipelines +* **An introduction to meta-pipelines** - This vignette illustrates the use of meta-pipelines -#Usage +# Usage ## Loading the package diff --git a/data-raw/predefFunctions.R b/data-raw/predefFunctions.R index 107b085..8303f76 100644 --- a/data-raw/predefFunctions.R +++ b/data-raw/predefFunctions.R @@ -39,6 +39,18 @@ exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), isDataFunction = T, firstArgClass = "") -> .batchPredefFunctions +.batchPredefFunctions %>>% dplyr::add_row(functionName = "getFeaturesForPyClassification", + heading = "", + engine = "r", + exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), + isDataFunction = T, + firstArgClass = "") -> .batchPredefFunctions +.batchPredefFunctions %>>% dplyr::add_row(functionName = "getTargetForPyClassification", + heading = "", + engine = "r", + exceptionHandlingFunction = c(as.character(substitute(genericPipelineException))), + isDataFunction = T, + firstArgClass = "") -> .batchPredefFunctions ################################################################################################## diff --git a/inst/python/sampleFunctions.py b/inst/python/sampleFunctions.py new file mode 100644 index 0000000..c5b92f9 --- /dev/null +++ b/inst/python/sampleFunctions.py @@ -0,0 +1,17 @@ +import pandas as pd +from sklearn import datasets +from sklearn import metrics +from sklearn.tree import DecisionTreeClassifier + +def getColMeans(df): + meanList = [] + for x in df.columns: + meanList.append(df[x].mean()) + return meanList + +def decisionTreeTrainAndTest(data, target, newData): + model = DecisionTreeClassifier() + model.fit(data, target) + testPred = model.predict(newData) + return testPred + diff --git a/inst/report.Rmd b/inst/report.Rmd index 51f9bfd..8c8613b 100644 --- a/inst/report.Rmd +++ b/inst/report.Rmd @@ -37,12 +37,13 @@ storedOps <- pipelineDetails %>>% dplyr::filter(storeOutput == T) for(i in storedOps$id){ opTable <- storedOps %>>% dplyr::filter(id == i) obj%>>% getOutputById(i) -> op + eval(parse(text = paste0("op_", i, " <- op"))) knit_expanded <- paste0( "\n```{r chunk",i,",results='asis', fig.width = 12, out.width = '100%', echo=FALSE} \n\n cat('## ",opTable$heading," \n') \n\n - op + op_", i, " \n``` \n\n" ) diff --git a/man/AnalysisPipeline.Rd b/man/AnalysisPipeline-class.Rd similarity index 84% rename from man/AnalysisPipeline.Rd rename to man/AnalysisPipeline-class.Rd index 51399af..138bf5e 100644 --- a/man/AnalysisPipeline.Rd +++ b/man/AnalysisPipeline-class.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions-batch.R \docType{class} -\name{AnalysisPipeline} +\name{AnalysisPipeline-class} +\alias{AnalysisPipeline-class} \alias{AnalysisPipeline} \title{Class for constructing Analysis Pipelines for batch/ one-time analyeses} \description{ @@ -13,9 +14,6 @@ the data on which the pipeline is to be applied, as well as the pipeline itself Additionally, this class is meant to be used for batch/ one-time processing. Contains additional slots to hold the data frame to be used for the pipeline and associated schema - -More details of how an object of this class should be initialized is provided in the -constructor - \link{initializeAnalysisPipeline} } \section{Slots}{ @@ -28,6 +26,6 @@ constructor - \link{initializeAnalysisPipeline} \seealso{ Other Package core functions for batch/one-time analyses: \code{\link{checkSchema}}, \code{\link{generateReport}}, - \code{\link{initializeAnalysisPipeline}} + \code{\link{initialize,BaseAnalysisPipeline-method}} } \concept{Package core functions for batch/one-time analyses} diff --git a/man/BaseAnalysisPipeline.Rd b/man/BaseAnalysisPipeline-class.Rd similarity index 85% rename from man/BaseAnalysisPipeline.Rd rename to man/BaseAnalysisPipeline-class.Rd index 1a95b1a..da1ce01 100644 --- a/man/BaseAnalysisPipeline.Rd +++ b/man/BaseAnalysisPipeline-class.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R \docType{class} -\name{BaseAnalysisPipeline} +\name{BaseAnalysisPipeline-class} +\alias{BaseAnalysisPipeline-class} \alias{BaseAnalysisPipeline} \title{Base class for \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} objects} \description{ @@ -15,8 +16,6 @@ as the base class for various types of Pipeline objects such as Batch and Stream This base class which contains the slots related to the registry, pipeline and output can be extended to create custom class for specific scenarios if required. -The details of the constructor for this class can be found at \link{initializeBaseAnalysisPipeline} - In the documentation, objects of classes which are subclasses of this class are referred to as 'Pipeline' objects } \section{Slots}{ @@ -27,13 +26,11 @@ In the documentation, objects of classes which are subclasses of this class are \item{\code{pipelineExecutor}}{A list containing details of the execution, such as topological ordering of functions to be executed, dependency map of functions, as well as logger configuration} -\item{\code{registry}}{A tibble which holds all the registered functions} - \item{\code{output}}{A list which holds all the functions output} }} \seealso{ -Other Package core functions: \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -45,8 +42,7 @@ Other Package core functions: \code{\link{MetaAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/MetaAnalysisPipeline.Rd b/man/MetaAnalysisPipeline-class.Rd similarity index 90% rename from man/MetaAnalysisPipeline.Rd rename to man/MetaAnalysisPipeline-class.Rd index 1808889..6605700 100644 --- a/man/MetaAnalysisPipeline.Rd +++ b/man/MetaAnalysisPipeline-class.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions-meta-pipelines.R \docType{class} -\name{MetaAnalysisPipeline} +\name{MetaAnalysisPipeline-class} +\alias{MetaAnalysisPipeline-class} \alias{MetaAnalysisPipeline} \title{Class for creating and working with meta-pipelines} \description{ @@ -24,7 +25,7 @@ functions in the pipeline and their respective arguments} }} \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -36,8 +37,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/StreamingAnalysisPipeline.Rd b/man/StreamingAnalysisPipeline-class.Rd similarity index 75% rename from man/StreamingAnalysisPipeline.Rd rename to man/StreamingAnalysisPipeline-class.Rd index 5d1b901..a66c1dc 100644 --- a/man/StreamingAnalysisPipeline.Rd +++ b/man/StreamingAnalysisPipeline-class.Rd @@ -1,7 +1,8 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-streaming-functions.R \docType{class} -\name{StreamingAnalysisPipeline} +\name{StreamingAnalysisPipeline-class} +\alias{StreamingAnalysisPipeline-class} \alias{StreamingAnalysisPipeline} \title{Class for constructing Analysis Pipelines for streaming analyeses} \description{ @@ -12,9 +13,6 @@ Inherits the base class \link{BaseAnalysisPipeline} class which holds the metada the data on which the pipeline is to be applied, as well as the pipeline itself This class currently only supports Apache Spark Structured Streaming, implemented through the SparkR interface - -More details of how an object of this class should be initialized is provided in the -constructor - \link{initializeStreamingAnalysisPipeline} } \section{Slots}{ @@ -24,7 +22,4 @@ constructor - \link{initializeStreamingAnalysisPipeline} \item{\code{originalSchemaDf}}{Empty Spark DataFrame representing the schema of the input} }} -\seealso{ -Other Package core functions for Streaming Analyses: \code{\link{initializeStreamingAnalysisPipeline}} -} \concept{Package core functions for Streaming Analyses} diff --git a/man/analysisPipelines.Rd b/man/analysisPipelines.Rd index cfd2b57..a527f60 100644 --- a/man/analysisPipelines.Rd +++ b/man/analysisPipelines.Rd @@ -10,3 +10,7 @@ The package aims at enabling data scientists to compose pipelines of analysis wh exploratory analysis & reporting, as well as modeling steps. It also aims to enable data scientists to use tools of their choice through an R interface, and compose interoperable pipelines between R, Spark, and Python. } +\details{ +Important Note - This package uses 'SparkR' to interact with Spark and automatically installs it if not present +from a Github repo as 'SparkR' is not distrubuted on CRAN +} diff --git a/man/assessEngineSetUp.Rd b/man/assessEngineSetUp.Rd index 2541fc1..e72e8ac 100644 --- a/man/assessEngineSetUp.Rd +++ b/man/assessEngineSetUp.Rd @@ -1,17 +1,21 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{assessEngineSetUp} \alias{assessEngineSetUp} +\alias{assessEngineSetUp,BaseAnalysisPipeline-method} \title{Assesses engine (R, Spark, Python, Spark Structured Streaming) set up} \usage{ assessEngineSetUp(object) + +\S4method{assessEngineSetUp}{BaseAnalysisPipeline}(object) } \arguments{ -\item{object}{object that contains input, pipeline, registry and output} +\item{object}{A Pipeline object} } \value{ -Tibble containing the details of available engines, whether they are required for a recipe, a logical reporting - whether the engine has been set up, and comments. +Tibble containing the details of available engines, whether they are required for a pipeline, a logical value + reporting whether the engine has been set up, and comments. } \description{ Assesses engine (R, Spark, Python, Spark Structured Streaming) set up @@ -20,12 +24,11 @@ Assesses engine (R, Spark, Python, Spark Structured Streaming) set up Assesses whether engines required for executing functions in an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object have been set up -This method is implemented on the base class as it is a shared functionality types of Analysis Pipelines -which extend this class +This method is implemented on the base class as it is a shared functionality across Pipeline objects } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, \code{\link{exportAsMetaPipeline}}, @@ -36,8 +39,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/checkSchema.Rd b/man/checkSchema.Rd index 4acb12a..875c070 100644 --- a/man/checkSchema.Rd +++ b/man/checkSchema.Rd @@ -23,9 +23,9 @@ Compares the schemas of two dataframes, providing information on added and remov as compared to the old } \seealso{ -Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline}}, +Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}}, \code{\link{generateReport}}, - \code{\link{initializeAnalysisPipeline}} + \code{\link{initialize,BaseAnalysisPipeline-method}} } \concept{Package core functions for batch/one-time analyses} \keyword{internal} diff --git a/man/checkSchemaMatch.Rd b/man/checkSchemaMatch.Rd index 8a02c1e..b50e9f5 100644 --- a/man/checkSchemaMatch.Rd +++ b/man/checkSchemaMatch.Rd @@ -27,8 +27,8 @@ Checks the schema of the new data frame that the pipeline is to be initialized w the original schema that the pipeline was saved with. Provides a detailed comparison } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{createPipelineInstance}}, \code{\link{exportAsMetaPipeline}}, @@ -39,8 +39,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/correlationMatPlot.Rd b/man/correlationMatPlot.Rd index 3d6f3fb..40dd4b3 100644 --- a/man/correlationMatPlot.Rd +++ b/man/correlationMatPlot.Rd @@ -8,6 +8,8 @@ correlationMatPlot(dataset, methodused = "everything") } \arguments{ \item{dataset}{the dataset that needs to be loaded} + +\item{methodused}{methods to be used for computing correlation} } \value{ Correlation Matrix graph diff --git a/man/createPipelineInstance.Rd b/man/createPipelineInstance.Rd index f82a3fb..4502cb2 100644 --- a/man/createPipelineInstance.Rd +++ b/man/createPipelineInstance.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions-meta-pipelines.R +\docType{methods} \name{createPipelineInstance} \alias{createPipelineInstance} +\alias{createPipelineInstance,MetaAnalysisPipeline-method} \title{Create a Pipeline object from a meta-pipeline} \usage{ createPipelineInstance(metaPipelineObj, newParams) + +\S4method{createPipelineInstance}{MetaAnalysisPipeline}(metaPipelineObj, + newParams) } \arguments{ \item{metaPipelineObj}{A \code{MetaAnalysisPipeline} object} @@ -24,8 +29,8 @@ This method instantiates a Pipeline object (both \code{AnalysisPipeline} and \co a meta-pipeline as well as an object containing the new set of values for the arguments of all the functions in the pipeline. } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{exportAsMetaPipeline}}, @@ -36,8 +41,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/exportAsMetaPipeline.Rd b/man/exportAsMetaPipeline.Rd index e4cfae2..95d1d61 100644 --- a/man/exportAsMetaPipeline.Rd +++ b/man/exportAsMetaPipeline.Rd @@ -1,13 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions-meta-pipelines.R +\docType{methods} \name{exportAsMetaPipeline} \alias{exportAsMetaPipeline} +\alias{exportAsMetaPipeline,BaseAnalysisPipeline-method} \title{Method to export a meta-pipeline} \usage{ exportAsMetaPipeline(object) + +\S4method{exportAsMetaPipeline}{BaseAnalysisPipeline}(object) } \arguments{ -\item{.Object}{A Pipeline object} +\item{object}{A Pipeline object} } \value{ an object of class "\code{MetaAnalysisPipeline}" @@ -20,8 +24,8 @@ This method exports a Pipeline object i.e. of the classes \code{AnalysisPipeline \code{StreamingAnalysisPipeline} as a meta-pipeline } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -32,8 +36,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/generateOutput.Rd b/man/generateOutput.Rd index 31d1fc1..91a2b53 100644 --- a/man/generateOutput.Rd +++ b/man/generateOutput.Rd @@ -20,7 +20,7 @@ generateOutput(object) \value{ Updated Pipeline object with the outputs at each step stored in the \code{output} slot. -Specific outputs can be obtained by using the \link{getOuputByOrderId} function +Specific outputs can be obtained by using the \link{getOutputById} function } \description{ Generate a list of outputs from Pipeline objects @@ -33,8 +33,8 @@ The sequence of operations stored in the pipeline object are run and outputs generated, stored in a list } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -45,8 +45,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/generateReport.Rd b/man/generateReport.Rd index bd0d6c5..b41315c 100644 --- a/man/generateReport.Rd +++ b/man/generateReport.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions-batch.R +\docType{methods} \name{generateReport} \alias{generateReport} +\alias{generateReport,AnalysisPipeline,character-method} \title{Generate a HTML report from an \code{AnalysisPipeline} object} \usage{ generateReport(object, path) + +\S4method{generateReport}{AnalysisPipeline,character}(object, path = ".") } \arguments{ \item{object}{object that contains input, pipeline, registry and output} @@ -22,8 +26,8 @@ The sequence of operations stored in the \code{AnalysisPipeline} object are run, and a HTML report is generated with outputs in the same sequence as the pipeline created by the user } \seealso{ -Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline}}, +Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}}, \code{\link{checkSchema}}, - \code{\link{initializeAnalysisPipeline}} + \code{\link{initialize,BaseAnalysisPipeline-method}} } \concept{Package core functions for batch/one-time analyses} diff --git a/man/genericPipelineException.Rd b/man/genericPipelineException.Rd index 005d2fa..d02d61f 100644 --- a/man/genericPipelineException.Rd +++ b/man/genericPipelineException.Rd @@ -19,8 +19,8 @@ incurred during the call of the function being registered can be passed by the u will be called instead of this function } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -31,8 +31,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getFeaturesForPyClassification.Rd b/man/getFeaturesForPyClassification.Rd new file mode 100644 index 0000000..1862b8e --- /dev/null +++ b/man/getFeaturesForPyClassification.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/r-helper-utilites-python.R +\name{getFeaturesForPyClassification} +\alias{getFeaturesForPyClassification} +\title{Extracts selected columns from a data frame as a Python array} +\usage{ +getFeaturesForPyClassification(dataset, featureNames) +} +\arguments{ +\item{dataset}{an R data frame} + +\item{featureNames}{Column names to be extracted from the R data frames. A character vector.} +} +\description{ +Extracts selected columns from a data frame as a Python array +} +\details{ +Helper function, which when provided an R data frame and a set of column/ feature names, +extracts them from the R data frame as a matrix and converts them to the equivalent Python array. + +Typically this function can be used when providing a feature matrix to a Python machine learning function +} +\seealso{ +Other R helper utilities for Python: \code{\link{setPythonEnvir}} +} +\concept{R helper utilities for Python} diff --git a/man/getInput.Rd b/man/getInput.Rd index 8072278..8124664 100644 --- a/man/getInput.Rd +++ b/man/getInput.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{getInput} \alias{getInput} +\alias{getInput,BaseAnalysisPipeline-method} \title{Obtains the initializedInput} \usage{ getInput(object) + +\S4method{getInput}{BaseAnalysisPipeline}(object) } \arguments{ \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} @@ -22,8 +26,8 @@ This method is implemented on the base class as it is a shared functionality typ which extend this class } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -35,8 +39,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getLoggerDetails.Rd b/man/getLoggerDetails.Rd index fc93697..0a85597 100644 --- a/man/getLoggerDetails.Rd +++ b/man/getLoggerDetails.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{getLoggerDetails} \alias{getLoggerDetails} +\alias{getLoggerDetails,BaseAnalysisPipeline-method} \title{Obtains the logger configuration for the pipeline} \usage{ getLoggerDetails(object) + +\S4method{getLoggerDetails}{BaseAnalysisPipeline}(object) } \arguments{ \item{object}{A Pipeline object} @@ -19,8 +23,8 @@ Obtains the logger configuration for the pipeline This function obtains the logger configuration for the pipeline. } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -31,8 +35,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getOutputById.Rd b/man/getOutputById.Rd index eb96d4a..7da3d18 100644 --- a/man/getOutputById.Rd +++ b/man/getOutputById.Rd @@ -1,17 +1,22 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{getOutputById} \alias{getOutputById} +\alias{getOutputById,BaseAnalysisPipeline-method} \title{Obtains a specific output} \usage{ getOutputById(object, reqId, includeCall = F) + +\S4method{getOutputById}{BaseAnalysisPipeline}(object, reqId, + includeCall = F) } \arguments{ \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} -\item{includeCall}{Logical which defines whether the call used to generate the output should be returned. By, default this is false} +\item{reqId}{The position of the function for which the output is desired in the sequence of operations in the pipeline.} -\item{id}{The position of the function for which the output is desired in the sequence of operations in the pipeline.} +\item{includeCall}{Logical which defines whether the call used to generate the output should be returned. By, default this is false} } \value{ If includeCall = F, the output object generated by the function is returned @@ -32,8 +37,8 @@ This method is implemented on the base class as it is a shared functionality typ which extend this class } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -44,8 +49,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getPipeline.Rd b/man/getPipeline.Rd index cab2a18..ac783d6 100644 --- a/man/getPipeline.Rd +++ b/man/getPipeline.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{getPipeline} \alias{getPipeline} +\alias{getPipeline,BaseAnalysisPipeline-method} \title{Obtain the pipeline} \usage{ getPipeline(object) + +\S4method{getPipeline}{BaseAnalysisPipeline}(object) } \arguments{ \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} @@ -22,8 +26,8 @@ This method is implemented on the base class as it is a shared functionality typ which extend this class } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -35,8 +39,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getPipelinePrototype.Rd b/man/getPipelinePrototype.Rd index cb79ddd..b9bf433 100644 --- a/man/getPipelinePrototype.Rd +++ b/man/getPipelinePrototype.Rd @@ -1,13 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions-meta-pipelines.R +\docType{methods} \name{getPipelinePrototype} \alias{getPipelinePrototype} +\alias{getPipelinePrototype,MetaAnalysisPipeline-method} \title{Obtain the prototype of the functions in the pipeline} \usage{ getPipelinePrototype(metaPipelineObj) + +\S4method{getPipelinePrototype}{MetaAnalysisPipeline}(metaPipelineObj) } \arguments{ -\item{object}{A \code{MetaAnalysisPipeline} object} +\item{metaPipelineObj}{A \code{MetaAnalysisPipeline} object} } \value{ An object og class \code{proto} from the 'proto' package @@ -22,8 +26,8 @@ be accessed the same way. These can be accessed and set to new values. This pipe \code{createPipelineInstance} method which will instantiate an executable pipeline with the inputs set in the prototype } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -34,8 +38,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getOutputById}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getRegistry.Rd b/man/getRegistry.Rd index 94f9af1..294964a 100644 --- a/man/getRegistry.Rd +++ b/man/getRegistry.Rd @@ -16,8 +16,8 @@ Obtains the function registry Obtains the function registry as a tibble, including both predefined and user defined functions } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -29,8 +29,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/getResponse.Rd b/man/getResponse.Rd index 5713fb9..2a2b018 100644 --- a/man/getResponse.Rd +++ b/man/getResponse.Rd @@ -6,7 +6,15 @@ \usage{ getResponse(f) } +\arguments{ +\item{f}{formula from which term is to be extracted.} +} +\value{ +The response variable in the formula as a string +} \description{ Obtains the response term from the formula } -\keyword{internal} +\details{ +This is a helper function to extract the response variable from a formula +} diff --git a/man/getTargetForPyClassification.Rd b/man/getTargetForPyClassification.Rd new file mode 100644 index 0000000..a4291b5 --- /dev/null +++ b/man/getTargetForPyClassification.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/r-helper-utilites-python.R +\name{getTargetForPyClassification} +\alias{getTargetForPyClassification} +\title{Extracts selected column from a data frame a binary class Python array} +\usage{ +getTargetForPyClassification(dataset, targetVarName, positiveClass) +} +\description{ +Extracts selected column from a data frame a binary class Python array +} +\details{ +Helper function, which when provided an R dataframe and a binary categorical column, +extracts it from the R data frame, converts it to 1/0 class coding, and converts it to a Python array + +Typically this function can be used to extract a target variable for a classifier to be provided to a +Python machine learning function +} diff --git a/man/getTerm.Rd b/man/getTerm.Rd index d028884..a298cd0 100644 --- a/man/getTerm.Rd +++ b/man/getTerm.Rd @@ -6,7 +6,15 @@ \usage{ getTerm(f) } +\arguments{ +\item{f}{formula from which term is to be extracted.} +} +\value{ +String with the terms +} \description{ Obtains the dependency term from the formula } -\keyword{internal} +\details{ +This is a helper function to extract the terms from a formula +} diff --git a/man/initDfBasedOnType.Rd b/man/initDfBasedOnType.Rd index 9dac48e..ea4f284 100644 --- a/man/initDfBasedOnType.Rd +++ b/man/initDfBasedOnType.Rd @@ -22,8 +22,8 @@ Transforms provided inputs into R data frame regardless of the input provided, b or Python data frames } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -34,8 +34,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getOutputById}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/initialize-methods.Rd b/man/initialize-methods.Rd new file mode 100644 index 0000000..315dccd --- /dev/null +++ b/man/initialize-methods.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/core-functions.R, R/core-functions-batch.R, +% R/core-functions-meta-pipelines.R, R/core-streaming-functions.R +\docType{methods} +\name{initialize,BaseAnalysisPipeline-method} +\alias{initialize,BaseAnalysisPipeline-method} +\alias{initialize,AnalysisPipeline-method} +\alias{initialize,MetaAnalysisPipeline-method} +\alias{initialize,StreamingAnalysisPipeline-method} +\title{This is the constructor for the \link{BaseAnalysisPipeline} class} +\usage{ +\S4method{initialize}{BaseAnalysisPipeline}(.Object) + +\S4method{initialize}{AnalysisPipeline}(.Object, ..., + input = data.frame(), filePath = "") + +\S4method{initialize}{MetaAnalysisPipeline}(.Object, type = "batch") + +\S4method{initialize}{StreamingAnalysisPipeline}(.Object, input) +} +\description{ +BaseAnalysisPipeline constructor + +AnalysisPipeline constructor + +MetaAnalysisPipeline constructor + +StreamingAnalysisPipeline constructor +} +\seealso{ +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, + \code{\link{assessEngineSetUp}}, + \code{\link{checkSchemaMatch}}, + \code{\link{createPipelineInstance}}, + \code{\link{exportAsMetaPipeline}}, + \code{\link{generateOutput}}, + \code{\link{genericPipelineException}}, + \code{\link{getInput}}, \code{\link{getLoggerDetails}}, + \code{\link{getOutputById}}, + \code{\link{getPipelinePrototype}}, + \code{\link{getPipeline}}, \code{\link{getRegistry}}, + \code{\link{initDfBasedOnType}}, + \code{\link{loadMetaPipeline}}, + \code{\link{loadPipeline}}, + \code{\link{loadPredefinedFunctionRegistry}}, + \code{\link{loadRegistry}}, \code{\link{prepExecution}}, + \code{\link{registerFunction}}, + \code{\link{savePipeline}}, \code{\link{saveRegistry}}, + \code{\link{setInput}}, \code{\link{setLoggerDetails}}, + \code{\link{updateObject}}, + \code{\link{visualizePipeline}} + +Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline-class}}, + \code{\link{checkSchema}}, \code{\link{generateReport}} + +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, + \code{\link{assessEngineSetUp}}, + \code{\link{checkSchemaMatch}}, + \code{\link{createPipelineInstance}}, + \code{\link{exportAsMetaPipeline}}, + \code{\link{generateOutput}}, + \code{\link{genericPipelineException}}, + \code{\link{getInput}}, \code{\link{getLoggerDetails}}, + \code{\link{getOutputById}}, + \code{\link{getPipelinePrototype}}, + \code{\link{getPipeline}}, \code{\link{getRegistry}}, + \code{\link{initDfBasedOnType}}, + \code{\link{loadMetaPipeline}}, + \code{\link{loadPipeline}}, + \code{\link{loadPredefinedFunctionRegistry}}, + \code{\link{loadRegistry}}, \code{\link{prepExecution}}, + \code{\link{registerFunction}}, + \code{\link{savePipeline}}, \code{\link{saveRegistry}}, + \code{\link{setInput}}, \code{\link{setLoggerDetails}}, + \code{\link{updateObject}}, + \code{\link{visualizePipeline}} +} +\concept{Package core functions} +\concept{Package core functions for batch/one-time analyses} +\keyword{internal} diff --git a/man/initializeAnalysisPipeline.Rd b/man/initializeAnalysisPipeline.Rd deleted file mode 100644 index 1b5350e..0000000 --- a/man/initializeAnalysisPipeline.Rd +++ /dev/null @@ -1,32 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/core-functions-batch.R -\docType{methods} -\name{initializeAnalysisPipeline} -\alias{initializeAnalysisPipeline} -\title{Constructor for the \link{AnalysisPipeline} class} -\usage{ -\S4method{initialize}{AnalysisPipeline}(.Object, ..., - input = data.frame(), filePath = "") -} -\arguments{ -\item{.Object}{The \code{AnalysisPipeline} object} - -\item{input}{The data frame on which operations need to be performed} - -\item{filePath}{File path for a .csv file to directly read in the dataset from} -} -\value{ -an object of class \code{AnalysisPipeline}, initialized with the input data frame provided -} -\description{ -Constructor for the \link{AnalysisPipeline} class -} -\details{ -Either one of \code{input} or \code{filePath} need to be provided i.e. either the - data frame or the file path to a csv file -} -\seealso{ -Other Package core functions for batch/one-time analyses: \code{\link{AnalysisPipeline}}, - \code{\link{checkSchema}}, \code{\link{generateReport}} -} -\concept{Package core functions for batch/one-time analyses} diff --git a/man/initializeBaseAnalysisPipeline.Rd b/man/initializeBaseAnalysisPipeline.Rd deleted file mode 100644 index d460e27..0000000 --- a/man/initializeBaseAnalysisPipeline.Rd +++ /dev/null @@ -1,50 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/core-functions.R -\docType{methods} -\name{initializeBaseAnalysisPipeline} -\alias{initializeBaseAnalysisPipeline} -\title{This is the constructor for the \link{BaseAnalysisPipeline} class} -\usage{ -\S4method{initialize}{BaseAnalysisPipeline}(.Object) -} -\arguments{ -\item{.Object}{The \code{BaseAnalysisPipeline} object} - -\item{loggerDetails}{Provide logger details} -} -\value{ -an object of class \code{BaseAnalysisPipeline}" -} -\description{ -This is the constructor for the \link{BaseAnalysisPipeline} class -} -\details{ -This is a constructor function for the base class for various types of Analysis Pipelines. This method gets - internally called by \code{AnalysisPipeline} and \code{StreamingAnalysisPipeline} constructors. -} -\seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, - \code{\link{assessEngineSetUp}}, - \code{\link{checkSchemaMatch}}, - \code{\link{createPipelineInstance}}, - \code{\link{exportAsMetaPipeline}}, - \code{\link{generateOutput}}, - \code{\link{genericPipelineException}}, - \code{\link{getInput}}, \code{\link{getLoggerDetails}}, - \code{\link{getOutputById}}, - \code{\link{getPipelinePrototype}}, - \code{\link{getPipeline}}, \code{\link{getRegistry}}, - \code{\link{initDfBasedOnType}}, - \code{\link{initializeMetaAnalysisPipeline}}, - \code{\link{loadMetaPipeline}}, - \code{\link{loadPipeline}}, - \code{\link{loadPredefinedFunctionRegistry}}, - \code{\link{loadRegistry}}, \code{\link{prepExecution}}, - \code{\link{registerFunction}}, - \code{\link{savePipeline}}, \code{\link{saveRegistry}}, - \code{\link{setInput}}, \code{\link{setLoggerDetails}}, - \code{\link{updateObject}}, - \code{\link{visualizePipeline}} -} -\concept{Package core functions} diff --git a/man/initializeMetaAnalysisPipeline.Rd b/man/initializeMetaAnalysisPipeline.Rd deleted file mode 100644 index edb2f78..0000000 --- a/man/initializeMetaAnalysisPipeline.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/core-functions-meta-pipelines.R -\docType{methods} -\name{initializeMetaAnalysisPipeline} -\alias{initializeMetaAnalysisPipeline} -\title{This is the constructor for the \link{MetaAnalysisPipeline} class} -\usage{ -\S4method{initialize}{MetaAnalysisPipeline}(.Object, type = "batch") -} -\arguments{ -\item{.Object}{The \code{MetaAnalysisPipeline} object} - -\item{type}{A string defining whether it is a batch or streaming pipeline. Acceptable values are 'batch' & 'streaming'} -} -\value{ -an object of class \code{MetaAnalysisPipeline}" -} -\description{ -This is the constructor for the \link{MetaAnalysisPipeline} class -} -\details{ -This method is a constructor for the \code{MetaAnalysisPipeline} class -} -\seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, - \code{\link{assessEngineSetUp}}, - \code{\link{checkSchemaMatch}}, - \code{\link{createPipelineInstance}}, - \code{\link{exportAsMetaPipeline}}, - \code{\link{generateOutput}}, - \code{\link{genericPipelineException}}, - \code{\link{getInput}}, \code{\link{getLoggerDetails}}, - \code{\link{getOutputById}}, - \code{\link{getPipelinePrototype}}, - \code{\link{getPipeline}}, \code{\link{getRegistry}}, - \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{loadMetaPipeline}}, - \code{\link{loadPipeline}}, - \code{\link{loadPredefinedFunctionRegistry}}, - \code{\link{loadRegistry}}, \code{\link{prepExecution}}, - \code{\link{registerFunction}}, - \code{\link{savePipeline}}, \code{\link{saveRegistry}}, - \code{\link{setInput}}, \code{\link{setLoggerDetails}}, - \code{\link{updateObject}}, - \code{\link{visualizePipeline}} -} -\concept{Package core functions} diff --git a/man/initializeStreamingAnalysisPipeline.Rd b/man/initializeStreamingAnalysisPipeline.Rd deleted file mode 100644 index 25e1765..0000000 --- a/man/initializeStreamingAnalysisPipeline.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/core-streaming-functions.R -\docType{methods} -\name{initializeStreamingAnalysisPipeline} -\alias{initializeStreamingAnalysisPipeline} -\title{Constructor for the \code{StreamingAnalysisPipeline} object} -\usage{ -\S4method{initialize}{StreamingAnalysisPipeline}(.Object, input) -} -\arguments{ -\item{.Object}{The \code{StreamingAnalysisPipeline} object} - -\item{input}{The Spark DataFrame on which operations need to be performed} -} -\value{ -an object of class "\code{StreamingAnalysisPipeline}", initialized with the input Spark DataFrame provided -} -\description{ -Constructor for the \code{StreamingAnalysisPipeline} object -} -\details{ -\code{input} needs to be provded and the argument needs to be of class \code{SparkDataFrame}, which is - generally created through operations using SparkR -} -\seealso{ -Other Package core functions for Streaming Analyses: \code{\link{StreamingAnalysisPipeline}} -} -\concept{Package core functions for Streaming Analyses} diff --git a/man/isDependencyParam.Rd b/man/isDependencyParam.Rd index 749e8f9..ff4984b 100644 --- a/man/isDependencyParam.Rd +++ b/man/isDependencyParam.Rd @@ -6,7 +6,16 @@ \usage{ isDependencyParam(f) } +\arguments{ +\item{f}{formula from which term is to be extracted.} +} +\value{ +Logical as to whether it is a dependency parameter +} \description{ Checks if the parameter is the dependency parameter } -\keyword{internal} +\details{ +This is a helper function to check if the formula provided is a dependency parameter, +as per the package's formula semantics, capturing function dependencies +} diff --git a/man/loadMetaPipeline.Rd b/man/loadMetaPipeline.Rd index 0976f03..e3457bf 100644 --- a/man/loadMetaPipeline.Rd +++ b/man/loadMetaPipeline.Rd @@ -23,8 +23,8 @@ Note - When a meta-pipeline is loaded, the existing registry is overwritten with meta-pipeline } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -36,8 +36,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, \code{\link{loadRegistry}}, \code{\link{prepExecution}}, diff --git a/man/loadPipeline.Rd b/man/loadPipeline.Rd index 3b9ae74..0630c3a 100644 --- a/man/loadPipeline.Rd +++ b/man/loadPipeline.Rd @@ -35,8 +35,8 @@ Note - When a pipeline is loaded, the existing registry is overwritten with the pipeline } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -48,8 +48,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, \code{\link{loadRegistry}}, \code{\link{prepExecution}}, diff --git a/man/loadPredefinedFunctionRegistry.Rd b/man/loadPredefinedFunctionRegistry.Rd index 21855a1..52c9fb0 100644 --- a/man/loadPredefinedFunctionRegistry.Rd +++ b/man/loadPredefinedFunctionRegistry.Rd @@ -13,8 +13,8 @@ Loading the registry of predefined functions Loads the registry of predefined functions } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -26,8 +26,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadRegistry}}, \code{\link{prepExecution}}, diff --git a/man/loadRegistry.Rd b/man/loadRegistry.Rd index 8dbc39d..914529a 100644 --- a/man/loadRegistry.Rd +++ b/man/loadRegistry.Rd @@ -17,8 +17,8 @@ This function loads a function registry and associated function definition store environment. The existing registry is overwritten with the newly loaded registry } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -30,8 +30,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/prepExecution.Rd b/man/prepExecution.Rd index b1ccb48..28c1f7d 100644 --- a/man/prepExecution.Rd +++ b/man/prepExecution.Rd @@ -1,10 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{prepExecution} \alias{prepExecution} +\alias{prepExecution,BaseAnalysisPipeline-method} \title{Prepare the pipleline for execution} \usage{ prepExecution(object) + +\S4method{prepExecution}{BaseAnalysisPipeline}(object) +} +\arguments{ +\item{object}{A Pipeline object} } \value{ Updated \code{AnalysisPipeline} \code{StreamingAnalysisPipeline} object @@ -17,8 +24,8 @@ The pipeline is prepared for execution by identifying the graph of the pipeline and dependency map in order to prepare for execution } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -30,8 +37,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/registerFunction.Rd b/man/registerFunction.Rd index 6aebbd6..85d79b0 100644 --- a/man/registerFunction.Rd +++ b/man/registerFunction.Rd @@ -19,6 +19,8 @@ registerFunction(functionName, heading = "", functionType = "batch", \item{engine}{specifies which engine the function is to be run on. Available engines include "r", "spark", and "python"} +\item{exceptionFunction}{R object corresponding to the exception function} + \item{isDataFunction}{logical parameter which defines whether the function to be registered operates on data i.e. the first parameter is a dataframe} \item{firstArgClass}{character string with the class of the first argument to the function, if it is a non-data function} @@ -37,8 +39,8 @@ If the function already exists in the registry, registration will be skipped. In to be reassigned in the Global Environment and then the \code{registerFunction} called again. } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -50,8 +52,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/savePipeline.Rd b/man/savePipeline.Rd index 0a81d45..786e7c9 100644 --- a/man/savePipeline.Rd +++ b/man/savePipeline.Rd @@ -1,10 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/core-functions.R +% Please edit documentation in R/core-functions.R, +% R/core-functions-meta-pipelines.R +\docType{methods} \name{savePipeline} \alias{savePipeline} +\alias{savePipeline,BaseAnalysisPipeline-method} +\alias{savePipeline,MetaAnalysisPipeline-method} \title{Saves the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object to the file system without outputs} \usage{ savePipeline(object, path) + +\S4method{savePipeline}{BaseAnalysisPipeline}(object, path) + +\S4method{savePipeline}{MetaAnalysisPipeline}(object, path) } \arguments{ \item{object}{object that contains input, pipeline, registry and output} @@ -25,8 +33,8 @@ This method is implemented on the base class as it is a shared functionality typ which extend this class } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -38,8 +46,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/saveRegistry.Rd b/man/saveRegistry.Rd index b9f74ce..687e83f 100644 --- a/man/saveRegistry.Rd +++ b/man/saveRegistry.Rd @@ -17,8 +17,8 @@ This function saves the existing function registry and associated function defi environment into a file. } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -30,8 +30,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/setInput.Rd b/man/setInput.Rd index b739052..522ff1b 100644 --- a/man/setInput.Rd +++ b/man/setInput.Rd @@ -1,10 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{setInput} \alias{setInput} +\alias{setInput,BaseAnalysisPipeline-method} \title{Sets the input for an \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} \usage{ setInput(object, input, filePath = "") + +\S4method{setInput}{BaseAnalysisPipeline}(object, input, filePath = "") } \arguments{ \item{object}{object that contains input, pipeline, registry and output} @@ -26,8 +30,8 @@ This method is implemented on the base class as it is a shared functionality typ which extend this class } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -39,8 +43,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/setLoggerDetails.Rd b/man/setLoggerDetails.Rd index 69d98e8..8f1c210 100644 --- a/man/setLoggerDetails.Rd +++ b/man/setLoggerDetails.Rd @@ -1,11 +1,17 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{setLoggerDetails} \alias{setLoggerDetails} +\alias{setLoggerDetails,BaseAnalysisPipeline-method} \title{Sets the logger configuration for the pipeline} \usage{ setLoggerDetails(object, target = "console", targetFile = "pipelineExecution.out", layout = "layout.simple") + +\S4method{setLoggerDetails}{BaseAnalysisPipeline}(object, + target = "console", targetFile = "pipelineExecution.out", + layout = "layout.simple") } \arguments{ \item{object}{A Pipeline object} @@ -14,7 +20,7 @@ setLoggerDetails(object, target = "console", \item{targetFile}{File name of the log file in case the target is 'file'} -\item{targetLayout}{Specify the layout according to 'futile.logger' package convention} +\item{layout}{Specify the layout according to 'futile.logger' package convention} } \description{ Sets the logger configuration for the pipeline @@ -23,8 +29,8 @@ Sets the logger configuration for the pipeline This function sets the logger configuration for the pipeline. } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -36,8 +42,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/setPythonEnvir.Rd b/man/setPythonEnvir.Rd new file mode 100644 index 0000000..082792a --- /dev/null +++ b/man/setPythonEnvir.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/r-helper-utilites-python.R +\name{setPythonEnvir} +\alias{setPythonEnvir} +\title{Sets the python environment to be used} +\usage{ +setPythonEnvir(type = "conda", pathOrEnvirName = "base") +} +\arguments{ +\item{type}{Type of python environment. Takes three possible vales - 'conda' for Anaconda environments, +'virtualenv' for Virtual environments, and 'python' to manually set the python path to use} + +\item{pathOrEnvirName}{Name of the environment for Anaconda and Virtual environments, +or the Python path when type is 'python'} +} +\description{ +Sets the python environment to be used +} +\details{ +Wrapper function over reticulate functions to set a python environment to be used +} +\seealso{ +Other R helper utilities for Python: \code{\link{getFeaturesForPyClassification}} +} +\concept{R helper utilities for Python} diff --git a/man/univarCatDistPlots.Rd b/man/univarCatDistPlots.Rd index 22205ce..317087c 100644 --- a/man/univarCatDistPlots.Rd +++ b/man/univarCatDistPlots.Rd @@ -7,6 +7,8 @@ univarCatDistPlots(data, uniCol, priColor, optionalPlots) } \arguments{ +\item{data}{the dataset where the column on which the plot is to be generated is present} + \item{uniCol}{the name of column on which the plot needs to be generated} \item{priColor}{the primary color for the plots} diff --git a/man/updateObject.Rd b/man/updateObject.Rd index ac4e297..27e12ef 100644 --- a/man/updateObject.Rd +++ b/man/updateObject.Rd @@ -1,11 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/core-functions.R +\docType{methods} \name{updateObject} \alias{updateObject} +\alias{updateObject,BaseAnalysisPipeline-method} \title{Update the \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object by adding an operation to the pipeline} \usage{ updateObject(object, operation, heading = "", parameters, outAsIn = F, storeOutput = F) + +\S4method{updateObject}{BaseAnalysisPipeline}(object, operation, + heading = "", parameters, outAsIn = F, storeOutput = F) } \arguments{ \item{object}{object that contains input, pipeline, registry and output} @@ -35,8 +40,8 @@ This method is implemented on the base class as it is a shared functionality typ which extend this class } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -48,8 +53,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/man/visualizePipeline.Rd b/man/visualizePipeline.Rd index e31b7f1..b829acd 100644 --- a/man/visualizePipeline.Rd +++ b/man/visualizePipeline.Rd @@ -1,10 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/core-functions.R +% Please edit documentation in R/core-functions.R, +% R/core-functions-meta-pipelines.R +\docType{methods} \name{visualizePipeline} \alias{visualizePipeline} +\alias{visualizePipeline,BaseAnalysisPipeline-method} +\alias{visualizePipeline,MetaAnalysisPipeline-method} \title{Visualizes the pipeline as a graph} \usage{ visualizePipeline(object) + +\S4method{visualizePipeline}{BaseAnalysisPipeline}(object) + +\S4method{visualizePipeline}{MetaAnalysisPipeline}(object) } \arguments{ \item{object}{The \code{AnalysisPipeline} or \code{StreamingAnalysisPipeline} object} @@ -20,8 +28,8 @@ Indicates dependencies amongst functions as well as functions for which output needs to be stored } \seealso{ -Other Package core functions: \code{\link{BaseAnalysisPipeline}}, - \code{\link{MetaAnalysisPipeline}}, +Other Package core functions: \code{\link{BaseAnalysisPipeline-class}}, + \code{\link{MetaAnalysisPipeline-class}}, \code{\link{assessEngineSetUp}}, \code{\link{checkSchemaMatch}}, \code{\link{createPipelineInstance}}, @@ -33,8 +41,7 @@ Other Package core functions: \code{\link{BaseAnalysisPipeline}}, \code{\link{getPipelinePrototype}}, \code{\link{getPipeline}}, \code{\link{getRegistry}}, \code{\link{initDfBasedOnType}}, - \code{\link{initializeBaseAnalysisPipeline}}, - \code{\link{initializeMetaAnalysisPipeline}}, + \code{\link{initialize,BaseAnalysisPipeline-method}}, \code{\link{loadMetaPipeline}}, \code{\link{loadPipeline}}, \code{\link{loadPredefinedFunctionRegistry}}, diff --git a/vignettes/Analysis_pipelines_for_working_with_Python_functions.Rmd b/vignettes/Analysis_pipelines_for_working_with_Python_functions.Rmd new file mode 100644 index 0000000..8467ac6 --- /dev/null +++ b/vignettes/Analysis_pipelines_for_working_with_Python_functions.Rmd @@ -0,0 +1,97 @@ +--- +title: "Analysis pipelines for working with Python functions" +author: "Naren Srinivasan" +date: "11/27/2018" +output: + rmarkdown::html_vignette: + toc: true + fig_width: 8 +vignette: > + %\VignetteIndexEntry{Analysis pipelines for working with Python functions} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- +# Introduction + +*Python* has grown exponentially over the past few years in terms of usage for data science, and specifically machine learning. It provides an extensive set of modules for executing various machine learning tasks. The *reticulate* R package provides a mechanism for interoperability between R and Python. It provides direct translation between equivalent commonly used object types, as well as functions. + +The *analysisPipelines* package uses the *reticulate* package under the hood, and provides a consistent high-level interface for the data scientist, as discussed in other vignettes. + +The vignette describes defining and executing *Python*-only pipelines using the *analysisPipelines* package. + +# Important Note + +The functionality of adding Python functions to the pipeline is enabled under the hood by the *reticulate* package. As the *reticulate* package itself is in its early stages of development and usage, some things might not work as expected. Additionally, for reticulating *Python* code itself in R MarkDown chunks (as opposed to sourcing Python files) **RStudio 1.2** is required, though it is still in Preview phase, as of the time of writing this vignette. + +On a separate note, there is a slight difference between how *SparkR* and *reticulate* are designed. SparkR provides wrappers to Spark functions and stays true to the conventions and classes used in *Apache Spark*, with the main type conversion offered being that on a data frame. *reticulate* is different in the sense that its aim is to provide interoperability, and provides type conversion between a wide range of object types between R and Python. + +The biggest difference is in terms of functions - in SparkR, functions written in Scala, etc. in a Python session cannot be accessed from an R session. However, using *reticulate* user-defined functions written in Python and sourced, can be accessed as objects in an R session. This allows greater flexibility, to write custom functions in Python, source the file, and then call those functions from R. This difference in design is important to understand, in order to construct functions which can then be used to compose pipelines. +```{r} +knitr::opts_chunk$set( + eval = FALSE + ) +``` + + +# Setup + +The *analysisPipelines* provides a couple of helper functions in R, making it easier to interact with the Python environment. One of them is to set the Python environment, which we do, like so: + +```{r} + +library(analysisPipelines) + +analysisPipelines::setPythonEnvir('python', '/Users/naren/anaconda3/bin/python') +os <- reticulate::import("os") +numpy <- reticulate::import("numpy") +pandas <- reticulate::import("pandas") +sklearn <- reticulate::import("sklearn") + +reticulate::source_python(system.file("python/sampleFunctions.py", package = "analysisPipelines")) + +reticulate::py_config() +``` + +# Registering Python functions + +Python functions which have been sourced through *reticulate* are available as references in the R environment and can be directly registered as part of the pipeline, through the usual mechanism. + +For non-R engines, such as Spark and Python, a suffix with the engine name is added to the function name on registration. So, functions with this suffix need to be used when pipelining to an *Analysis Pipeline* object. The engine is added as a suffix for better readability. A suffix is used (as opposed to a prefix) to enable easier auto-completes. + +The *analysisPipelines* package creates wrapper methods which contain the *argument* signature of the Python function. This allows the user to know what arguments need to passed. Normal *reticulate* imports have a `...` signature. + +In our Python sample function file, we have a function called `decisionTreeTrainAndTest` which was sourced. We register this function: + +```{r} +registerFunction('decisionTreeTrainAndTest', engine = "python", isDataFunction = F, firstArgClass = "numpy.ndarray") +getRegistry() +``` + +# Defining pipelines + +Pipelines are defined and executed as usual. Regardless of the engine being used the high-level interface remains the same. + +```{r} +data("iris") +trainSample <- sample(1:150, size = 100) +train <- iris[trainSample,] +test <- iris[-trainSample,] #%>>% getFeaturesForPyClassification(featureNames = colnames(iris)[-ncol(iris)]) +obj <- AnalysisPipeline(input = train) + +obj %>>% getFeaturesForPyClassification(featureNames = colnames(train)[-ncol(train)]) %>>% + getTargetForPyClassification(targetVarName = "Species", positiveClass = "setosa") %>>% + getFeaturesForPyClassification(dataset = test, featureNames = colnames(test)[-ncol(test)]) %>>% + decisionTreeTrainAndTest_py(data = ~f1, target = ~f2, newData = ~f3, storeOutput = T) -> objDecisionTree + +objDecisionTree %>>% assessEngineSetUp +objDecisionTree %>>% visualizePipeline +``` + +# Execution + +```{r} +objDecisionTree %>>% generateOutput -> op +#op %>>% generateReport("~/Desktop") +op %>>% getOutputById("4") +``` + diff --git a/vignettes/Analysis_pipelines_for_working_with_R_dataframes.Rmd b/vignettes/Analysis_pipelines_for_working_with_R_dataframes.Rmd index 26e8528..ab90b7d 100644 --- a/vignettes/Analysis_pipelines_for_working_with_R_dataframes.Rmd +++ b/vignettes/Analysis_pipelines_for_working_with_R_dataframes.Rmd @@ -14,13 +14,13 @@ vignette: > # An overview of the package -In a typical data science workflow there are multiple steps involved from data aggregation, cleaning, exploratory analysis, modeling and so on. As the data science community matures, we are seeing that there are a variety of languages which provide better capabilities for specific steps in the data science workflow. *R* is typically used for data transformations, statistical models, and visualizations, while *Python* provides more robust functions for machine learning. In addition to this, *Spark* provides an environment to process high volume data - both as one-time/ batch or as streams. +In a typical data science workflow there are multiple steps involved; from data aggregation, cleaning, exploratory analysis, modeling and so on. As the data science community matures, we are seeing that there are a variety of languages which provide better capabilities for specific steps in the data science workflow. *R* is typically used for data transformations, statistical models, and visualizations, while *Python* provides more robust functions for machine learning. In addition to this, *Spark* provides an environment to process high volume data - both as one-time/ batches or as streams. The job of today's data scientist is changing from one where they are married to a specific tool or language, to one where they are using all these tools for their specialized purposes. The key problem then becomes one of translation between these tools for seamless analysis. Additionally, in the work of a data scientist, there is a need to perform the same task repeatedly, as well as put certain analysis flows (or) pipelines into production to work on new data periodically, or work on streaming data. Recently, interfaces for using these various tools have been published. In terms of R packages, the *reticulate* package provides an interface to Python, and the *SparkR* and *sparklyr* packages provide an interface to Spark. -The *analysisPipelines* package uses these interfaces to enable *Interoperable Pipelines* i.e. the ability compose and execute a reusable data science pipeline which can contain functions to be executed in an *R* environment, in a *Python* environment or in a *Spark* environment. These pipelines can saved and loaded, to enable batch operation as datasets get updated with new data. +The *analysisPipelines* package uses these interfaces to enable *Interoperable Pipelines* i.e. the ability compose and execute a reusable data science pipeline which can contain functions to be executed in an *R* environment, in a *Python* environment or in a *Spark* environment. These pipelines can be saved and loaded, to enable batch operation as datasets get updated with new data. The goal of the *analysisPipelines* package is to make the job of the data scientist easier and help them compose pipelines of analysis which consist of data manipulation, exploratory analysis & reporting, as well as modeling steps. The idea is for data scientists to use tools of their choice through an *R* interface, using this package Essentially, it allows data scientists to: @@ -58,9 +58,11 @@ This package contains 5 vignettes: * This is the main vignette describing the package's core functionality, and explaining this through **batch** pipelines in just **R** * **Analysis pipelines for working with Spark DataFrames for one-time/ batch analyses** - This vignette describes creating **batch** pipelines to execute solely in a *Spark* environment +* **Analysis pipelines for working with Python functions** - This vignette describes creating **batch** pipelines to execute solely in a *Python* environment * **Interoperable analysis pipelines** - This vignette describes creating and executing **batch** pipelines which are composed of functions executing across *supported engines* * **Streaming Analysis Pipelines for working with Apache Spark Structured Streaming** - This vignette describes setting up streaming pipelines on *Apache Spark Structured Streaming* * **Using pipelines inside Shiny widgets or apps** - A brief vignette which illustrates an example of using a pipeline inside a shiny widget with reactive elements and changing data +* **An introduction to meta-pipelines** - This vignette illustrates the use of meta-pipelines # Usage @@ -254,19 +256,11 @@ op %>>% getOutputById("3") ## Differences between `outAsIn` and formulae - When to use what -The package provides 2 mechanisms to pass outputs from previous functions to subsequent ones. The first one is the `outAsIn` parameter. This is limited only to **transformations on data that the pipeline is instantiated with** which need to be passed on and is limited to the output of the *immediate previous function* when defining the pipeline. This is provided as an easy-to-use intuitive interface for the common use case of performing a set of sequential data transformations on the input data before performing some kind of analysis. Therefore, this should be sufficient for simple, or linear pipelines. In essence, the `outAsIn` parameter, can be used for data transformations in the main *path* of the pipeline, i.e. the primary flow. +The package provides 2 mechanisms to pass outputs from previous functions to subsequent ones. The first one is the `outAsIn` parameter. This is limited only to **transformations on data that the pipeline is instantiated with** which need to be passed on and is limited to the output of the *immediate previous function* when defining the pipeline. This is provided as an easy-to-use intuitive interface for the common use case of performing a set of sequential data transformations on the input data before performing some kind of analysis. Therefore, this should be sufficient for simple, or linear pipelines. -Formula semantics are provided to implement more complex pipelines, and are not limited to data parameters. Any type of object which is an output of a previous function can be used in a subsequent function. The typical use of these formulae are to provide parameters to certain functions in the pipeline, which are the result of previous functions. Formula semantics can be used for *auxiliary* flows in the pipeline which eventually merge into the main *path.* +Formula semantics are provided to implement more complex pipelines, and are not limited to data parameters. Any type of object which is an output of a previous function can be used in a subsequent function. The typical use of these formulae are to provide parameters to certain functions in the pipeline, which are the result of previous functions. Formula semantics can be used for the **data parameter** of data functions as well. This uses the output of the function specified, instead of the input data the object was instantiated with. -A confusion may arise when the first argument of the function is of type 'data.frame' but this function is not designed to operate on the input data tha the pipeline object was instantiated with, or any transformation of it. In this case, the expected practice is to register the function as *non-data* function, and use formula semantics to define the value to be passed to the first input. - -**Important Note** - When using formula semantics for the first argument of a *non-data* function, the `outAsIn` parameter should be specified as `FALSE` (the default), as it is essentially replicating the specification. - -For example, in the below flow, the main *path* is: - -Data -> filtering the data -> Plot a line chart. - -However, there is an auxiliary flow, where summarized data is required for another chart. +In essence, you can implement any kind of complex pipeline with formula semantics. The `outAsIn` parameter is provided as an easy-to-use shortcut for simpler linear pipelines. When a formula is specified for the first argument, the `outAsIn` parameter is rendered irrelevant. ```{r} exampleObj <- AnalysisPipeline(input = iris) @@ -297,22 +291,21 @@ plotSummary <- function(d, summaryVar, groupingVar){ registerFunction("filterData") registerFunction("summarizeData") registerFunction("plotLine") +registerFunction("plotSummary") + -## Non-data function operating on an auxiliary flow -registerFunction("plotSummary", isDataFunction = F, firstArgClass = "data.frame") exampleObj %>>% summarizeData(conditionVar = "Species") %>>% filterData(conditionVar = "Species", val = "setosa", outAsIn = F, storeOutput = F) %>>% - plotLine(y = "Sepal.Length", "Sepal.Width", outAsIn = T, storeOutput = T) %>>% + plotLine(y = "Sepal.Length", x = "Sepal.Width", outAsIn = T, storeOutput = T) %>>% plotSummary(d = ~f1, summaryVar = "Sepal.Length", groupingVar = "Species", storeOutput = T) -> exampleObj +exampleObj %>>% visualizePipeline exampleObj %>>% generateOutput %>>% getOutputById("4") ``` -Note that `plotSummary` which works on an aggregated dataset which is not created on the main *path* of the set of transformations on the input data is defined as a *non-data* function. - # Visualizing pipelines These pipelines can be visualized by calling the `visualizePipeline` method. This generates the whole pipeline as a network, showing the engines on which each function is run, and which outputs are stored. diff --git a/vignettes/Analysis_pipelines_for_working_with_sparkR.Rmd b/vignettes/Analysis_pipelines_for_working_with_sparkR.Rmd index a50edec..1551b53 100644 --- a/vignettes/Analysis_pipelines_for_working_with_sparkR.Rmd +++ b/vignettes/Analysis_pipelines_for_working_with_sparkR.Rmd @@ -15,11 +15,30 @@ vignette: > *Apache Spark* can be leveraged to process large volumes of distributed data that are typically impossible to process on standalone R servers. The vignette describes defining and executing *Spark*-only pipelines using the *analysisPipelines* package. +# Important Note + +Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. The *SparkR* version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. + +To install from Github, run the following command, if you know the Spark version: + +```{r eval = F} +devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') +``` + +The other option is to install R by running the following *terminal* commands if Spark has already been installed. + +```{bash eval = F} +$ export SPARK_HOME=/path/to/spark/directory +$ cd $SPARK_HOME/R/lib/SparkR/ +$ R -e "devtools::install('.')" +``` + + # Initialize libraries * Load the *analysisPipelines* and *SparkR* libraries -* Ensure you have a local installation of spark and SparkR package is installed * Check if the SPARK_HOME environment variable is set to spark installation folder. Else, define it using `sys.setenv()` function. + ```{r, include=FALSE} knitr::opts_chunk$set( eval = FALSE @@ -49,10 +68,11 @@ sparkRSessionCreateIfNotPresent(master = sparkMaster, sparkPackages = sparkPacka ``` # Read data from csv and initialize pipeline object -Spark can connect to datasources like Hive, Kafka. Besides, it also also read parquet, json and csv files. In this example we will read a csv file. + +Spark can connect to datasources like Hive, Kafka. Besides, it can also read parquet, json and csv files. In this example we will read a csv file. + ```{r} pipelineObj <- AnalysisPipeline(filePath = system.file("hotel_new.csv", package = "analysisPipelines")) -# pipelineObj <- AnalysisPipeline(input = SparkR::as.data.frame(system.file("hotel_new.csv", package = "analysisPipelines"))) ``` # User-defined spark functions @@ -83,7 +103,9 @@ bookingChannelsAnalysis <- function(inputDataset, groupByColumn) { # Registering user-defined functions to the pipeline object -Each user-defined function needs to be registered to the pipeline object. Post registration, the function can be used to construct a pipeline. A pipeline is a set of multiple functions called in a particular sequence. +Each user-defined function needs to be registered to the pipeline object. For non-R engines, such as Spark and Python, a suffix with the engine name is added to the function name on registration. So, functions with this suffix need to be used when pipelining to an *Analysis Pipeline* object. The engine is added as a suffix for better readability. A suffix is used (as opposed to a prefix) to enable easier auto-completes. + +Post registration, the function can be used to construct a pipeline. A pipeline is a set of multiple functions called in a particular sequence. ```{r} # Register user-defined functions @@ -96,8 +118,8 @@ registerFunction("bookingChannelsAnalysis", "Booking channels analysis", getRegistry() # Define pipeline from list of registered functions -pipelineObj %>% occupancyAnalysis(groupByColumn = "location_type", storeOutput = T) %>% - bookingChannelsAnalysis(groupByColumn = "location_type", storeOutput = T) -> pipelineObj +pipelineObj %>% occupancyAnalysis_spark(groupByColumn = "location_type", storeOutput = T) %>% + bookingChannelsAnalysis_spark(groupByColumn = "location_type", storeOutput = T) -> pipelineObj pipelineObj %>>% getPipeline pipelineObj %>>% visualizePipeline @@ -116,17 +138,10 @@ occupancyAnalysisResult <- pipelineObj %>>% getOutputById(1) occupancyAnalysisResultDf <- as.data.frame(occupancyAnalysisResult) DT::datatable(head(occupancyAnalysisResultDf),options = list(scrollX = T, scrollY = T)) - -# ggplot2::ggplot(occupancyAnalysisResultDf) + ggplot2::geom_col(ggplot2::aes(location_type, avgOccupancy, color=location_type)) - # Show booking channels analysis result bookingChannelsAnalysisResult <- pipelineObj %>>% getOutputById(2) bookingChannelsAnalysisResultDf <- as.data.frame(bookingChannelsAnalysisResult) DT::datatable(head(bookingChannelsAnalysisResultDf),options = list(scrollX = T, scrollY = T)) - - -# ggplot2::ggplot(bookingChannelsAnalysisResultDf) + ggplot2::geom_col(ggplot2::aes(location_type, avgCallCenterBookingPct, color=location_type)) + ggplot2::scale_y_continuous(labels = scales::percent) - ``` # Supplementary Note diff --git a/vignettes/Interoperable_Pipelines.Rmd b/vignettes/Interoperable_Pipelines.Rmd index d2fb6a1..b556252 100644 --- a/vignettes/Interoperable_Pipelines.Rmd +++ b/vignettes/Interoperable_Pipelines.Rmd @@ -12,12 +12,16 @@ vignette: > %\VignetteEncoding{UTF-8} --- -# Objective +# Introduction This vignette explains how **interoperable pipelines** containing functions operating on different engines such as R, Spark and Python can be configured and executed through the **analysisPipelines** package. Currently, the package supports interoperable pipelines containing R and Spark batch functions. If the package is new to you, it is recommended that you go through the *Analysis pipelines - Core functionality and working with R data frames and functions* vignette first. +# Important Note + +Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. + ```{r echo = FALSE} library(analysisPipelines) knitr::opts_chunk$set( @@ -44,6 +48,21 @@ inputDataset <- SparkR::read.df(path = system.file("hotel_new.csv", package = "a ``` +##Initializing Python connection + +```{r} +analysisPipelines::setPythonEnvir('python', '/Users/naren/anaconda3/bin/python') +os <- reticulate::import("os") +numpy <- reticulate::import("numpy") +pandas <- reticulate::import("pandas") +sklearn <- reticulate::import("sklearn") + +reticulate::source_python(system.file("python/sampleFunctions.py", package = "analysisPipelines")) + +reticulate::py_config() +``` + + ## Creating an analysisPipeline object We then initialize an *AnalysisPipeline*, with the input dataset @@ -65,13 +84,13 @@ getSchema <- function(inputDataset) { return(sparkSchema) } -sparkFilterData <- function(inputDataset, condition) { +filterData <- function(inputDataset, condition) { filteredData <- SparkR::filter(inputDataset, condition) return(filteredData) } registerFunction(functionName = "getSchema", engine = "spark") -registerFunction(functionName = "sparkFilterData", engine = "spark") +registerFunction(functionName = "filterData", engine = "spark") getRegistry() @@ -165,12 +184,17 @@ registerFunction(functionName = "rBivarPlots", engine = "r", heading = "Bivariat getRegistry() ``` +## Registering Python functions + +```{r} +registerFunction("decisionTreeTrainAndTest", engine = "python", isDataFunction = F, firstArgClass = "numpy.ndarray") +getRegistry() +``` -## Interoperable pipeline containing both R & Spark functions +## Interoperable pipeline containing R, Spark and Python functions -* Here we consider a typical use case of performing data filtering/ aggregations and so on and Spark, and then using R to perform analysis -* We use ggplot2 in R to visualize the data that is filtered in Spark +* Here we consider a typical use case of performing data filtering/ aggregations and so on and Spark, and then using R to visualize, and Python to run a Machine learning model We first visualize the data without filtering: @@ -188,7 +212,7 @@ opWithoutFilter %>>% getOutputById(1) We then perform filtering on one of the variables in Spark, before visualizing in R ```{r} -pipelineObj %>>% sparkFilterData("Class == 'good'") %>>% +pipelineObj %>>% filterData_spark(condition = "Class == 'good'") %>>% rBivarPlots(select_var_name_1 = "Compet_Occupancy", select_var_name_2 = "Occupancy", priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) -> singleFilterPipeline singleFilterPipeline %>>% visualizePipeline @@ -197,57 +221,27 @@ singleFilterPipeline %>>% generateOutput -> opWithFilter opWithFilter %>>% getOutputById(2) ``` -Finally, we show a case, where sequential filtering steps are performed in Spark, before visualizing in R - -```{r} -pipelineObj %>>% sparkFilterData("Class == 'good'") %>>% - sparkFilterData("Occupancy > 0.7", outAsIn = T, storeOutput = T) %>>% - rBivarPlots(select_var_name_1 = "Compet_Occupancy", select_var_name_2 = "Occupancy", - priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) -> twoFilterPipeline -twoFilterPipeline %>>% generateOutput -> opWith2Filters -opWith2Filters %>>% getOutputById(3) -opWith2Filters %>>% visualizePipeline -# opWith2Filters %>>% generateReport(path = "~/Desktop") -``` - +Finally, we show a case, where sequential filtering steps are performed in Spark, before visualizing in R, and running a decision tree model in Python. -# Saving and reloading the pipeline - -First, we save the pipeline - -```{r} -twoFilterPipeline %>>% savePipeline(path = "~/Downloads/twoFilterPipeline.Rda") -``` - -Then, we clear the R environment and restart the R session +Note, that in this case, we register `getTargetForPyClassifcation` and `getTargetForPyClassification` as *non-data* functions. In this particular pipeline, there is no main *path* as such, as the pipeline branches into 2 paths - one in R and the other in Python. In such cases, using `outAsIn` or the `dataFunction` parameter with formula semantics is just a **question of convenience**. If the first argument of a *non-data* function is of a data frame class in R, Python (Pandas) or Spark, the package automatically performs type conversions when environments are switched (R -> Spark, Spark -> Python, and so on). ```{r} -rm(list=ls(all=TRUE)) # Remove all objects in the R environment -#.rs.restartR() # Run on console to restart R session -``` - -We then reload the package, and start the Spark connection - -```{r sourcing} -library(analysisPipelines) -sparkHome <- "/Users/naren/softwares/spark-2.3.1-bin-hadoop2.7/" -sparkMaster <- "local[1]" -sparkPackages <- c("org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.1") - -sparkRSessionCreateIfNotPresent(sparkHome = sparkHome, master = sparkMaster, sparkPackages = sparkPackages) - -inputDataset <- SparkR::read.df(path=system.file("hotel_new.csv", package = "analysisPipelines"),source="csv",header = TRUE, inferSchema = "true") -``` +pipelineObj %>>% filterData_spark(condition = "Class == 'good'") %>>% + filterData_spark(condition = "Occupancy > 0.7", outAsIn = T) %>>% + rBivarPlots(select_var_name_1 = "Compet_Occupancy", select_var_name_2 = "Occupancy", + priColor = "blue", secColor = "green", outAsIn = T, storeOutput = T) %>>% + getFeaturesForPyClassification(dataset = ~f2, featureNames = c("direct_call_booking_pct", + "loyalty_pct", + "call_center_booking_pct")) %>>% + getTargetForPyClassification(dataset = ~f2, targetVarName = "Class", positiveClass = "good") %>>% + decisionTreeTrainAndTest_py(data = ~f4, target = ~f5, newData = ~f4, storeOutput = T) -> twoFilterPipeline -Then, we load the pipeline and execute on the new data which is the just the first hundred rows of the original dataset +twoFilterPipeline %>>% visualizePipeline -```{r} +twoFilterPipeline %>>% generateOutput -> opWith2Filters +opWith2Filters %>>% getOutputById(3) +opWith2Filters %>>% getOutputById(6) -pipelineObjLoaded <- loadPipeline(path = "~/Downloads/twoFilterPipeline.Rda") -pipelineObjLoaded %>>% checkSchemaMatch(newData = SparkR::as.data.frame(inputDataset)[1:100,]) -> schemaCheck -pipelineObjLoaded %>>% setInput(input = inputDataset) -> pipelineObjLoaded -pipelineObjLoaded %>>% generateOutput -> op -op %>>% getOutputById("3") ``` # Supplementary Note diff --git a/vignettes/Meta_Pipelines.Rmd b/vignettes/Meta_Pipelines.Rmd index 9092cca..213774e 100644 --- a/vignettes/Meta_Pipelines.Rmd +++ b/vignettes/Meta_Pipelines.Rmd @@ -114,7 +114,9 @@ We set the input of the pipeline object to the `iris` dataset and then execute t ```{r} complexMetaPipeline %>>% createPipelineInstance(pipelineProto) -> newPipelineObj -newPipelineObj %>>% setInput(input = iris) %>>% generateOutput %>>% getOutputById("3") +newPipelineObj %>>% setInput(input = iris) -> newPipelineObj + +newPipelineObj %>>% generateOutput %>>% getOutputById("3") ``` diff --git a/vignettes/Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd b/vignettes/Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd index ae23162..acffe72 100644 --- a/vignettes/Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd +++ b/vignettes/Streaming_pipelines_for_working_Apache_Spark_Structured_Streaming.Rmd @@ -18,6 +18,24 @@ The vignette aims to show examples of using SparkR as an interface to run stream * Implement a pipeline using SparkR dataframes for streaming use cases +# Important Note + +Using *Spark* as an engine requires the *SparkR* package to be installed. *SparkR* is distributed natively with *Apache Spark* and is not distributed on CRAN. The *SparkR* version needs to directly map to the Spark version (hence the native distribution), and care needs to be taken to ensure that this is configured properly. + +To install from Github, run the following command, if you know the Spark version: + +```{r eval = F} +devtools::install_github('apache/spark@v2.x.x', subdir='R/pkg') +``` + +The other option is to install R by running the following *terminal* commands if Spark has already been installed. + +```{bash eval = F} +$ export SPARK_HOME=/path/to/spark/directory +$ cd $SPARK_HOME/R/lib/SparkR/ +$ R -e "devtools::install('.')" +``` + # Initialize libraries * Initialize the analysisPipelines and SparkR libraries @@ -166,8 +184,8 @@ getRegistry() # Define pipeline # Do data prep -pipelineObj %>% castKafkaStreamAsString() %>% - convertKafkaValueFromJson(schema = consumerDataSchema, outAsIn = T) %>% convertStructToDf(outAsIn = T) %>% castDfColumns(outAsIn = T, storeOutput = T) -> pipelineObj +pipelineObj %>% castKafkaStreamAsString_sparkSS() %>% + convertKafkaValueFromJson_sparkSS(schema = consumerDataSchema, outAsIn = T) %>% convertStructToDf_sparkSS(outAsIn = T) %>% castDfColumns_sparkSS(outAsIn = T, storeOutput = T) -> pipelineObj pipelineObj %>>% getPipeline pipelineObj %>>% visualizePipeline @@ -177,7 +195,7 @@ pipelineObj %>>% visualizePipeline The pipeline is run by calling the `generateOutput()` function. The `output` attribute of the pipeline object contains the resultant Spark dataframe(s). -In this example the Spark dataframes are converted to R dataframes to help visualizing the result. +In this example the Spark DataFrames are converted to R dataframes to help understand the result. ```{r} diff --git a/vignettes/Using_pipelines_inside_shiny_widgets.Rmd b/vignettes/Using_pipelines_inside_shiny_widgets.Rmd index 0657be7..7ccb399 100644 --- a/vignettes/Using_pipelines_inside_shiny_widgets.Rmd +++ b/vignettes/Using_pipelines_inside_shiny_widgets.Rmd @@ -2,7 +2,7 @@ title: "Using pipelines inside Shiny widgets or apps" author: "Naren Srinivasan" date: "11/14/2018" -runtime: shiny-prerendered +runtime: shiny output: rmarkdown::html_vignette: fig_width: 8