From 7817017d45c1bfb853ff5863b11f77c4ae3bb951 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sat, 24 Jan 2026 00:34:30 +1100 Subject: [PATCH 1/4] refactor nemo_osfx --- NAMESPACE | 1 + R/write.R | 47 +++++++++++++------ man/nemo_osfx.Rd | 26 ++++++++++ .../test-roxytest-testexamples-write.R | 11 ++++- 4 files changed, 70 insertions(+), 15 deletions(-) create mode 100644 man/nemo_osfx.Rd diff --git a/NAMESPACE b/NAMESPACE index c3658e2..58aedb5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,6 +18,7 @@ export(list_files_dir) export(nemo_cli) export(nemo_log) export(nemo_log_date) +export(nemo_osfx) export(nemo_out_formats) export(nemo_write) export(nemoverse_wf_dispatch) diff --git a/R/write.R b/R/write.R index c176772..661d371 100644 --- a/R/write.R +++ b/R/write.R @@ -47,22 +47,18 @@ nemo_write <- function(d, fpfix = NULL, format = "tsv", dbconn = NULL, dbtab = N } else { stopifnot(!is.null(fpfix)) fpfix <- as.character(fpfix) - sfx <- c(tsv = "tsv.gz", csv = "csv.gz", parquet = "parquet", rds = "rds") - osfx <- function(s) glue("{fpfix}.{sfx[s]}") + osfx <- nemo_osfx(fpfix, format) fs::dir_create(dirname(fpfix)) - if (format == "tsv") { - readr::write_tsv(d, osfx("tsv")) - } else if (format == "csv") { - readr::write_csv(d, osfx("csv")) - } else if (format == "parquet") { - arrow::write_parquet(d, osfx("parquet")) - } else if (format == "rds") { - readr::write_rds(d, osfx("rds")) - } else { - stop("No where else to go, check your output format!") - } + w <- list( + tsv = list(fun = "write_tsv", pkg = "readr"), + csv = list(fun = "write_csv", pkg = "readr"), + parquet = list(fun = "write_parquet", pkg = "arrow"), + rds = list(fun = "write_rds", pkg = "readr") + ) + x <- w[[format]] + fun <- getExportedValue(x[["pkg"]], x[["fun"]]) + fun(d, osfx) } - # also gets returned in case of NULL format return(invisible(d)) } @@ -93,3 +89,26 @@ valid_out_fmt <- function(x, choices = nemo_out_formats()) { nemo_out_formats <- function() { c("parquet", "db", "tsv", "csv", "rds") } + +#' Construct Output File Paths with Format Suffix +#' +#' @param fpfix (`character(n)`)\cr +#' Vector of one or more file prefixes e.g. /path/to/foo +#' @param format (`character(1)`)\cr +#' Output format. One of tsv, csv, parquet, rds, or db. +#' @return Character vector of output file paths +#' +#' @examples +#' fpfix <- "path/to/foo" +#' format <- "tsv" +#' o <- nemo_osfx(fpfix, format) +#' @testexamples +#' expect_equal(o, glue("{fpfix}.tsv.gz")) +#' +#' @export +nemo_osfx <- function(fpfix, format) { + valid_out_fmt(format) + fpfix <- as.character(fpfix) + sfx <- c(tsv = "tsv.gz", csv = "csv.gz", parquet = "parquet", rds = "rds") + paste0(fpfix, ".", sfx[format]) +} diff --git a/man/nemo_osfx.Rd b/man/nemo_osfx.Rd new file mode 100644 index 0000000..fa0796a --- /dev/null +++ b/man/nemo_osfx.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write.R +\name{nemo_osfx} +\alias{nemo_osfx} +\title{Construct Output File Paths with Format Suffix} +\usage{ +nemo_osfx(fpfix, format) +} +\arguments{ +\item{fpfix}{(\code{character(n)})\cr +Vector of one or more file prefixes e.g. /path/to/foo} + +\item{format}{(\code{character(1)})\cr +Output format. One of tsv, csv, parquet, rds, or db.} +} +\value{ +Character vector of output file paths +} +\description{ +Construct Output File Paths with Format Suffix +} +\examples{ +fpfix <- "path/to/foo" +format <- "tsv" +o <- nemo_osfx(fpfix, format) +} diff --git a/tests/testthat/test-roxytest-testexamples-write.R b/tests/testthat/test-roxytest-testexamples-write.R index a9955f4..18c02b9 100644 --- a/tests/testthat/test-roxytest-testexamples-write.R +++ b/tests/testthat/test-roxytest-testexamples-write.R @@ -13,7 +13,7 @@ test_that("Function nemo_write() @ L34", { }) -test_that("Function valid_out_fmt() @ L81", { +test_that("Function valid_out_fmt() @ L77", { valid_out_fmt("tsv") expect_true(valid_out_fmt("tsv")) @@ -21,3 +21,12 @@ test_that("Function valid_out_fmt() @ L81", { expect_error(valid_out_fmt(c("tsv", "csv"))) }) + +test_that("Function nemo_osfx() @ L109", { + + fpfix <- "path/to/foo" + format <- "tsv" + o <- nemo_osfx(fpfix, format) + expect_equal(o, glue("{fpfix}.tsv.gz")) +}) + From 5aa731ec386ddda0cbc4107d3676cc15baca4261 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Sun, 25 Jan 2026 15:44:56 +1100 Subject: [PATCH 2/4] rename nemo_id + nemo_pfix; keep outpaths in writer --- R/Tool.R | 18 ++++++++++++------ R/write.R | 3 ++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/R/Tool.R b/R/Tool.R index 14768bd..33a1f30 100644 --- a/R/Tool.R +++ b/R/Tool.R @@ -5,7 +5,6 @@ #' @examples #' \dontrun{ #' path <- here::here("inst/extdata/tool1") -#' x <- Tool$new("tool1", pkg = "nemo", path = path) #' # demo filter + tidy #' x <- Tool1$new(path = path)$ #' filter_files(exclude = "alignments_dupfreq")$ @@ -371,7 +370,6 @@ Tool <- R6::R6Class( # for tidying (and therefore writing). So return NULL. return(NULL) } - d_write <- self$tbls |> dplyr::select( "tool_parser", @@ -385,8 +383,8 @@ Tool <- R6::R6Class( tidy_data = list( tidy_data |> tibble::add_column( - nemo_id = as.character(id), - nemo_pfix = as.character(prefix), + input_id = as.character(id), + input_pfix = as.character(prefix), .before = 1 ) ), @@ -411,9 +409,17 @@ Tool <- R6::R6Class( dbconn = dbconn, dbtab = .data$dbtab ) - ) + ), + outpath = attr(out, "outpath") ) |> - dplyr::ungroup() + dplyr::ungroup() |> + dplyr::select( + "tool_parser", + "prefix", + "tidy_data", + "tbl_name", + "outpath" + ) invisible(d_write) }, #' @description Parse, filter, tidy and write files. diff --git a/R/write.R b/R/write.R index 661d371..08207a4 100644 --- a/R/write.R +++ b/R/write.R @@ -59,7 +59,8 @@ nemo_write <- function(d, fpfix = NULL, format = "tsv", dbconn = NULL, dbtab = N fun <- getExportedValue(x[["pkg"]], x[["fun"]]) fun(d, osfx) } - return(invisible(d)) + attr(d, "outpath") <- if (format == "db") NULL else osfx + invisible(d) } #' Output Format is Valid From 0d5d93c178cb05908a767c56316c92c3cccf90b1 Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Mon, 26 Jan 2026 23:59:26 +1100 Subject: [PATCH 3/4] add output_id --- DESCRIPTION | 1 + R/Tool.R | 27 ++++++++++++++++++--------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1f8aad1..250b76e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,6 +29,7 @@ Imports: rlang, tibble, tidyr, + ulid, yaml Suggests: argparse, diff --git a/R/Tool.R b/R/Tool.R index 33a1f30..dc97487 100644 --- a/R/Tool.R +++ b/R/Tool.R @@ -350,12 +350,20 @@ Tool <- R6::R6Class( #' Directory path to output tidy files. Ignored if format is db. #' @param format (`character(1)`)\cr #' Format of output files. - #' @param id (`character(1)`)\cr - #' ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + #' @param input_id (`character(1)`)\cr + #' Input ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). #' @param dbconn (`DBIConnection`)\cr #' Database connection object (see `DBI::dbConnect`). + #' @param output_id (`character(1)`)\cr + #' Output ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). #' @return A tibble with the tidy data and their output location prefix. - write = function(odir = ".", format = "tsv", id = NULL, dbconn = NULL) { + write = function( + odir = ".", + format = "tsv", + input_id = NULL, + dbconn = NULL, + output_id = ulid::ulid() + ) { if (format != "db") { if (is.null(odir)) { stop("Output directory must be specified when format is not 'db'.") @@ -363,7 +371,7 @@ Tool <- R6::R6Class( fs::dir_create(odir) odir <- normalizePath(odir) } - stopifnot(!is.null(id)) + stopifnot(!is.null(input_id), !is.null(output_id)) stopifnot("Did you forget to tidy?" = !private$needs_tidying) if (is.null(self$tbls)) { # even though tidying is not needed, there must be no files detected @@ -383,8 +391,9 @@ Tool <- R6::R6Class( tidy_data = list( tidy_data |> tibble::add_column( - input_id = as.character(id), + input_id = as.character(input_id), input_pfix = as.character(prefix), + output_id = as.character(output_id), .before = 1 ) ), @@ -427,8 +436,8 @@ Tool <- R6::R6Class( #' Directory path to output tidy files. #' @param format (`character(1)`)\cr #' Format of output files. - #' @param id (`character(1)`)\cr - #' ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + #' @param input_id (`character(1)`)\cr + #' Input ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). #' @param dbconn (`DBIConnection`)\cr #' Database connection object (see `DBI::dbConnect`). #' @param include (`character(n)`)\cr @@ -439,7 +448,7 @@ Tool <- R6::R6Class( nemofy = function( odir = ".", format = "tsv", - id = NULL, + input_id = NULL, dbconn = NULL, include = NULL, exclude = NULL @@ -451,7 +460,7 @@ Tool <- R6::R6Class( write( odir = odir, format = format, - id = id, + input_id = input_id, dbconn = dbconn ) } From 7f87d73fa22b201ddaa8fa6bf6f1e385abd4425b Mon Sep 17 00:00:00 2001 From: pdiakumis Date: Thu, 29 Jan 2026 23:56:17 +1100 Subject: [PATCH 4/4] Tool/Workflow minor refac --- R/Tool.R | 40 ++++++++++++----------- R/Workflow.R | 89 +++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 90 insertions(+), 39 deletions(-) diff --git a/R/Tool.R b/R/Tool.R index dc97487..f2393c8 100644 --- a/R/Tool.R +++ b/R/Tool.R @@ -19,7 +19,7 @@ #' user = "orcabus" #' ) #' lx$nemofy( -#' odir = "nogit/test_data", +#' diro = "nogit/test_data", #' format = "db", # "parquet", #' id = "run2", #' dbconn = dbconn, @@ -346,30 +346,30 @@ Tool <- R6::R6Class( return(invisible(self)) }, #' @description Write tidy tibbles. - #' @param odir (`character(1)`)\cr + #' @param diro (`character(1)`)\cr #' Directory path to output tidy files. Ignored if format is db. #' @param format (`character(1)`)\cr - #' Format of output files. + #' Format of output. #' @param input_id (`character(1)`)\cr - #' Input ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + #' Input ID to use for the dataset (e.g. `run123`). + #' @param output_id (`character(1)`)\cr + #' Output ID to use for the dataset (e.g. `run123`). #' @param dbconn (`DBIConnection`)\cr #' Database connection object (see `DBI::dbConnect`). - #' @param output_id (`character(1)`)\cr - #' Output ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). #' @return A tibble with the tidy data and their output location prefix. write = function( - odir = ".", + diro = ".", format = "tsv", input_id = NULL, - dbconn = NULL, - output_id = ulid::ulid() + output_id = ulid::ulid(), + dbconn = NULL ) { if (format != "db") { - if (is.null(odir)) { + if (is.null(diro)) { stop("Output directory must be specified when format is not 'db'.") } - fs::dir_create(odir) - odir <- normalizePath(odir) + fs::dir_create(diro) + diro <- normalizePath(diro) } stopifnot(!is.null(input_id), !is.null(output_id)) stopifnot("Did you forget to tidy?" = !private$needs_tidying) @@ -404,7 +404,7 @@ Tool <- R6::R6Class( paste(.data$tool_parser, .data$tidy_name, sep = "_") ), # used to write when non-db format - fpfix = paste(file.path(odir, .data$prefix), .data$tbl_name, sep = "_"), + fpfix = paste(file.path(diro, .data$prefix), .data$tbl_name, sep = "_"), dbtab = ifelse( format == "db", list(.data$tbl_name), @@ -432,12 +432,14 @@ Tool <- R6::R6Class( invisible(d_write) }, #' @description Parse, filter, tidy and write files. - #' @param odir (`character(1)`)\cr + #' @param diro (`character(1)`)\cr #' Directory path to output tidy files. #' @param format (`character(1)`)\cr - #' Format of output files. + #' Format of output. #' @param input_id (`character(1)`)\cr - #' Input ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + #' Input ID to use for the dataset (e.g. `run123`). + #' @param output_id (`character(1)`)\cr + #' Output ID to use for the dataset (e.g. `run123`). #' @param dbconn (`DBIConnection`)\cr #' Database connection object (see `DBI::dbConnect`). #' @param include (`character(n)`)\cr @@ -446,9 +448,10 @@ Tool <- R6::R6Class( #' Files to exclude. #' @return A tibble with the tidy data and their output location prefix. nemofy = function( - odir = ".", + diro = ".", format = "tsv", input_id = NULL, + output_id = ulid::ulid(), dbconn = NULL, include = NULL, exclude = NULL @@ -458,9 +461,10 @@ Tool <- R6::R6Class( filter_files(include = include, exclude = exclude)$ tidy()$ write( - odir = odir, + diro = diro, format = format, input_id = input_id, + output_id = output_id, dbconn = dbconn ) } diff --git a/R/Workflow.R b/R/Workflow.R index cff5421..785cfaf 100644 --- a/R/Workflow.R +++ b/R/Workflow.R @@ -6,10 +6,10 @@ #' path <- system.file("extdata/tool1", package = "nemo") #' tools <- list(tool1 = Tool1) #' wf1 <- Workflow$new(name = "foo", path = path, tools = tools) -#' odir <- tempdir() +#' diro <- tempdir() #' wf1$list_files() -#' wf1$nemofy(odir = odir, format = "parquet", id = "run1") -#' (lf <- list.files(odir, pattern = "tool1.*parquet", full.names = FALSE)) +#' wf1$nemofy(diro = diro, format = "parquet", id = "run1") +#' (lf <- list.files(diro, pattern = "tool1.*parquet", full.names = FALSE)) #' #dbconn <- DBI::dbConnect(drv = RPostgres::Postgres(), dbname = "nemo", user = "orcabus") #' #wf1$nemofy(format = "db", id = "runABC", dbconn = dbconn) #' @testexamples @@ -31,6 +31,9 @@ Workflow <- R6::R6Class( #' @field files_tbl (`tibble(n)`)\cr #' Tibble of files from [list_files_dir()]. files_tbl = NULL, + #' @field written_files (`tibble(n)`)\cr + #' Tibble of files written from `self$write()`. + written_files = NULL, #' @description Create a new Workflow object. #' @param name (`character(1)`)\cr @@ -42,6 +45,8 @@ Workflow <- R6::R6Class( initialize = function(name = NULL, path = NULL, tools = NULL) { self$name <- name private$validate_tools(tools) + private$is_tidied <- FALSE + private$is_written <- FALSE self$path <- normalizePath(path) self$files_tbl <- list_files_dir(self$path) # handle everything in a list of Tools @@ -52,10 +57,12 @@ Workflow <- R6::R6Class( #' @param ... (ignored). print = function(...) { res <- tibble::tribble( - ~var , ~value , - "name" , self$name , - "path" , glue::glue_collapse(self$path, sep = ", ") , - "ntools" , as.character(length(self$tools)) + ~var , ~value , + "name" , self$name , + "path" , glue::glue_collapse(self$path, sep = ", ") , + "ntools" , as.character(length(self$tools)) , + "tidied" , as.character(private$is_tidied) , + "written" , as.character(private$is_written) ) cat("#--- Workflow ---#\n") print(res) @@ -88,30 +95,66 @@ Workflow <- R6::R6Class( #' Should the raw parsed tibbles be kept in the final output? #' @return self invisibly. tidy = function(tidy = TRUE, keep_raw = FALSE) { + # if no tidying needed, early return + if (private$is_tidied) { + return(invisible(self)) + } self$tools <- self$tools |> purrr::map(\(x) x$tidy(tidy = tidy, keep_raw = keep_raw)) - invisible(self) + private$is_tidied <- TRUE + return(invisible(self)) }, #' @description Write tidy tibbles. - #' @param odir (`character(1)`)\cr + #' @param diro (`character(1)`)\cr #' Directory path to output tidy files. #' @param format (`character(1)`)\cr - #' Format of output files. - #' @param id (`character(1)`)\cr - #' ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). + #' Format of output. + #' @param input_id (`character(1)`)\cr + #' Input ID to use for the dataset (e.g. `run123`). + #' @param output_id (`character(1)`)\cr + #' Output ID to use for the dataset (e.g. `run123`). #' @param dbconn (`DBIConnection`)\cr #' Database connection object (see `DBI::dbConnect`). - #' @return A tibble with the tidy data and their output location prefix. - write = function(odir = ".", format = "tsv", id = NULL, dbconn = NULL) { - self$tools |> - purrr::map(\(x) x$write(odir = odir, format = format, id = id, dbconn = dbconn)) |> + #' @return self invisibly. + write = function( + diro = ".", + format = "tsv", + input_id = NULL, + output_id = ulid::ulid(), + dbconn = NULL + ) { + res <- self$tools |> + purrr::map(\(x) { + x$write( + diro = diro, + format = format, + input_id = input_id, + output_id = output_id, + dbconn = dbconn + ) + }) |> dplyr::bind_rows() + private$is_written <- TRUE + self$written <- res + # Write metadata + # if (format != "db" && !is.null(res) && nrow(res) > 0) { + # meta <- Metadata$new( + # workflow = self, + # write_result = res, + # output_dir = diro, + # format = format, + # input_id = input_id, + # output_id = output_id + # ) + # meta$write() + # } + return(invisible(self)) }, #' @description Parse, filter, tidy and write files. - #' @param odir (`character(1)`)\cr + #' @param diro (`character(1)`)\cr #' Directory path to output tidy files. #' @param format (`character(1)`)\cr - #' Format of output files. + #' Format of output. #' @param id (`character(1)`)\cr #' ID to use for the dataset (e.g. `wfrid.123`, `prid.456`). #' @param dbconn (`DBIConnection`)\cr @@ -122,7 +165,7 @@ Workflow <- R6::R6Class( #' Files to exclude. #' @return A tibble with the tidy data and their output location prefix. nemofy = function( - odir = ".", + diro = ".", format = "tsv", id = NULL, dbconn = NULL, @@ -133,7 +176,7 @@ Workflow <- R6::R6Class( self$ filter_files(include = include, exclude = exclude)$ tidy()$ - write(odir = odir, format = format, id = id, dbconn = dbconn) + write(diro = diro, format = format, id = id, dbconn = dbconn) }, #' @description Get raw schemas for all Tools. #' @return Tibble with names of tool and file, schema and its version. @@ -172,6 +215,10 @@ Workflow <- R6::R6Class( tool_nms <- purrr::map_chr(x, "classname") |> tolower() stopifnot(!is.null(tool_nms)) stopifnot(all(purrr::map(x, "inherit") == as.symbol("Tool"))) - } + }, + # Do files need to be tidied? Used when no files are detected, so we can + # use downstream as a bypass. + is_tidied = NULL, + is_written = NULL ) # private end )