685. compare atime_data.table_tidyverse vignette.Rmd

---
title: "compare atime. tidyverse vignette"
author: "Doris Amoakohene"
date: "`r Sys.Date()`"
output: html_document
---


```{r}
library(cache)
```


```{r setup, include = FALSE}
time.begin <- Sys.time()
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = TRUE,
  fig.width=18,
  fig.height=10
)
if(FALSE){
  unlink("~/R/atime-cache-4.3.1/*")
  rmarkdown::render("compare-data.table-tidyverse.Rmd")
}
```

In this vignette we compare computational requirements (time and
memory) of common operations using `data.table` and tidyverse functions.

## Setup

```{r}
library(data.table)
hostname <- system("hostname",intern=TRUE)
(max.threads <- as.integer(Sys.getenv("SLURM_JOB_CPUS_PER_NODE", "1")))
threads.vec <- unique(as.integer(c(1, ceiling(max.threads/2), max.threads)))
seconds.limit <- 1
cache.list <- list()
cache <- function(symbol, code){
  cache.symb <- substitute(symbol)
  suffix <- if(grepl("devel",R.version.string)){
    "devel"
  }else{
    gsub("R version | .*", "", R.version.string)
  }
  cache.dir <- paste0("~/R/atime-cache-",suffix)
  cache.rds <- file.path(cache.dir, paste0(cache.symb, ".RDS"))
  if(file.exists(cache.rds)){
    value <- readRDS(cache.rds)
  }else{
    to.eval <- substitute(code)
    value <- eval(to.eval)
    value$hostname <- hostname
    if(dir.exists(cache.dir))saveRDS(value, cache.rds)
  }
  cache.list[[paste(cache.symb)]] <<- value
  assign(paste(cache.symb), value, parent.frame())
}
aplot <- function(atime.list, my.title, xmax, max.seconds, xlab, color.vec=NULL){
  best.list <- atime::references_best(atime.list)
  blank.dt <- data.table(x=best.list$meas$N[1], y=max.seconds, unit="seconds")
  if(require(ggplot2)){
    hline.df <- with(atime.list, data.frame(seconds.limit, unit="seconds"))
    gg <- ggplot()+
      ggtitle(tit <<- paste(my.title,"on",hostname))+
      theme_bw()+
      geom_blank(aes(
        x, y),
        data=blank.dt)+
      facet_grid(unit ~ ., scales="free")+
      geom_hline(aes(
        yintercept=seconds.limit),
        color="grey",
        data=hline.df)+
      geom_line(aes(
        N, empirical, color=expr.name),
        data=best.list$meas)+
      geom_ribbon(aes(
        N, ymin=q25, ymax=q75, fill=expr.name),
        data=best.list$meas[unit=="seconds"],
        alpha=0.5)+
      scale_x_log10(xlab)+
      scale_y_log10("median line, quartiles band")
    if(!is.null(color.vec)){
      gg <- gg+
        scale_color_manual(values=color.vec)+
        scale_fill_manual(values=color.vec)
    }
    if(require(directlabels)){
      gg+
        directlabels::geom_dl(aes(
          N, empirical, color=expr.name, label=expr.name),
          method="right.polygons",
          data=best.list$meas)+
        theme(legend.position="none")+
        coord_cartesian(xlim=c(NA,xmax))
    }else{
      gg
    }
  }
}
```
## Writing CSV

First we define some code which will be used in all of the writing
benchmarks,

```{r}
atime_write <- function(make.input.fun, fmt){
  grid.args <- list(
    list(THREADS=threads.vec),
    "data.table::fwrite"=quote({
      data.table::setDTthreads(THREADS)
      data.table::fwrite(input, name.list[["fwrite"]], showProgress = FALSE)
    }))
  small.input <- make.input.fun(2,2)
  if(requireNamespace("arrow") && is.data.frame(small.input)){
    grid.args[["write_csv_arrow"]] <- quote({
      arrow::set_cpu_count(THREADS)
      arrow::write_csv_arrow(input, name.list[["write_csv_arrow"]])
    })
  }
  if(requireNamespace("readr") && is.data.frame(small.input)){
    ##readr can't handle matrix input.
    grid.args[["readr::write_csv"]] <- quote({
      readr::write_csv(
        input, name.list[["write_csv"]], progress = FALSE, num_threads = THREADS)
    })
  }
  expr.list <- do.call(atime::atime_grid, grid.args)
  atime::atime(
    N=as.integer(10^seq(0, 6, by=0.5)),
    setup={
      input <- make.input.fun(N)
      name.list <- list()
      for(fun in c("fwrite", "write_csv", "write_csv_arrow", "write.csv")){
        name.list[[fun]] <- file.path(
          tempdir(), sprintf(fmt, fun, N))
      }
    },
    seconds.limit = seconds.limit,
    expr.list=expr.list,
    "utils::write.csv"=utils::write.csv(input, name.list[["write.csv"]]))
}
one.thread <- function(DT)DT[grepl("utils|scan|THREADS=1$", expr.name)]
facetPlot <- function(atime.list, fun.name.vec=c("N^2","N"), N.min=1e2){
  best.list <- atime::references_best(atime.list)
  meas.dt <- one.thread(best.list$meas)
  ref.dt <- one.thread(best.list$ref)[
    fun.name %in% fun.name.vec & N >= N.min]
  if(require(ggplot2)){
    hline.df <- with(write.real.vary.rows, data.frame(
      seconds.limit, unit="seconds"))
    gg <- ggplot()+
      ggtitle(paste0(tit,", asymptotic complexity"))+
      theme_bw()+
      facet_grid(unit ~ expr.name, scales="free")+
      geom_hline(aes(
        yintercept=seconds.limit),
        color="grey",
        data=hline.df)+
      geom_line(aes(
        N, reference, group=paste(expr.name, fun.name)),
        linewidth=2,
        data=ref.dt)+
      geom_line(aes(
        N, empirical, color=expr.name),
        linewidth=1,
        data=meas.dt)+
      geom_ribbon(aes(
        N, ymin=q25, ymax=q75, fill=expr.name),
        data=meas.dt[unit=="seconds"],
        alpha=0.5)+
      scale_x_log10("N = Number of columns")+
      scale_y_log10("median line, quartiles band")+
      scale_color_manual(values=write.colors)+
      scale_fill_manual(values=write.colors)
    if(require(directlabels)){
      gg+
        directlabels::geom_dl(aes(
          N, reference,
          label.group=paste(expr.name, fun.name),
          label=fun.name),
          method="left.polygons",
          data=ref.dt)+
        theme(legend.position="none")
    }else{
      gg
    }
  }
}
if(FALSE){
  RColorBrewer::display.brewer.all()
  dput(RColorBrewer::brewer.pal(Inf, "Set2"))
  dput(RColorBrewer::brewer.pal(Inf, "RdGy"))
}
NAME <- function(prefix, ...){
  structure(
    c(...)[1:length(threads.vec)],
    names=sprintf("%sTHREADS=%d", prefix, threads.vec))
}
write.colors <- c(
  NAME("readr::write_csv ", "#9970AB","#762A83", "#40004B"), #purple
  "#5AAE61", "#1B7837", "#00441B",#green
  NAME("data.table::fwrite ", "#D6604D", "#B2182B", "#67001F"),#reds
  "#878787", "#4D4D4D", "#1A1A1A",#greys
  NAME("write_csv_arrow ", "#BF812D", "#8C510A", "#543005"),#browns
  "#35978F", "#01665E", "#003C30",#teal polars
  "utils::write.csv"="deepskyblue")
(write.colors <- write.colors[names(write.colors)!=""])
```

### Writing CSV with real numbers

The code below is for real numbers with a constant number of columns,
and a variable number of rows.

```{r}
random_real <- function(N.rows, N.cols){
  set.seed(1)
  matrix(rnorm(N.rows*N.cols), N.rows, N.cols)
}
cache(write.real.vary.rows, atime_write(
  function(N.rows, N.cols=10)random_real(N.rows, N.cols),
  "10_real_cols_%s_%d.csv"))
aplot(write.real.vary.rows, "Write CSV with 10 random normal real columns", 1e9, 1e1, "Number of rows", write.colors)
```

The plot above shows that all methods are the same, except
`utils::write.csv` memory is increasing with data size, and others are
contant.

```{r}
facetPlot(write.real.vary.rows)
```

The plot above shows that the memory usage of `utils::write.csv` is
linear.

The code below writes real numbers with a constant number of rows, and
a variable number of columns.

```{r}
cache(write.real.vary.cols, atime_write(
  function(N.cols, N.rows=10)random_real(N.rows, N.cols),
  "10_real_rows_%s_%d.csv"))
aplot(write.real.vary.cols, "Write CSV with 10 random normal real rows", 1e9, 1e1, "Number of columns", write.colors)
```

The plot above shows that `data.table::fread` uses asymptotically less
time and memory than the others.

### Write CSV from character matrix

The code below is for a character data matrix with a constant number of
columns, and a variable number of rows.

```{r}
chr_mat <- function(N.rows, N.cols){
  data.vec <- paste0("'quoted", c(" ", "_"), "data'")
  matrix(data.vec, N.rows, N.cols)
}
cache(write.chrmat.vary.rows, atime_write(
  function(N.rows,N.cols=10)chr_mat(N.rows, N.cols),
  "10_chrmat_cols_%s_%d.csv"))
aplot(write.chrmat.vary.rows, "Write CSV from matrix with 10 character columns", 1e9, 1e1, "Number of rows", write.colors)
```

TODO

```{r}
facetPlot(write.chrmat.vary.rows)
```

TODO

```{r}
cache(write.chrmat.vary.cols, atime_write(
  function(N.cols, N.rows=10)chr_mat(N.rows, N.cols),
  "10_chrmat_rows_%s_%d.csv"))
aplot(write.chrmat.vary.cols, "Write CSV from matrix with 10 character rows", 1e9, 1e1, "Number of columns", write.colors)
```

TODO

```{r}
facetPlot(write.chrmat.vary.cols)
```

TODO

### Write CSV from character data.table

The code below is for a character data.table with a constant number of
columns, and a variable number of rows.

```{r}
chr_dt <- function(N.rows, N.cols){
  data.table(chr_mat(N.rows, N.cols))
}
cache(write.chr.vary.rows, atime_write(
  function(N.rows,N.cols=10)chr_dt(N.rows, N.cols),
  "10_chr_cols_%s_%d.csv"))
aplot(write.chr.vary.rows, "Write CSV from data.table with 10 character columns", 1e9, 1e1, "Number of rows", write.colors)
```

The figure above is useful for comparing different functions, and
shows that all have the same asymptotic time complexity
class. However, we observe a difference in memory usage: linear for
`write.csv` and constant for others. Below, we draw reference lines,
so we can see the complexity class.

```{r}
facetPlot(write.chr.vary.rows)
```

The figure above shows that all functions are linear time, and
`write.csv` is linear memory.
The code below is for a character data.frame with a constant number of
rows, and a variable number of columns.

```{r}
cache(write.chr.vary.cols, atime_write(
  function(N.cols, N.rows=10)chr_dt(N.rows, N.cols),
  "10_chr_rows_%s_%d.csv"))
aplot(write.chr.vary.cols, "Write CSV from data.table with 10 character rows", 1e9, 1e1, "Number of columns", write.colors)
```

The figure above shows that `data.table::fwrite` clearly has a smaller
slope (linear complexity in number of columns) than the other methods
(quadratic complexity), as shown in the plot below, which includes
best reference lines above and below each empirical measurement
asymptote.

```{r}
facetPlot(write.chr.vary.cols)
```

The comparisons above show significant advantages for `data.table` for
writing CSV data with a large number of columns: asymptotically less
time and memory (linear rather than quadratic in number of columns).

### Write CSV from data.table with factor columns

The code below is for factor data with a constant number of
columns, and a variable number of rows.

```{r}
fac_dt <- function(N.rows, N.cols){
  data.vec <- factor(paste0("'quoted", c(" ", "_"), "data'"))
  as.data.table(lapply(1:N.cols, function(col.i)rep(data.vec,l=N.rows)))
}
cache(write.fac.vary.rows, atime_write(
  function(N.rows,N.cols=10)fac_dt(N.rows, N.cols),
  "10_fac_cols_%s_%d.csv"))
aplot(write.fac.vary.rows, "Write CSV from data.table with 10 factor columns", 1e9, 1e1, "Number of rows", write.colors)
```

TODO. Below, we draw reference lines, so we can see the complexity
class.

```{r}
facetPlot(write.fac.vary.rows)
```

TODO
The code below is for factor data with a constant number of
rows, and a variable number of columns.

```{r}
cache(write.fac.vary.cols, atime_write(
  function(N.cols, N.rows=10)fac_dt(N.rows, N.cols),
  "10_fac_rows_%s_%d.csv"))
aplot(write.fac.vary.cols, "Write CSV with 10 factor rows", 1e9, 1e1, "Number of columns", write.colors)
```

TODO.  In the plot below, we include best reference lines above and
below each empirical measurement asymptote.

```{r}
facetPlot(write.fac.vary.cols)
```

TODO

### Write CSV from data.table with POSIXct columns

The code below is for factor data with a constant number of
columns, and a variable number of rows.

```{r}
POSIXct_dt <- function(N.rows, N.cols){
  as.data.table(lapply(1:N.cols, function(col.i)rep(Sys.time(),l=N.rows)))
}
cache(write.POSIXct.vary.rows, atime_write(
  function(N.rows,N.cols=10)POSIXct_dt(N.rows, N.cols),
  "10_fac_cols_%s_%d.csv"))
aplot(write.POSIXct.vary.rows, "Write CSV from data.table with 10 POSIXct columns", 1e9, 1e1, "Number of rows", write.colors)
```

TODO. Below, we draw reference lines, so we can see the complexity
class.

```{r}
facetPlot(write.POSIXct.vary.rows)
```

TODO
The code below is for factor data with a constant number of
rows, and a variable number of columns.

```{r}
cache(write.POSIXct.vary.cols, atime_write(
  function(N.cols, N.rows=10)POSIXct_dt(N.rows, N.cols),
  "10_POSIXct_rows_%s_%d.csv"))
aplot(write.POSIXct.vary.cols, "Write CSV with 10 POSIXct rows", 1e9, 1e1, "Number of columns", write.colors)
```

TODO.  In the plot below, we include best reference lines above and
below each empirical measurement asymptote.

```{r}
facetPlot(write.POSIXct.vary.cols)
```

TODO

## Reading CSV

First we define a function which we will use for all of the read
benchmarks,

```{r}
read.expr.list <- c(
  if(requireNamespace("readr"))atime::atime_grid(
    list(LAZY=c(TRUE, FALSE), THREADS=threads.vec),
    "readr::read_csv"={
      readr::read_csv(
        f.csv, num_threads = THREADS, lazy = LAZY,
        ##col_select=1:10,
        ##n_max=10,
        show_col_types=FALSE, progress=FALSE)
    }),
  atime::atime_grid(
    list(THREADS=threads.vec),
    "data.table::fread"={
      data.table::setDTthreads(THREADS)
      data.table::fread(
        f.csv,
        ##nrows=10,
        ##select=1:10,
        showProgress=FALSE)
    }),
  if(FALSE && requireNamespace("polars"))atime::atime_grid(
    ##TODO wait until we know how to set max number of threads.
    "polars::pl$read_csv"={
      TODO
      polars::pl$scan_csv(f.csv)
    }),
  if(requireNamespace("arrow"))atime::atime_grid(
    list(THREADS=threads.vec),
    "read_csv_arrow"={
      arrow::set_cpu_count(THREADS)#https://github.com/apache/arrow/issues/30205#issuecomment-1378060874
      arrow::read_csv_arrow(
        f.csv)#col_select=1:10?
    }),
  atime::atime_grid(
    "utils::read.csv"={
      utils::read.csv(f.csv)
    }))
atime_read <- function(glob, compute=FALSE, colClasses, N.col){  
  fmt <- sub("[*]", "%d", glob)
  csv.dt <- nc::capture_first_vec(
    Sys.glob(file.path(tempdir(), glob)),
    N="[0-9]+", as.integer,
    ".csv")[order(N)]
  read.more.list <- c(
    read.expr.list,
    if(!missing(colClasses))list(
      "read.csv(colClasses)"=substitute(
        utils::read.csv(f.csv, colClasses = CLASS),
        list(CLASS=colClasses)),
      "list2DF(scan)"=substitute({
        what <- `names<-`(
          rep(list(FUN()), NCOL),
          paste0("V",seq_len(NCOL))
        )
        list2DF(scan(f.csv, what=what, sep=",", skip=1, multi.line=FALSE))
      }, list(
        FUN=as.symbol(colClasses),
        NCOL=if(missing(N.col))quote(N) else N.col)
      )))
  expr.list <- if(compute){
    read.compute.expr.list <- list()
    for(expr.name in names(read.more.list)){
      lang.list <- as.list(read.more.list[[expr.name]])
      LAST <- length(lang.list)
      lang.list[[LAST]] <- as.call(c(
        quote(`<-`),
        quote(DF),
        lang.list[[LAST]]))
      read.compute.expr.list[[expr.name]] <- as.call(c(
        lang.list,
        quote(apply(DF, 1, paste, collapse=","))))
    }
    read.compute.expr.list
  }else{
    read.more.list
  }
  atime::atime(
    N=csv.dt$N,
    setup={
      f.csv <- file.path(tempdir(), sprintf(fmt, N))
    },
    seconds.limit = seconds.limit,
    expr.list=expr.list)
}
PRGn <- c(
  NAME("readr::read_csv LAZY=FALSE,", "#9970AB","#762A83", "#40004B"), #purple
  "#5AAE61", "#1B7837", "#00441B",#green
  NAME("data.table::fread ", "#D6604D", "#B2182B", "#67001F"),#reds
  NAME("readr::read_csv LAZY=TRUE,", "#878787", "#4D4D4D", "#1A1A1A"),#greys
  NAME("read_csv_arrow ", "#BF812D", "#8C510A", "#543005"),#browns
  "#35978F", "#01665E", "#003C30",#teal polars
  "utils::read.csv"="#00FFFF",#"deepskyblue",
  "read.csv(colClasses)"="#00CCCC",
  "list2DF(scan)"="#009999")
(read.colors <- PRGn[names(PRGn)!=""&!is.na(names(PRGn))])
```

Below we read real numbers with a constant number of columns, and a
variable number of rows.

```{r}
cache(read.real.vary.rows, atime_read("10_real_cols_fwrite_*.csv", compute=FALSE, colClasses="numeric",N.col=10))
aplot(read.real.vary.rows, "Read CSV with 10 real columns", 1e9, 1e1, "Number of rows", read.colors)
```

It can be seen in the plot above that the green results, `read_csv`
with `LAZY=TRUE` are fastest, which is normal because lazy reading
does not actually read the data values into memory. A more fair
comparison is below, which computes a text string for every row after
reading the CSV,

```{r}
cache(compute.real.vary.rows, atime_read("10_real_cols_fwrite_*.csv", compute=TRUE))
aplot(compute.real.vary.rows, "Read CSV with 10 real columns, then collapse each row", 1e9, 1e1, "Number of rows", read.colors)
```

Below we read real numbers with a constant number of rows, and a
variable number of columns.

```{r}
cache(read.real.vary.cols, atime_read("10_real_rows_fwrite_*.csv", compute=FALSE, colClasses="numeric"))
aplot(read.real.vary.cols, "Read CSV with 10 real rows", 1e8, 1e1, "Number of columns", read.colors)
```

The plot above shows that all functions have the same asymptotic
memory usage, but `read.csv` has a larger asymptotic time complexity
class than the others. The plot below shows that the time complexity
class of `read.csv` is in fact quadratic, whereas the others are linear.

```{r}
facetPlot(read.real.vary.cols,c("N^2","N log N", "N"))
```


Below we read character data with a constant number of columns, and a
variable number of rows.

```{r}
cache(read.chr.vary.rows, atime_read("10_chr_cols_fwrite_*.csv", compute=FALSE, colClasses="character",N.col=10))
aplot(read.chr.vary.rows, "Read CSV with 10 character columns", 1e9, 1e1, "Number of rows", read.colors)
```

As with the previous result for real data, the green results above,
`read_csv` with `LAZY=TRUE` are fastest, which is normal because lazy
reading does not actually read the data values into memory. A more
fair comparison is below, which computes a text string for every row
after reading the CSV,

```{r}
cache(compute.chr.vary.rows, atime_read("10_chr_cols_fwrite_*.csv", compute=TRUE))
aplot(compute.chr.vary.rows, "Read CSV with 10 character columns, then collapse each row", 1e9, 1e1, "Number of rows", read.colors)
```
Below we read character data with a constant number of rows, and a
variable number of columns.

```{r}
cache(read.chr.vary.cols, atime_read("10_chr_rows_fwrite_*.csv", compute=FALSE, colClasses="character"))
aplot(read.chr.vary.cols, "Read CSV with 10 character rows", 1e8, 1e1, "Number of columns", read.colors)
```

From the comparisons above, it can be seen that for a small number of
columns, and a large number of rows, all the methods are about the
same (constant factor differences, using more than one thread also
results in small constant factor speedups). However for a small number
of rows and a large number of columns, `data.table::fread` is clearly
the most efficient:

* `data.table::fread` uses constant factors less time and memory than
  `readr::read_csv`.
* `utils::read.csv` uses asymptotically more time (super-linear in the
  number of columns).
  
```{r}
facetPlot(read.chr.vary.cols,c("N^2","N log N", "N"))
```

## Reading CSV, first few rows or columns

First we define a function which we will use for all of the read
benchmarks,

```{r}
limit.expr.list <- c(
  if(requireNamespace("readr"))atime::atime_grid(
    "readr::read_csv"={
      readr::read_csv(
        f.csv, num_threads = 1,
        col_select=1:10,
        n_max=10, lazy=FALSE,
        show_col_types=FALSE, progress=FALSE)
    }),
  atime::atime_grid(
    "data.table::fread"={
      data.table::setDTthreads(1)
      data.table::fread(
        f.csv,
        nrows=10,
        select=1:10,
        showProgress=FALSE)
    }),
  if(requireNamespace("polars"))atime::atime_grid(
    "polars::read_csv_"={
      ## https://github.com/pola-rs/r-polars/issues/267
      polars::pl$scan_csv(f.csv)[,1:10,drop=FALSE]$slice(0,10)$collect()
    }),
  if(requireNamespace("arrow"))atime::atime_grid(
    "read_csv_arrow"={
      arrow::set_cpu_count(1)
      arrow::read_csv_arrow(f.csv, col_select=1:10)#n_max not possible, https://github.com/apache/arrow/issues/36325#issuecomment-1609738413
    }),
  atime::atime_grid(
    "utils::read.csv"={
      utils::read.csv(f.csv, nrows=10)
    }))
atime_read_limit <- function(glob){  
  fmt <- sub("[*]", "%d", glob)
  csv.dt <- nc::capture_first_vec(
    Sys.glob(file.path(tempdir(), glob)),
    N="[0-9]+", as.integer,
    ".csv")[order(N)]
  atime::atime(
    N=csv.dt$N,
    setup={
      f.csv <- file.path(tempdir(), sprintf(fmt, N))
    },
    seconds.limit = seconds.limit,
    expr.list=limit.expr.list)
}
limit.colors <- c(
  "readr::read_csv"="#9970AB", #purple
  ##"#5AAE61",#green
  "data.table::fread"="#D6604D",#reds
  "readr::read_csv"="#878787",#greys
  "read_csv_arrow"="#BF812D",#browns
  "polars::read_csv_"="#35978F",#teal polars
  "utils::read.csv"="#00FFFF")#"deepskyblue",
```

Below we read real numbers with a constant number of columns, and a
variable number of rows.

```{r}
cache(read.real.vary.rows.limit, atime_read_limit("10_real_cols_fwrite_*.csv"))
aplot(read.real.vary.rows.limit, "Read first 10 rows of CSV with 10 real columns", 1e9, 1e1, "Number of rows in CSV", limit.colors)
```

Below we read real numbers with a constant number of rows, and a
variable number of columns in the CSV file (only first 10 columns read into R).

```{r}

cache(read.real.vary.cols.limit, atime_read_limit("10_real_rows_fwrite_*.csv"))
aplot(read.real.vary.cols.limit, "Read first 10 columns of CSV with 10 real rows", 1e8, 1e1, "Number of columns in CSV", limit.colors)
```

TODO

```{r}
facetPlot(read.real.vary.cols.limit,c("N","1"))
```

## Summarize by group

The next problem is motivated by a common operation in machine
learning code: computing the mean/SD over cross-validation folds.

```{r}
summary.expr.list <- c(atime::atime_grid(
  list(THREADS=threads.vec),
  if(FALSE && requireNamespace("collapse"))atime::atime_grid(
    "collapse"={
      TODO
      ## https://sebkrantz.github.io/collapse/#regarding-performance
    }
  ),
  "[.data.table"={
    data.table::setDTthreads(THREADS)
    loss.dt[, .(
      loss_length=.N,
      loss_mean=mean(loss),
      loss_sd=sd(loss)
    ), by=.(set, epoch)]
  }),
  atime::atime_grid(
    "base::by"={
      base::by(
        loss.dt$loss, 
        list(loss.dt$set, loss.dt$epoch), 
        function(values)c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values)))
    },
    "base::tapply"={
      base::tapply(
        loss.dt$loss, 
        list(loss.dt$set, loss.dt$epoch), 
        function(values)c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values)))
    }, 
    "stats::aggregate"={
      res <- stats::aggregate(
        loss ~ set + epoch, 
        loss.dt, 
        function(values)list(c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values))))
      data.frame(
        subset(res, select=-loss), 
        do.call(rbind, res$loss))
    },
    "data.table::dcast"={
      dcast(
        loss.dt,
        set + epoch ~ .,
        list(length, mean, sd),
        value.var="loss")
    }),
  if(requireNamespace("dplyr"))atime::atime_grid("dplyr::summarise"={
    loss.dt |> 
      dplyr::group_by(set, epoch) |> 
      dplyr::summarise(
        loss_length=length(loss),
        loss_mean=mean(loss), 
        loss_sd=sd(loss))
  }),
  if(requireNamespace("tidyr"))atime::atime_grid("tidyr::pivot_wider"={
    loss.dt |> 
      tidyr::pivot_wider(
        id_cols = c(set,epoch), 
        values_from=loss, 
        names_from=name, 
        values_fn=function(values)list(c(
          loss_length=length(values),
          loss_mean=mean(values), 
          loss_sd=sd(values)))) |> 
      tidyr::unnest_wider(loss)
  }))
some <- function(...)c(...)[1:length(threads.vec)]
summary.colors <- c(
  some("#D6604D", "#B2182B", "#67001F"),#dark red
  "#66C2A5", "#FC8D62", "#8DA0CB", "#E78AC3", "#A6D854", "#FFD92F", 
  "#E5C494", "#B3B3B3")
names(summary.colors)[1:length(summary.expr.list)] <- names(summary.expr.list)
summary.colors <- summary.colors[names(summary.colors)!=""]
options(dplyr.summarise.inform=FALSE)
cache(summary.atime.list, atime::atime(
  N=as.integer(10^seq(0, 7, by=0.5)),
  setup={
    n.folds <- 10
    loss.dt <- data.table(
      name="loss", 
      fold=rep(1:n.folds, each=2*N),
      loss=rnorm(2*N*n.folds),
      set=rep(c("subtrain","validation"),each=N),
      epoch=1:N,
      key=c("set","epoch","fold"))
  },
  seconds.limit=seconds.limit,
  expr.list=summary.expr.list))
aplot(summary.atime.list, "Length, Mean, SD over 10 folds for each epoch and set", 1e7, 1e1, "Number of epochs", summary.colors)
```

The comparison above shows that using `[.data.table` is by far the
most efficient method (by constant factors) to compute the Mean and SD
over folds.

## Join and compute sum

```{r}
i <- 1:100
DT <- CJ(letter=LETTERS, i)[, x := rnorm(.N)]
setkey(DT, letter, i)
DF <- data.frame(DT)
rownames(DF) <- with(DF, paste0(letter, i))
expr.list <- if(requireNamespace("dplyr"))atime::atime_grid("dplyr::inner_join"=with(dplyr::inner_join(DT, select.dt, by=c('letter','i')), x+y))
cache(atime.join.sum, atime::atime(
  N=10^seq(1, 7),
  setup={
    select.dt <- data.table(
      letter=sample(LETTERS, N, replace=TRUE),
      i=sample(i, N, replace=TRUE),
      y=rnorm(N))
    setkey(select.dt, letter, i)
    select.df <- data.frame(select.dt)
  },
  seconds.limit=seconds.limit,
  "data.table::`[.data.table`"=DT[select.dt, x+y],
  "data.table::merge"=data.table::merge.data.table(DT,select.dt)[, x+y],
  "base::merge.data.frame"=with(base::merge.data.frame(DF, select.df, by=c('letter','i')), x+y),
  "[+paste0"=with(select.df, DF[paste0(letter,i),"x"]+y),
  expr.list=expr.list))
aplot(atime.join.sum, "Join and sum", 1e9, 1e1, "Size of output vector")
```

## Join and summarize

```{r}
i <- 1:100
DT <- CJ(letter=LETTERS, i)[, x := rnorm(.N)]
setkey(DT, letter, i)
DF <- data.frame(DT)
rownames(DF) <- with(DF, paste0(letter, i))
cache(atime.join.summarize, atime::atime(
  N=as.integer(10^seq(0, 7, by=0.5)),
  setup={
    select.dt <- data.table(
      letter=sample(LETTERS, N, replace=TRUE),
      i=sample(i, N, replace=TRUE),
      y=rnorm(N))
    setkey(select.dt, letter, i)
    select.df <- data.frame(select.dt)
  },
  seconds.limit=seconds.limit,
  "data.table::`[.data.table`"={
    select.dt[DT, .(rows=.N, diff=mean(y)-x), by=.EACHI, nomatch=0L]
  },
  "base::by"={
    do.call(rbind, base::by(
      select.df, 
      with(select.df, paste0(letter, i)), 
      function(sdf){
        srow <- sdf[1,]
        data.frame(
          srow[,c("letter","i")],
          rows=nrow(sdf), 
          diff=mean(sdf$y)-DF[with(srow,paste0(letter,i)),"x"])
      }))
  }, 
  "dplyr::inner_join"={
    dplyr::inner_join(DT, select.dt, by=c('letter','i')) |> 
      dplyr::group_by(letter, i) |> 
      dplyr::summarise(rows=length(y), diff=mean(y)-x[1])
  }))
aplot(atime.join.summarize, "Join and summarize", 1e10, 1e1, "Rows in join table")
```

## Rolling join

This situation arises when you want to compute the average in a
regular grid over some irregularly spaced numbers.

```{r}
digits <- 1
grid.space <- 10^(-digits)
offset <- grid.space/2
cache(atime.roll, atime::atime(
  N=10^seq(1:7),
  setup={
    set.seed(1)
    X <- runif(N)
    Y <- 10*X+rnorm(N)
  },
  seconds.limit=seconds.limit,
  "data.table::[roll=nearest"={
    irreg.dt <- data.table(X, Y, key="X")
    grid <- seq(offset, 1-offset, by=grid.space)
    reg.dt <- data.table(grid, X=grid, key="X")
    join.dt <- reg.dt[irreg.dt, roll="nearest"]
    join.dt[, .(Y.N=.N, Y.mean=mean(Y), Y.sd=sd(Y)), by=grid]
  },
  "round,data.table"={
    data.table(
      grid=round(X+offset, digits=digits)-offset,
      Y
    )[, .(
      Y.N=.N, 
      Y.mean=mean(Y), 
      Y.sd=sd(Y)
    ), by=grid]
  },
  "round,aggregate"={
    grid <- round(X+offset, digits=digits)-offset
    aggregate(Y ~ grid, FUN=function(values)c(
      N=length(values),
      mean=mean(values),
      sd=sd(values)))
  }))
aplot(atime.roll, "Rolling join", 1e10, 1e1, "Rows in join table")
```

## Total time

```{r}
Sys.time() - time.begin
seconds.vec <- sapply(cache.list, function(L){
  do.call(sum, L$meas$time)
})
sum(seconds.vec)+time.begin-time.begin
```

## session info

```{r}
sessionInfo()
```
```