From 23415baf33f0abeb902f5c09de12d185786b66d2 Mon Sep 17 00:00:00 2001 From: Joe Cheng Date: Sat, 8 May 2021 01:22:18 -0700 Subject: [PATCH 1/4] Initial sketch for Rmd-style reporting --- .gitignore | 1 + _functions_compare.R | 78 ++++++++++++++++++++ renv.lock | 165 +++++++++++++++++++++++++++++++++++++------ report.Rmd | 105 +++++++++++++++++++++++++++ report.css | 4 ++ 5 files changed, 330 insertions(+), 23 deletions(-) create mode 100644 _functions_compare.R create mode 100644 report.Rmd create mode 100644 report.css diff --git a/.gitignore b/.gitignore index 5b6a065..35bd117 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +report.html diff --git a/_functions_compare.R b/_functions_compare.R new file mode 100644 index 0000000..ab15502 --- /dev/null +++ b/_functions_compare.R @@ -0,0 +1,78 @@ +fsbench_results <- NULL +fsbench_runs <- character(0) +fsbench_tasks <- character(0) + +fsbench_report_init <- function(params) { + files_to_read <- lapply(params, Sys.glob) + bad_glob <- which(vapply(files_to_read, length, integer(1)) == 0) + if (length(bad_glob) > 0) { + stop(sprintf("Bad params value '%s': no files found that matched '%s'", names(params)[bad_glob[1]], params[bad_glob[1]])) + } + + fsbench_results <<- do.call(rbind, + mapply(read_runs, names(params), files_to_read, SIMPLIFY = FALSE, USE.NAMES = FALSE) + ) + fsbench_runs <<- unique(names(params)) + fsbench_tasks <<- unique(fsbench_results$task) +} + +fsbench_take_results <- function(tasks) { + matches <- fsbench_results$task %in% tasks + fsbench_tasks <<- setdiff(fsbench_tasks, tasks) + df <- fsbench_results[matches,] + df +} + +fsbench_plot <- function(df, scales = c("fixed", "free")) { + scales <- match.arg(scales) + + p <- ggplot(df, aes(run_name, elapsed, fill = run_name)) + + if (all(df$parallelism == 1)) { + p <- p + geom_bar(stat = "identity", show.legend = FALSE) + + xlab("Configuration") + } else { + p <- p + geom_line(aes(x = parallelism, group = run_name, color = run_name)) + + geom_point(aes(x = parallelism, color = run_name)) + + xlab("Parallelism") + } + + p <- p + + facet_wrap(~ task, ncol = length(unique(df$task)), scales = scales) + + ylab("Elasped (seconds)") + p +} + +fsbench_table <- function(df) { + df <- df[order(df$task, df$run_name), c("task", "run_name", "elapsed", "parallelism")] + if (all(df$parallelism == 1)) { + df$parallelism <- NULL + df <- tidyr::pivot_wider(df, id_cols = task, names_from = run_name, values_from = elapsed) + } else { + df <- tidyr::pivot_wider(df, id_cols = c(task, parallelism), names_from = run_name, values_from = elapsed) + } + knitr::kable(df, row.names = FALSE) +} + +read_runs <- function(run_name, files) { + # Read each file into a separate data frame, using read.csv(stringsAsFactors=FALSE) + data_frames <- lapply(files, read.csv, stringsAsFactors = FALSE) + # Combine all data frames into a single data frame + data_frame_all <- do.call(rbind, data_frames) + # Fix-up NAs for parallelism - set them to 1 as parallel tests never have a parallelization factor of 1 + # This makes sure that all of the aggregation functions work correctly + data_frame_all$parallelism <- ifelse(is.na(data_frame_all$parallelism), 1, data_frame_all$parallelism) + # This factor() call is necessary to prevent aggregate() from reordering + # by task, alphabetically + data_frame_all$task <- factor(data_frame_all$task, unique(data_frame_all$task)) + # Break data frame into groups of rows based on `task` and `parallelism`, then calculate + # mean(elapsed), and return the result as a data frame + data_frame_mean <- aggregate(elapsed ~ task+parallelism, data_frame_all, mean) + # Return the data in the shape that we ultimately want + data.frame( + run_name = factor(run_name, levels = unique(run_name)), + task = factor(data_frame_mean$task, levels = unique(data_frame_mean$task)), + elapsed = data_frame_mean$elapsed, + parallelism = data_frame_mean$parallelism + ) +} diff --git a/renv.lock b/renv.lock index ab00c60..a250916 100644 --- a/renv.lock +++ b/renv.lock @@ -1,6 +1,6 @@ { "R": { - "Version": "4.0.2", + "Version": "4.0.5", "Repositories": [ { "Name": "CRAN", @@ -48,14 +48,14 @@ "Package": "R6", "Version": "2.5.0", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Hash": "b203113193e70978a696b2809525649d" }, "RColorBrewer": { "Package": "RColorBrewer", "Version": "1.1-2", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "e031418365a7f7a766181ab5a41a5716" }, "Rcpp": { @@ -72,39 +72,46 @@ "Repository": "CRAN", "Hash": "50c838a310445e954bc13f26f26a6ecf" }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, "bit": { "Package": "bit", "Version": "4.0.4", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "f36715f14d94678eea9933af927bc15d" }, "bit64": { "Package": "bit64", "Version": "4.0.5", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "9fe98599ca456d6552421db0d6772d8f" }, "cli": { "Package": "cli", "Version": "2.3.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "3e3f28efcadfda442cd18651fbcbbecf" }, "colorspace": { "Package": "colorspace", "Version": "2.0-0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "abea3384649ef37f60ef51ce002f3547" }, "cpp11": { "Package": "cpp11", "Version": "0.2.6", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "f08909ebdad90b19d8d3930da4220564" }, "crayon": { @@ -125,9 +132,16 @@ "Package": "digest", "Version": "0.6.27", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Hash": "a0cbe758a531d054b537d16dff4d58a1" }, + "dplyr": { + "Package": "dplyr", + "Version": "1.0.6", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "19e84500b64bc7e589cb1e2550e25832" + }, "ellipsis": { "Package": "ellipsis", "Version": "0.3.1", @@ -135,6 +149,13 @@ "Repository": "CRAN", "Hash": "fd2844b3a43ae2d27e70ece2df1b4e2a" }, + "evaluate": { + "Package": "evaluate", + "Version": "0.14", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "ec8ca05cffcc70569eaaad8469d2a3a7" + }, "fansi": { "Package": "fansi", "Version": "0.4.2", @@ -146,37 +167,51 @@ "Package": "farver", "Version": "2.1.0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "c98eb5133d9cb9e1622b8691487f11bb" }, "fst": { "Package": "fst", "Version": "0.9.4", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Hash": "66c2d5dffed8d181b9555617beb2d0f9" }, + "generics": { + "Package": "generics", + "Version": "0.1.0", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "4d243a9c10b00589889fe32314ffd902" + }, "ggplot2": { "Package": "ggplot2", "Version": "3.3.3", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "3eb6477d01eb5bbdc03f7d5f70f2733e" }, "glue": { "Package": "glue", "Version": "1.4.2", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Hash": "6efd734b14c6471cfe443345f3e35e29" }, "gtable": { "Package": "gtable", "Version": "0.3.0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "ac5c6baf7822ce8732b343f14c072c4d" }, + "highr": { + "Package": "highr", + "Version": "0.9", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "8eb36c8125038e648e5d111c0d7b2ed4" + }, "hms": { "Package": "hms", "Version": "1.0.0", @@ -184,18 +219,39 @@ "Repository": "RSPM", "Hash": "bf552cdd96f5969873afdac7311c7d0d" }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.1.1", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "af2c2531e55df5cf230c4b5444fc973c" + }, "isoband": { "Package": "isoband", "Version": "0.2.4", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "b2008df40fb297e3fef135c7e8eeec1a" }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.7.2", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "98138e0994d41508c7a6b84a0600cfcb" + }, + "knitr": { + "Package": "knitr", + "Version": "1.33", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "0bc1b5da1b0eb07cd4b727e95e9ff0b8" + }, "labeling": { "Package": "labeling", "Version": "0.4.2", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "3d5108641f47470611a32d0bdf357a72" }, "lattice": { @@ -216,9 +272,16 @@ "Package": "magrittr", "Version": "2.0.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Hash": "41287f1ac7d28a92f0a286ed507928d3" }, + "markdown": { + "Package": "markdown", + "Version": "1.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "61e4a10781dd00d7d81dd06ca9b94e95" + }, "mgcv": { "Package": "mgcv", "Version": "1.8-33", @@ -226,11 +289,18 @@ "Repository": "CRAN", "Hash": "eb7b6439bc6d812eed2cddba5edc6be3" }, + "mime": { + "Package": "mime", + "Version": "0.10", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "26fa77e707223e1ce042b2b5d09993dc" + }, "munsell": { "Package": "munsell", "Version": "0.5.0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "6dfe8bf774944bd5595785e3229d8771" }, "nlme": { @@ -244,7 +314,7 @@ "Package": "pillar", "Version": "1.5.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "24622aa4a0d3de3463c34513edca99b2" }, "pkgconfig": { @@ -289,20 +359,48 @@ "Repository": "CRAN", "Hash": "599df23c40a4fce9c7b4764f28c37857" }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.7", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "edbf4cb1aefae783fd8d3a008ae51943" + }, "scales": { "Package": "scales", "Version": "1.1.1", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "6f76f71042411426ec8df6c54f34e6dd" }, + "stringi": { + "Package": "stringi", + "Version": "1.5.3", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a063ebea753c92910a4cca7b18bc1f05" + }, + "stringr": { + "Package": "stringr", + "Version": "1.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "0759e6b6c0957edb1311028a49a35e76" + }, "tibble": { "Package": "tibble", "Version": "3.1.0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "4d894a114dbd4ecafeda5074e7c538e6" }, + "tidyr": { + "Package": "tidyr", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "450d7dfaedde58e28586b854eeece4fa" + }, "tidyselect": { "Package": "tidyselect", "Version": "1.1.0", @@ -310,6 +408,13 @@ "Repository": "CRAN", "Hash": "6ea435c354e8448819627cf686f66e0a" }, + "tinytex": { + "Package": "tinytex", + "Version": "0.31", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "25b572f764f3c19fef9aac33b5724f3d" + }, "utf8": { "Package": "utf8", "Version": "1.2.1", @@ -328,7 +433,7 @@ "Package": "viridisLite", "Version": "0.3.0", "Source": "Repository", - "Repository": "RSPM", + "Repository": "CRAN", "Hash": "ce4f6271baa94776db692f1cb2055bee" }, "vroom": { @@ -342,8 +447,22 @@ "Package": "withr", "Version": "2.4.1", "Source": "Repository", - "Repository": "CRAN", + "Repository": "RSPM", "Hash": "caf4781c674ffa549a4676d2d77b13cc" + }, + "xfun": { + "Package": "xfun", + "Version": "0.22", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "eab2f8ba53809c321813e72ecbbd19ba" + }, + "yaml": { + "Package": "yaml", + "Version": "2.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "2826c5d9efb0a88f657c7a679c7106db" } } } diff --git a/report.Rmd b/report.Rmd new file mode 100644 index 0000000..538a75f --- /dev/null +++ b/report.Rmd @@ -0,0 +1,105 @@ +--- +title: "fsbench report" +output: + html_document: + css: report.css +params: + EBS: examples/ebs.csv + EFS: examples/efs.csv +--- + +```{r setup, echo=FALSE} +library(ggplot2) +knitr::opts_chunk$set(echo = FALSE, fig.height = 3.5) + +source("_functions_compare.R") +fsbench_report_init(params) +``` + +## Install packages + +```{r} +df <- fsbench_take_results(c("Install MASS", "Install lattice", "Install BH")) +fsbench_plot(df) +fsbench_table(df) +``` + +## Write CSV + +```{r} +df <- fsbench_take_results(sprintf("Write CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) +fsbench_plot(df, scales = "free") +fsbench_table(df) +``` + +```{r} +df <- fsbench_take_results(sprintf("Write CSV, 100MB over %s files", 10^(1:4))) +fsbench_plot(df, scales = "free") +fsbench_table(df) +``` + +## Read CSV + +```{r} +df <- fsbench_take_results(sprintf("Read CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) +fsbench_plot(df, scales = "free") +fsbench_table(df) +``` + +```{r} +df <- fsbench_take_results(sprintf("Read CSV, 100MB over %s files", 10^(1:4))) +fsbench_plot(df, scales = "free") +fsbench_table(df) +``` + +## FST reads + +```{r} +df <- fsbench_take_results(sprintf("FST random reads, 100MB over %s*%s reads", 10^(1:4), c("10MB", "1MB", "100KB", "10KB"))) +fsbench_plot(df, scales = "free") +fsbench_table(df) +``` + +## Realistic CSV reads + +```{r} +df <- fsbench_take_results("Read 14 days of CRAN logs with fread") +fsbench_plot(df) +fsbench_table(df) +``` + +```{r} +df <- fsbench_take_results("Sample 5000 rows from each of 14 CRAN logs with vroom") +fsbench_plot(df) +fsbench_table(df) +``` + +## Parallel sequential reads/writes, 1GB + +```{r} +df <- fsbench_take_results("DD write, 1GB") +fsbench_plot(df) +fsbench_table(df) +``` + +```{r} +df <- fsbench_take_results("DD read, 1GB") +fsbench_plot(df) +fsbench_table(df) +``` + +## Parallel sequential reads/writes, 10MB over 1000 files + +```{r} +df <- fsbench_take_results("DD write, 10MB over 1000 files") +fsbench_plot(df) +fsbench_table(df) +``` + +```{r} +df <- fsbench_take_results("DD read, 10MB over 1000 files") +fsbench_plot(df) +fsbench_table(df) +``` + + diff --git a/report.css b/report.css new file mode 100644 index 0000000..486684d --- /dev/null +++ b/report.css @@ -0,0 +1,4 @@ +.table { + width: auto; +} + From 5fab91505bb202d5bb3fe4274e96810df106c52e Mon Sep 17 00:00:00 2001 From: Joe Cheng Date: Mon, 10 May 2021 21:29:55 -0700 Subject: [PATCH 2/4] Use R6 style for result object R6 emphasizes the mutable nature of this object --- _functions_compare.R | 63 +++++++++++++++++++++++++++++++++----------- report.Rmd | 60 ++++++++++++++++++++++++++++------------- report.css | 2 ++ 3 files changed, 91 insertions(+), 34 deletions(-) diff --git a/_functions_compare.R b/_functions_compare.R index ab15502..f1fb5df 100644 --- a/_functions_compare.R +++ b/_functions_compare.R @@ -1,6 +1,44 @@ -fsbench_results <- NULL -fsbench_runs <- character(0) -fsbench_tasks <- character(0) +# Represents a set of fsbench results, incorporating multiple runs and tasks +FSBenchResults <- R6::R6Class("FSBenchResults", + private = list( + results = "data.frame", + runs = factor(), + tasks = factor() + ), + public = list( + initialize = function(results, runs) { + private$results <- results + private$runs <- runs + private$tasks <- unique(results$task) + }, + # Retrieve the results for the specified tasks, and prevents them from + # being returned from future calls to self$remaining() + take = function(tasks, run_names = NULL) { + df <- self$peek(tasks, run_names = run_names) + private$tasks <- setdiff(private$tasks, tasks) + df + }, + # Like self$take(), but doesn't affect self$remaining() + peek = function(tasks, run_names = NULL) { + df <- private$results + df <- df[df$task %in% tasks,] + if (length(run_names) > 0) { + df <- df[df$run_name %in% run_names,] + unseen_run_names <- setdiff(run_names, df$run_name) + warning( + "Run name(s) requested but not found: ", + paste(unseen_run_names, collapse = ", ") + ) + } + df + }, + # Returns tasks that have not yet been returned by take() + remaining = function() { + df <- private$results + df[df$task %in% private$tasks,] + } + ) +) fsbench_report_init <- function(params) { files_to_read <- lapply(params, Sys.glob) @@ -9,21 +47,15 @@ fsbench_report_init <- function(params) { stop(sprintf("Bad params value '%s': no files found that matched '%s'", names(params)[bad_glob[1]], params[bad_glob[1]])) } - fsbench_results <<- do.call(rbind, + results <- do.call(rbind, mapply(read_runs, names(params), files_to_read, SIMPLIFY = FALSE, USE.NAMES = FALSE) ) - fsbench_runs <<- unique(names(params)) - fsbench_tasks <<- unique(fsbench_results$task) -} + runs <- unique(names(params)) -fsbench_take_results <- function(tasks) { - matches <- fsbench_results$task %in% tasks - fsbench_tasks <<- setdiff(fsbench_tasks, tasks) - df <- fsbench_results[matches,] - df + FSBenchResults$new(results, runs) } -fsbench_plot <- function(df, scales = c("fixed", "free")) { +fsbench_plot <- function(df, scales = c("fixed", "free"), ncol = length(unique(df$task)), nrow = 1) { scales <- match.arg(scales) p <- ggplot(df, aes(run_name, elapsed, fill = run_name)) @@ -38,8 +70,9 @@ fsbench_plot <- function(df, scales = c("fixed", "free")) { } p <- p + - facet_wrap(~ task, ncol = length(unique(df$task)), scales = scales) + - ylab("Elasped (seconds)") + facet_wrap(~ task, ncol = ncol, nrow = nrow, scales = scales) + + ylab("Elasped (seconds)") + + ylim(0, NA) p } diff --git a/report.Rmd b/report.Rmd index 538a75f..36a09d8 100644 --- a/report.Rmd +++ b/report.Rmd @@ -10,16 +10,28 @@ params: ```{r setup, echo=FALSE} library(ggplot2) -knitr::opts_chunk$set(echo = FALSE, fig.height = 3.5) +knitr::opts_chunk$set(echo = FALSE, fig.height = 3.5, fig.align = "center") source("_functions_compare.R") -fsbench_report_init(params) +results <- fsbench_report_init(params) ``` +All times are in seconds (lower is better). + ## Install packages +Installation of R packages is significantly affected by disk I/O, but still quite usable under EFS. + ```{r} -df <- fsbench_take_results(c("Install MASS", "Install lattice", "Install BH")) +df <- results$take(c("Install MASS", "Install lattice")) +fsbench_plot(df) +fsbench_table(df) +``` + +One notable outlier is BH, which is a worst-case scenario for distributed filesystems: it contains nearly 13,000 small files (C++ header files that comprise a significant subset of the [Boost](https://www.boost.org/) library). Installing BH on EFS is so much slower than usual that users commonly think the session has hung. + +```{r fig.width=3.5} +df <- results$take(c("Install BH")) fsbench_plot(df) fsbench_table(df) ``` @@ -27,13 +39,18 @@ fsbench_table(df) ## Write CSV ```{r} -df <- fsbench_take_results(sprintf("Write CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) -fsbench_plot(df, scales = "free") -fsbench_table(df) +# These tests are not that useful. Take them, but don't display them. +df <- results$take(sprintf("Write CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) +# fsbench_plot(df, scales = "free") +# fsbench_table(df) ``` +Each of the following tests writes 100MB of CSV data, distributed over different numbers of files. The greater the number of files, the smaller each individual CSV file is; for example, 100MB over 1,000 files results in each file being 100KB, while 100MB over 10,000 files results in each file being 10KB. + +The collective overhead of EFS increases as we write to greater numbers of smaller files. + ```{r} -df <- fsbench_take_results(sprintf("Write CSV, 100MB over %s files", 10^(1:4))) +df <- results$take(sprintf("Write CSV, 100MB over %s files", 10^(1:4))) fsbench_plot(df, scales = "free") fsbench_table(df) ``` @@ -41,13 +58,13 @@ fsbench_table(df) ## Read CSV ```{r} -df <- fsbench_take_results(sprintf("Read CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) +df <- results$take(sprintf("Read CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) fsbench_plot(df, scales = "free") fsbench_table(df) ``` ```{r} -df <- fsbench_take_results(sprintf("Read CSV, 100MB over %s files", 10^(1:4))) +df <- results$take(sprintf("Read CSV, 100MB over %s files", 10^(1:4))) fsbench_plot(df, scales = "free") fsbench_table(df) ``` @@ -55,21 +72,21 @@ fsbench_table(df) ## FST reads ```{r} -df <- fsbench_take_results(sprintf("FST random reads, 100MB over %s*%s reads", 10^(1:4), c("10MB", "1MB", "100KB", "10KB"))) +df <- results$take(sprintf("FST random reads, 100MB over %s*%s reads", 10^(1:4), c("10MB", "1MB", "100KB", "10KB"))) fsbench_plot(df, scales = "free") fsbench_table(df) ``` ## Realistic CSV reads -```{r} -df <- fsbench_take_results("Read 14 days of CRAN logs with fread") +```{r fig.width=3.5} +df <- results$take("Read 14 days of CRAN logs with fread") fsbench_plot(df) fsbench_table(df) ``` -```{r} -df <- fsbench_take_results("Sample 5000 rows from each of 14 CRAN logs with vroom") +```{r fig.width=3.5} +df <- results$take("Sample 5000 rows from each of 14 CRAN logs with vroom") fsbench_plot(df) fsbench_table(df) ``` @@ -77,13 +94,13 @@ fsbench_table(df) ## Parallel sequential reads/writes, 1GB ```{r} -df <- fsbench_take_results("DD write, 1GB") +df <- results$take("DD write, 1GB") fsbench_plot(df) fsbench_table(df) ``` ```{r} -df <- fsbench_take_results("DD read, 1GB") +df <- results$take("DD read, 1GB") fsbench_plot(df) fsbench_table(df) ``` @@ -91,15 +108,20 @@ fsbench_table(df) ## Parallel sequential reads/writes, 10MB over 1000 files ```{r} -df <- fsbench_take_results("DD write, 10MB over 1000 files") +df <- results$take("DD write, 10MB over 1000 files") fsbench_plot(df) fsbench_table(df) ``` ```{r} -df <- fsbench_take_results("DD read, 10MB over 1000 files") +df <- results$take("DD read, 10MB over 1000 files") fsbench_plot(df) fsbench_table(df) ``` - +```{r} +df <- results$remaining() +if (nrow(df) > 0) { + warning("Unreported task(s) detected: ", paste(paste0("'", unique(df$task), "'"), collapse = ", ")) +} +``` diff --git a/report.css b/report.css index 486684d..177540d 100644 --- a/report.css +++ b/report.css @@ -1,4 +1,6 @@ .table { width: auto; + margin-left: auto; + margin-right: auto; } From 7765a459459239eb616146e2dfa39a0efa998a70 Mon Sep 17 00:00:00 2001 From: Joe Cheng Date: Tue, 11 May 2021 17:21:20 -0700 Subject: [PATCH 3/4] Recommendations doc --- _functions_compare.R | 8 +- recommendations.Rmd | 192 +++++++++++++++++++++++++++++++++++++++++++ report.css | 3 + 3 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 recommendations.Rmd diff --git a/_functions_compare.R b/_functions_compare.R index f1fb5df..b5e2dbd 100644 --- a/_functions_compare.R +++ b/_functions_compare.R @@ -63,10 +63,16 @@ fsbench_plot <- function(df, scales = c("fixed", "free"), ncol = length(unique(d if (all(df$parallelism == 1)) { p <- p + geom_bar(stat = "identity", show.legend = FALSE) + xlab("Configuration") + + # If too many runs, turn the x-axis labels 90 degrees so they fit + if (length(unique(df$run_name)) > 5) { + p <- p + theme(axis.text.x = element_text(angle = 90)) + } } else { p <- p + geom_line(aes(x = parallelism, group = run_name, color = run_name)) + geom_point(aes(x = parallelism, color = run_name)) + - xlab("Parallelism") + xlab("Parallelism") + + theme(legend.title = element_blank()) } p <- p + diff --git a/recommendations.Rmd b/recommendations.Rmd new file mode 100644 index 0000000..72d86e6 --- /dev/null +++ b/recommendations.Rmd @@ -0,0 +1,192 @@ +--- +title: "EFS recommendations (DRAFT)" +output: + html_document: + css: report.css +--- + +```{r setup, echo=FALSE} +library(ggplot2) +knitr::opts_chunk$set(echo = FALSE, fig.height = 3.5, fig.align = "center") + +source("_functions_compare.R") + +show_results <- function(tasks, ..., scales = "fixed") { + results <- fsbench_report_init(rlang::list2(...)) + df <- results$peek(tasks) + knitr::knit_print(fsbench_plot(df, scales = scales)) + knitr::knit_print(fsbench_table(df)) +} + +task_write_latency <- "Write CSV, 100MB over 1000 files" +task_read_latency <- "Read CSV, 100MB over 1000 files" +task_latency <- c(task_write_latency, task_read_latency) +task_parallel_read <- c("DD read, 1GB", "DD read, 10MB over 1000 files") +task_parallel_write <- c("DD write, 1GB", "DD write, 10MB over 1000 files") +task_parallel_thru <- c("DD read, 1GB", "DD write, 1GB") +task_parallel_latency <- c("DD read, 10MB over 1000 files", "DD write, 10MB over 1000 files") +``` + +With much of EFS performance being based on various usage patterns, this document should serve as a starting point. Be aware that tuning EFS will be a requirement after monitoring customer behavior and likely require monitoring for maintaining long-term performance. + +See the page for test methods and results. + +In the results below, all times are in seconds (lower is better). + +## Max I/O vs. GeneralPurpose + +**Summary: Use GeneralPurpose** + +When creating a filesystem you must choose a performance mode which cannot be altered. We strongly recommend using GeneralPurpose. + +[AWS recommends](https://docs.aws.amazon.com/efs/latest/ug/performance.html): + +> File systems in the Max I/O mode can scale to higher levels of aggregate throughput and operations per second. This scaling is done with a tradeoff of slightly higher latencies for file metadata operations. Highly parallelized applications and workloads, such as big data analysis, media processing, and genomic analysis, can benefit from this mode. + +In our testing maxIO is dramatically worse because of the increased latency - especially in the "many small files" scenario. + +```{r} +show_results(task_latency, scales = "free", + "General Purpose" = "../aws-fsbench/results/modes1/efs*/*.csv", + "MaxIO" = "../aws-fsbench/results/modes2/efs*/*.csv", +) +``` + +## Bursting vs. Provisioned Throughput + +**Summary: Similar performance under normal conditions, but Provisioned lets you pay extra to avoid surprises** + +The default bursting behavior is likely how we will want customers to start using EFS. The reason behind this is because we have no way of predicting how much throughput they will need. The customers will need to monitor their Burst Credit balance and permitted throughput [via CloudWatch](https://docs.aws.amazon.com/efs/latest/ug/efs-metrics.html) to ensure that they are not surprised by throttling if they run out. We highly recommend setting alarms based on these metrics. + +Throttling is remedied by either generating more Burst Credits (writing files to the filesystem or waiting for the Burst Credits to refresh) or converting to Provisioned Throughput mode. Large filesystems (\> 1TB) should be able to theoretically burst for 50% of the time. For smaller filesystems, Provisioned Throughput can be set to maintain a constant performance level. + +Note that generating large files to bump into a larger tier of burst performance is both time consuming and expensive. Weigh these options carefully. Creating \> 1TB of data could cost hundreds of dollars just to store the initial data. + +If migrating to EFS, Provisioned Throughput can help save time if you wish to move a lot of data. In our tests, moving from Bursting to 500MiB Provisioned improved speed by 5x and preserved Burst Credits. + +In most of our testing for Multi AZ EFS, bursting performs better than provisioned. For One Zone, the difference appeared to be minimal. + +```{r} +runs <- list( + "Bursting" = "../aws-fsbench/results/types4/efs*/*.csv", + "Provisioned" = "../aws-fsbench/results/types3/efs*/*.csv" +) + +show_results(c("Read CSV, 100MB", "Read 14 days of CRAN logs with fread"), scales = "free", !!!runs) +show_results(task_parallel_latency, scales = "free", !!!runs) +show_results(task_parallel_thru, scales = "free", !!!runs) +``` + +## Multi AZ vs One Zone + +**Summary: One Zone is significantly faster (and cheaper), Multi AZ has higher availability** + +AWS [currently supports](https://aws.amazon.com/efs/sla/) 99.99% uptime for Multi AZ and 99.9% for One Zone. + +Most of our customers who want fail over will prefer using a Multi AZ filesystem. However, there are major performance gains if they are willing to tolerate using a single availability zone. The One Zone filesystem is still durable, however if that availability zone goes down, there is no failover. This might be a great candidate for fast development environments. + +One Zone has performance that might be imperceptible compared to NFS. + +Read more about [storage classes here](https://docs.aws.amazon.com/efs/latest/ug/storage-classes.html). + +```{r} +show_results(task_latency, scales = "free", + "One Zone" = "../aws-fsbench/results/*/efs_one/*.csv", + "Multi AZ" = "../aws-fsbench/results/*/efs_multi/*.csv", +) +``` + +## Instance Types + +In general for EFS, [AWS recommends](https://docs.aws.amazon.com/efs/latest/ug/performance-tips.html) preferring instance types with more CPU or memory depending on the workload. Prefer memory-optimized or compute-optimized over general purpose instance types. + +For fsbench workloads, we have observed performance gains by using memory-optimized instance types, e.g. r5. For UI-related tasks like "Install BH" this could provide a nicer user experience. + +For servers which utilize many NFS client connections (e.g. Launcher) the enhanced networking might prove to be noticeably better. Consider using the "n" variants, e.g. r5n. + +```{r} +show_results(c(task_latency, "Install BH"), scales = "free", + t3.large = "../aws-fsbench/results/types*/efs_t3.large/*.csv", + i3.large = "../aws-fsbench/results/types*/efs_i3en.large/*.csv", + i3en.large = "../aws-fsbench/results/types*/efs_i3en.large/*.csv", + c5.xlarge = "../aws-fsbench/results/types*/efs_c5.xlarge/*.csv", + c5n.xlarge = "../aws-fsbench/results/types*/efs_c5n.xlarge/*.csv", + m5.large = "../aws-fsbench/results/types*/efs_m5.large/*.csv", + m5n.large = "../aws-fsbench/results/types*/efs_m5n.large/*.csv", + r5.large = "../aws-fsbench/results/types*/efs_r5.large/*.csv", + r5n.large = "../aws-fsbench/results/types*/efs_r5n.large/*.csv", +) +show_results(task_parallel_read, scales = "free", + t3.large = "../aws-fsbench/results/types*/efs_t3.large/*.csv", + i3.large = "../aws-fsbench/results/types*/efs_i3en.large/*.csv", + i3en.large = "../aws-fsbench/results/types*/efs_i3en.large/*.csv", + c5.xlarge = "../aws-fsbench/results/types*/efs_c5.xlarge/*.csv", + c5n.xlarge = "../aws-fsbench/results/types*/efs_c5n.xlarge/*.csv", + m5.large = "../aws-fsbench/results/types*/efs_m5.large/*.csv", + m5n.large = "../aws-fsbench/results/types*/efs_m5n.large/*.csv", + r5.large = "../aws-fsbench/results/types*/efs_r5.large/*.csv", + r5n.large = "../aws-fsbench/results/types*/efs_r5n.large/*.csv", +) +``` + +## Instance sizes + +We have observed significant gains in going from large to xlarge instance sizes - primarily in parallelized load. For servers with many users, increasing the instance size is recommended. Do not attempt to use smaller instance types e.g. c5.large with 4GB memory. + +```{r} +show_results(task_latency, scales = "free", + t3.large = "../aws-fsbench/results/types3/efs_t3.large/*.csv", + t3.xlarge = "../aws-fsbench/results/types3/efs_t3.xlarge/*.csv", +) + +show_results(task_parallel_read, scales = "free", + t3.large = "../aws-fsbench/results/types3/efs_t3.large/*.csv", + t3.xlarge = "../aws-fsbench/results/types3/efs_t3.xlarge/*.csv", +) +``` + +## read_ahead_kb vs. default + +Linux kernels (5.4.\*) use a read_ahead_kb of 128, however the AWS docs recommend 15000. The [efs-utils](https://docs.aws.amazon.com/efs/latest/ug/installing-amazon-efs-utils.html) package will set this correctly, but for customers who wish to use only standard NFS utilities will need to [do this manually](https://docs.aws.amazon.com/efs/latest/ug/performance-tips.html#efs-perf-optimize-nfs-read-ahead). + +```{r} +show_results(task_latency, scales = "free", + "Without efs-utils" = "../aws-fsbench/results/types2/efs_t3.large/*.csv", + "With efs-utils" = "../aws-fsbench/results/types3/efs_t3.large/*.csv", +) + +show_results(task_parallel_write, scales = "free", + "Without efs-utils" = "../aws-fsbench/results/types2/efs_t3.large/*.csv", + "With efs-utils" = "../aws-fsbench/results/types3/efs_t3.large/*.csv", +) +``` + +## Mounting considerations + +We strongly recommend using [efs-utils](https://docs.aws.amazon.com/efs/latest/ug/installing-amazon-efs-utils.html) to mount the EFS filesystem. If this is not feasible, standard NFS client connections are possible, but there are [mounting instructions](https://docs.aws.amazon.com/efs/latest/ug/mounting-fs.html) and [additional considerations](https://docs.aws.amazon.com/efs/latest/ug/mounting-fs-mount-cmd-general.html) to take into account. + +## Multiple Users + +When using an EFS filesystem for many users, we recommend splitting up the data between users as much as possible. For example, writing large files will block metadata operations in that directory until the write operation is complete. Try to keep the users isolated to separate directories whenever possible. + +## Special Considerations and Product Limitations + +Operations which consume many small files will not perform well in most EFS settings. + +We recommend pre-installing R packages so that users do not have to repeatedly install them. + +Prefer reading large files over splitting data between many small files. + +Project sharing using RSW (in its current state) will not work due to NFS ACLs [not being supported by EFS](https://docs.aws.amazon.com/efs/latest/ug/limits.html). + +For RSW, the default lock type of link-based won't work, the [advisory type must be used](https://docs.rstudio.com/ide/server-pro/latest/load-balancing.html#lock-configuration) instead. + +## Monitoring usage + +Please read about [available CloudWatch metrics](https://docs.aws.amazon.com/efs/latest/ug/monitoring-metric-math.html#metric-math-throughput-utilization) and creating customized metrics using [metric math for EFS](https://docs.aws.amazon.com/efs/latest/ug/monitoring-metric-math.html#metric-math-throughput-utilization). + +If using Bursting mode, be sure to monitor the BurstCreditBalancemetric. If this begins to decrease substantially over time, it will be time to consider adding data to bump the filesystem size into a [larger tier ](https://docs.aws.amazon.com/efs/latest/ug/performance.html)with more burst credits, or moving to Provisioned Throughput to establish a consistent baseline. + +If using Bursting mode, using [metric math](https://docs.aws.amazon.com/efs/latest/ug/monitoring-metric-math.html#metric-math-throughput-utilization), you can compare MeteredIOBytes to PermittedThroughput to know if you are using all of your available throughput. If this is the case, it might be an indication that you should move to Provisioned Throughput. + +If using Provisioned Throughput PermittedThroughput can be used to determine whether or not your storage volume has bumped you above your designated throughput setting. diff --git a/report.css b/report.css index 177540d..a0216de 100644 --- a/report.css +++ b/report.css @@ -2,5 +2,8 @@ width: auto; margin-left: auto; margin-right: auto; + border: 1px solid #CCC; + border-left: none; + border-right: none; } From 651b9b425a76a721b6d5500d57f1d8c8602d8446 Mon Sep 17 00:00:00 2001 From: Joe Cheng Date: Tue, 11 May 2021 23:55:31 -0700 Subject: [PATCH 4/4] Word wrapping; more commentary --- _functions_compare.R | 7 +++++++ report.Rmd | 33 +++++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/_functions_compare.R b/_functions_compare.R index b5e2dbd..b81b461 100644 --- a/_functions_compare.R +++ b/_functions_compare.R @@ -58,6 +58,13 @@ fsbench_report_init <- function(params) { fsbench_plot <- function(df, scales = c("fixed", "free"), ncol = length(unique(df$task)), nrow = 1) { scales <- match.arg(scales) + df$task <- local({ + task <- lapply(df$task, strwrap, width = 20) + # Join each character vector's elements, using \n + task <- vapply(task, paste, character(1), collapse = "\n") + factor(task, levels = unique(task)) + }) + p <- ggplot(df, aes(run_name, elapsed, fill = run_name)) if (all(df$parallelism == 1)) { diff --git a/report.Rmd b/report.Rmd index 36a09d8..67153bc 100644 --- a/report.Rmd +++ b/report.Rmd @@ -4,8 +4,10 @@ output: html_document: css: report.css params: - EBS: examples/ebs.csv - EFS: examples/efs.csv + EBS: ../aws-fsbench/results/kevin7/ebs/*.csv + EFS: ../aws-fsbench/results/kevin7/efs_multi/*.csv + EFS1: ../aws-fsbench/results/kevin7/efs_one/*.csv + NFS: ../aws-fsbench/results/kevin7/nfs/*.csv --- ```{r setup, echo=FALSE} @@ -18,6 +20,8 @@ results <- fsbench_report_init(params) All times are in seconds (lower is better). +Note that the "EFS" flavor is EFS in Multiple Availability Zone mode, and "EFS1" is EFS in Single Availability Zone mode. + ## Install packages Installation of R packages is significantly affected by disk I/O, but still quite usable under EFS. @@ -57,10 +61,13 @@ fsbench_table(df) ## Read CSV +Reading small files from EFS is slower than from EBS/NFS, but the difference is of a smaller magnitude than writing. + ```{r} +# These tests are not that useful. Take them, but don't display them. df <- results$take(sprintf("Read CSV, %s", c("10KB", "1MB", "100MB", "1GB"))) -fsbench_plot(df, scales = "free") -fsbench_table(df) +# fsbench_plot(df, scales = "free") +# fsbench_table(df) ``` ```{r} @@ -71,6 +78,8 @@ fsbench_table(df) ## FST reads +FST read results are similar to CSV reads, despite ostensibly performing random reads instead of sequential. This may be because of aggressive prefetching due to Amazon's recommended NFS client configuration values. + ```{r} df <- results$take(sprintf("FST random reads, 100MB over %s*%s reads", 10^(1:4), c("10MB", "1MB", "100KB", "10KB"))) fsbench_plot(df, scales = "free") @@ -79,6 +88,8 @@ fsbench_table(df) ## Realistic CSV reads +This test reads real CRAN daily download log files, each about 60MB. + ```{r fig.width=3.5} df <- results$take("Read 14 days of CRAN logs with fread") fsbench_plot(df) @@ -91,7 +102,11 @@ fsbench_plot(df) fsbench_table(df) ``` -## Parallel sequential reads/writes, 1GB +## Parallel 1GB reads/writes + +This measures using `dd` to write 1GB using 2, 4, 8, and 16 worker processes, all on a single node. Note that *each* worker writes 1GB; as the number of works increases, so does the total amount of data being written. + +EFS and EFS1 are actually quite a bit *faster* than EBS and NFS. EBS and NFS have suspiciously identical timings, as do EFS and EFS1. More investigation may be warranted here. ```{r} df <- results$take("DD write, 1GB") @@ -99,13 +114,17 @@ fsbench_plot(df) fsbench_table(df) ``` +EFS and EFS1 are massively faster than EBS and NFS for reads. Again, EFS and EFS1 have suspiciously close timings. + ```{r} df <- results$take("DD read, 1GB") fsbench_plot(df) fsbench_table(df) ``` -## Parallel sequential reads/writes, 10MB over 1000 files +## Parallel 10KB reads/writes + +Similar to the previous "Parallel 1GB reads/writes", but this time, with each worker reading 1000 x 10KB files. The results are very different, with EBS the clear winner and EFS the clear loser. ```{r} df <- results$take("DD write, 10MB over 1000 files") @@ -113,6 +132,8 @@ fsbench_plot(df) fsbench_table(df) ``` +Read performance is much closer though, with all four filesystems performing quite similarly. + ```{r} df <- results$take("DD read, 10MB over 1000 files") fsbench_plot(df)