diff --git a/NEWS.md b/NEWS.md index b14b9491ce..a311eec860 100644 --- a/NEWS.md +++ b/NEWS.md @@ -38,6 +38,8 @@ 11. `split.data.table` recognizes `sep=` when splitting with `by=`, just like the default and data.frame methods [#5417](https://github.com/Rdatatable/data.table/issues/5417). +12. `setDT` is faster for data with many columns, thanks @MichaelChirico for reporting and fixing the issue, [#5426](https://github.com/Rdatatable/data.table/issues/5426). + ## BUG FIXES 1. `unique()` returns a copy the case when `nrows(x) <= 1` instead of a mutable alias, [#5932](https://github.com/Rdatatable/data.table/pull/5932). This is consistent with existing `unique()` behavior when the input has no duplicates but more than one row. Thanks to @brookslogan for the report and @dshemetov for the fix. diff --git a/R/data.table.R b/R/data.table.R index 5513fb276f..81b647fb2d 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2846,12 +2846,6 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { stopf("Cannot convert '%1$s' to data.table by reference because binding is locked. It is very likely that '%1$s' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.", cname) } } - # check no matrix-like columns, #3760. Other than a single list(matrix) is unambiguous and depended on by some revdeps, #3581 - if (length(x)>1L) { - idx = vapply_1i(x, function(xi) length(dim(xi)))>1L - if (any(idx)) - warningf("Some columns are a multi-column type (such as a matrix column): %s. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column.", brackify(which(idx))) - } if (is.data.table(x)) { # fix for #1078 and #1128, see .resetclass() for explanation. setattr(x, 'class', .resetclass(x, 'data.table')) @@ -2859,6 +2853,14 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x) } else if (is.data.frame(x)) { + # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 + # for performance, only warn on the first such column, #5426 + for (jj in seq_along(x)) { + if (length(dim(x[[jj]])) > 1L) { + .Call(Cwarn_matrix_column_r, jj) + break + } + } rn = if (!identical(keep.rownames, FALSE)) rownames(x) else NULL setattr(x, "row.names", .set_row_names(nrow(x))) if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) @@ -2877,22 +2879,11 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { x = null.data.table() } else if (is.list(x)) { # copied from as.data.table.list - except removed the copy - for (i in seq_along(x)) { - if (is.null(x[[i]])) next # allow NULL columns to be created by setDT(list) even though they are not really allowed - # many operations still work in the presence of NULL columns and it might be convenient - # e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which - # fail for NULL columns will give helpful error at that point, #3480 and #3471 - if (inherits(x[[i]], "POSIXlt")) stopf("Column %d is of POSIXlt type. Please convert it to POSIXct using as.POSIXct and run setDT again. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.", i) - } - n = vapply_1i(x, length) - n_range = range(n) - if (n_range[1L] != n_range[2L]) { - tbl = sort(table(n)) - stopf("All elements in argument 'x' to 'setDT' must be of same length, but the profile of input lengths (length:frequency) is: %s\nThe first entry with fewer than %d entries is %d.", brackify(sprintf('%s:%d', names(tbl), tbl)), n_range[2L], which.max(n 1) { + warn_matrix_column(i+1); + test_matrix_cols = false; + } + len_xi = INTEGER(dim_xi)[0]; + } else { + len_xi = LENGTH(xi); + } + if (!base_length) { + base_length = len_xi; + } else if (len_xi != base_length) { + error(_("All elements in argument 'x' to 'setDT' must be of equal length, but input %d has length %d whereas the first non-empty input had length %d"), i+1, len_xi, base_length); + } + } + return ScalarInteger(base_length); +} + SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose) { SEXP names, klass; // klass not class at request of pydatatable because class is reserved word in C++, PR #3129 diff --git a/src/data.table.h b/src/data.table.h index 297167d467..811e6a5a17 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -175,6 +175,7 @@ SEXP dt_na(SEXP x, SEXP cols); SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); +void warn_matrix_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, @@ -275,6 +276,7 @@ SEXP notchin(SEXP x, SEXP table); SEXP setattrib(SEXP, SEXP, SEXP); SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP copy(SEXP); +SEXP setdt_nrows(SEXP); SEXP alloccolwrapper(SEXP, SEXP, SEXP); SEXP selfrefokwrapper(SEXP, SEXP); SEXP truelength(SEXP); @@ -317,6 +319,7 @@ SEXP glast(SEXP); SEXP gfirst(SEXP); SEXP gnthvalue(SEXP, SEXP); SEXP dim(SEXP); +SEXP warn_matrix_column_r(SEXP); SEXP gvar(SEXP, SEXP); SEXP gsd(SEXP, SEXP); SEXP gprod(SEXP, SEXP); diff --git a/src/init.c b/src/init.c index a974a2d95d..54039584fe 100644 --- a/src/init.c +++ b/src/init.c @@ -51,6 +51,7 @@ R_CallMethodDef callMethods[] = { {"Cdogroups", (DL_FUNC) &dogroups, -1}, {"Ccopy", (DL_FUNC) ©, -1}, {"Cshallowwrapper", (DL_FUNC) &shallowwrapper, -1}, +{"Csetdt_nrows", (DL_FUNC) &setdt_nrows, -1}, {"Calloccolwrapper", (DL_FUNC) &alloccolwrapper, -1}, {"Cselfrefokwrapper", (DL_FUNC) &selfrefokwrapper, -1}, {"Ctruelength", (DL_FUNC) &truelength, -1}, @@ -141,6 +142,7 @@ R_CallMethodDef callMethods[] = { {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, +{"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/wrappers.c b/src/wrappers.c index a23b169685..6587caa97a 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -120,3 +120,7 @@ SEXP dim(SEXP x) return ans; } +SEXP warn_matrix_column_r(SEXP i) { + warn_matrix_column(INTEGER(i)[0]); + return R_NilValue; +}