From e7d1412498022657889ca2afd8c7a7af937d15cb Mon Sep 17 00:00:00 2001 From: antonm Date: Sat, 1 Feb 2025 23:02:06 -0500 Subject: [PATCH 1/6] Fixed nested ':=' reference assignment fails (#6768) *Followed TODO: by mattdowle from resolution to '2-space indentation #2420' *Added tests for jsub that modify DT by-reference *Added test case for interger vector indexing --- NEWS.md | 2 + R/data.table.R | 129 +++++++++++++++++++++++++++--------------- inst/tests/tests.Rraw | 11 +++- 3 files changed, 94 insertions(+), 48 deletions(-) diff --git a/NEWS.md b/NEWS.md index f0ddda29e..515bd7b75 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,6 +133,8 @@ rowwiseDT( 19. An integer overflow in `fread()` with lines longer than `2^(31/2)` bytes is prevented, [#6729](https://github.com/Rdatatable/data.table/issues/6729). The typical impact was no worse than a wrong initial allocation size, corrected later. Thanks to @TaikiSan21 for the report and @aitap for the fix. +20. By reference assignments (':=') with functions that modify the data.table by reference e.g. (`foo=function(DT){DT[,b:=1L];return(2L)}`, `DT[,a:=foo(DT)]`) returned a mallformed data.table due to the the modification of the targeted named column ("a") index before and after the j expression evaluation. Thanks @AntonNM for the the report and fix. + ## NOTES 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181). diff --git a/R/data.table.R b/R/data.table.R index eeb9a1ca2..1e5edef1a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1154,7 +1154,6 @@ replace_dot_alias = function(e) { } else if (is.numeric(lhs)) { m = as.integer(lhs) if (any(m<1L | ncol(x) DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame - if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { - DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove - n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() - # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). - name = substitute(x) - if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) - catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) - # #1729 -- copying to the wrong environment here can cause some confusion - if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") - - # Verbosity should not issue warnings, so cat rather than warning. - # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. - - # TO DO ... comments moved up from C ... - # Note that the NAMED(dt)>1 doesn't work because .Call - # always sets to 2 (see R-ints), it seems. Work around - # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too - # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. - # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we - # don't mind. - } - setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope - if (is.name(name)) { - assign(as.character(name),x,parent.frame(),inherits=TRUE) - } else if (.is_simple_extraction(name)) { # TODO(#6702): use a helper here as the code is very similar to setDT(). - k = eval(name[[2L]], parent.frame(), parent.frame()) - if (is.list(k)) { - origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) - if (is.character(j)) { - if (length(j)!=1L) stopf("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is %d", length(j)) - j = match(j, names(k)) - if (is.na(j)) internal_error("item '%s' not found in names of list", origj) # nocov - } - .Call(Csetlistelt,k,as.integer(j), x) - } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { - assign(as.character(name[[3L]]), x, k, inherits=FALSE) - } else if (isS4(k)) { - .Call(CsetS4elt, k, as.character(name[[3L]]), x) - } - } # TO DO: else if env$<- or list$<- - } } } } @@ -1402,6 +1360,85 @@ replace_dot_alias = function(e) { } if (!is.null(lhs)) { + # Re-matches characters names in the lhs after jval to account for jsub's that modify the columns of the data.table (#6768) + # Replaces numerical lhs with respective names_x + if(is.character(lhs)){ + m = chmatch(lhs, names_x) + if(!anyNA(m)){ + # updates by reference to existing columns + cols = as.integer(m) + newnames=NULL + if (identical(irows, integer())) { + # Empty integer() means no rows e.g. logical i with only FALSE and NA + # got converted to empty integer() by the which() above + # Short circuit and do-nothing since columns already exist. If some don't + # exist then for consistency with cases where irows is non-empty, we need to create + # them of the right type and populate with NA. Which will happen via the regular + # alternative branches below, to cover #759. + # We need this short circuit at all just for convenience. Otherwise users may need to + # fix errors in their RHS when called on empty edge cases, even when the result won't be + # used anyway (so it would be annoying to have to fix it.) + if (verbose) { + catf("No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows\n", nrow(x)) + } + .Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated + .global$print = address(x) + return(invisible(x)) + } + }else{ + # Adding new column(s). + newnames=setdiff(lhs, names_x) + m[is.na(m)] = ncol(x)+seq_len(length(newnames)) + cols = as.integer(m) + # ok <- selfrefok above called without verbose -- only activated when + # ok=-1 which will trigger setalloccol with verbose in the next + # branch, which again calls _selfrefok and returns the message then + # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame + if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { + DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove + n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() + # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). + name = substitute(x) + if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) + catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) + # #1729 -- copying to the wrong environment here can cause some confusion + if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") + + # Verbosity should not issue warnings, so cat rather than warning. + # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. + + # TO DO ... comments moved up from C ... + # Note that the NAMED(dt)>1 doesn't work because .Call + # always sets to 2 (see R-ints), it seems. Work around + # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too + # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. + # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we + # don't mind. + } + setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope + if (is.name(name)) { + assign(as.character(name),x,parent.frame(),inherits=TRUE) + } else if (.is_simple_extraction(name)) { # TODO(#6702): use a helper here as the code is very similar to setDT(). + k = eval(name[[2L]], parent.frame(), parent.frame()) + if (is.list(k)) { + origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame()) + if (is.character(j)) { + if (length(j)!=1L) stopf("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is %d", length(j)) + j = match(j, names(k)) + if (is.na(j)) internal_error("item '%s' not found in names of list", origj) # nocov + } + .Call(Csetlistelt,k,as.integer(j), x) + } else if (is.environment(k) && exists(as.character(name[[3L]]), k)) { + assign(as.character(name[[3L]]), x, k, inherits=FALSE) + } else if (isS4(k)) { + .Call(CsetS4elt, k, as.character(name[[3L]]), x) + } + } # TO DO: else if env$<- or list$<- + } + } + } else if (is.numeric(lhs)) { + lhs = names_x[m] + } # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) return(suppPrint(x)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0649b15ae..81361b341 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1979,8 +1979,9 @@ test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3, test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) # Test that with=FALSE by number isn't messed up by dup column names, #2025 -DT = data.table(a=1:3,a=4:6) -test(634, DT[,2:=200L], data.table(a=1:3,a=200L)) +DT = data.table(a=1:3, a=4:6, a=7:9) +test(634, DT[,2:=200L], data.table(a=1:3, a=200L, a=7:9)) +test(634.1, DT[,c(2, 3):=200L], data.table(a=1:3, a=200L, a=200L)) # Test names when not all items are named, #2029 DT = data.table(x=1:3,y=1:3) @@ -21063,3 +21064,9 @@ test(2304.100, set(copy(DT), i=2L, j=c("L1", "L2"), value=list(list(NULL), list( # the integer overflow in #6729 is only noticeable with UBSan test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE }) + +# j expressions that modify a data.table by reference, (#6768) +inner=function(dt){dt[,b:=4:6]} +outer=function(dt){inner(dt); return(7:9)} +DT = data.table(a=1:3) +test(2306, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9)) From a5f40e8d69bfe0ceca27c331dfde739d866b528c Mon Sep 17 00:00:00 2001 From: antonm Date: Sat, 1 Feb 2025 23:21:23 -0500 Subject: [PATCH 2/6] Updated NEWS.md and tests.Rraw --- NEWS.md | 2 +- inst/tests/tests.Rraw | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 515bd7b75..0eee35221 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,7 +133,7 @@ rowwiseDT( 19. An integer overflow in `fread()` with lines longer than `2^(31/2)` bytes is prevented, [#6729](https://github.com/Rdatatable/data.table/issues/6729). The typical impact was no worse than a wrong initial allocation size, corrected later. Thanks to @TaikiSan21 for the report and @aitap for the fix. -20. By reference assignments (':=') with functions that modify the data.table by reference e.g. (`foo=function(DT){DT[,b:=1L];return(2L)}`, `DT[,a:=foo(DT)]`) returned a mallformed data.table due to the the modification of the targeted named column ("a") index before and after the j expression evaluation. Thanks @AntonNM for the the report and fix. +20. By reference assignments (':=') with functions that modified the data.table by reference e.g. (`foo=function(DT){modify(DT);return(1L)}`, `DT[,a:=foo(DT)]`) returned a mallformed data.table due to the modification of the targeted named column index ("a") during the j expression evaluation. Thanks @AntonNM for the report and fix. ## NOTES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 81361b341..4b325cd0a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21068,5 +21068,7 @@ test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE }) # j expressions that modify a data.table by reference, (#6768) inner=function(dt){dt[,b:=4:6]} outer=function(dt){inner(dt); return(7:9)} +foo=function(dt){dt[b:=4:6];return(7:9)} DT = data.table(a=1:3) test(2306, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9)) +test(2306.1, DT[,c:=foo(DT)], data.table(a=1:3, b=4:6, c=7:9)) From da18bd83a0e4743369fd70cbc1ec6708690b9ded Mon Sep 17 00:00:00 2001 From: antonm Date: Sat, 1 Feb 2025 23:48:07 -0500 Subject: [PATCH 3/6] Added reference to issue to NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 0eee35221..fa72f5b5d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -133,7 +133,7 @@ rowwiseDT( 19. An integer overflow in `fread()` with lines longer than `2^(31/2)` bytes is prevented, [#6729](https://github.com/Rdatatable/data.table/issues/6729). The typical impact was no worse than a wrong initial allocation size, corrected later. Thanks to @TaikiSan21 for the report and @aitap for the fix. -20. By reference assignments (':=') with functions that modified the data.table by reference e.g. (`foo=function(DT){modify(DT);return(1L)}`, `DT[,a:=foo(DT)]`) returned a mallformed data.table due to the modification of the targeted named column index ("a") during the j expression evaluation. Thanks @AntonNM for the report and fix. +20. By reference assignments (':=') with functions that modified the data.table by reference e.g. (`foo=function(DT){modify(DT);return(1L)}`, `DT[,a:=foo(DT)]`) returned a mallformed data.table due to the modification of the targeted named column index ("a") during the j expression evaluation [#6768](https://github.com/Rdatatable/data.table/issues/6768). Thanks @AntonNM for the report and fix. ## NOTES From f0dce340f96611d2727b258a68c293fe7c5e3e07 Mon Sep 17 00:00:00 2001 From: antonm Date: Sat, 1 Feb 2025 23:52:13 -0500 Subject: [PATCH 4/6] fixed typo --- inst/tests/tests.Rraw | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4b325cd0a..ea7075e65 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21068,7 +21068,7 @@ test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE }) # j expressions that modify a data.table by reference, (#6768) inner=function(dt){dt[,b:=4:6]} outer=function(dt){inner(dt); return(7:9)} -foo=function(dt){dt[b:=4:6];return(7:9)} +foo=function(dt){dt[,b:=4:6];return(7:9)} DT = data.table(a=1:3) test(2306, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9)) test(2306.1, DT[,c:=foo(DT)], data.table(a=1:3, b=4:6, c=7:9)) From e757125c6c42d429d4fe47db9e707c593e264897 Mon Sep 17 00:00:00 2001 From: antonm Date: Sun, 2 Feb 2025 00:03:27 -0500 Subject: [PATCH 5/6] Removed trailing whitespace --- R/data.table.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/data.table.R b/R/data.table.R index 1e5edef1a..7bc8b9b5a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1178,7 +1178,7 @@ replace_dot_alias = function(e) { return(invisible(x)) } } else { - # Adding new column(s). Allocation for columns and recalculation of target cols moved after the jval = eval(jsub) + # Adding new column(s). Allocation for columns and recalculation of target cols moved after the jval = eval(jsub) # in case of error or by-reference modifications to the DT newnames=setdiff(lhs, names_x) m[is.na(m)] = ncol(x)+seq_len(length(newnames)) From 4e05d8da234031e97cfcd7aa98989f83633f8c2d Mon Sep 17 00:00:00 2001 From: antonm Date: Sun, 2 Feb 2025 00:35:30 -0500 Subject: [PATCH 6/6] Removed redundant empty rows check --- R/data.table.R | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 7bc8b9b5a..68243cb1c 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1368,23 +1368,6 @@ replace_dot_alias = function(e) { # updates by reference to existing columns cols = as.integer(m) newnames=NULL - if (identical(irows, integer())) { - # Empty integer() means no rows e.g. logical i with only FALSE and NA - # got converted to empty integer() by the which() above - # Short circuit and do-nothing since columns already exist. If some don't - # exist then for consistency with cases where irows is non-empty, we need to create - # them of the right type and populate with NA. Which will happen via the regular - # alternative branches below, to cover #759. - # We need this short circuit at all just for convenience. Otherwise users may need to - # fix errors in their RHS when called on empty edge cases, even when the result won't be - # used anyway (so it would be annoying to have to fix it.) - if (verbose) { - catf("No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows\n", nrow(x)) - } - .Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated - .global$print = address(x) - return(invisible(x)) - } }else{ # Adding new column(s). newnames=setdiff(lhs, names_x)