diff --git a/NEWS.md b/NEWS.md index fa5fa3e79b..0b762ca3db 100644 --- a/NEWS.md +++ b/NEWS.md @@ -162,6 +162,8 @@ 23. Added a `data.table` method for `utils::edit` to ensure a `data.table` is returned, for convenience, [#593](https://github.com/Rdatatable/data.table/issues/593). +24. More efficient optimization of many columns in `j` (e.g. from `.SD`), [#1470](https://github.com/Rdatatable/data.table/issues/1470). Thanks @Jorges1000 for the report. + #### BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. diff --git a/R/data.table.R b/R/data.table.R index 6cc4b9cde9..43541e9afb 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1611,11 +1611,11 @@ replace_order = function(isub, verbose, env) { } if (verbose) { if (!identical(oldjsub, jsub)) - cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L),"'\n",sep="") + cat("lapply optimization changed j from '",deparse(oldjsub),"' to '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") else - cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L),"'\n",sep="") + cat("lapply optimization is on, j unchanged as '",deparse(jsub,width.cutoff=200L, nlines=1L),"'\n",sep="") } - dotN = function(x) is.name(x) && x == ".N" # For #5760 + dotN = function(x) is.name(x) && x==".N" # For #5760. TODO: Rprof() showed dotN() may be the culprit if iterated (#1470)?; avoid the == which converts each x to character? # FR #971, GForce kicks in on all subsets, no joins yet. Although joins could work with # nomatch=0L even now.. but not switching it on yet, will deal it separately. if (getOption("datatable.optimize")>=2 && !is.data.table(i) && !byjoin && length(f__) && !length(lhs)) { @@ -1643,7 +1643,9 @@ replace_order = function(isub, verbose, env) { } if (jsub[[1L]]=="list") { GForce = TRUE - for (ii in seq_along(jsub)[-1L]) if (!.ok(jsub[[ii]])) GForce = FALSE + for (ii in seq.int(from=2L, length.out=length(jsub)-1L)) { + if (!.ok(jsub[[ii]])) {GForce = FALSE; break} + } } else GForce = .ok(jsub) if (GForce) { if (jsub[[1L]]=="list") @@ -1665,12 +1667,15 @@ replace_order = function(isub, verbose, env) { nomeanopt=FALSE # to be set by .optmean() using <<- inside it oldjsub = jsub if (jsub[[1L]]=="list") { - for (ii in seq_along(jsub)[-1L]) { - this_jsub = jsub[[ii]] - if (dotN(this_jsub)) next; # For #5760 - # Addressing #1369, #2949 and #1974. Added is.symbol() check to handle cases where expanded function definition is used instead of function names. #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table. - if (is.call(this_jsub) && is.symbol(this_jsub[[1L]]) && this_jsub[[1L]]=="mean") - jsub[[ii]] = .optmean(this_jsub) + # Addressing #1369, #2949 and #1974. This used to be 30s (vs 0.5s) with 30K elements items in j, #1470. Could have been dotN() and/or the for-looped if() + todo = sapply(jsub, function(x) { + is.call(x) && is.symbol(x[[1L]]) && x[[1L]]=="mean" + # jsub[[1]]=="list" so the first item will always be FALSE + # is.symbol() for when expanded function definition is used instead of function names; #1369 results in (function(x) sum(x)) as jsub[[.]] from dcast.data.table + }) + if (any(todo)) { + w = which(todo) + jsub[w] = lapply(jsub[w], .optmean) } } else if (jsub[[1L]]=="mean") { jsub = .optmean(jsub)