diff --git a/NEWS.md b/NEWS.md index b477adf93b..b639b2aedf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -192,24 +192,24 @@ ```R set.seed(108) x = rnorm(1e6); n = 1e3 - rollfun = function(x, n, FUN) { - ans = rep(NA_real_, nx<-length(x)) + base_rollapply = function(x, n, FUN) { + nx = length(x) + ans = rep(NA_real_, nx) for (i in n:nx) ans[i] = FUN(x[(i-n+1):i]) ans } - system.time(ans1<-rollfun(x, n, mean)) - system.time(ans2<-zoo::rollapplyr(x, n, function(x) mean(x), fill=NA)) - system.time(ans3<-zoo::rollmeanr(x, n, fill=NA)) - system.time(ans4<-frollapply(x, n, mean)) - system.time(ans5<-frollmean(x, n)) - sapply(list(ans2,ans3,ans4,ans5), all.equal, ans1) + system.time(base_rollapply(x, n, mean)) + system.time(zoo::rollapplyr(x, n, function(x) mean(x), fill=NA)) + system.time(zoo::rollmeanr(x, n, fill=NA)) + system.time(frollapply(x, n, mean)) + system.time(frollmean(x, n)) ### fun mean sum median - # base rollfun 8.815 5.151 60.175 + # base_rollapply 8.815 5.151 60.175 # zoo::rollapply 34.373 27.837 88.552 - # zoo::roll[fun] 0.215 0.185 NA + # zoo::roll[fun] 0.215 0.185 NA ## median not fully supported # frollapply 5.404 1.419 56.475 - # froll[fun] 0.003 0.002 NA + # froll[fun] 0.003 0.002 NA ## median not yet supported ``` 28. `setnames()` now accepts functions in `old=` and `new=`, [#3703](https://github.com/Rdatatable/data.table/issues/3703). Thanks @smingerson for the feature request and @shrektan for the PR. @@ -219,11 +219,13 @@ setnames(DT, toupper) names(DT) # [1] "A" "B" "C" - setnames(DT, 2:3, tolower) + setnames(DT, c(1,3), tolower) names(DT) - # [1] "A" "b" "c" + # [1] "a" "B" "c" ``` +29. `:=` and `set()` now use zero-copy type coercion. Accordingly, `DT[..., integerColumn:=0]` and `set(DT,i,j,0)` no longer warn about the `0` ('numeric') needing to be `0L` ('integer') because there is no longer any time or space used for this coercion. The old long warning was off-putting to new users ("what and why L?"), whereas advanced users appreciated the old warning so they could avoid the coercion. Although the time and space for one coercion in a single call is unmeasurably small, when placed in a loop the small overhead of any allocation on R's heap could start to become noticeable (more so for `set()` whose purpose is low-overhead looping). Further, when assigning a value across columns of varying types, it could be inconvenient to supply the correct type for every column. Hence, zero-copy coercion was introduced to satisfy all these requirements. A warning is still issued, as before, when fractional data is discarded; e.g. when 3.14 is assigned to an integer column. Zero-copy coercion applies to length>1 vectors as well as length-1 vectors. + ## BUG FIXES 1. `first`, `last`, `head` and `tail` by group no longer error in some cases, [#2030](https://github.com/Rdatatable/data.table/issues/2030) [#3462](https://github.com/Rdatatable/data.table/issues/3462). Thanks to @franknarf1 for reporting. @@ -327,6 +329,8 @@ 44. Grouping could create a `malformed factor` and/or segfault when the factors returned by each group did not have identical levels, [#2199](https://github.com/Rdatatable/data.table/issues/2199) and [#2522](https://github.com/Rdatatable/data.table/issues/2522). Thanks to Václav Hausenblas, @franknarf1, @ben519, and @Henrik-P for reporting. +45. `rbindlist` (and printing a `data.table` with over 100 rows because that uses `rbindlist(head, tail)`) could error with `malformed factor` for unordered factor columns containing a used `NA_character_` level, [#3915](https://github.com/Rdatatable/data.table/issues/3915). This is an unusual input for unordered factors because NA_integer_ is recommended by default in R. Thanks to @sindribaldur for reporting. + ## NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/bmerge.R b/R/bmerge.R index abb7aec2c8..3d6ab028f3 100644 --- a/R/bmerge.R +++ b/R/bmerge.R @@ -75,12 +75,12 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos if (xclass=="character" || iclass=="character" || xclass=="logical" || iclass=="logical" || xclass=="factor" || iclass=="factor") { - if (anyNA(i[[ic]]) && all(is.na(i[[ic]]))) { # TODO: allNA function in C + if (anyNA(i[[ic]]) && allNA(i[[ic]])) { if (verbose) cat("Coercing all-NA i.",names(i)[ic]," (",iclass,") to type ",xclass," to match type of x.",names(x)[xc],".\n",sep="") set(i, j=ic, value=match.fun(paste0("as.", xclass))(i[[ic]])) next } - else if (anyNA(x[[xc]]) && all(is.na(x[[xc]]))) { + else if (anyNA(x[[xc]]) && allNA(x[[xc]])) { if (verbose) cat("Coercing all-NA x.",names(x)[xc]," (",xclass,") to type ",iclass," to match type of i.",names(i)[ic],".\n",sep="") set(x, j=xc, value=match.fun(paste0("as.", iclass))(x[[xc]])) next diff --git a/R/fread.R b/R/fread.R index 4bc4c4dc00..d7ea4de61b 100644 --- a/R/fread.R +++ b/R/fread.R @@ -118,7 +118,7 @@ yaml=FALSE, autostart=NA, tmpdir=tempdir()) } if (!missing(autostart)) warning("'autostart' is now deprecated and ignored. Consider skip='string' or skip=n"); if (is.logical(colClasses)) { - if (!all(is.na(colClasses))) stop("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") + if (!allNA(colClasses)) stop("colClasses is type 'logical' which is ok if all NA but it has some TRUE or FALSE values in it which is not allowed. Please consider the drop= or select= argument instead. See ?fread.") colClasses = NULL } if (!is.null(colClasses) && is.atomic(colClasses)) { diff --git a/R/test.data.table.R b/R/test.data.table.R index 47ded3002f..59c622c433 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -157,7 +157,7 @@ test.data.table = function(verbose=FALSE, pkg="pkg", silent=FALSE, with.other.pa # inittime=PS_rss=GC_used=GC_max_used=NULL # m = fread("memtest.csv")[inittime==.inittime] # if (nrow(m)) { - # ps_na = all(is.na(m[["PS_rss"]])) # OS with no 'ps -o rss R' support + # ps_na = allNA(m[["PS_rss"]]) # OS with no 'ps -o rss R' support # grDevices::png("memtest.png") # p = graphics::par(mfrow=c(if (ps_na) 2 else 3, 2)) # if (!ps_na) { @@ -282,9 +282,9 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,output=NULL,notOutput=NULL, if (!missing(error) && !missing(y)) stop("Test ",numStr," is invalid: when error= is provided it does not make sense to pass y as well") # nocov - string_match = function(x, y) { - length(grep(x,y,fixed=TRUE)) || # try treating x as literal first; useful for most messages containing ()[]+ characters - length(tryCatch(grep(x,y), error=function(e)NULL)) # otherwise try x as regexp + string_match = function(x, y, ignore.case=FALSE) { + length(grep(x, y, fixed=TRUE)) || # try treating x as literal first; useful for most messages containing ()[]+ characters + length(tryCatch(grep(x, y, ignore.case=ignore.case), error=function(e)NULL)) # otherwise try x as regexp } xsub = substitute(x) @@ -364,10 +364,10 @@ test = function(num,x,y=TRUE,error=NULL,warning=NULL,output=NULL,notOutput=NULL, fail = TRUE # nocov end } - if (length(notOutput) && string_match(notOutput, out)) { + if (length(notOutput) && string_match(notOutput, out, ignore.case=TRUE)) { # nocov start cat("Test",numStr,"produced output but should not have:\n") - cat("Expected absent: <<",gsub("\n","\\\\n",notOutput),">>\n",sep="") + cat("Expected absent (case insensitive): <<",gsub("\n","\\\\n",notOutput),">>\n",sep="") cat("Observed: <<",gsub("\n","\\\\n",out),">>\n",sep="") fail = TRUE # nocov end diff --git a/R/utils.R b/R/utils.R index 0426a06654..de35e709c0 100644 --- a/R/utils.R +++ b/R/utils.R @@ -12,6 +12,7 @@ if (base::getRversion() < "3.5.0") { } isTRUEorNA = function(x) is.logical(x) && length(x)==1L && (is.na(x) || x) isTRUEorFALSE = function(x) is.logical(x) && length(x)==1L && !is.na(x) +allNA = function(x) .Call(C_allNAR, x) if (base::getRversion() < "3.2.0") { # Apr 2015 isNamespaceLoaded = function(x) x %chin% loadedNamespaces() diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index b8739c9560..cdbe482c91 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -17,6 +17,7 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { # other than here inside this branch. all.equal.data.table = data.table:::all.equal.data.table + allNA = data.table:::allNA any_na = data.table:::any_na as.data.table.array = data.table:::as.data.table.array as.IDate.default = data.table:::as.IDate.default @@ -869,17 +870,17 @@ test(299.03, truelength(DT)>length(DT)) # the := over-allocated, by 100 by def # FR #2551 - old 299.3 and 299.5 are changed to include length(RHS) > 1 to issue the warning DT[,c:=rep(42L,.N)] # plonk test(299.04, DT, data.table(a=1:3, c=42L)) -test(299.05, DT[2:3,c:=c(42, 42)], data.table(a=1:3,c=42L), warning="Coerced double RHS to integer.*column 2 named 'c'.*RHS.*no fractions.*more efficiently.*integer.*Consider.*L") +test(299.05, DT[2:3,c:=c(43, 44)], data.table(a=1:3,c=42:44)) # FR #2551 - length(RHS) = 1 - no warning for type conversion -test(299.06, DT[2,c:=42], data.table(a=1:3,c=42L)) +test(299.06, DT[2,c:=42], data.table(a=1:3,c=INT(42,42,44))) # also see tests 302 and 303. (Ok, new test file for fast assign would be tidier). test(299.07, DT[,c:=rep(FALSE,nrow(DT))], data.table(a=1:3,c=FALSE)) # replace c column with logical -test(299.08, DT[2:3,c:=c(42,0)], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced double RHS to logical.*column 2 named 'c'.*If the target column's type logical is correct") +test(299.08, DT[2:3,c:=c(3.14,0)], data.table(a=1:3, c=c(FALSE,TRUE,FALSE)), warning="3.14.*double.*at RHS position 1 taken as TRUE.*column 2 named 'c'.*logical") +test(299.09, DT[2:3,c:=c(0,1)], data.table(a=1:3,c=c(FALSE,FALSE,TRUE))) # no warning # FR #2551 is now changed to fit in / fix bug #5442. Stricter warnings are in place now. Check tests 1294.1-34 below. -test(299.09, DT[2,c:=42], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced double RHS to logical to match") -test(299.11, DT[2,c:=42L], data.table(a=1:3,c=c(FALSE,TRUE,FALSE)), warning="Coerced integer RHS to logical to match") -test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=FALSE), warning="Coerced integer RHS to logical to match the type of the target column.*If the target column's type logical is correct") - +test(299.10, DT[2,c:=42], data.table(a=1:3, c=c(FALSE,TRUE,TRUE)), warning="42.0.*double.*at RHS position 1.*TRUE") +test(299.11, DT[1,c:=42L], data.table(a=1:3, c=TRUE), warning="42.*integer.*at RHS position 1.*TRUE.*column 2 named 'c'.*logical") +test(299.12, DT[2:3,c:=c(0L, 0L)], data.table(a=1:3,c=c(TRUE,FALSE,FALSE))) # Test bug fix #1468, combining i and by. DT = data.table(a=1:3,b=1:9,v=1:9,key="a,b") @@ -969,8 +970,7 @@ test(335, DT[,2:1]<-NULL, error="Attempt to assign to column") DT = data.table(a=1:2, b=1:6) test(336, DT[,z:=a/b], data.table(a=1:2,b=1:6,z=(1:2)/(1:6))) -test(337, DT[3:4,z:=a*b], data.table(a=1:2,b=1:6,z=c(1,1,3,8,1/5,2/6)), warning="Coerced integer RHS to double to match") - +test(337, DT[3:4,z:=a*b], data.table(a=1:2,b=1:6,z=c(1,1,3,8,1/5,2/6))) # test eval of LHS of := (using with=FALSE gives a warning here from v1.9.3) DT = data.table(a=1:3, b=4:6) @@ -1024,8 +1024,7 @@ test(355, DT[11:2010,f:=newlevels], data.table(f=factor(c(rep("000",10),newlevel DT = data.table(f=c("a","b"),x=1:4) # Test coercing factor to character column test(355.5, DT[3,f:=factor("foo")], data.table(f=c("a","b","foo","b"),x=1:4)) -test(355.6, DT[4,f:=factor("bar"),verbose=TRUE], data.table(f=c("a","b","foo","bar"),x=1:4), output="Coerced factor RHS to character to match the column") - +test(355.6, DT[4,f:=factor("bar"),verbose=TRUE], data.table(f=c("a","b","foo","bar"),x=1:4), notOutput="coerce") # See datatable-help post and NEWS item for 1.6.7 DT = data.table(X=factor(letters[1:10]), Y=1:10) @@ -1215,19 +1214,30 @@ test(422, length(DT)==4) test(423, truelength(DT), 1028L) # Test crash bug fixed, #1656, introduced with the 1.7.0 feature -DT <- data.table(a = factor(c("A", "Z")), b = 1:4) -DT[1,1] <- "Z" -test(424, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) -test(425, DT[1,1] <- 1, 1, warning="Coerced 'double' RHS to 'integer'") +DT = data.table(a = factor(c("A", "Z")), b = 1:4) +test(424.1, DT[1,1] <- "Z", "Z") +test(424.2, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) +test(425, DT[1,1] <- 1, 1) test(426, DT, data.table(a=factor(c("A","Z")),b=1:4)) -DT[1,1] <- 2L -test(427, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) -DT[1,a:="A"] -test(428, DT, data.table(a=factor(c("A","Z","A","Z")),b=1:4)) -DT[1,a:=2L] -test(429, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) -test(430, DT[1,1]<- 3L, 3L, warning="RHS contains 3 which is outside the levels range.*1,2.*of column 1, NAs generated") -test(431, DT[1,1:=4L], data.table(a=factor(c(NA,"Z","A","Z")),b=1:4), warning="RHS contains 4 which is outside the levels range.*1,2.*of column 1, NAs generated") +test(427.1, DT[1,1] <- 2L, 2L) +test(427.2, DT, data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) +test(428, DT[1,a:="A"], data.table(a=factor(c("A","Z","A","Z")),b=1:4)) +test(429, DT[1,a:=2L], data.table(a=factor(c("Z","Z","A","Z")),b=1:4)) + +test(430.1, DT[1,1]<-3, error="Assigning factor numbers to column 1 named 'a'. But 3.0.* is outside the level range [[]1,2[]], or is not a whole number") +test(430.2, DT[1,1]<-1.3, error="Assigning factor numbers to column 1 named 'a'. But 1.3.* is outside the level range [[]1,2[]], or is not a whole number") +test(430.3, DT[1,1:=4L], error="Assigning factor numbers.*1 named 'a'. But 4 is outside.*1,2.*") +test(430.4, DT[1,a:=TRUE], error="Cannot assign 'logical' to 'factor'. Factor columns can be assigned") +test(430.5, DT[2,b:=factor("A")], error="Cannot assign 'factor' to 'integer'. Factors can only be assigned to") + +DT = data.table(a=factor(c("A","B","A","C","B")), b=1:5) +test(431.1, DT[1,1:=NA], data.table(a=factor(c(NA,"B","A","C","B")), b=1:5)) +test(431.2, DT[2,1:=NA_integer_], data.table(a=factor(c(NA,NA,"A","C","B"), levels=LETTERS[1:3]), b=1:5)) +test(431.3, DT[3,1:=NA_real_], data.table(a=factor(c(NA,NA,NA,"C","B"), levels=LETTERS[1:3]), b=1:5)) +test(431.4, DT[4,1:=NA_character_], data.table(a=factor(c(NA,NA,NA,NA,"B"), levels=LETTERS[1:3]), b=1:5)) +if (test_bit64) { + test(431.5, DT[5,1:=as.integer64(NA)], data.table(a=factor(c(NA,NA,NA,NA,NA), levels=LETTERS[1:3]), b=1:5)) +} old = getOption("datatable.alloccol") # Test that unsetting datatable.alloccol is caught, #2014 options(datatable.alloccol=NULL) # In this =NULL case, options() in R 3.0.0 returned TRUE rather than the old value. This R bug was fixed in R 3.1.1. @@ -4846,38 +4856,43 @@ test(1293, ans1, ans2) dt <- data.table(a=1:3, b=c(7,8,9), c=c(TRUE, NA, FALSE), d=as.list(4:6), e=c("a", "b", "c")) test(1294.01, dt[, a := 1]$a, rep(1L, 3L)) -test(1294.02, dt[, a := 1.5]$a, rep(1L, 3L), warning="Coerced double RHS to integer.*column 1 named 'a'.*fractions which have been lost; e.g. item 1 with value 1.5.* truncated to 1.") +test(1294.02, dt[, a := 1.5]$a, rep(1L, 3L), warning="1.5.*double.*position 1 truncated.*column 1 named 'a'.*integer") test(1294.03, dt[, a := NA]$a, rep(NA_integer_, 3L)) test(1294.04, dt[, a := "a"]$a, rep(NA_integer_, 3L), warning=c("NAs introduced by coercion", - "Coerced character RHS to integer.*create the RHS as type integer.*with as.integer.. to avoid this warning.*DT., `a`:=as.character.`a`.*")) -test(1294.05, dt[, a := list(list(1))]$a, rep(1L, 3L), warning="Coerced list RHS to integer to match.*column 1 named 'a'") + "Coerced 'character' RHS to 'integer'.*column 1 named 'a'")) +test(1294.05, dt[, a := list(list(1))]$a, error="Cannot coerce 'list' RHS to 'integer' to match.*column 1 named 'a'") test(1294.06, dt[, a := list(1L)]$a, rep(1L, 3L)) test(1294.07, dt[, a := list(1)]$a, rep(1L, 3L)) -test(1294.08, dt[, a := TRUE]$a, rep(1L, 3L), warning="Coerced logical RHS to integer") +test(1294.08, dt[, a := TRUE]$a, rep(1L, 3L)) test(1294.09, dt[, b := 1L]$b, rep(1,3)) test(1294.10, dt[, b := NA]$b, rep(NA_real_,3)) test(1294.11, dt[, b := "bla"]$b, rep(NA_real_, 3), warning=c("NAs introduced by coercion", - "Coerced character RHS to double to match.*column 2 named 'b'.*DT[, `b`:=as.character(`b`)]")) -test(1294.12, dt[, b := list(list(1))]$b, rep(1,3), warning="Coerced list RHS to double") -test(1294.13, dt[, b := TRUE]$b, rep(1,3), warning="Coerced logical RHS to double") + "Coerced 'character' RHS to 'double' to match.*column 2 named 'b'")) +test(1294.12, dt[, b := list(list(1))]$b, error="Cannot coerce 'list' RHS to 'double' to match.*column 2 named 'b'") +test(1294.13, dt[, b := TRUE]$b, rep(1,3)) test(1294.14, dt[, b := list(1)]$b, rep(1,3)) -test(1294.15, dt[, c := 1]$c, rep(TRUE, 3), warning="Coerced double RHS to logical") -test(1294.16, dt[, c := 1L]$c, rep(TRUE, 3), warning="Coerced integer RHS to logical") +test(1294.15, dt[, c := 1]$c, rep(TRUE, 3)) +test(1294.16, dt[, c := 1L]$c, rep(TRUE, 3)) test(1294.17, dt[, c := NA]$c, rep(NA, 3)) -test(1294.18, dt[, c := list(1)]$c, rep(TRUE, 3), warning="Coerced double RHS to logical") -test(1294.19, dt[, c := list(list(1))]$c, rep(TRUE, 3), warning="Coerced list RHS to logical") -test(1294.20, dt[, c := "bla"]$c, rep(NA, 3), warning="Coerced character RHS to logical") +test(1294.18, dt[, c := list(1)]$c, rep(TRUE, 3)) +test(1294.19, dt[, c := list(list(1))]$c, error="Cannot coerce 'list' RHS to 'logical' to match.*column 3 named 'c'") +test(1294.20, dt[, c := "bla"]$c, rep(NA, 3), warning="Coerced 'character' RHS to 'logical'") test(1294.21, dt[, d := 1]$d, rep(list(1), 3)) test(1294.22, dt[, d := 1L]$d, rep(list(1L), 3)) test(1294.23, dt[, d := TRUE]$d, rep(list(TRUE), 3)) test(1294.24, dt[, d := "bla"]$d, rep(list("bla"), 3)) test(1294.25, dt[, d := list(list(1))]$d, rep(list(1), 3)) -test(1294.26, dt[, e := 1]$e, rep("1", 3), warning="Coerced double RHS to character") -test(1294.27, dt[, e := 1L]$e, rep("1", 3), warning="Coerced integer RHS to character") -test(1294.28, dt[, e := TRUE]$e, rep("TRUE", 3), warning="Coerced logical RHS to character") -test(1294.29, dt[, e := list(list(1))]$e, rep("1", 3), warning="Coerced list RHS to character") +test(1294.26, dt[, e := 1]$e, rep("1", 3)) +test(1294.27, dt[, e := 1L]$e, rep("1", 3)) +test(1294.28, dt[, e := TRUE]$e, rep("TRUE", 3)) +### +### TEMPORARILY OFF IN DEV. TO BE ADDRESSED BEFORE v1.12.4 RELEASE +### REVISIT IN #3909 or #3626 +### +### test(1294.29, dt[, e := list(list(1))]$e, rep("1", 3)) +### test(1294.30, dt[, e := "bla"]$e, rep("bla", 3)) test(1294.31, dt[, e := list("bla2")]$e, rep("bla2", 3)) @@ -9815,7 +9830,7 @@ test(1674, forderv(c(2147483645L, 2147483646L, 2147483647L, 2147483644L), order= A = data.table(foo = c(1, 2, 3), bar = c(4, 5, 6)) A[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)] B = data.table(foo = c(1, 2, 3, 4, 5, 6), bar = structure(c(3L, 3L, 3L, 1L, 2L, NA), .Label=c("Boop","Beep",NA), class="factor")) -test(1675.1, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:3,1:2,NA)) +test(1675.1, as.integer(B[A, bar := i.bar, on="foo"]$bar), c(1:2,NA,1:2,NA)) # remove the NA level is change in v1.12.4 A = data.table(foo = c(1, 2, 3), bar = c(4, 5, 6)) B = data.table(foo = c(1, 2, 3, 4, 5, 6), bar = c(NA, NA, NA, 4, 5, 6)) A[, bar := factor(bar, levels = c(4, 5), labels = c("Boop", "Beep"), exclude = 6)] @@ -12927,9 +12942,9 @@ test(1944.6, DT[flag == 1, sum(x), keyby = group], data.table(group=1:4, V1=INT( # assigning an int greater than length(levels) corruption of int, #2984 DT <- data.table(a = factor(c("A", "Z")), b = 1:4) c <- 3L -test(1945.1, DT[1, a:=c][1,a], factor(NA, levels=c("A","Z")), warning="RHS.*outside the levels range.*NAs generated") +test(1945.1, DT[1, a:=c], error=err<-"Assigning factor numbers to column 1 named 'a'. But 3 is outside the level range [1,2]") test(1945.2, c, 3L) -test(1945.3, DT[2,1] <- c, 3L, warning="RHS.*outside the levels range.*NAs generated") +test(1945.3, DT[2,1] <- c, error=err) test(1945.4, c, 3L) # subset a data.table containing an altrep derived from ]<-, ]]<- etc, #3051 @@ -14113,16 +14128,58 @@ test(2004.6, chmatch(c("a","b"), c("b",x2)), c(NA,1L)) # the second fallb test(2004.7, c("a","b") %in% c("b",x1,x2), c(FALSE, TRUE)) # the second fallback might be redundnant though; see comments in chmatch.c # more coverage ... -test(2005.1, truelength(NULL), 0L) -DT = data.table(a=1:3, b=4:6) -test(2005.2, set(DT, 4L, "b", NA), error="i[1] is 4 which is out of range [1,nrow=3]") -test(2005.3, set(DT, 3L, 8i, NA), error="j is type 'complex'. Must be integer, character, or numeric is coerced with warning.") -test(2005.4, set(DT, 1L, 2L, expression(x+2)), error="cannot be coerced to.*integer") # R's error message same as returned by as.integer(expression(x+2)) +test(2005.01, truelength(NULL), 0L) +DT = data.table(a=1:3, b=4:6, c=as.raw(7:9), d=as.logical(c(1,0,1)), e=pi*1:3, f=as.complex(10:12)) +test(2005.02, set(DT, 4L, "b", NA), error="i[1] is 4 which is out of range [1,nrow=3]") +test(2005.03, set(DT, 3L, 8i, NA), error="j is type 'complex'. Must be integer, character, or numeric is coerced with warning.") +test(2005.04, set(DT, 1L, 2L, expression(x+2)), error="type 'expression' cannot be coerced to 'integer'") # similar to R's error for as.integer(expression(x+2)) DT[,foo:=factor(c("a","b","c"))] -test(2005.5, DT[2, foo:=8i], error="Can't assign to column 'foo' (type 'factor') a value of type 'complex' (not character, factor, integer or numeric)") -test(2005.6, DT[2, a:=9, verbose=TRUE], output="Coerced length-1 RHS from double to integer to match column's type. No precision was lost. If this") -test(2005.7, DT[2, a:=NA, verbose=TRUE], output="Coerced length-1 RHS from logical to integer to match column's type. If this") -test(2005.8, DT[2, a:=9.9]$a, INT(1,9,3), warning="Coerced double RHS to integer.*One or more RHS values contain fractions which have been lost.*9.9.*has been truncated to 9") +test(2005.05, DT[2, foo:=8i], error="Can't assign to column 'foo' (type 'factor') a value of type 'complex' (not character, factor, integer or numeric)") +test(2005.06, DT[2, a:=9, verbose=TRUE], notOutput="Coerced") +test(2005.07, DT[2, a:=NA, verbose=TRUE], notOutput="Coerced") +test(2005.08, DT[2, a:=9.9]$a, INT(1,9,3), warning="9.9.*double.*position 1 truncated.*column 1 named 'a'.*integer") +test(2005.09, set(DT, 1L, "c", expression(x+2)), error="type 'expression' cannot be coerced to 'raw'") +test(2005.10, set(DT, 1L, "d", expression(x+2)), error="type 'expression' cannot be coerced to 'logical'") +test(2005.11, set(DT, 1L, "e", expression(x+2)), error="type 'expression' cannot be coerced to 'double'") +test(2005.12, set(DT, 1L, "f", expression(x+2)), error="type 'expression' cannot be coerced to 'complex'") +test(2005.30, DT[2:3,c:=c(TRUE,FALSE), verbose=TRUE]$c, as.raw(INT(7,1,0)), + output="Zero-copy coerce when assigning 'logical' to 'raw' column 3 named 'c'") +test(2005.31, set(DT,1L,"c",NA)$c, as.raw(INT(0,1,0))) +test(2005.32, set(DT,1:2,"c",INT(-1,255))$c, as.raw(INT(0,255,0)), + warning="-1.*integer.*position 1 taken as 0 when assigning to column 3 named 'c'.*raw") +test(2005.33, DT[2:3,c:=INT(NA,256)]$c, as.raw(INT(0,0,0)), + warning="-2147483648.*integer.*position 1 taken as 0 when assigning to column 3 named 'c'.*raw") +test(2005.34, set(DT,2:3,"c",c(NA,3.14))$c, as.raw(INT(0,0,3)), + warning="[nN].*double.*position 1 either truncated.*or taken as 0 when assigning to column 3 named 'c'.*raw") # 'nan' for me but might vary hence [nN] +test(2005.35, DT[1:2,c:=c(Inf,0.78)]$c, as.raw(INT(0,0,3)), + warning="[iI].*double.*position 1 either truncated.*or taken as 0 when assigning to column 3 named 'c'.*raw'") # 'inf' for me but might vary hence [iI] +test(2005.36, DT[1:2,d:=as.raw(c(0,255))]$d, c(FALSE,TRUE,TRUE), + warning="255.*raw.*position 2 taken as TRUE when assigning to column 4 named 'd'.*logical") +test(2005.37, DT[2:3,b:=as.raw(c(0,255))]$b, INT(4,0,255)) +test(2005.38, DT[1:2,e:=as.raw(c(0,255))]$e, c(0,255,pi*3)) +test(2005.39, DT[c(1,3,2), c:=as.raw(c(0,100,255)), verbose=TRUE]$c, as.raw(c(0,255,100)), + notOutput="coerce") +test(2005.40, DT[c(3,1), f:=as.raw(c(20,42))]$f, c(42+0i, 11, 20+0i)) +test(2005.41, DT[2:3, f:=c(NA,FALSE)]$f, c(42+0i, NA, 0+0i)) +test(2005.42, DT[c(1,3), f:=c(-42L,NA)]$f, c(-42+0i, NA, NA)) +test(2005.43, DT[3:2, f:=c(pi,-Inf)]$f, c(-42+0i, -Inf+0i, pi+0i)) +if (test_bit64) { + DT[,g:=as.integer64(c(-9,0,NA))] + test(2005.60, set(DT, 1L, "g", expression(x+2)), error="type 'expression' cannot be coerced to 'integer64'") + test(2005.61, DT[1:2,e:=as.integer64(c(NA,-200))]$e, c(NA_real_, -200, pi*3)) + test(2005.62, DT[2:3, d:=as.integer64(c(2,NA))]$d, c(FALSE,TRUE,NA), + warning="2.*integer64.*position 1 taken as TRUE when assigning to column 4 named 'd'.*logical") + DT[,b:=4:6] + test(2005.63, DT[2:3, b:=as.integer64(c("2147483647","2147483648"))]$b, INT(4,2147483647,NA), + warning="2147483648.*integer64.*position 2 out-of-range [(]NA[)] when assigning to column 2 named 'b'.*integer") + test(2005.64, DT[2:3, b:=as.integer64(c("-2147483648","-2147483647"))]$b, INT(4,NA,-2147483647), + warning="-2147483648.*integer64.*position 1 out-of-range [(]NA[)].*column 2 named 'b'.*integer") + test(2005.65, DT[c(2,1,3), c:=as.integer64(c(-1,255,256))]$c, as.raw(c(255,0,0)), + warning="-1.*integer64.*position 1 taken as 0 when assigning to column 3 named 'c'.*raw") + test(2005.66, DT[2:3, f:=as.integer64(c(NA,"2147483648"))]$f, as.complex(c(-42,NA,2147483648))) + DT[,h:=LETTERS[1:3]] + test(2005.67, DT[2:3, h:=as.integer64(1:2)], error="To assign integer64 to a character column, please use as.character.") +} # rbindlist raw type, #2819 test(2006.1, rbindlist(list(data.table(x = as.raw(1), y=as.raw(3)), data.table(x = as.raw(2))), fill=TRUE), data.table(x=as.raw(1:2), y=as.raw(c(3,0)))) @@ -14132,7 +14189,7 @@ test(2006.2, rbindlist(list(data.table(x = as.raw(1:2), y=as.raw(5:6)), data.tab if (test_bit64) { test(2007.1, rbindlist(list( list(a=as.integer64(1), b=3L), list(a=2L, b=4L) )), data.table(a=as.integer64(1:2), b=3:4)) test(2007.2, rbindlist(list( list(a=3.4, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6), - warning="Column 1 of item 1: coerced to integer64 but contains a non-integer value [(]3.40.* at position 1[)]; precision lost") + warning="Column 1 of item 1: 3.4.*double.*position 1 truncated.*precision lost.*when assigning to column 1 named 'a'.*integer64'") test(2007.3, rbindlist(list( list(a=3.0, b=5L), list(a=as.integer64(4), b=6L) )), data.table(a=as.integer64(3:4), b=5:6)) test(2007.4, rbindlist(list( list(b=5:6), list(a=as.integer64(4), b=7L)), fill=TRUE), data.table(b=5:7, a=as.integer64(c(NA,NA,4)))) # tests writeNA of integer64 test(2007.5, rbindlist(list( list(a=INT(1,NA,-2)), list(a=as.integer64(c(3,NA))) )), data.table(a=as.integer64(c(1,NA,-2,3,NA)))) # int NAs combined with int64 NA @@ -14243,7 +14300,15 @@ test(2014.4, class(fread(test_data_mult, keepLeadingZeros = TRUE)[[1]]), "charac test(2014.5, class(fread(test_data_mult, keepLeadingZeros = FALSE)[[1]]), "integer") # rbindlist should drop NA from levels of source factors, relied on by package emil -test(2015, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level should not be retained +test(2015.1, levels(rbindlist( list( data.frame(a=factor("a",levels=c("a",NA),exclude=NULL)) ))$a), "a") # the NA level (unused in this case) should not be retained +# follow-up from #3915; since this was malformed factor (so not relied on); lets just drop the used NA level too in v1.12.4 for these regular (not ordered) factors +DT = data.table(V1 = factor(as.character(c(NA, 1:3, NA)), exclude = NULL)) +test(2015.2, list(levels(DT$V1), as.integer(DT$V1)), list(as.character(c(1:3,NA)), INT(4,1,2,3,4))) # the 4's are now moved to NA_integer_ by rbindlist +test(2015.3, unclass(rbindlist(list(DT), use.names=FALSE)$V1), setattr(INT(NA,1,2,3,NA), "levels", as.character(1:3))) +DT = data.table(V1 = factor(as.character(c(NA, 1:100, NA)), exclude = NULL)) +test(2015.4, print(DT), output="V1.*1:[ ]+.*2:[ ]+1.*101:[ ]+100.*102:[ ]+") +DT = data.table(V1 = factor(as.character(c(NA, 1:3, NA)), exclude = NULL)) +test(2015.5, print(DT), output="V1.*1:[ ]+.*2:[ ]+1.*4:[ ]+3.*5:[ ]+") # better save->load->set() message, #2996 DT = data.table(a=1:3) @@ -15390,11 +15455,11 @@ test(2066.05, DT[, v[2], by=id], data.table(id = 1:2, V1=c(2i, NA))) DT = data.table(A=1:5, B=-3i, C=2147483647L) test(2066.06, DT[, .(sum(B), mean(B)), by=A%%2L], data.table(A=1:0, V1=c(-9i, -6i), V2=-3i)) test(2066.07, DT[2:4, .(sum(B), mean(B)), by=A%%2L], data.table(A=0:1, V1=c(-6i, -3i), V2=-3i)) -DT[4, B:=NA] -test(2066.08, DT[, .(sum(B), mean(B)), by=A%%2L], data.table(A=1:0, V1=c(-9i, NA), V2=c(-3i, NA))) -test(2066.09, DT[2:4, .(sum(B), mean(B)), by=A%%2L], data.table(A=0:1, V1=c(NA, -3i), V2=c(NA, -3i))) -test(2066.10, DT[, .(sum(B, na.rm=TRUE), mean(B, na.rm=TRUE)), by=A%%2L], data.table(A=1:0, V1=c(-9i, -3i), V2=-3i)) -test(2066.11, DT[2:4, .(sum(B, na.rm=TRUE), mean(B, na.rm=TRUE)), by=A%%2L], data.table(A=0:1, V1=c(-3i, -3i), V2=-3i)) +test(2066.08, DT[4, B:=NA]$B, c(-3i,-3i,-3i,NA,-3i)) +test(2066.09, DT[, .(sum(B), mean(B)), by=A%%2L], data.table(A=1:0, V1=c(-9i, NA), V2=c(-3i, NA))) +test(2066.10, DT[2:4, .(sum(B), mean(B)), by=A%%2L], data.table(A=0:1, V1=c(NA, -3i), V2=c(NA, -3i))) +test(2066.11, DT[, .(sum(B, na.rm=TRUE), mean(B, na.rm=TRUE)), by=A%%2L], data.table(A=1:0, V1=c(-9i, -3i), V2=-3i)) +test(2066.12, DT[2:4, .(sum(B, na.rm=TRUE), mean(B, na.rm=TRUE)), by=A%%2L], data.table(A=0:1, V1=c(-3i, -3i), V2=-3i)) # Shift complex values, part of #3690 z = c(1:3) + c(3:1)*1i @@ -16166,6 +16231,48 @@ DT = data.table(x = rep(letters[c(3, 1, 2)], each = 2)) test(2114.7, DT[, `:=`(g=.GRP, f=factor(.GRP)), by = x], data.table(x=rep(c("c","a","b"),each=2), g=rep(1:3,each=2), f=factor(rep(as.character(1:3),each=2)))) +# extra tests from #996 for completeness; no warning no-alloc coerce here of 0 and 1 numerics +DT = data.table(a=1:4, b=c(FALSE, TRUE, NA, FALSE)) +test(2115.1, set(DT,3L,1L,0), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, NA, FALSE))) +test(2115.2, set(DT,3L,2L,0), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, FALSE, FALSE))) +test(2115.3, set(DT,3L,2L,-2L), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, TRUE, FALSE)), # see also test 299 + warning="-2.*integer.*position 1 taken as TRUE.*column 2 named 'b'.*logical") +test(2115.4, set(DT,4L,2L,3.14), data.table(a=INT(1,2,0,4), b=c(FALSE, TRUE, TRUE, TRUE)), + warning="3.14.*double.*position 1 taken as TRUE.*column 2 named 'b'.*logical") +DT = data.table(code=c("c","b","c","a"), val=10:13) +test(2115.5, DT[code=="c", val := val+1], data.table(code=c("c","b","c","a"), val=INT(11,11,13,13))) +DT = data.table(x=factor(LETTERS[1:3]), y=1:6) +test(2115.6, copy(DT)[4:6, x:=LETTERS[1:3]], DT) # identical(RHS, levels) + +# allNA(); an aside in PR#3909 +test(2116.01, !allNA(c("",NA))) +test(2116.02, allNA(NA_character_)) +test(2116.03, allNA(as.character(c(NA,NA)))) +test(2116.04, allNA(character())) # same as all(is.na(character())) +test(2116.05, !allNA(c(Inf,NA))) +test(2116.06, allNA(as.double(c(NA,NA)))) +test(2116.07, allNA(double())) # same as all(is.na(double())) +if (test_bit64) { + test(2116.08, !allNA(as.integer64(c(NA,0)))) + test(2116.09, allNA(as.integer64(c(NA,NA)))) + test(2116.10, allNA(integer64())) # same as all(is.na(integer64())) +} +test(2116.11, !allNA(as.raw(c(0,0)))) +test(2116.12, !allNA(as.raw(c(0,255)))) +test(2116.13, allNA(raw())) # same as all(is.na(raw())) +test(2116.14, allNA(NULL)) # same as all(is.na(NULL)) +test(2116.15, allNA(list())) # same as all(is.na(list())) +# turned off allNA list support for now to avoid accidentally using it internally where we did not intend; allNA not yet exported +# https://github.com/Rdatatable/data.table/pull/3909#discussion_r329065950 +test(2116.16, allNA(list(NA, NA_character_)), error="Unsupported type 'list' passed to allNA") # base R returns true +test(2116.17, allNA(list(NA, NA)), error="Unsupported type 'list' passed to allNA") # base R returns true +test(2116.18, allNA(list(NA, c(NA,NA))), error="Unsupported type 'list' passed to allNA") # base R returns false + +# don't create NA factor level when assigning to factor from character; bug in v1.12.2 noticed in dev part of PR#3909 +DT = data.table(a=factor(LETTERS[1:3])) +test(2117.1, levels(DT[2:3,a:=c("",NA_character_)]$a), c("A","B","C","")) +test(2117.2, DT[1,a:=NA_character_]$a, factor(c(NA,"",NA), levels=c("A","B","C",""))) + ################################### # Add new tests above this line # diff --git a/src/assign.c b/src/assign.c index 2c9d6e95f2..9cddba9fab 100644 --- a/src/assign.c +++ b/src/assign.c @@ -281,7 +281,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign R_len_t i, j, numToDo, targetlen, vlen, r, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; - SEXP targetcol, nullint, thisv, targetlevels, newcol, s, colnam, tmp, colorder, key, index, a, assignedNames, indexNames; + SEXP targetcol, nullint, s, colnam, tmp, colorder, key, index, a, assignedNames, indexNames; bool verbose=GetVerbose(), anytodelete=false; const char *c1, *tc1, *tc2; int *buf, newKeyLength, indexNo; @@ -461,7 +461,6 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // truelengths of both already set by alloccol } for (i=0; i oldncol) { // new column - SET_VECTOR_ELT(dt, coln, newcol=allocNAVectorLike(thisvalue, nrow)); + SET_VECTOR_ELT(dt, coln, targetcol=allocNAVectorLike(thisvalue, nrow)); // initialize with NAs for when 'rows' is a subset and it doesn't touch // do not try to save the time to NA fill (contiguous branch free assign anyway) since being // sure all items will be written to (isNull(rows), length(rows), vlen<1, targetlen) is not worth the risk. - if (isVectorAtomic(thisvalue)) copyMostAttrib(thisvalue,newcol); // class etc but not names + if (isVectorAtomic(thisvalue)) copyMostAttrib(thisvalue,targetcol); // class etc but not names // else for lists (such as data.frame and data.table) treat them as raw lists and drop attribs if (vlen<1) continue; // e.g. DT[,newcol:=integer()] (adding new empty column) - targetcol = newcol; - RHS = thisvalue; } else { // existing column targetcol = VECTOR_ELT(dt,coln); - if (isFactor(targetcol)) { - // Coerce RHS to appropriate levels of LHS, adding new levels as necessary (unlike base) - // If it's the same RHS being assigned to several columns, we have to recoerce for each - // one because the levels of each target are likely different - if (isFactor(thisvalue)) { - thisvalue = PROTECT(asCharacterFactor(thisvalue)); thisprotecti++; - } - targetlevels = getAttrib(targetcol, R_LevelsSymbol); - if (isNull(targetlevels)) error("somehow this factor column has no levels"); - if (isString(thisvalue)) { - savetl_init(); // ** TO DO **: remove allocs that could fail between here and _end, or different way - for (j=0; j0) { - savetl(s); // pre-2.14.0 this will save all the uninitialised truelengths - // so 2.14.0+ may be faster, but isn't required. - // as from v1.8.0 we assume R's internal hash is positive, so don't - // save the uninitialised truelengths that by chance are negative - } - SET_TRUELENGTH(s,0); - } - for (j=0; j0) savetl(s); - SET_TRUELENGTH(s,j+1); - } - R_len_t addi = 0; - SEXP addlevels=NULL; - RHS = PROTECT(allocVector(INTSXP, length(thisvalue))); thisprotecti++; - int *iRHS = INTEGER(RHS); - for (j=0; j= length(addlevels)) { - addlevels = PROTECT(growVector(addlevels, length(addlevels)+1000)); thisprotecti++; - } - SET_STRING_ELT(addlevels,addi++,thisv); - // if-else for #1718 fix - SET_TRUELENGTH(thisv, (thisv != NA_STRING) ? (addi+length(targetlevels)) : NA_INTEGER); - } - iRHS[j] = TRUELENGTH(thisv); - } - if (addi > 0) { - R_len_t oldlen = length(targetlevels); - targetlevels = PROTECT(growVector(targetlevels, oldlen+addi)); thisprotecti++; - for (j=0; jLENGTH(targetlevels)) - && iRHS[j] != NA_INTEGER) { - warning("RHS contains %d which is outside the levels range ([1,%d]) of column %d, NAs generated", iRHS[j], LENGTH(targetlevels), i+1); - iRHS[j] = NA_INTEGER; - } - } - } - } else { - if (TYPEOF(targetcol)==TYPEOF(thisvalue) || TYPEOF(targetcol)==VECSXP) - RHS = thisvalue; - else { - // coerce the RHS to match the type of the column, unlike [<-.data.frame, for efficiency. - if (isString(targetcol) && isFactor(thisvalue)) { - RHS = PROTECT(asCharacterFactor(thisvalue)); thisprotecti++; - if (verbose) Rprintf("Coerced factor RHS to character to match the column's type. Avoid this coercion if possible, for efficiency, by creating RHS as type character.\n"); - // TO DO: datatable.pedantic could turn this into warning - } else { - RHS = PROTECT(coerceVector(thisvalue,TYPEOF(targetcol))); thisprotecti++; - char *s1 = (char *)type2char(TYPEOF(targetcol)); - char *s2 = (char *)type2char(TYPEOF(thisvalue)); - // FR #2551, added test for equality between RHS and thisvalue to not provide the warning when length(thisvalue) == 1 - if ( length(thisvalue)==1 && TYPEOF(RHS)!=VECSXP && ( - ( isReal(thisvalue) && isInteger(targetcol) && REAL(thisvalue)[0]==INTEGER(RHS)[0] ) || // DT[,intCol:=4] rather than DT[,intCol:=4L] - ( isLogical(thisvalue) && LOGICAL(thisvalue)[0] == NA_LOGICAL ) || // DT[,intCol:=NA] - ( isInteger(thisvalue) && isReal(targetcol) ) )) { - if (verbose) Rprintf("Coerced length-1 RHS from %s to %s to match column's type.%s If this assign is happening a lot inside a loop, in particular via set(), then it may be worth avoiding this coercion by using R's type postfix on the value being assigned; e.g. typeof(0) vs typeof(0L), and typeof(NA) vs typeof(NA_integer_) vs typeof(NA_real_).\n", s2, s1, - isInteger(targetcol) && isReal(thisvalue) ? " No precision was lost." : ""); - // TO DO: datatable.pedantic could turn this into warning. Or we could catch and avoid the coerceVector allocation ourselves using a single int. - } else { - if (isReal(thisvalue) && isInteger(targetcol)) { - int w = INTEGER(isReallyReal(thisvalue))[0]; // first fraction present (1-based), 0 if none - if (w>0) { - warning("Coerced double RHS to integer to match the type of the target column (column %d named '%s'). One or more RHS values contain fractions which have been lost; e.g. item %d with value %f has been truncated to %d.", - coln+1, CHAR(STRING_ELT(names, coln)), w, REAL(thisvalue)[w-1], INTEGER(RHS)[w-1]); - } else { - warning("Coerced double RHS to integer to match the type of the target column (column %d named '%s'). The RHS values contain no fractions so would be more efficiently created as integer. Consider using R's 'L' postfix (typeof(0L) vs typeof(0)) to create constants as integer and avoid this warning. Wrapping the RHS with as.integer() will avoid this warning too but it's better if possible to create the RHS as integer in the first place so that the cost of the coercion can be avoided.", coln+1, CHAR(STRING_ELT(names, coln))); - } - } else { - warning("Coerced %s RHS to %s to match the type of the target column (column %d named '%s'). If the target column's type %s is correct, it's best for efficiency to avoid the coercion and create the RHS as type %s. To achieve that consider R's type postfix: typeof(0L) vs typeof(0), and typeof(NA) vs typeof(NA_integer_) vs typeof(NA_real_). You can wrap the RHS with as.%s() to avoid this warning, but that will still perform the coercion. If the target column's type is not correct, it's best to revisit where the DT was created and fix the column type there; e.g., by using colClasses= in fread(). Otherwise, you can change the column type now by plonking a new column (of the desired type) over the top of it; e.g. DT[, `%s`:=as.%s(`%s`)]. If the RHS of := has nrow(DT) elements then the assignment is called a column plonk and is the way to change a column's type. Column types can be observed with sapply(DT,typeof).", - s2, s1, coln+1, CHAR(STRING_ELT(names, coln)), s1, s1, s1, CHAR(STRING_ELT(names, coln)), s2, CHAR(STRING_ELT(names, coln))); - } - } - } - } - } } - memrecycle(targetcol, rows, 0, targetlen, RHS); // also called from dogroups where these arguments are used more - UNPROTECT(thisprotecti); // unprotect inside loop through columns to save protection stack + const char *ret = memrecycle(targetcol, rows, 0, targetlen, thisvalue, coln+1, CHAR(STRING_ELT(names, coln))); + if (ret) warning(ret); } + *_Last_updated = numToDo; // the updates have taken place with no error, so update .Last.updated now assignedNames = PROTECT(allocVector(STRSXP, LENGTH(cols))); protecti++; for (i=0;i0) { - savetl(s); - } else if (tl<0) { - // # nocov start - for (int j=0; j=0) { - if (tl>0) savetl(s); - SET_TRUELENGTH(s, -nTargetLevels-(++nAdd)); + const bool sourceIsI64=isReal(source) && Rinherits(source, char_integer64); + const bool targetIsI64=isReal(target) && Rinherits(target, char_integer64); + if (sourceIsFactor || targetIsFactor) { + if (!targetIsFactor) { + if (!isString(target) && !isNewList(target)) + error("Cannot assign 'factor' to '%s'. Factors can only be assigned to factor, character or list columns.", type2char(TYPEOF(target))); + // else assigning factor to character is left to later below, avoiding wasteful asCharacterFactor + } else if (!sourceIsFactor && !isString(source)) { + // target is factor + if (allNA(source, false)) { // return false for list and other types that allNA does not support + source = ScalarLogical(NA_LOGICAL); // a global constant in R and won't allocate; fall through to regular zero-copy coerce + } else if (isInteger(source) || isReal(source)) { + // allow assigning level numbers to factor columns; test 425, 426, 429 and 1945 + const int nlevel = length(getAttrib(target, R_LevelsSymbol)); + if (isInteger(source)) { + const int *sd = INTEGER(source); + for (int i=0; inlevel) { + error("Assigning factor numbers to column %d named '%s'. But %d is outside the level range [1,%d]", colnum, colname, val, nlevel); + } + } + } else { + const double *sd = REAL(source); + for (int i=0; inlevel)) { + error("Assigning factor numbers to column %d named '%s'. But %f is outside the level range [1,%d], or is not a whole number.", colnum, colname, val, nlevel); + } + } } + // Now just let the valid level numbers fall through to regular assign by BODY below + } else { + error("Cannot assign '%s' to 'factor'. Factor columns can be assigned factor, character, NA in any type, or level numbers.", type2char(TYPEOF(source))); } - const int nSource = length(source); - const int *sourceD = INTEGER(source); - int *newSourceD = INTEGER(newSource); - for (int i=0; i0) { + savetl(s); + } else if (tl<0) { + // # nocov start + for (int j=0; j=0) { + if (!sourceIsFactor && s==NA_STRING) continue; // don't create NA factor level when assigning character to factor; test 2117 + if (tl>0) savetl(s); + SET_TRUELENGTH(s, -nTargetLevels-(++nAdd)); + } // else, when sourceIsString, it's normal for there to be duplicates here } - free(temp); - } else { - // all source levels were already in target levels, but not with the same integers; we're done - savetl_end(); - } - // now continue, but with the mapped integers in the (new) source - } - } - if (!length(where)) { // e.g. called from rbindlist with where=R_NilValue - switch (TYPEOF(target)) { - case RAWSXP: - if (TYPEOF(source)!=RAWSXP) { source = PROTECT(coerceVector(source, RAWSXP)); protecti++; } - if (slen==1) { - // recycle single items - Rbyte *td = RAW(target)+start; - const Rbyte val = RAW(source)[0]; - for (int i=0; i255, "%d", "taken as 0") + case REALSXP: if (sourceIsI64) + CHECK_RANGE(long long, REAL, val<0 || val>255, "%lld", "taken as 0") + else CHECK_RANGE(double, REAL, !R_FINITE(val) || val<0.0 || val>256.0 || (int)val!=val, "%f", "either truncated (precision lost) or taken as 0") + } break; + case INTSXP: + if (TYPEOF(source)==REALSXP) { + if (sourceIsI64) + CHECK_RANGE(long long, REAL, val!=NA_INTEGER64 && (val<=NA_INTEGER || val>INT_MAX), "%lld", "out-of-range (NA)") + else CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "%f", "truncated (precision lost)") + } break; + case REALSXP: + if (targetIsI64 && isReal(source) && !sourceIsI64) { + CHECK_RANGE(double, REAL, !ISNAN(val) && (!R_FINITE(val) || (int)val!=val), "%f", "truncated (precision lost)") } - } break; - case CPLXSXP: { - Rcomplex *td = COMPLEX(target); - const Rcomplex *sd = COMPLEX(source); - for (int i=0; i255 || val<0) ? 0 : val, td[i]=cval) + case REALSXP: if (sourceIsI64) + BODY(int64_t, REAL, Rbyte, (val>255 || val<0) ? 0 : val, td[i]=cval) + else BODY(double, REAL, Rbyte, (ISNAN(val)||val>255||val<0) ? 0 : val, td[i]=cval) + default: COERCE_ERROR("raw"); + } + } break; + case LGLSXP: { + int *td = LOGICAL(target) + off; + switch (TYPEOF(source)) { + case RAWSXP: BODY(Rbyte, RAW, int, val!=0, td[i]=cval) + case LGLSXP: if (mem) { + memcpy(td, LOGICAL(source), slen*sizeof(Rboolean)); break; + } else BODY(int, LOGICAL, int, val, td[i]=cval) + case INTSXP: BODY(int, INTEGER, int, val==NA_INTEGER ? NA_LOGICAL : val!=0, td[i]=cval) + case REALSXP: if (sourceIsI64) + BODY(int64_t, REAL, int, val==NA_INTEGER64 ? NA_LOGICAL : val!=0, td[i]=cval) + else BODY(double, REAL, int, ISNAN(val) ? NA_LOGICAL : val!=0.0, td[i]=cval) + default: COERCE_ERROR("logical"); + } + } break; + case INTSXP : { + int *td = INTEGER(target) + off; + switch (TYPEOF(source)) { + case RAWSXP: BODY(Rbyte, RAW, int, (int)val, td[i]=cval) + case LGLSXP: // same as INTSXP ... + case INTSXP: if (mem) { + memcpy(td, INTEGER(source), slen*sizeof(int)); break; + } else BODY(int, INTEGER, int, val, td[i]=cval) + case REALSXP: if (sourceIsI64) + BODY(int64_t, REAL, int, (val==NA_INTEGER64||val>INT_MAX||val<=NA_INTEGER) ? NA_INTEGER : (int)val, td[i]=cval) + else BODY(double, REAL, int, ISNAN(val) ? NA_INTEGER : (int)val, td[i]=cval) + default: COERCE_ERROR("integer"); // test 2005.4 + } + } break; + case REALSXP : { + if (targetIsI64) { + int64_t *td = (int64_t *)REAL(target) + off; + switch (TYPEOF(source)) { + case RAWSXP: BODY(Rbyte, RAW, int64_t, (int64_t)val, td[i]=cval) + case LGLSXP: // same as INTSXP + case INTSXP: BODY(int, INTEGER, int64_t, val==NA_INTEGER ? NA_INTEGER64 : val, td[i]=cval) + case REALSXP: + if (sourceIsI64) { + if(mem) { memcpy(td, (int64_t *)REAL(source), slen*sizeof(int64_t)); break; } + else BODY(int64_t, REAL, int64_t, val, td[i]=cval) + } else BODY(double, REAL, int64_t, R_FINITE(val) ? val : NA_INTEGER64, td[i]=cval) + default: COERCE_ERROR("integer64"); } - } break; - case STRSXP : { - const SEXP *sd = STRING_PTR(source); - for (int i=0; i1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986 error("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code.", thislen, j+1, i+1, maxn); } - memrecycle(target, R_NilValue, thisansloc, maxn, source); + memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, ""); } } ansloc += maxn; diff --git a/src/init.c b/src/init.c index 1147bff23a..4189db28d0 100644 --- a/src/init.c +++ b/src/init.c @@ -84,6 +84,7 @@ SEXP cj(); SEXP lock(); SEXP unlock(); SEXP islockedR(); +SEXP allNAR(); // .Externals SEXP fastmean(); @@ -174,6 +175,7 @@ R_CallMethodDef callMethods[] = { {"C_islocked", (DL_FUNC) &islockedR, -1}, {"CfrollapplyR", (DL_FUNC) &frollapplyR, -1}, {"CtestMsgR", (DL_FUNC) &testMsgR, -1}, +{"C_allNAR", (DL_FUNC) &allNAR, -1}, {NULL, NULL, 0} }; diff --git a/src/rbindlist.c b/src/rbindlist.c index a7b846535e..a3e64cb21a 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -276,7 +276,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) int longestLen=0, longestW=-1, longestI=-1; // just for ordered factor SEXP longestLevels=R_NilValue; // just for ordered factor bool int64=false; - bool foundName=false; + const char *foundName=NULL; bool anyNotStringOrFactor=false; SEXP firstCol=R_NilValue; int firsti=-1, firstw=-1; @@ -287,7 +287,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) if (w==-1) continue; // column j of final result has no input from this item (fill must be true) if (!foundName) { SEXP cn=PROTECT(getAttrib(li, R_NamesSymbol)); - if (length(cn)) { SET_STRING_ELT(ansNames, idcol+j, STRING_ELT(cn, w)); foundName=true; } + if (length(cn)) { SEXP tt; SET_STRING_ELT(ansNames, idcol+j, tt=STRING_ELT(cn, w)); foundName=CHAR(tt); } UNPROTECT(1); } SEXP thisCol = VECTOR_ELT(li, w); @@ -319,7 +319,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) } } - if (!foundName) { char buff[12]; sprintf(buff,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); } + if (!foundName) { static char buff[12]; sprintf(buff,"V%d",j+1), SET_STRING_ELT(ansNames, idcol+j, mkChar(buff)); foundName=buff; } if (factor) maxType=INTSXP; // if any items are factors then a factor is created (could be an option) if (int64 && maxType!=REALSXP) error("Internal error: column %d of result is determined to be integer64 but maxType=='%s' != REALSXP", j+1, type2char(maxType)); // # nocov @@ -442,19 +442,36 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) const int *id = INTEGER(thisCol); if (length(thisCol)<=1) { // recycle length-1, or NA-fill length-0 - const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER) ? -TRUELENGTH(thisColStrD[id[0]-1]) : NA_INTEGER; + SEXP lev; + const int val = (length(thisCol)==1 && id[0]!=NA_INTEGER && (lev=thisColStrD[id[0]-1])!=NA_STRING) ? -TRUELENGTH(lev) : NA_INTEGER; + // ^^ #3915 and tests 2015.2-5 for (int r=0; r