Rdatatable · mattdowle · Nov 15, 2022 · Nov 10, 2022 · Nov 10, 2022 · Nov 10, 2022
@@ -168,3 +168,275 @@ test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40"))
 
 # Add scaled-up non-ASCII forder test 1896
 
+# largest tests by ram usage moved out of tests.Rraw, #5517
+
+# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix)
+# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which
+# this tests. But it's well under 10 seconds.
+DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep="")))
+test(301, nrow(DT[,sum(B),by=C])==100010)
+DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep=""))
+test(301.1, nrow(DT[,sum(B),by=C])==100010)
+
+# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too.
+options(datatable.optimize=0L)
+set.seed(1)
+DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
+test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
+test(637.2, key(DT[J(43L),a:=99L]), NULL)
+setkey(DT,a)
+test(637.3, key(DT[,a:=99L,by=a]), NULL)
+options(datatable.optimize=2L)
+set.seed(1)
+DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
+test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
+test(638.2, key(DT[J(43L),a:=99L]), NULL)
+setkey(DT,a)
+test(638.3, key(DT[,a:=99L,by=a]), NULL)
+
+# Test X[Y] slowdown, #2216
+# Many minutes in 1.8.2!  Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes
+# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw.
+X = CJ(a=seq_len(1e3),b=seq_len(1e3))
+Y = copy(X)
+X[4,b:=3L]     # create a dup group, to force allLen1=FALSE
+setkey(X)
+test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10)   # this system.time usage ok in this case
+test(820, system.time(X[Y,mult="first"])["user.self"] < 10)           # this system.time usage ok in this case
+
+# test uniqlengths
+set.seed(45)
+x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
+ox <- forderv(x)
+o1 <- uniqlist(list(x), ox)
+test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
+o1 <- uniqlist(list(x))
+test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
+rm(list=c("x","ox","o1"))
+gc()
+
+# Fix for (usually small) memory leak when grouping, #2648.
+# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row).
+DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001))
+before = gc()["Vcells",2]
+for (i in 1:50) DT[, sum(B), by=A]
+after = gc()["Vcells",2]
+test(1157, after < before+3)  # +3 = 3MB
+# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case.
+
+# Similar for when dogroups writes less rows than allocated, #2648.
+DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4))
+before = gc()["Vcells",2]
+for (i in 1:50) DT[ , unlist(.SD), by = 'k']
+after = gc()["Vcells",2]
+test(1158, after < before+3)  # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024
+
+#  fix DT[TRUE, :=] using too much working memory for i, #1249
+if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) {  # in case R not compiled with memory profiling enabled
+  f = tempfile()
+  N = 1000000           # or any large number of rows
+  DT = data.table(A=1:N, B=rnorm(N))
+  DT[TRUE, B := B * 2] # stabilize with initial dummy update
+  Rprofmem(f)
+  DT[TRUE, B := B * 2] # or some in-place update
+  Rprofmem(NULL)
+  test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L)  # one allocation for the RHS only
+  unlink(f)
+}
+
+if (FALSE) {
+  # Full range takes too long for CRAN.
+  dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day")
+  dtsCh = as.character(dts)   # 36s
+  dtsCh = gsub(" ","0",sprintf("%10s",dtsCh))  # R does not 0 pad years < 1000
+  test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31")))
+} else {
+  # test on CRAN a reduced but important range
+  dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day")
+  dtsCh = as.character(dts)
+  test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01")))
+}
+DT = data.table(A=dts, B=as.IDate(dts))
+test(1739.3, sapply(DT,typeof), c(A="double",B="integer"))
+test(1739.4, typeof(dts), "double")
+f = tempfile()
+g = tempfile()                               # Full range
+fwrite(DT,f)                                 #     0.092s
+write.csv(DT,g,row.names=FALSE,quote=FALSE)  #    65.250s
+test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=",")))
+test(1739.6, readLines(f), readLines(g))
+unlink(f)
+unlink(g)
+rm(list=c("dtsCh","dts"))
+gc()
+
+# catch malformed factor in rbindlist, #3315
+set.seed(32940)
+NN=7e5; KK=4e4; TT=25
+DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) )
+test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites
+
+# print.data.table row id in non-scientific notation, #1167
+DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5))
+test(1549, capture.output(print(DT)), c("         a b", "      1: 1 a", "      2: 2 b", "      3: 3 c", "      4: 4 a", "      5: 5 b", "     ---    ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c"))
+rm(DT)
+
+# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because
+# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not.
+# Would need a very complicated construction of embedded new lines in quoted fields, to test that.
+# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN.
+DT = as.data.table(CO2)
+f = tempfile()
+for (i in 0:1000) {
+  start = nrow(CO2)*i
+  fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE)
+  if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE)
+}
+test(1835, fread(f, verbose=TRUE),
+  output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped",
+  warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>")
+unlink(f)
+
+# test no memory leak, #2191 and #2284
+# These take a few seconds each, and it's important to run these on CRAN to check no leak
+gc(); before = gc()["Vcells","(Mb)"]
+for (i in 1:2000) { DT = data.table(1:3); rm(DT) }  # in 1.8.2 would leak 3MB
+gc(); after = gc()["Vcells","(Mb)"]
+test(861, after < before+0.5)   # close to 0.0 difference, but 0.5 for safe margin
+gc(); before = gc()["Vcells","(Mb)"]
+DF = data.frame(x=1:20, y=runif(20))
+for (i in 1:2000) { DT = as.data.table(DF); rm(DT) }
+gc(); after = gc()["Vcells","(Mb)"]
+test(862, after < before+0.5)
+gc(); before = gc()["Vcells","(Mb)"]
+DT = data.table(x=1:20, y=runif(20))
+for (i in 1:2000) { x <- DT[1:5,]; rm(x) }
+gc(); after = gc()["Vcells","(Mb)"]
+test(863, after < before+0.5)
+
+# fread should use multiple threads on single column input.
+# tests 2 threads; the very reasonable limit on CRAN
+# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently)
+if (getDTthreads() == 1L) {
+  cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n")
+} else {
+  N = if (TRUE) 2e6 else 1e9   # offline speed check
+  fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile())
+  test(1760.1, file.info(f)$size > 4*1024*1024)
+  test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads")
+  unlink(f)
+}
+
+# segfault of unprotected var caught with the help of address sanitizer; was test 1509
+# in #5517 I figured this test shouldn't be reduced in size due to its nature
+set.seed(1)
+val = sample(c(1:5, NA), 1e4L, TRUE)
+dt <- setDT(replicate(100L, val, simplify=FALSE))
+## to ensure there's no segfault...
+ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE)
+test(1035.21, ans, ans)
+
+# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882
+# This runs with 2 threads in the test suite on CRAN and AppVeyor etc.
+# 2 threads are sufficient to fail before the fix.
+N = 20
+DF = data.frame(a=rnorm(N),
+                b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]),
+                c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]))
+DT = setDT(DF)  # setDT required since data.table() already expanded altrep's
+before = sum(gc()[, 2])
+fff = function(aref) {
+  ff = lapply(1:5, function(i) {
+    DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
+  })
+  return(rbindlist(ff))
+}
+for(i in 1:100) {
+  f = fff("a")
+  rm("f")
+}
+gc()  # extra gc() (i.e. two including the one on next line) seems to reduce `after`
+      # from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm.
+after = sum(gc()[, 2])
+test(1912.1, after < before + 10)  # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
+#
+before = sum(gc()[, 2])
+fff = function(aref) {
+  DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race.
+  lapply(1:5, function(i) {
+    DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
+  })
+}
+for(i in 1:100) {
+  fff("a")
+}
+gc()
+after = sum(gc()[, 2])
+test(1912.2, after < before + 10)
+
+DT = data.table(A=seq(1, 1000000), B="x", C=TRUE)
+fwrite(DT, f<-tempfile())
+test(1815, fread(f, nrows=5), DT[1:5])  #2243: nrows small vs large nrow(DT)
+
+# Better jump sync and run-on in PR#2627
+#
+# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627
+# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly
+x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184)
+x[51094]=""
+cat(x, file=f<-tempfile(), sep="\n")
+test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),],
+           data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")),
+           output="jumps=[0..2)")  # ensure jump 1 happened
+#
+# out-of-sample short lines in the first jump, not near the jump point
+x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
+x[5021:5041] = "small,batch,short,lines"   # 4 fields not 5
+cat(x, file=f, sep="\n")
+test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020),
+             warning="Stopped early on line 5021.*<<small,batch,short,lines>>")
+test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),],
+             data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
+                        V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
+                        V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
+                        V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
+                        V5=c(1L,5020L,NA,NA,5042L,102184L)),
+             output="jumps=[0..2)")
+#
+# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267
+# confirmed fails in master with that error before PR#2627
+x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
+x[51094:51150] = "small,batch,short,lines"   # 4 fields not 5
+cat(x, file=f, sep="\n")
+test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093),
+             warning="Stopped early on line 51094.*<<small,batch,short,lines>>",
+             output="jumps=[0..2)")
+test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),],
+             data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
+                        V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
+                        V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
+                        V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
+                        V5=c(1L,51093L,NA,NA,51151L,102184L)),
+             output="jumps=[0..2)")
+#
+# jump inside a quoted field containing many new lines, to simulate a dirty jump
+# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too.
+# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps.
+x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
+x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093"
+cat(x, file=f, sep="\n")
+test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)],  # gsub since R on Windows replaces \n with \r\n
+             data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"),
+                        V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)),
+             output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)")
+# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's
+#   still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth)
+unlink(f)
+
+# chmatchdup test from benchmark at the bottom of chmatch.c
+set.seed(45L)
+x = sample(letters, 1e5, TRUE)
+y = sample(letters, 1e6, TRUE)
+test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406))
+rm(list=c("x","y"))
+
@@ -690,4 +690,32 @@ if (loaded[["nanotime"]]) {
 
 }
 
+# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517
+DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 )
+test(28.1, DT[,plot(b,f)], NULL)
+test(28.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0
+test(28.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer()))
+try(graphics.off(),silent=TRUE)
+
+# test DT$.<- in a data.table-unaware package
+# moved from tests.Rraw 1890 to here to save ram of loading stats package and plot, #5517
+DT = data.table(A=1:5)
+test(29.1, stats::ts.plot(gpars=DT), error="object must have one or more observations")
+# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col()
+test(29.2, DT, data.table(A=1:5))
+
+if (FALSE) {  # moved from tests.Rraw in #5517 and not yet back on; wasn't sure we need to still test reshape2
+  # test dispatch for non-data.table objects, #4864.
+  if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) {
+    test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6),
+                   error="The melt generic in data.table has been passed a data.frame")
+  } else {
+    #  1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2
+    #  2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded)
+    #  3) in dev locally I have reshape2 installed to run caret in other.Rraw
+    test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6),
+                   as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)),
+                   warning="The melt generic in data.table has been passed a data.frame")
+  }
+}