Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
272 changes: 272 additions & 0 deletions inst/tests/benchmark.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,275 @@ test(1742.5, substr(x, nchar(x)-10L, nchar(x)), c("50,28,95,76","62,87,23,40"))

# Add scaled-up non-ASCII forder test 1896

# largest tests by ram usage moved out of tests.Rraw, #5517

# Test ad hoc by of more than 100,000 levels, see 2nd part of bug #1387 (100,000 from the limit of base::sort.list radix)
# This does need to be this large, like this in CRAN checks, because sort.list(method="radix") has this limit, which
# this tests. But it's well under 10 seconds.
DT = data.table(A=1:10,B=rnorm(10),C=factor(paste("a",1:100010,sep="")))
test(301, nrow(DT[,sum(B),by=C])==100010)
DT = data.table(A=1:10,B=rnorm(10),C=paste("a",1:100010,sep=""))
test(301.1, nrow(DT[,sum(B),by=C])==100010)

# Test := by key, and that := to the key by key unsets the key. Make it non-trivial in size too.
options(datatable.optimize=0L)
set.seed(1)
DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
test(637.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
test(637.2, key(DT[J(43L),a:=99L]), NULL)
setkey(DT,a)
test(637.3, key(DT[,a:=99L,by=a]), NULL)
options(datatable.optimize=2L)
set.seed(1)
DT = data.table(a=sample(1:100,1e6,replace=TRUE),b=sample(1:1000,1e6,replace=TRUE),key="a")
test(638.1, DT[,m:=sum(b),by=a][1:3], data.table(a=1L,b=c(156L,808L,848L),m=DT[J(1),sum(b)],key="a"))
test(638.2, key(DT[J(43L),a:=99L]), NULL)
setkey(DT,a)
test(638.3, key(DT[,a:=99L,by=a]), NULL)

# Test X[Y] slowdown, #2216
# Many minutes in 1.8.2! Now well under 1s, but 10s for very wide tolerance for CRAN. We'd like CRAN to tell us if any changes
# in R or elsewhere cause the 2 minute (!) bug to return. Hence not moving out to benmark.Rraw.
X = CJ(a=seq_len(1e3),b=seq_len(1e3))
Y = copy(X)
X[4,b:=3L] # create a dup group, to force allLen1=FALSE
setkey(X)
test(819, system.time(X[Y,allow.cartesian=TRUE])["user.self"] < 10) # this system.time usage ok in this case
test(820, system.time(X[Y,mult="first"])["user.self"] < 10) # this system.time usage ok in this case

# test uniqlengths
set.seed(45)
x <- sample(c(NA_integer_, 1:1e4), 1e6, TRUE)
ox <- forderv(x)
o1 <- uniqlist(list(x), ox)
test(1151.1, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
o1 <- uniqlist(list(x))
test(1151.2, c(diff(o1), length(x)-tail(o1, 1L)+1L), uniqlengths(o1, length(x)))
rm(list=c("x","ox","o1"))
gc()

# Fix for (usually small) memory leak when grouping, #2648.
# Deliberate worst case: largest group (100000 rows) followed last by a small group (1 row).
DT = data.table(A=rep(1:2,c(100000,1)), B=runif(100001))
before = gc()["Vcells",2]
for (i in 1:50) DT[, sum(B), by=A]
after = gc()["Vcells",2]
test(1157, after < before+3) # +3 = 3MB
# Before the patch, Vcells grew dramatically from 6MB to 60MB. Now stable at 6MB. Increase 50 to 1000 and it grew to over 1GB for this case.

# Similar for when dogroups writes less rows than allocated, #2648.
DT = data.table(k = 1:50, g = 1:20, val = rnorm(1e4))
before = gc()["Vcells",2]
for (i in 1:50) DT[ , unlist(.SD), by = 'k']
after = gc()["Vcells",2]
test(1158, after < before+3) # 177.6MB => 179.2MB. Needs to be +3 now from v1.9.8 with alloccol up from 100 to 1024

# fix DT[TRUE, :=] using too much working memory for i, #1249
if (!inherits(try(Rprofmem(NULL), silent=TRUE), "try-error")) { # in case R not compiled with memory profiling enabled
f = tempfile()
N = 1000000 # or any large number of rows
DT = data.table(A=1:N, B=rnorm(N))
DT[TRUE, B := B * 2] # stabilize with initial dummy update
Rprofmem(f)
DT[TRUE, B := B * 2] # or some in-place update
Rprofmem(NULL)
test(1542, length(grep("000",readLines(f, warn=FALSE))), 1L) # one allocation for the RHS only
unlink(f)
}

if (FALSE) {
# Full range takes too long for CRAN.
dts = seq(as.Date("0000-03-01"), as.Date("9999-12-31"), by="day")
dtsCh = as.character(dts) # 36s
dtsCh = gsub(" ","0",sprintf("%10s",dtsCh)) # R does not 0 pad years < 1000
test(1739.1, length(dtsCh)==3652365 && identical(dtsCh[c(1,3652365)],c("0000-03-01","9999-12-31")))
} else {
# test on CRAN a reduced but important range
dts = seq(as.Date("1899-12-31"), as.Date("2100-01-01"), by="day")
dtsCh = as.character(dts)
test(1739.2, length(dtsCh)==73051 && identical(dtsCh[c(1,73051)],c("1899-12-31","2100-01-01")))
}
DT = data.table(A=dts, B=as.IDate(dts))
test(1739.3, sapply(DT,typeof), c(A="double",B="integer"))
test(1739.4, typeof(dts), "double")
f = tempfile()
g = tempfile() # Full range
fwrite(DT,f) # 0.092s
write.csv(DT,g,row.names=FALSE,quote=FALSE) # 65.250s
test(1739.5, readLines(f), c("A,B",paste(dtsCh,dtsCh,sep=",")))
test(1739.6, readLines(f), readLines(g))
unlink(f)
unlink(g)
rm(list=c("dtsCh","dts"))
gc()

# catch malformed factor in rbindlist, #3315
set.seed(32940)
NN=7e5; KK=4e4; TT=25
DT = data.table( id = sample(KK, NN, TRUE), tt = sample(TT, NN, TRUE), ff = factor(sample(3, NN, TRUE)) )
test(1978, print(DT[ , diff(ff), by = id]), error="Column 2 of item 1 has type 'factor' but has no levels; i.e. malformed.") # the print invokes rbindlist which bites

# print.data.table row id in non-scientific notation, #1167
DT <- data.table(a = rep(1:5,3*1e5), b = rep(letters[1:3],5*1e5))
test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "1499996: 1 b", "1499997: 2 c", "1499998: 3 a", "1499999: 4 b", "1500000: 5 c"))
rm(DT)

# Create a file to test a sample jump being skipped due to format error. It will fail later in the read step because
# this is a real error. Currently have not constructed an error for which nextGoodLine looks good, but in fact is not.
# Would need a very complicated construction of embedded new lines in quoted fields, to test that.
# This test size with default buffMB results in 2 threads being used. 2 is important to pass on CRAN.
DT = as.data.table(CO2)
f = tempfile()
for (i in 0:1000) {
start = nrow(CO2)*i
fwrite(DT[,Plant:=start:(start+nrow(CO2)-1)], f, append=TRUE, col.names=FALSE)
if (i==502) write("-999,Bad,Line,0.0,0.0,extra\n", f, append=TRUE)
}
test(1835, fread(f, verbose=TRUE),
output = "A line with too-many.*jump 50.*jump landed awkwardly.*skipped",
warning = "Stopped.*line 42253. Expected 5 fields but found 6.*discarded.*<<-999,Bad,Line,0.0,0.0,extra>>")
unlink(f)

# test no memory leak, #2191 and #2284
# These take a few seconds each, and it's important to run these on CRAN to check no leak
gc(); before = gc()["Vcells","(Mb)"]
for (i in 1:2000) { DT = data.table(1:3); rm(DT) } # in 1.8.2 would leak 3MB
gc(); after = gc()["Vcells","(Mb)"]
test(861, after < before+0.5) # close to 0.0 difference, but 0.5 for safe margin
gc(); before = gc()["Vcells","(Mb)"]
DF = data.frame(x=1:20, y=runif(20))
for (i in 1:2000) { DT = as.data.table(DF); rm(DT) }
gc(); after = gc()["Vcells","(Mb)"]
test(862, after < before+0.5)
gc(); before = gc()["Vcells","(Mb)"]
DT = data.table(x=1:20, y=runif(20))
for (i in 1:2000) { x <- DT[1:5,]; rm(x) }
gc(); after = gc()["Vcells","(Mb)"]
test(863, after < before+0.5)

# fread should use multiple threads on single column input.
# tests 2 threads; the very reasonable limit on CRAN
# file needs to be reasonably large for threads to kick in (minimum chunkSize is 1MB currently)
if (getDTthreads() == 1L) {
cat("Test 1760 not run because this session either has no OpenMP or has been limited to one thread (e.g. under UBSAN and ASAN)\n")
} else {
N = if (TRUE) 2e6 else 1e9 # offline speed check
fwrite(data.table(A=sample(10,N,replace=TRUE)), f<-tempfile())
test(1760.1, file.info(f)$size > 4*1024*1024)
test(1760.2, fread(f, verbose=TRUE, nThread=2), output="using 2 threads")
unlink(f)
}

# segfault of unprotected var caught with the help of address sanitizer; was test 1509
# in #5517 I figured this test shouldn't be reduced in size due to its nature
set.seed(1)
val = sample(c(1:5, NA), 1e4L, TRUE)
dt <- setDT(replicate(100L, val, simplify=FALSE))
## to ensure there's no segfault...
ans <- melt(dt, measure.vars=names(dt), na.rm=TRUE)
test(1035.21, ans, ans)

# gc race with altrep in R-devel May 2018, #2866 & #2767, PR#2882
# This runs with 2 threads in the test suite on CRAN and AppVeyor etc.
# 2 threads are sufficient to fail before the fix.
N = 20
DF = data.frame(a=rnorm(N),
b=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]),
c=factor(rbinom(N,5,prob=0.5),1:5,letters[1:5]))
DT = setDT(DF) # setDT required since data.table() already expanded altrep's
before = sum(gc()[, 2])
fff = function(aref) {
ff = lapply(1:5, function(i) {
DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
})
return(rbindlist(ff))
}
for(i in 1:100) {
f = fff("a")
rm("f")
}
gc() # extra gc() (i.e. two including the one on next line) seems to reduce `after`
# from 29.7 to 27.2 (exactly `before`). Keeping the extra gc() as no harm.
after = sum(gc()[, 2])
test(1912.1, after < before + 10) # 10MB very wide margin. With the gc race, heap usage grew much more which is all we're testing here (no blow up).
#
before = sum(gc()[, 2])
fff = function(aref) {
DT = setDT(data.frame(a=1:N, b=1:N, c=1:N, d=1:N, e=1:N, f=1:N, g=1:N, h=1:N)) # 1:N creates altrep. A few of them too to tickle (the fixed) race.
lapply(1:5, function(i) {
DT[,list(sumA=sum(get(aref))),by=b][,c:=letters[i]]
})
}
for(i in 1:100) {
fff("a")
}
gc()
after = sum(gc()[, 2])
test(1912.2, after < before + 10)

DT = data.table(A=seq(1, 1000000), B="x", C=TRUE)
fwrite(DT, f<-tempfile())
test(1815, fread(f, nrows=5), DT[1:5]) #2243: nrows small vs large nrow(DT)

# Better jump sync and run-on in PR#2627
#
# Reproduces error 'did not finish exactly where jump 1 found ...' in #2561 in master before PR #2627
# the jump point is just before an empty line and the nextGoodLine() wasn't sync'd properly
x = sprintf("ABCDEFGHIJKLMNOPQRST%06d", 1:102184)
x[51094]=""
cat(x, file=f<-tempfile(), sep="\n")
test(1874.1, fread(f,header=FALSE,verbose=TRUE)[c(1,51094,.N),],
data.table(V1=c("ABCDEFGHIJKLMNOPQRST000001","","ABCDEFGHIJKLMNOPQRST102184")),
output="jumps=[0..2)") # ensure jump 1 happened
#
# out-of-sample short lines in the first jump, not near the jump point
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[5021:5041] = "small,batch,short,lines" # 4 fields not 5
cat(x, file=f, sep="\n")
test(1874.2, fread(f), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:5020),
warning="Stopped early on line 5021.*<<small,batch,short,lines>>")
test(1874.3, fread(f,fill=TRUE,verbose=TRUE)[c(1,5020,5021,5041,5042,.N),],
data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
V5=c(1L,5020L,NA,NA,5042L,102184L)),
output="jumps=[0..2)")
#
# jump just before a set of 30 or more too-few lines, to reproduce "No good line could be found" error in #2267
# confirmed fails in master with that error before PR#2627
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[51094:51150] = "small,batch,short,lines" # 4 fields not 5
cat(x, file=f, sep="\n")
test(1874.4, fread(f,verbose=TRUE), data.table(V1="ABCD", V2="FGHI", V3="KLMN", V4="PQRS", V5=1:51093),
warning="Stopped early on line 51094.*<<small,batch,short,lines>>",
output="jumps=[0..2)")
test(1874.5, fread(f,fill=TRUE,verbose=TRUE)[c(1,51093,51094,51150,51151,.N),],
data.table(V1=c("ABCD","ABCD","small","small","ABCD","ABCD"),
V2=c("FGHI","FGHI","batch","batch","FGHI","FGHI"),
V3=c("KLMN","KLMN","short","short","KLMN","KLMN"),
V4=c("PQRS","PQRS","lines","lines","PQRS","PQRS"),
V5=c(1L,51093L,NA,NA,51151L,102184L)),
output="jumps=[0..2)")
#
# jump inside a quoted field containing many new lines, to simulate a dirty jump
# we'll make this jump landing even harder for nextGoodLine() by making the lines resemble the number and types of the true lines, too.
# Rather than needing to make nextGoodLine() better and better (at some point it's impossible), in these rare cases we'll just sweep dirty jumps.
x = sprintf("ABCD,FGHI,KLMN,PQRS,%06d", 1:102184)
x[51093] = "\"A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n\",FGHI,KLMN,PQRS,51093"
cat(x, file=f, sep="\n")
test(1875.6, fread(f,verbose=TRUE)[c(1,51092:51094,.N),][3,V1:=gsub("\r","",V1)], # gsub since R on Windows replaces \n with \r\n
data.table(V1=c("ABCD","ABCD", "A,B,C,D,1\nA,B,C,D,2\nA,B,C,D,3\nA,B,C,D,4\nA,B,C,D,5\nA,B,C,D,6\nA,B,C,D,7\nA,B,C,D,8\n", "ABCD","ABCD"),
V2="FGHI", V3="KLMN", V4="PQRS", V5=c(1L,51092:51094,102184L)),
output = "too-few.*sample jump 50.*jump landed awkwardly.*skipped.*Read the data.*jumps=\\[0..2\\).*jumps=\\[1..2\\).*Reading 2 chunks \\(1 swept\\)")
# Aside: although the file (with over 100,000 lines) is big enough for 100 sampling jumps (of which just 1, the middle sample jump, skipped), it's
# still too small for more than 2 reading chunks to be worth it which is correct (based on buffMB not nth)
unlink(f)

# chmatchdup test from benchmark at the bottom of chmatch.c
set.seed(45L)
x = sample(letters, 1e5, TRUE)
y = sample(letters, 1e6, TRUE)
test(2000, c(head(ans<-chmatchdup(x,y,0L)),tail(ans)), INT(7,49,11,20,69,25,99365,100750,97596,99671,103320,99406))
rm(list=c("x","y"))

28 changes: 28 additions & 0 deletions inst/tests/other.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -690,4 +690,32 @@ if (loaded[["nanotime"]]) {

}

# that plot works; moved from tests.Rraw 167 to here to save ram of loading graphics package and possible screen device issues on overloaded servers, #5517
DT = data.table( a=1:5, b=11:50, d=c("A","B","C","D"), f=1:5, grp=1:5 )
test(28.1, DT[,plot(b,f)], NULL)
test(28.2, as.integer(DT[,hist(b)]$breaks), seq.int(10L,50L,by=5L)) # as.integer needed for R 3.1.0
test(28.3, DT[,plot(b,f),by=.(grp)], data.table(grp=integer()))
try(graphics.off(),silent=TRUE)

# test DT$.<- in a data.table-unaware package
# moved from tests.Rraw 1890 to here to save ram of loading stats package and plot, #5517
DT = data.table(A=1:5)
test(29.1, stats::ts.plot(gpars=DT), error="object must have one or more observations")
# Inside ts.plot is a gpars$ylab<- which happens before its error. That dispatches to our $<- which does the alloc.col()
test(29.2, DT, data.table(A=1:5))

if (FALSE) { # moved from tests.Rraw in #5517 and not yet back on; wasn't sure we need to still test reshape2
# test dispatch for non-data.table objects, #4864.
if (inherits(try(getNamespace("reshape2"), silent=TRUE),"try-error")) {
test(1038.001, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6),
error="The melt generic in data.table has been passed a data.frame")
} else {
# 1) GLCI rel-cran has reshape2 installed because caret in other.Rraw depends on reshape2
# 2) a user running test.data.table() with reshape2 installed (doesn't have to be loaded)
# 3) in dev locally I have reshape2 installed to run caret in other.Rraw
test(1038.002, melt(as.data.frame(DT), id.vars=1:2, measure.vars=5:6),
as.data.frame(melt(DT, id.vars=1:2, measure.vars=5:6)),
warning="The melt generic in data.table has been passed a data.frame")
}
}

Loading