From 515de90a6068911a148e54343a3503043b8bb87c Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 8 Jan 2020 15:36:35 -0800 Subject: [PATCH 01/22] removed last DATAPTR from assign.c; easy as it was only in a comment --- src/assign.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/assign.c b/src/assign.c index d339cbfe15..38935ec39c 100644 --- a/src/assign.c +++ b/src/assign.c @@ -685,8 +685,7 @@ const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source, *memrecycle_message = '\0'; int protecti=0; if (isNewList(source)) { - // A list() column; i.e. target is a column of pointers to SEXPs rather than the much more common case - // where memrecycle copies the DATAPTR data to the atomic target from the atomic source. + // A list() column; i.e. target is a column of pointers to SEXPs rather than the more common case of numbers in an atomic vector. // If any item within the list is NAMED then take a fresh copy. So far this has occurred from dogroups.c when // j returns .BY or similar specials as-is within a list(). Those specials are static inside // dogroups so if we don't copy now the last value written to them by dogroups becomes repeated in the result; From abec7f7f488b7b5b5b7b2bbf0a0d4aac29a6cc76 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Wed, 8 Jan 2020 16:39:29 -0800 Subject: [PATCH 02/22] freadR.c --- src/freadR.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/freadR.c b/src/freadR.c index 7824e366f1..843f49a893 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -510,7 +510,7 @@ void pushBuffer(ThreadLocalFreadParsingContext *ctx) // While the string columns are happening other threads before me can be copying their non-string buffers to the // final DT and other threads after me can be filling their buffers too. // rowSize is passed in because it will be different (much smaller) on the reread covering any type exception columns - // locals passed in on stack so openmp knows that no synchonization is required + // locals passed in on stack so openmp knows that no synchronization is required // the byte position of this column in the first row of the row-major buffer if (nStringCols) { @@ -561,29 +561,29 @@ void pushBuffer(ThreadLocalFreadParsingContext *ctx) resj++; if (type[j]!=CT_STRING && type[j]>0) { if (thisSize == 8) { - char *dest = (char *)DATAPTR(VECTOR_ELT(DT, resj)) + DTi*8; - char *src8 = (char*)buff8 + off8; - for (int i=0; i2638000) printf("freadR.c:460: thisSize==4, resj=%d, %"PRIu64", %d, %d, j=%d, done=%d\n", resj, (uint64_t)DTi, off4, rowSize4, j, done); - for (int i=0; i CT_BOOL8_L) STOP(_("Field size is 1 but the field is of type %d\n"), type[j]); - Rboolean *dest = (Rboolean *)((char *)DATAPTR(VECTOR_ELT(DT, resj)) + DTi*sizeof(Rboolean)); - char *src1 = (char*)buff1 + off1; - for (int i=0; i Date: Wed, 8 Jan 2020 18:34:52 -0800 Subject: [PATCH 03/22] fwriteR.c using DATAPTR_RO --- src/Makevars.in | 2 +- src/data.table.h | 1 + src/fwrite.c | 16 ++++++++-------- src/fwrite.h | 8 ++++---- src/fwriteR.c | 18 ++++++++---------- src/utils.c | 12 ++++++++++++ 6 files changed, 34 insertions(+), 23 deletions(-) diff --git a/src/Makevars.in b/src/Makevars.in index 8f20d70560..491b0afa0d 100644 --- a/src/Makevars.in +++ b/src/Makevars.in @@ -2,5 +2,5 @@ PKG_CFLAGS = @openmp_cflags@ PKG_LIBS = @openmp_cflags@ -lz all: $(SHLIB) - mv $(SHLIB) datatable$(SHLIB_EXT) + if [ "$(SHLIB)" != "datatable$(SHLIB_EXT)" ]; then mv $(SHLIB) datatable$(SHLIB_EXT); fi if [ "$(OS)" != "Windows_NT" ] && [ `uname -s` = 'Darwin' ]; then install_name_tool -id datatable$(SHLIB_EXT) datatable$(SHLIB_EXT); fi diff --git a/src/data.table.h b/src/data.table.h index 3c7176d6bc..34490c4a6f 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -230,6 +230,7 @@ bool islocked(SEXP x); SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); +//const void *DATAPTR_RO(SEXP x); // types.c char *end(char *start); diff --git a/src/fwrite.c b/src/fwrite.c index 1f09d431fa..33d84086bd 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -41,12 +41,12 @@ static int scipen; static bool squashDateTime=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) static bool verbose=false; -extern const char *getString(void *, int64_t); -extern int getStringLen(void *, int64_t); -extern int getMaxStringLen(void *, int64_t); -extern int getMaxCategLen(void *); -extern int getMaxListItemLen(void *, int64_t); -extern const char *getCategString(void *, int64_t); +extern const char *getString(const void *, int64_t); +extern int getStringLen(const void *, int64_t); +extern int getMaxStringLen(const void *, int64_t); +extern int getMaxCategLen(const void *); +extern int getMaxListItemLen(const void *, int64_t); +extern const char *getCategString(const void *, int64_t); extern double wallclock(void); inline void write_chars(const char *x, char **pch) @@ -542,12 +542,12 @@ static inline void write_string(const char *x, char **pch) *pch = ch; } -void writeString(void *col, int64_t row, char **pch) +void writeString(const void *col, int64_t row, char **pch) { write_string(getString(col, row), pch); } -void writeCategString(void *col, int64_t row, char **pch) +void writeCategString(const void *col, int64_t row, char **pch) { write_string(getCategString(col, row), pch); } diff --git a/src/fwrite.h b/src/fwrite.h index b03bb571aa..1ce1809503 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -8,7 +8,7 @@ #define DTPRINT Rprintf #endif -typedef void (*writer_fun_t)(void *, int64_t, char **); +typedef void (*writer_fun_t)(const void *, int64_t, char **); // in the order of writer_fun_t in fwriteR.c void writeBool8(); @@ -74,7 +74,7 @@ typedef struct fwriteMainArgs int ncol; int64_t nrow; // a vector of pointers to all-same-length column vectors - void **columns; + const void **columns; writer_fun_t *funs; // a vector of writer_fun_t function pointers // length ncol vector containing which fun[] to use for each column @@ -82,9 +82,9 @@ typedef struct fwriteMainArgs // A limit of 256 writers seems more than sufficient uint8_t *whichFun; - void *colNames; // NULL means no header, otherwise ncol strings + const void *colNames; // NULL means no header, otherwise ncol strings bool doRowNames; // optional, likely false - void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. + const void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. char sep; char sep2; char dec; diff --git a/src/fwriteR.c b/src/fwriteR.c index 5b95dae3ae..c121e736cf 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -76,7 +76,7 @@ void writeList(SEXP *col, int64_t row, char **pch) { } char *ch = *pch; write_chars(sep2start, &ch); - void *data = (void *)DATAPTR(v); + const void *data = DATAPTR_RO(v); writer_fun_t fun = funs[wf]; for (int j=0; j Date: Thu, 9 Jan 2020 00:45:44 -0800 Subject: [PATCH 04/22] interim: removed USE_RINTERNALS; work needed in dogroups.c --- src/assign.c | 2 +- src/cj.c | 2 +- src/data.table.h | 3 ++- src/fifelse.c | 14 +++++++------- src/subset.c | 13 ++++--------- src/transpose.c | 2 +- src/utils.c | 4 ++-- 7 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/assign.c b/src/assign.c index 38935ec39c..ebf825f491 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1062,7 +1062,7 @@ const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source, if (TYPEOF(source)!=VECSXP && TYPEOF(source)!=EXPRSXP) BODY(SEXP, &, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) else - BODY(SEXP, VECTOR_PTR, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) + BODY(SEXP, SEXPPTR_RO, SEXP, val, SET_VECTOR_ELT(target, off+i, cval)) default : error(_("Unsupported column type in assign.c:memrecycle '%s'"), type2char(TYPEOF(target))); // # nocov } diff --git a/src/cj.c b/src/cj.c index c312c43b6e..6205ca5174 100644 --- a/src/cj.c +++ b/src/cj.c @@ -74,7 +74,7 @@ SEXP cj(SEXP base_list) { } } break; case VECSXP: { - const SEXP *sourceP = VECTOR_PTR(source); + const SEXP *sourceP = SEXPPTR_RO(source); int start = 0; for (int i=0; i -#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); @@ -65,6 +64,8 @@ typedef R_xlen_t RLEN; #define ALTREP(x) 0 // for R<3.5.0, see issue #2866 and grep for "ALTREP" to see comments where it's used #endif +#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) + // init.c extern SEXP char_integer64; extern SEXP char_ITime; diff --git a/src/fifelse.c b/src/fifelse.c index c322e25cf0..efae03a6b0 100644 --- a/src/fifelse.c +++ b/src/fifelse.c @@ -122,9 +122,9 @@ SEXP fifelseR(SEXP l, SEXP a, SEXP b, SEXP na) { } } break; case VECSXP : { - const SEXP *restrict pa = VECTOR_PTR(a); - const SEXP *restrict pb = VECTOR_PTR(b); - const SEXP *restrict pna = VECTOR_PTR(na); + const SEXP *restrict pa = SEXPPTR_RO(a); + const SEXP *restrict pb = SEXPPTR_RO(b); + const SEXP *restrict pna = SEXPPTR_RO(na); for (int64_t i=0; i Date: Thu, 9 Jan 2020 14:42:46 -0800 Subject: [PATCH 05/22] interim: added sourceItem to memrecycle and replaced one block in dogroups.c using it as per todo comment --- src/assign.c | 29 +++++++++++++++++------------ src/data.table.h | 2 +- src/dogroups.c | 25 +++++-------------------- src/rbindlist.c | 2 +- 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/src/assign.c b/src/assign.c index ebf825f491..f05904619b 100644 --- a/src/assign.c +++ b/src/assign.c @@ -502,7 +502,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } else { // existing column targetcol = VECTOR_ELT(dt,coln); } - const char *ret = memrecycle(targetcol, rows, 0, targetlen, thisvalue, coln+1, CHAR(STRING_ELT(names, coln))); + const char *ret = memrecycle(targetcol, rows, 0, targetlen, thisvalue, -1, coln+1, CHAR(STRING_ELT(names, coln))); if (ret) warning(ret); } @@ -668,14 +668,19 @@ static bool anyNamed(SEXP x) { #define MSGSIZE 1000 static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects -const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source, int colnum, const char *colname) +const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceItem, const int colnum, const char *colname) // like memcpy but recycles single-item source // 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer() // assigns to target[start:start+len-1] or target[where[start:start+len-1]] where start is 0-based +// if sourceStart==-1 then all of source is used (if it is 1 item then it is recycled, or its length must match), +// otherwise sourceStart>=0 is the single item from source to recycle (used in dogroups to recycle the group values) { if (len<1) return NULL; - int slen = length(source); - if (slen==0) return NULL; + if (length(source)==0) return NULL; + if (sourceItem>=0 && sourceItem>=length(source)) + error(_("Internal error: sourceItem=%d length(source)=%d"), sourceItem, length(source)); // # nocov + const int soff = sourceItem>=0 ? sourceItem : 0; + const int slen = sourceItem>=0 ? 1 : length(source); if (slen>1 && slen!=len && (!isNewList(target) || isNewList(source))) error(_("Internal error: recycle length error not caught earlier. slen=%d len=%d"), slen, len); // # nocov // Internal error because the column has already been added to the DT, so length mismatch should have been caught before adding the column. @@ -718,7 +723,7 @@ const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source, if (isInteger(source)) { const int *sd = INTEGER(source); for (int i=0; inlevel) { error(_("Assigning factor numbers to column %d named '%s'. But %d is outside the level range [1,%d]"), colnum, colname, val, nlevel); } @@ -726,7 +731,7 @@ const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source, } else { const double *sd = REAL(source); for (int i=0; inlevel)) { error(_("Assigning factor numbers to column %d named '%s'. But %f is outside the level range [1,%d], or is not a whole number."), colnum, colname, val, nlevel); } @@ -849,7 +854,7 @@ const char *memrecycle(SEXP target, SEXP where, int start, int len, SEXP source, #define CHECK_RANGE(STYPE, RFUN, COND, FMT, TO) {{ \ const STYPE *sd = (const STYPE *)RFUN(source); \ for (int i=0; i0 && slen==len && soff==0; // mc=memcpy; only if types match and not for single items (a single assign faster than these non-const memcpy calls) const int *wd = length(where) ? INTEGER(where)+start : NULL; switch (TYPEOF(target)) { case RAWSXP: { diff --git a/src/data.table.h b/src/data.table.h index b3f0f1932f..3f754f91c2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -156,7 +156,7 @@ SEXP dt_na(SEXP x, SEXP cols); // assign.c SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); -const char *memrecycle(SEXP target, SEXP where, int r, int len, SEXP source, int coln, const char *colname); +const char *memrecycle(const SEXP target, const SEXP where, const int r, const int len, SEXP source, const int sourceItem, const int coln, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, diff --git a/src/dogroups.c b/src/dogroups.c index 0cf2d8bca8..f28f4ec13e 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -314,7 +314,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX SET_STRING_ELT(dtnames, colj, STRING_ELT(newnames, colj-origncol)); copyMostAttrib(RHS, target); // attributes of first group dominate; e.g. initial factor levels come from first group } - const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, 0, ""); + const char *warn = memrecycle(target, order, INTEGER(starts)[i]-1, grpn, RHS, -1, 0, ""); // can't error here because length mismatch already checked for all jval columns before starting to add any new columns if (warn) warning(_("Group %d column '%s': %s"), i+1, CHAR(STRING_ELT(dtnames, colj)), warn); @@ -388,26 +388,11 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX for (int j=0; j1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986 error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn); } - memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, ""); + memrecycle(target, R_NilValue, thisansloc, maxn, source, -1, 0, ""); } } ansloc += maxn; diff --git a/src/rbindlist.c b/src/rbindlist.c index 614d3bf0fc..2eca74ab3d 100644 --- a/src/rbindlist.c +++ b/src/rbindlist.c @@ -519,7 +519,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg) thisCol = PROTECT(coerceVector(thisCol, TYPEOF(target))); nprotect++; } // else coerces if needed within memrecycle; with a no-alloc direct coerce from 1.12.4 (PR #3909) - const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, idcol+j+1, foundName); + const char *ret = memrecycle(target, R_NilValue, ansloc, thisnrow, thisCol, -1, idcol+j+1, foundName); if (ret) warning(_("Column %d of item %d: %s"), w+1, i+1, ret); // e.g. when precision is lost like assigning 3.4 to integer64; test 2007.2 // TODO: but maxType should handle that and this should never warn From c7264336633868d58072f8a076548d4506dea894 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 9 Jan 2020 15:07:52 -0800 Subject: [PATCH 06/22] LOGICAL() should have been INTEGER() but was working ok since LOGICAL is int currently; found by removing USE_RINTERNALS which invokes extra checks --- src/gsumm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gsumm.c b/src/gsumm.c index 210b6d7bf5..ef63519a3c 100644 --- a/src/gsumm.c +++ b/src/gsumm.c @@ -1189,9 +1189,9 @@ SEXP gnthvalue(SEXP x, SEXP valArg) { } break; case INTSXP: { - const int *ix = LOGICAL(x); + const int *ix = INTEGER(x); ans = PROTECT(allocVector(INTSXP, ngrp)); - int *ians = LOGICAL(ans); + int *ians = INTEGER(ans); for (i=0; i grpsize[i]) { INTEGER(ans)[i] = NA_INTEGER; continue; } k = ff[i]+val-2; From f812963747b018379d9092c71930e36c24652333 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Thu, 9 Jan 2020 16:10:51 -0800 Subject: [PATCH 07/22] localized dogroups.c:rownum to help compiler, and reinstated USE_RINTERNALS for now to pass --- src/data.table.h | 1 + src/dogroups.c | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 3f754f91c2..8fcfe80a61 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,5 +1,6 @@ #include "dt_stdio.h" // PRId64 and PRIu64 #include +#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); diff --git a/src/dogroups.c b/src/dogroups.c index f28f4ec13e..b3f18ddcb7 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -5,7 +5,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verbose) { - R_len_t rownum, ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, origIlen=0, origSDnrow=0; + R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, origIlen=0, origSDnrow=0; int nprotect=0; SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; @@ -189,7 +189,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX SETLENGTH(I, grpn); int *iI = INTEGER(I); if (LENGTH(order)==0) { - if (grpn) rownum = istarts[i]-1; else rownum = -1; // not ternary to pass strict-barrier + const int rownum = grpn ? istarts[i]-1 : -1; for (int j=0; j=0) { for (int j=0; j Date: Sat, 11 Jan 2020 17:15:50 -0800 Subject: [PATCH 08/22] memrecyle sourceItem => sourceStart & sourceLen and used it in dogroups to replace DATAPTR --- src/assign.c | 22 +++--- src/data.table.h | 3 +- src/dogroups.c | 181 ++++++++++++++++++----------------------------- src/rbindlist.c | 2 +- 4 files changed, 83 insertions(+), 125 deletions(-) diff --git a/src/assign.c b/src/assign.c index f05904619b..7fb5effa83 100644 --- a/src/assign.c +++ b/src/assign.c @@ -502,7 +502,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } else { // existing column targetcol = VECTOR_ELT(dt,coln); } - const char *ret = memrecycle(targetcol, rows, 0, targetlen, thisvalue, -1, coln+1, CHAR(STRING_ELT(names, coln))); + const char *ret = memrecycle(targetcol, rows, 0, targetlen, thisvalue, 0, -1, coln+1, CHAR(STRING_ELT(names, coln))); if (ret) warning(ret); } @@ -668,19 +668,22 @@ static bool anyNamed(SEXP x) { #define MSGSIZE 1000 static char memrecycle_message[MSGSIZE+1]; // returned to rbindlist so it can prefix with which one of the list of data.table-like objects -const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceItem, const int colnum, const char *colname) +const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname) // like memcpy but recycles single-item source // 'where' a 1-based INTEGER vector subset of target to assign to, or NULL or integer() // assigns to target[start:start+len-1] or target[where[start:start+len-1]] where start is 0-based -// if sourceStart==-1 then all of source is used (if it is 1 item then it is recycled, or its length must match), -// otherwise sourceStart>=0 is the single item from source to recycle (used in dogroups to recycle the group values) +// if sourceLen==-1 then all of source is used (if it is 1 item then it is recycled, or its length must match) for convenience to avoid +// having to use length(source) (repeating source expression) in each call +// sourceLen==1 is used in dogroups to recycle the group values into ans to match the nrow of each group's result; sourceStart is set to each group value row. { if (len<1) return NULL; - if (length(source)==0) return NULL; - if (sourceItem>=0 && sourceItem>=length(source)) - error(_("Internal error: sourceItem=%d length(source)=%d"), sourceItem, length(source)); // # nocov - const int soff = sourceItem>=0 ? sourceItem : 0; - const int slen = sourceItem>=0 ? 1 : length(source); + const int slen = sourceLen>=0 ? sourceLen : length(source); + if (slen==0) return NULL; + if (sourceStart<0 || sourceStart+slen>length(source)) + error(_("Internal error memrecycle: sourceStart=%d sourceLen=%d length(source)=%d"), sourceStart, sourceLen, length(source)); // # nocov + if (!length(where) && start+len>length(target)) + error(_("Internal error memrecycle: start=%d len=%d length(target)=%d"), start, len, length(target)); // # nocov + const int soff = sourceStart; if (slen>1 && slen!=len && (!isNewList(target) || isNewList(source))) error(_("Internal error: recycle length error not caught earlier. slen=%d len=%d"), slen, len); // # nocov // Internal error because the column has already been added to the DT, so length mismatch should have been caught before adding the column. @@ -701,6 +704,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con // If source is already not named (because j already created a fresh unnamed vector within a list()) we don't want to // duplicate unnecessarily, hence checking for named rather than duplicating always. // See #481, #1270 and tests 1341.* fail without this copy. + // ********** This might go away now that we copy properly in dogroups.c ********** if (anyNamed(source)) { source = PROTECT(copyAsPlain(source)); protecti++; } diff --git a/src/data.table.h b/src/data.table.h index 8fcfe80a61..eaa76cd851 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -66,6 +66,7 @@ typedef R_xlen_t RLEN; #endif #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) +// for convenient [] since DATAPTR_RO is untyped (const void *) and we wish to avoid overhead of looped STRING_ELT and VECTOR_ELT // init.c extern SEXP char_integer64; @@ -157,7 +158,7 @@ SEXP dt_na(SEXP x, SEXP cols); // assign.c SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); -const char *memrecycle(const SEXP target, const SEXP where, const int r, const int len, SEXP source, const int sourceItem, const int coln, const char *colname); +const char *memrecycle(const SEXP target, const SEXP where, const int r, const int len, SEXP source, const int sourceStart, const int sourceLen, const int coln, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, diff --git a/src/dogroups.c b/src/dogroups.c index b3f18ddcb7..1ed5a37085 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -113,76 +113,44 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX INTEGER(N)[0] = istarts[i] == NA_INTEGER ? 0 : grpn; // .N is number of rows matched to ( 0 even when nomatch is NA) INTEGER(GRP)[0] = i+1; // group counter exposed as .GRP + INTEGER(rownames)[1] = -grpn; // the .set_row_names() of .SD. Not .N when nomatch=NA and this is a nomatch + for (int j=0; j=0 && nrowgroups) for (int j=0; j=0) { - for (int j=0; j1 && thislen!=maxn && grpn>0) { // grpn>0 for grouping empty tables; test 1986 error(_("Supplied %d items for column %d of group %d which has %d rows. The RHS length must either be 1 (single values are ok) or match the LHS length exactly. If you wish to 'recycle' the RHS please use rep() explicitly to make this intent clear to readers of your code."), thislen, j+1, i+1, maxn); } - memrecycle(target, R_NilValue, thisansloc, maxn, source, -1, 0, ""); + memrecycle(target, R_NilValue, thisansloc, maxn, source, 0, -1, 0, ""); } } ansloc += maxn; @@ -455,30 +393,45 @@ SEXP keepattr(SEXP to, SEXP from) return to; } -SEXP growVector(SEXP x, R_len_t newlen) +SEXP growVector(SEXP x, const R_len_t newlen) { // Similar to EnlargeVector in src/main/subassign.c, with the following changes : // * replaced switch and loops with one memcpy for INTEGER and REAL, but need to age CHAR and VEC. // * no need to cater for names // * much shorter and faster SEXP newx; - R_len_t i, len = length(x); + R_len_t len = length(x); if (isNull(x)) error(_("growVector passed NULL")); PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here? if (newlen < len) len=newlen; // i.e. shrink switch (TYPEOF(x)) { - case STRSXP : - for (i=0; i Date: Sat, 11 Jan 2020 17:20:50 -0800 Subject: [PATCH 09/22] vertical code align to spot mistakes and help the eye --- src/dogroups.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index 1ed5a37085..46b7865c38 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -405,21 +405,11 @@ SEXP growVector(SEXP x, const R_len_t newlen) PROTECT(newx = allocVector(TYPEOF(x), newlen)); // TO DO: R_realloc(?) here? if (newlen < len) len=newlen; // i.e. shrink switch (TYPEOF(x)) { - case RAWSXP: - memcpy(RAW(newx), RAW(x), len*SIZEOF(x)); - break; - case LGLSXP: - memcpy(LOGICAL(newx), LOGICAL(x), len*SIZEOF(x)); - break; - case INTSXP: - memcpy(INTEGER(newx), INTEGER(x), len*SIZEOF(x)); - break; - case REALSXP: - memcpy(REAL(newx), REAL(x), len*SIZEOF(x)); - break; - case CPLXSXP: - memcpy(COMPLEX(newx), COMPLEX(x), len*SIZEOF(x)); - break; + case RAWSXP: memcpy(RAW(newx), RAW(x), len*SIZEOF(x)); break; + case LGLSXP: memcpy(LOGICAL(newx), LOGICAL(x), len*SIZEOF(x)); break; + case INTSXP: memcpy(INTEGER(newx), INTEGER(x), len*SIZEOF(x)); break; + case REALSXP: memcpy(REAL(newx), REAL(x), len*SIZEOF(x)); break; + case CPLXSXP: memcpy(COMPLEX(newx), COMPLEX(x), len*SIZEOF(x)); break; case STRSXP : { const SEXP *xd = SEXPPTR_RO(x); for (int i=0; i Date: Sat, 11 Jan 2020 17:54:32 -0800 Subject: [PATCH 10/22] subsetVectorRaw used in dogroups to replace final DATAPTR and simplify it, also parallel for non-SEXP types --- src/data.table.h | 1 + src/dogroups.c | 23 +++++------------------ src/subset.c | 4 ++-- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index eaa76cd851..98ad723738 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -129,6 +129,7 @@ int getNumericRounding_C(); SEXP reorder(SEXP x, SEXP order); // subset.c +void subsetVectorRaw(SEXP ans, SEXP source, SEXP idx, const bool anyNA); SEXP subsetVector(SEXP x, SEXP idx); // fcast.c diff --git a/src/dogroups.c b/src/dogroups.c index 46b7865c38..bed4a65a39 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -138,13 +138,14 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX if (istarts[i] == NA_INTEGER || (LENGTH(order) && iorder[ istarts[i]-1 ]==NA_INTEGER)) { for (int j=0; j Date: Sat, 11 Jan 2020 18:05:25 -0800 Subject: [PATCH 11/22] const bool verbose --- src/dogroups.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/dogroups.c b/src/dogroups.c index bed4a65a39..35e112efc4 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,13 +3,14 @@ #include #include -SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verbose) +SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp, origIlen=0, origSDnrow=0; int nprotect=0; SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; clock_t tstart=0, tblock[10]={0}; int nblock[10]={0}; + const bool verbose = LOGICAL(verboseArg)[0]==1; if (!isInteger(order)) error(_("Internal error: order not integer vector")); // # nocov if (TYPEOF(starts) != INTSXP) error(_("Internal error: starts not integer")); // # nocov @@ -154,7 +155,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX writeNA(VECTOR_ELT(xSD, j), 0, 1); } } else { - if (LOGICAL(verbose)[0]) tstart = clock(); + if (verbose) tstart = clock(); SETLENGTH(I, grpn); int *iI = INTEGER(I); if (LENGTH(order)==0) { @@ -166,21 +167,21 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX for (int j=0; j0; Rprintf(_("\n %s took %.3fs for %d groups\n"), w ? "collecting discontiguous groups" : "memcpy contiguous groups", From 456fbae35afb72f314f8a6deb8fe513084b36eff Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sat, 11 Jan 2020 18:46:00 -0800 Subject: [PATCH 12/22] coverage --- inst/tests/tests.Rraw | 12 +++++++++--- src/dogroups.c | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 03152f270f..84f49b2c23 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -483,10 +483,16 @@ test(171.2, DT[B=="b"][A==85,C], numeric()) test(171.3, DT[ , data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ], by=B ], data.table(B=c("a","c"),V1=c(67,905))) test(172, DT[ , list(3,data.table( A, C )[ A==25, C ] + data.table( A, C )[ A==85, C ]), by=B ], data.table(B=c("a","b","c"),V1=3,V2=c(67,NA,905))) -# Test growing result in memory. Usually the guess is good though. -# This example returns no rows for first group so guess for up-front allocate needs a reallocate +# Test growing result in memory (growVector). Usually the guess is good though. DT = data.table(A=c(1L,1L,2L,2L,3L,3L), B=1:6) -test(173, DT[,B[B>3],by=A][,V1], c(4L,5L,6L)) +# no rows for first group so guess for up-front allocate needs a reallocate ... +test(173.1, DT[,B[B>3],by=A][,V1], c(4L,5L,6L)) +# cover raw type with the 3rd group result being unexpectedly larger ... +test(173.2, DT[, if(.GRP==3) as.raw(3:5) else as.raw(.GRP), by=A], + data.table(A=INT(1,2,3,3,3), V1=as.raw(1:5))) +# 2nd group returns more than the number of the rows in that group, and cover list type too ... +test(173.3, DT[, .(if(.GRP==2) list(3:5, "b", 0:1, mean) else list("a",1:2)), by=A], + data.table(A=INT(1,1,2,2,2,2,3,3), V1=list("a",1:2,3:5,"b",0:1,mean,"a",1:2))) # Example taken from Harish post to datatable-help on 11 July DT <- data.table( diff --git a/src/dogroups.c b/src/dogroups.c index 35e112efc4..956e822a76 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -409,7 +409,7 @@ SEXP growVector(SEXP x, const R_len_t newlen) SET_VECTOR_ELT(newx, i, xd[i]); } break; default : - error(_("Internal error: growVector doesn't support type '%s'"), type2char(TYPEOF(x))); + error(_("Internal error: growVector doesn't support type '%s'"), type2char(TYPEOF(x))); // # nocov } // if (verbose) Rprintf(_("Growing vector from %d to %d items of type '%s'\n"), len, newlen, type2char(TYPEOF(x))); // Would print for every column if here. Now just up in dogroups (one msg for each table grow). From 9c43d6bb5a22b72621eb2aff21ee391958f32a65 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 12 Jan 2020 15:02:48 -0800 Subject: [PATCH 13/22] reorder.c --- src/reorder.c | 71 ++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 38 deletions(-) diff --git a/src/reorder.c b/src/reorder.c index 7b5093dd55..3799f544d3 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -3,9 +3,9 @@ SEXP reorder(SEXP x, SEXP order) { // For internal use only by setkey(). - // 'order' must strictly be a permutation of 1:n (i.e. no repeats, zeros or NAs) - // If only a small subset in the middle is reordered the ends are moved in: [start,end]. - // x may be a vector, or a list of same-length vectors such as data.table + // 'order' must be a strict permutation of 1:n; i.e. no repeats, zeros, NAs. Also known as a shuffle. + // If only a small subset in the middle is reordered, the ends are moved in to avoid wasteful work. + // x may be a vector, or a list of same-length vectors (typically a data.table). R_len_t nrow, ncol; size_t maxSize = 0; if (isNewList(x)) { @@ -37,66 +37,61 @@ SEXP reorder(SEXP x, SEXP order) const int *restrict idx = INTEGER(order); int i=0; - while (iend) error(_("order is not a permutation of 1:nrow[%d]"), nrow); + // This file is for internal use (so we should get the input right), but this check seems sensible to still perform since it should + // run in negligible time (sequential with prefetch). It is not sufficient though; e.g. a non-permutation such as a duplicate will not be caught. + // But checking for dups would incur too much cost given that i) this is for internal use only and we only use this function internally + // when we're sure we're passing in a strict permutation, and ii) this operation is fundamental to setkey and data.table. } - // Creorder is for internal use (so we should get the input right!), but the check above seems sensible. The for loop above should run - // in neglible time (sequential with prefetch). It will catch NAs anywhere but won't catch duplicates. But doing so would be going too - // far given this is for internal use only and we only use this function internally when we're sure it's a permutation. - char *TMP = malloc(nrow * maxSize); - // enough RAM for a copy of one column (of largest type). Writes into the [start,end] subset. Outside [start,end] is wasted in that rarer case - // to save a "-start" in the deep loop below in all cases. + char *TMP = malloc((end-start+1)*maxSize); if (!TMP) error(_("Unable to allocate %d * %d bytes of working memory for reordering data.table"), end-start+1, maxSize); - for (int i=0; i Date: Sun, 12 Jan 2020 17:20:39 -0800 Subject: [PATCH 14/22] pass repeated cc() in dev; unrelated to this PR --- inst/tests/tests.Rraw | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 84f49b2c23..63b5adcc29 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8513,19 +8513,19 @@ test(1613.602, all.equal(data.table(a=1), data.frame(a=1), check.attributes = FA test(1613.603, all.equal(data.table(a=1), list(a=1), check.attributes = FALSE)) test(1613.604, all.equal(data.table(a=1), 1, check.attributes = FALSE)) test(1613.605, all.equal(data.table(a=1), try(stop('this wont work'), silent = TRUE), check.attributes = FALSE), "target is data.table but current is not and failed to be coerced to it") -L1 <- list(a = data.table(1), b = setattr("foo", "tbl", data.table(1))) -L2 <- list(a = 1, b = setattr("foo", "tbl", 1)) +L1 = list(a = data.table(1), b = setattr("foo1613", "tbl", data.table(1))) +L2 = list(a = 1, b = setattr("foo1613", "tbl", 1)) test(1613.606, all(grepl("target is data.table, current is numeric", all.equal(L1, L2)))) -as.data.table.foo = function(x) { # test as.data.table coerce of 'current' argument - if (!length(x)) warning("empty foo") - as.data.table(unclass(foo)) +as.data.table.foo1613 = function(x) { # test as.data.table coerce of 'current' argument + if (!length(x)) warning("empty foo1613") + as.data.table(unclass(foo1613)) } -registerS3method("as.data.table", "foo", as.data.table.foo) -foo = structure(list(NULL), class="foo") -test(1613.607, all.equal(data.table(), foo, check.attributes=FALSE)) -foo = structure(list(), class="foo") -test(1613.608, all.equal(data.table(), foo, check.attributes=FALSE), warning="empty") -rm(as.data.table.foo, foo) +registerS3method("as.data.table", "foo1613", as.data.table.foo1613) +foo1613 = structure(list(NULL), class="foo1613") +test(1613.607, all.equal(data.table(), foo1613, check.attributes=FALSE)) +foo1613 = structure(list(), class="foo1613") +test(1613.608, all.equal(data.table(), foo1613, check.attributes=FALSE), warning="empty") +rm(as.data.table.foo1613, foo1613) DT1 <- data.table(a = 1:4, b = letters[1:4], .seqn = 5L) DT2 <- data.table(a = 4:1, b = letters[4:1], .seqn = 5L) @@ -16762,13 +16762,13 @@ test(2131, lapply(x[ , list(dt = list(.SD)), by = a]$dt, attr, '.data.table.lock list(NULL, NULL, NULL)) # S4 object not suported in fifelse and fcase, #4135 -setClass("rdate", slots=list(date="numeric")) -setMethod("show","rdate", function(object) print(object@date)) -s1 = new("rdate",date=20191231) -s2 = new("rdate",date=20191230) +class2132 = setClass("class2132", slots=list(x="numeric")) +s1 = class2132(x=20191231) +s2 = class2132(x=20191230) test(2132.1, fifelse(TRUE, s1, s2), error = "S4 class objects (except nanotime) are not supported.") test(2132.2, fifelse(TRUE, 1, s2), error = "S4 class objects (except nanotime) are not supported.") test(2132.3, fcase(TRUE, s1, FALSE, s2), error = "S4 class objects (except nanotime) are not supported. Please see https://github.com/Rdatatable/data.table/issues/4131.") +rm(s1, s2, class2132) ######################## From c7b01748d9f386d727b16f77a344b6ca97149c51 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Sun, 12 Jan 2020 20:19:23 -0800 Subject: [PATCH 15/22] malloc=>R_alloc, thanks @jangorecki --- src/reorder.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/reorder.c b/src/reorder.c index 3799f544d3..0073e5857f 100644 --- a/src/reorder.c +++ b/src/reorder.c @@ -52,8 +52,7 @@ SEXP reorder(SEXP x, SEXP order) // when we're sure we're passing in a strict permutation, and ii) this operation is fundamental to setkey and data.table. } - char *TMP = malloc((end-start+1)*maxSize); - if (!TMP) error(_("Unable to allocate %d * %d bytes of working memory for reordering data.table"), end-start+1, maxSize); + char *TMP = (char *)R_alloc(end-start+1, maxSize); for (int i=0; i Date: Sun, 12 Jan 2020 22:33:30 -0800 Subject: [PATCH 16/22] VECTOR_PTR & STRING_PTR removed from setcolorder by moving it to reorder.c and reusing reorder; also added strict-permutaton check to that central place --- R/data.table.R | 2 +- inst/tests/tests.Rraw | 2 +- src/assign.c | 30 ------------------------------ src/data.table.h | 2 +- src/reorder.c | 41 ++++++++++++++++++++++++++++++----------- 5 files changed, 33 insertions(+), 44 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 9d1c8adc48..af4f14de0b 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2536,7 +2536,7 @@ setcolorder = function(x, neworder=key(x)) if (is.character(neworder) && anyDuplicated(names(x))) stop("x has some duplicated column name(s): ", paste(names(x)[duplicated(names(x))], collapse=","), ". Please remove or rename the duplicate(s) and try again.") # if (!is.data.table(x)) stop("x is not a data.table") - neworder = colnamesInt(x, neworder, check_dups=TRUE) + neworder = colnamesInt(x, neworder, check_dups=FALSE) # dups are now checked inside Csetcolorder below if (length(neworder) != length(x)) { #if shorter than length(x), pad by the missing # elements (checks below will catch other mistakes) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 63b5adcc29..f6d17a0762 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -13704,7 +13704,7 @@ test(1967.61, setnames(x, 1+3i, 'cplx'), error = "'old' is type complex") test(1967.62, setnames(x, 1, c('d', 'e')), error = "'old' is length 1 but 'new'") test(1967.621, setnames(x, 1:2, c("a","a")), data.table(a=1:5, a=6:10)) test(1967.622, setnames(x, 1:2, c("a",NA)), error = "NA in 'new' at positions [2]") -test(1967.63, setcolorder(x, c(1, 1)), error = 'specify duplicated column') +test(1967.63, setcolorder(x, c(1, 1)), error = 'Item 2 of order (1) is either NA, out of range [1,2], or is duplicated. The new order must be a strict permutation of 1:n') test(1967.64, setcolorder(x, 1+3i), error = 'must be character or numeric') test(1967.65, setcolorder(x, 300), error = 'specify non existing column*.*300') diff --git a/src/assign.c b/src/assign.c index 7fb5effa83..0d5c67369e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -1216,33 +1216,3 @@ SEXP setcharvec(SEXP x, SEXP which, SEXP newx) return R_NilValue; } -SEXP setcolorder(SEXP x, SEXP o) -{ - SEXP names = getAttrib(x, R_NamesSymbol); - const int *od = INTEGER(o), ncol=LENGTH(x); - if (isNull(names)) error(_("dt passed to setcolorder has no names")); - if (ncol != LENGTH(names)) - error(_("Internal error: dt passed to setcolorder has %d columns but %d names"), ncol, LENGTH(names)); // # nocov - - // Double-check here at C level that o[] is a strict permutation of 1:ncol. Reordering columns by reference makes no - // difference to generations/refcnt so we can write behind barrier in this very special case of strict permutation. - bool *seen = Calloc(ncol, bool); - for (int i=0; incol) - error(_("Internal error: o passed to Csetcolorder contains an NA or out-of-bounds")); // # nocov - if (seen[od[i]-1]) - error(_("Internal error: o passed to Csetcolorder contains a duplicate")); // # nocov - seen[od[i]-1] = true; - } - Free(seen); - - SEXP *tmp = Calloc(ncol, SEXP); - SEXP *xd = VECTOR_PTR(x), *namesd = STRING_PTR(names); - for (int i=0; iend) error(_("order is not a permutation of 1:nrow[%d]"), nrow); - // This file is for internal use (so we should get the input right), but this check seems sensible to still perform since it should - // run in negligible time (sequential with prefetch). It is not sufficient though; e.g. a non-permutation such as a duplicate will not be caught. - // But checking for dups would incur too much cost given that i) this is for internal use only and we only use this function internally - // when we're sure we're passing in a strict permutation, and ii) this operation is fundamental to setkey and data.table. + if (idx[i]==NA_INTEGER || idx[i]-1end || seen[idx[i]-1-start]++) + error(_("Item %d of order (%d) is either NA, out of range [1,%d], or is duplicated. The new order must be a strict permutation of 1:n"), + i+1, idx[i], length(order)); + // This should run in reasonable time because although 'seen' is random write, it is writing to just 1 byte * nrow + // which is relatively small and has a good chance of fitting in cache. + // A worry mitigated by this check is a user passing their own incorrect ordering using ::: to reach this internal. + // This check is once up front, and then idx is applied to all the columns which is where the most time is spent. } - char *TMP = (char *)R_alloc(end-start+1, maxSize); + char *TMP = (char *)R_alloc(nmid, maxSize); for (int i=0; i Date: Mon, 13 Jan 2020 16:50:52 -0800 Subject: [PATCH 17/22] REAL() not on INTSXP (even when doesn't matter) in frank.c to pass stronger checks under non-USE_RINTERNALS --- src/data.table.h | 2 +- src/frank.c | 46 ++++++++++++++++++++++------------------------ 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 33a38810e2..59a550179e 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,6 +1,6 @@ #include "dt_stdio.h" // PRId64 and PRIu64 #include -#define USE_RINTERNALS +//#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); diff --git a/src/frank.c b/src/frank.c index 387838ee5f..2557d40b3c 100644 --- a/src/frank.c +++ b/src/frank.c @@ -3,8 +3,6 @@ // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); -extern SEXP char_integer64; - SEXP dt_na(SEXP x, SEXP cols) { int n=0, elem; @@ -70,8 +68,7 @@ SEXP dt_na(SEXP x, SEXP cols) { } SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) { - int i=0, j=0, k=0, n; - int *xstart = INTEGER(xstartArg), *xlen = INTEGER(xlenArg), *xorder = INTEGER(xorderArg); + const int *xstart = INTEGER(xstartArg), *xlen = INTEGER(xlenArg), *xorder = INTEGER(xorderArg); enum {MEAN, MAX, MIN, DENSE, SEQUENCE} ties = MEAN; // RUNLENGTH if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "average")) ties = MEAN; @@ -81,42 +78,43 @@ SEXP frank(SEXP xorderArg, SEXP xstartArg, SEXP xlenArg, SEXP ties_method) { else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "sequence")) ties = SEQUENCE; // else if (!strcmp(CHAR(STRING_ELT(ties_method, 0)), "runlength")) ties = RUNLENGTH; else error(_("Internal error: invalid ties.method for frankv(), should have been caught before. please report to data.table issue tracker")); // # nocov - n = length(xorderArg); - SEXP ans = (ties == MEAN) ? PROTECT(allocVector(REALSXP, n)) : PROTECT(allocVector(INTSXP, n)); - int *ians = INTEGER(ans); - double *dans = REAL(ans); - if (n > 0) { + const int n = length(xorderArg); + SEXP ans = PROTECT(allocVector(ties==MEAN ? REALSXP : INTSXP, n)); + int *ians=NULL; + double *dans=NULL; + if (ties==MEAN) dans=REAL(ans); else ians=INTEGER(ans); + if (n>0) { switch (ties) { case MEAN : - for (i = 0; i < length(xstartArg); i++) { - for (j = xstart[i]-1; j < xstart[i]+xlen[i]-1; j++) + for (int i=0; i Date: Mon, 13 Jan 2020 17:17:31 -0800 Subject: [PATCH 18/22] INTEGER used on REAL fill_d found by removing USE_RINTERNALS --- src/fcast.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/fcast.c b/src/fcast.c index ff8c98fc7b..b8d6fa4c2a 100644 --- a/src/fcast.c +++ b/src/fcast.c @@ -8,7 +8,6 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil int nrows=INTEGER(nrowArg)[0], ncols=INTEGER(ncolArg)[0]; int nlhs=length(lhs), nval=length(val), *idx = INTEGER(idxArg); SEXP target; - bool isfill = true; SEXP ans = PROTECT(allocVector(VECSXP, nlhs + (nval * ncols))); // set lhs cols @@ -21,13 +20,12 @@ SEXP fcast(SEXP lhs, SEXP val, SEXP nrowArg, SEXP ncolArg, SEXP idxArg, SEXP fil SEXP thisfill = fill; int nprotect = 0; if (isNull(fill)) { - isfill = false; if (LOGICAL(is_agg)[0]) { thisfill = PROTECT(allocNAVector(TYPEOF(thiscol), 1)); nprotect++; } else thisfill = VECTOR_ELT(fill_d, i); } - if (isfill && TYPEOF(fill) != TYPEOF(thiscol)) { - thisfill = PROTECT(coerceVector(fill, TYPEOF(thiscol))); nprotect++; + if (TYPEOF(thisfill) != TYPEOF(thiscol)) { + thisfill = PROTECT(coerceVector(thisfill, TYPEOF(thiscol))); nprotect++; } switch (TYPEOF(thiscol)) { case INTSXP: From f83f82e15067de02a15e7006f2c3c06510662bce Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 13 Jan 2020 17:34:45 -0800 Subject: [PATCH 19/22] use_rinternals back on for now just to pass --- src/data.table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data.table.h b/src/data.table.h index 59a550179e..33a38810e2 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,6 +1,6 @@ #include "dt_stdio.h" // PRId64 and PRIu64 #include -//#define USE_RINTERNALS +#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); From 20bd1d1cb02cb1744e138c64660aaa2a2bde9488 Mon Sep 17 00:00:00 2001 From: mattdowle Date: Mon, 13 Jan 2020 19:12:26 -0800 Subject: [PATCH 20/22] Removed USE_RINTERNALS, passing main tests, not nafill tests yet. Not #ifndef(ALTREP) because ALTREP is now a function when USE_RINTERNALS is off and not defined at this point --- src/data.table.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 33a38810e2..9181992c7a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,6 +1,5 @@ #include "dt_stdio.h" // PRId64 and PRIu64 #include -#define USE_RINTERNALS #include // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); @@ -61,8 +60,9 @@ typedef R_xlen_t RLEN; #define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) #define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) -#ifndef ALTREP -#define ALTREP(x) 0 // for R<3.5.0, see issue #2866 and grep for "ALTREP" to see comments where it's used +#include +#if !defined(R_VERSION) || R_VERSION Date: Mon, 13 Jan 2020 23:08:03 -0800 Subject: [PATCH 21/22] nafill.c tweaked to pass non-USE_RINTERNALS: REAL can't be used on INTSXP even when result not used --- src/nafill.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/nafill.c b/src/nafill.c index a3b9789b8f..eb4e5c0e20 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -121,16 +121,26 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S SEXP ans = R_NilValue; ans_t *vans = (ans_t *)R_alloc(nx, sizeof(ans_t)); for (R_len_t i=0; i Date: Tue, 14 Jan 2020 00:27:04 -0800 Subject: [PATCH 22/22] pass R 3.1.0 --- src/data.table.h | 18 +++++++++--------- src/utils.c | 11 ----------- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/src/data.table.h b/src/data.table.h index 9181992c7a..766ed4fd78 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -1,6 +1,15 @@ #include "dt_stdio.h" // PRId64 and PRIu64 #include + +#include +#if !defined(R_VERSION) || R_VERSION < R_Version(3, 5, 0) // R-exts$6.14 +#define ALTREP(x) 0 // #2866 +#define USE_RINTERNALS // #4164 +#define DATAPTR_RO(x) ((const void *)DATAPTR(x)) +#endif #include +#define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT + // #include // the debugging machinery + breakpoint aidee // raise(SIGINT); #include // for uint64_t rather than unsigned long long @@ -60,14 +69,6 @@ typedef R_xlen_t RLEN; #define NEED2UTF8(s) !(IS_ASCII(s) || (s)==NA_STRING || IS_UTF8(s)) #define ENC2UTF8(s) (!NEED2UTF8(s) ? (s) : mkCharCE(translateCharUTF8(s), CE_UTF8)) -#include -#if !defined(R_VERSION) || R_VERSION