From f8ea51aff3d31e5ed2f9341d0141afe25f0b1d4a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 13 Sep 2017 11:07:55 -0700 Subject: [PATCH 01/16] Initial agno fwrite --- src/fwrite.c | 84 +++++----- src/fwriteR.c | 419 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 460 insertions(+), 43 deletions(-) create mode 100644 src/fwriteR.c diff --git a/src/fwrite.c b/src/fwrite.c index c72ef1a5ff..f804123877 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -1,4 +1,3 @@ -#include "data.table.h" #include "fwriteLookups.h" #include #include // for access() @@ -19,16 +18,16 @@ #define SIZE_SF 1000000000000000ULL // 10^NUM_SF // Globals for this file only. Written once to hold parameters passed from R level. -static const char *na; // by default "" or if set then usually "NA" +static const char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files static char sep2; // '|' within list columns static const char *sep2start, *sep2end; static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 -static Rboolean verbose=FALSE; // be chatty? -static Rboolean quote=FALSE; // whether to surround fields with double quote ". NA means 'auto' (default) -static Rboolean qmethod_escape=TRUE; // when quoting fields, how to manage double quote in the field contents -static Rboolean logicalAsInt=FALSE; // logical as 0/1 or "TRUE"/"FALSE" -static Rboolean squash=FALSE; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) +static _Bool verbose=FALSE; // be chatty? +static _Bool quote=FALSE; // whether to surround fields with double quote ". NA means 'auto' (default) +static _Bool qmethod_escape=TRUE; // when quoting fields, how to manage double quote in the field contents +static _Bool logicalAsInt=FALSE; // logical as 0/1 or "TRUE"/"FALSE" +static _Bool squash=FALSE; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2=epoch, 3=write.csv #define DATETIMEAS_EPOCH 2 #define DATETIMEAS_WRITECSV 3 @@ -96,7 +95,7 @@ SEXP genLookups() { return R_NilValue; } /* - FILE *f = fopen("/tmp/fwriteLookups.h", "w"); + FILE *f = fopen("/tmp/fwriteLookups.h", "w"); fprintf(f, "//\n\ // Generated by fwrite.c:genLookups()\n\ //\n\ @@ -158,7 +157,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) unsigned long long fraction = u.ull & 0xFFFFFFFFFFFFF; // (1ULL<<52)-1; int exponent = (int)((u.ull>>52) & 0x7FF); // [0,2047] - // Now sum the appropriate powers 2^-(1:52) of the fraction + // Now sum the appropriate powers 2^-(1:52) of the fraction // Important for accuracy to start with the smallest first; i.e. 2^-52 // Exact powers of 2 (1.0, 2.0, 4.0, etc) are represented precisely with fraction==0 // Skip over tailing zeros for exactly representable numbers such 0.5, 0.75 @@ -167,7 +166,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) double acc = 0; // 'long double' not needed int i = 52; if (fraction) { - while ((fraction & 0xFF) == 0) { fraction >>= 8; i-=8; } + while ((fraction & 0xFF) == 0) { fraction >>= 8; i-=8; } while (fraction) { acc += sigparts[(((fraction&1u)^1u)-1u) & i]; i--; @@ -185,7 +184,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) unsigned long long l = y * SIZE_SF; // low magnitude mult 10^NUM_SF // l now contains NUM_SF+1 digits as integer where repeated /10 below is accurate - // if (verbose) Rprintf("\nTRACE: acc=%.20Le ; y=%.20Le ; l=%llu ; e=%d ", acc, y, l, exp); + // if (verbose) Rprintf("\nTRACE: acc=%.20Le ; y=%.20Le ; l=%llu ; e=%d ", acc, y, l, exp); if (l%10 >= 5) l+=10; // use the last digit to round l /= 10; @@ -198,7 +197,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) while (l%10 == 0) { l /= 10; trailZero++; } int sf = NUM_SF - trailZero; if (sf==0) {sf=1; exp++;} // e.g. l was 9999999[5-9] rounded to 10000000 which added 1 digit - + // l is now an unsigned long that doesn't start or end with 0 // sf is the number of digits now in l // exp is e were l to be written with the decimal sep after the first digit @@ -231,7 +230,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) // scientific ... ch += sf; // sf-1 + 1 for dec for (int i=sf; i>1; i--) { - *ch-- = '0' + l%10; + *ch-- = '0' + l%10; l /= 10; } if (sf == 1) ch--; else *ch-- = dec; @@ -317,7 +316,7 @@ static inline void write_time(int x, char **thisCh) { char *ch = *thisCh; if (x<0) { // <0 covers NA_INTEGER too (==INT_MIN checked in init.c) - write_chars(na, &ch); + write_chars(na, &ch); } else { int hh = x/3600; int mm = (x - hh*3600) / 60; @@ -369,7 +368,7 @@ static inline void write_date(int x, char **thisCh) int z = x - y*365 - y/4 + y/100 - y/400 + 1; // days from March 1st in year y int md = monthday[z]; // See fwriteLookups.h for how the 366 item lookup 'monthday' is arranged y += z && (md/100)<3; // The +1 above turned z=-1 to 0 (meaning Feb29 of year y not Jan or Feb of y+1) - + ch += 7 + 2*!squash; *ch-- = '0'+md%10; md/=10; *ch-- = '0'+md%10; md/=10; @@ -402,7 +401,7 @@ static void writePOSIXct(SEXP column, int i, char **thisCh) // Aside: an often overlooked option for users is to start R in UTC: $ TZ='UTC' R // All positive integers up to 2^53 (9e15) are exactly representable by double which is relied // on in the ops here; number of seconds since epoch. - + double x = REAL(column)[i]; char *ch = *thisCh; if (!R_FINITE(x)) { @@ -594,17 +593,17 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. const Rboolean showProgress = LOGICAL(showProgress_Arg)[0]; time_t start_time = time(NULL); time_t next_time = start_time+2; // start printing progress meter in 2 sec if not completed by then - + verbose = LOGICAL(verbose_Arg)[0]; - + sep = *CHAR(STRING_ELT(sep_Arg, 0)); // DO NOT DO: allow multichar separator (bad idea) sep2start = CHAR(STRING_ELT(sep2_Arg, 0)); sep2 = *CHAR(STRING_ELT(sep2_Arg, 1)); sep2end = CHAR(STRING_ELT(sep2_Arg, 2)); - + const char *eol = CHAR(STRING_ELT(eol_Arg, 0)); // someone might want a trailer on every line so allow any length string as eol - + na = CHAR(STRING_ELT(na_Arg, 0)); dec = *CHAR(STRING_ELT(dec_Arg,0)); quote = LOGICAL(quote_Arg)[0]; @@ -643,7 +642,7 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. UNPROTECT(1); // s, not DF } } - + // Allocate lookup vector to writer function for each column. For simplicity and robustness via many fewer lines // of code and less logic need. Secondly, for efficiency to save deep switch and branches later. // Don't use a VLA as ncol could be > 1e6 columns @@ -664,9 +663,9 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. if (verbose) Rprintf("If quote='auto', fields will be quoted if the field contains either sep ('%c') or sep2[2] ('%c') because column %d is a list column.\n", sep, sep2, firstListColumn ); if (dec==sep) error("Internal error: dec != sep was checked at R level"); if (dec==sep2 || sep==sep2) - error("sep ('%c'), sep2[2L] ('%c') and dec ('%c') must all be different when list columns are present. Column %d is a list column.", sep, sep2, dec, firstListColumn); + error("sep ('%c'), sep2[2L] ('%c') and dec ('%c') must all be different when list columns are present. Column %d is a list column.", sep, sep2, dec, firstListColumn); } - + // user may want row names even when they don't exist (implied row numbers as row names) Rboolean doRowNames = LOGICAL(row_names)[0]; SEXP rowNames = NULL; @@ -674,7 +673,7 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. rowNames = getAttrib(DFin, R_RowNamesSymbol); if (!isString(rowNames)) rowNames=NULL; } - + // Estimate max line length of a 1000 row sample (100 rows in 10 places). // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. // Buffers will be resized later if there are too many line lengths outside the sample, anyway. @@ -718,10 +717,10 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. if (fun==NULL) error("Column %d is a list column but on row %d is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", j+1, i+1, type2char(TYPEOF(v))); for (int k=0; k=1 because 0-columns was caught earlier. write_chars(eol, &ch); // replace it with the newline. - + // Track longest line seen so far. If we start to see longer lines than we saw in the // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, // must fit in the buffer. Can't early write and reset buffer because the @@ -929,7 +928,7 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. } } // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. - // Something like: + // Something like: // if (me==0) { // failed = TRUE; // inside ordered here; the slaves are before ordered and not looking at 'failed' // R_CheckUserInterrupt(); @@ -980,4 +979,3 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. return(R_NilValue); } - diff --git a/src/fwriteR.c b/src/fwriteR.c new file mode 100644 index 0000000000..f73e6e33ec --- /dev/null +++ b/src/fwriteR.c @@ -0,0 +1,419 @@ + +#include "data.table.h" + +SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. data.frame, data.table + SEXP filename_Arg, + SEXP sep_Arg, + SEXP sep2_Arg, + SEXP eol_Arg, + SEXP na_Arg, + SEXP dec_Arg, + SEXP quote_Arg, // 'auto'=NA_LOGICAL|TRUE|FALSE + SEXP qmethod_escapeArg, // TRUE|FALSE + SEXP append, // TRUE|FALSE + SEXP row_names, // TRUE|FALSE + SEXP col_names, // TRUE|FALSE + SEXP logicalAsInt_Arg, // TRUE|FALSE + SEXP dateTimeAs_Arg, // 0=ISO(yyyy-mm-dd),1=squash(yyyymmdd),2=epoch,3=write.csv + SEXP buffMB_Arg, // [1-1024] default 8MB + SEXP nThread, + SEXP showProgress_Arg, + SEXP verbose_Arg) +{ + if (!isNewList(DFin)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); + RLEN ncol = length(DFin); + if (ncol==0) { + warning("fwrite was passed an empty list of no columns. Nothing to write."); + return R_NilValue; + } + RLEN nrow = length(VECTOR_ELT(DFin, 0)); + + const Rboolean showProgress = LOGICAL(showProgress_Arg)[0]; + time_t start_time = time(NULL); + time_t next_time = start_time+2; // start printing progress meter in 2 sec if not completed by then + + verbose = LOGICAL(verbose_Arg)[0]; + + sep = *CHAR(STRING_ELT(sep_Arg, 0)); // DO NOT DO: allow multichar separator (bad idea) + sep2start = CHAR(STRING_ELT(sep2_Arg, 0)); + sep2 = *CHAR(STRING_ELT(sep2_Arg, 1)); + sep2end = CHAR(STRING_ELT(sep2_Arg, 2)); + + const char *eol = CHAR(STRING_ELT(eol_Arg, 0)); + // someone might want a trailer on every line so allow any length string as eol + + na = CHAR(STRING_ELT(na_Arg, 0)); + dec = *CHAR(STRING_ELT(dec_Arg,0)); + quote = LOGICAL(quote_Arg)[0]; + // When NA is a non-empty string, then we must quote all string fields + if (*na != '\0' && quote == NA_LOGICAL) quote = TRUE; + qmethod_escape = LOGICAL(qmethod_escapeArg)[0]; + const char *filename = CHAR(STRING_ELT(filename_Arg, 0)); + logicalAsInt = LOGICAL(logicalAsInt_Arg)[0]; + dateTimeAs = INTEGER(dateTimeAs_Arg)[0]; + squash = (dateTimeAs==1); + int nth = INTEGER(nThread)[0]; + int firstListColumn = 0; + clock_t t0=clock(); + + SEXP DF = DFin; + int protecti = 0; + if (dateTimeAs == DATETIMEAS_WRITECSV) { + int j=0; while(j 1e6 columns + writer_fun_t *fun = (writer_fun_t *)R_alloc(ncol, sizeof(writer_fun_t)); + for (int j=0; j maxLineLen) maxLineLen = thisLineLen; + } + } + maxLineLen += strlen(eol); + if (verbose) Rprintf("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(clock()-t0)/CLOCKS_PER_SEC); + + int f; + if (*filename=='\0') { + f=-1; // file="" means write to standard output + eol = "\n"; // We'll use Rprintf(); it knows itself about \r\n on Windows + } else { +#ifdef WIN32 + f = _open(filename, _O_WRONLY | _O_BINARY | _O_CREAT | (LOGICAL(append)[0] ? _O_APPEND : _O_TRUNC), _S_IWRITE); + // eol must be passed from R level as '\r\n' on Windows since write() only auto-converts \n to \r\n in + // _O_TEXT mode. We use O_BINARY for full control and perhaps speed since O_TEXT must have to deep branch an if('\n') +#else + f = open(filename, O_WRONLY | O_CREAT | (LOGICAL(append)[0] ? O_APPEND : O_TRUNC), 0666); +#endif + if (f == -1) { + int erropen = errno; + if( access( filename, F_OK ) != -1 ) + error("%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?", strerror(erropen), filename); + else + error("%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), filename); + } + } + t0=clock(); + + if (verbose) { + Rprintf("Writing column names ... "); + if (f==-1) Rprintf("\n"); + } + if (LOGICAL(col_names)[0]) { + SEXP names = getAttrib(DFin, R_NamesSymbol); + if (names!=R_NilValue) { + if (LENGTH(names) != ncol) error("Internal error: length of column names is not equal to the number of columns. Please report."); + // allow for quoting even when not. + int buffSize = 2/*""*/ +1/*,*/; + for (int j=0; j1024) error("buffMB=%d outside [1,1024]", buffMB); // check it again even so + size_t buffSize = 1024*1024*buffMB; + if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters + rowsPerBatch = + (10*maxLineLen > buffSize) ? 1 : // very long lines (100,000 characters+) we'll just do one row at a time. + 0.5 * buffSize/maxLineLen; // Aim for 50% buffer usage. See checkBuffer for comments. + if (rowsPerBatch > nrow) rowsPerBatch=nrow; + int numBatches = (nrow-1)/rowsPerBatch + 1; + if (numBatches < nth) nth = numBatches; + if (verbose) { + Rprintf("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", + nrow, numBatches, rowsPerBatch, buffMB, showProgress, nth); + if (f==-1) Rprintf("\n"); + } + t0 = clock(); + + failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail + Rboolean hasPrinted=FALSE; + Rboolean anyBufferGrown=FALSE; + int maxBuffUsedPC=0; + + #pragma omp parallel num_threads(nth) + { + char *ch, *buffer; // local to each thread + ch = buffer = malloc(buffSize); // each thread has its own buffer + // Don't use any R API alloc here (e.g. R_alloc); they are + // not thread-safe as per last sentence of R-exts 6.1.1. + + if (buffer==NULL) {failed=-errno;} + // Do not rely on availability of '#omp cancel' new in OpenMP v4.0 (July 2013). + // OpenMP v4.0 is in gcc 4.9+ (https://gcc.gnu.org/wiki/openmp) but + // not yet in clang as of v3.8 (http://openmp.llvm.org/) + // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer + // and after parallel section, single thread will call R API error() safely. + + size_t myAlloc = buffSize; + size_t myMaxLineLen = maxLineLen; + // so we can realloc(). Should only be needed if there are very long single CHARSXP + // much longer than occurred in the sample for maxLineLen. Or for list() columns + // contain vectors which are much longer than occurred in the sample. + + #pragma omp single + { + nth = omp_get_num_threads(); // update nth with the actual nth (might be different than requested) + } + int me = omp_get_thread_num(); + + #pragma omp for ordered schedule(dynamic) + for(RLEN start=0; start=1 because 0-columns was caught earlier. + write_chars(eol, &ch); // replace it with the newline. + + // Track longest line seen so far. If we start to see longer lines than we saw in the + // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, + // must fit in the buffer. Can't early write and reset buffer because the + // file output would be out-of-order. Can't change rowsPerBatch after the 'parallel for' started. + size_t thisLineLen = ch-lineStart; + if (thisLineLen > myMaxLineLen) myMaxLineLen=thisLineLen; + checkBuffer(&buffer, &myAlloc, &ch, myMaxLineLen); + if (failed) break; // this thread stop writing rows; fall through to clear up and error() below + } + #pragma omp ordered + { + if (!failed) { // a thread ahead of me could have failed below while I was working or waiting above + if (f==-1) { + *ch='\0'; // standard C string end marker so Rprintf knows where to stop + Rprintf(buffer); + // nth==1 at this point since when file=="" (f==-1 here) fwrite.R calls setDTthreads(1) + // Although this ordered section is one-at-a-time it seems that calling Rprintf() here, even with a + // R_FlushConsole() too, causes corruptions on Windows but not on Linux. At least, as observed so + // far using capture.output(). Perhaps Rprintf() updates some state or allocation that cannot be done + // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console + // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. + } else { + if (WRITE(f, buffer, (int)(ch-buffer)) == -1) { + failed=errno; + } + if (myAlloc > buffSize) anyBufferGrown = TRUE; + int used = 100*((double)(ch-buffer))/buffSize; // percentage of original buffMB + if (used > maxBuffUsedPC) maxBuffUsedPC = used; + time_t now; + if (me==0 && showProgress && (now=time(NULL))>=next_time && !failed) { + // See comments above inside the f==-1 clause. + // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the + // master thread (me==0) and hopefully this will work on Windows. If not, user should set + // showProgress=FALSE until this can be fixed or removed. + int ETA = (int)((nrow-end)*(((double)(now-start_time))/end)); + if (hasPrinted || ETA >= 2) { + if (verbose && !hasPrinted) Rprintf("\n"); + Rprintf("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " + "anyBufferGrown=%s; maxBuffUsed=%d%%. ETA %d secs. ", + (100.0*end)/nrow, nrow, (int)(now-start_time), nth, nth==1?"":"s", + anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); + R_FlushConsole(); // for Windows + next_time = now+1; + hasPrinted = TRUE; + } + } + // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. + // Something like: + // if (me==0) { + // failed = TRUE; // inside ordered here; the slaves are before ordered and not looking at 'failed' + // R_CheckUserInterrupt(); + // failed = FALSE; // no user interrupt so return state + // } + // But I fear the slaves will hang waiting for the master (me==0) to complete the ordered + // section which may not happen if the master thread has been interrupted. Rather than + // seeing failed=TRUE and falling through to free() and close() as intended. + // Could register a finalizer to free() and close() perhaps : + // http://r.789695.n4.nabble.com/checking-user-interrupts-in-C-code-tp2717528p2717722.html + // Conclusion for now: do not provide ability to interrupt. + // write() errors and malloc() fails will be caught and cleaned up properly, however. + } + ch = buffer; // back to the start of my buffer ready to fill it up again + } + } + } + free(buffer); + // all threads will call this free on their buffer, even if one or more threads had malloc + // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. + } + // Finished parallel region and can call R API safely now. + if (hasPrinted) { + if (!failed) { + // clear the progress meter + Rprintf("\r " + " \r"); + R_FlushConsole(); // for Windows + } else { + // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC + Rprintf("\n"); + } + } + if (f!=-1 && CLOSE(f) && !failed) + error("%s: '%s'", strerror(errno), filename); + // quoted '%s' in case of trailing spaces in the filename + // If a write failed, the line above tries close() to clean up, but that might fail as well. So the + // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail + // from the original error. + if (failed<0) { + error("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", strerror(-failed), nth, buffMB); + } else if (failed>0) { + error("%s: '%s'", strerror(failed), filename); + } + if (verbose) Rprintf("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", + nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); + UNPROTECT(protecti); + return(R_NilValue); +} + From a213dcbdfbd11119e76d52a9315d2868174a0512 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Thu, 14 Sep 2017 14:49:43 -0700 Subject: [PATCH 02/16] Interim --- src/fwrite.c | 215 ++++++++++++++++---------------------------------- src/fwriteR.c | 110 ++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 147 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index f804123877..10dc2e569d 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -1,8 +1,12 @@ #include "fwriteLookups.h" #include -#include // for access() +#include // for access() #include #include +#include // true and false +#include // INT32_MIN +#include // isfinite, isnan +#include // abs #ifdef WIN32 #include #include @@ -23,15 +27,15 @@ static char sep; // comma in .csv files static char sep2; // '|' within list columns static const char *sep2start, *sep2end; static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 -static _Bool verbose=FALSE; // be chatty? -static _Bool quote=FALSE; // whether to surround fields with double quote ". NA means 'auto' (default) -static _Bool qmethod_escape=TRUE; // when quoting fields, how to manage double quote in the field contents -static _Bool logicalAsInt=FALSE; // logical as 0/1 or "TRUE"/"FALSE" -static _Bool squash=FALSE; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) +static _Bool verbose=false; // be chatty? +static _Bool quote=false; // whether to surround fields with double quote ". NA means 'auto' (default) +static _Bool qmethod_escape=false; // when quoting fields, how to manage double quote in the field contents +static _Bool logicalAsInt=false; // logical as 0/1 or "TRUE"/"FALSE" +static _Bool squash=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2=epoch, 3=write.csv #define DATETIMEAS_EPOCH 2 #define DATETIMEAS_WRITECSV 3 -typedef void (*writer_fun_t)(SEXP, int, char **); +typedef void (*writer_fun_t)(void *, int, char **); static inline void write_chars(const char *x, char **thisCh) { @@ -41,13 +45,13 @@ static inline void write_chars(const char *x, char **thisCh) *thisCh = ch; } -static void writeLogical(SEXP column, int i, char **thisCh) +static void writeLogical(int *col, int row, char **thisCh) { - Rboolean x = LOGICAL(column)[i]; + int x = col[row]; char *ch = *thisCh; - if (x == NA_LOGICAL) { + if (x == INT32_MIN) { write_chars(na, &ch); - } else if (logicalAsInt) { + } else if (logicalAsInt) { // TODO raise this up to use different processor, default TRUE *ch++ = '0'+x; } else if (x) { *ch++='T'; *ch++='R'; *ch++='U'; *ch++='E'; @@ -57,7 +61,7 @@ static void writeLogical(SEXP column, int i, char **thisCh) *thisCh = ch; } -static inline void write_positive_int(long long x, char **thisCh) +static inline void write_positive_int(int64_t x, char **thisCh) { // Avoid log() for speed. Write backwards then reverse when we know how long. // Separate function just because it's used if row numbers are asked for, too @@ -73,14 +77,13 @@ static inline void write_positive_int(long long x, char **thisCh) *thisCh = ch; } -static void writeInteger(SEXP column, int i, char **thisCh) +static void writeInt32(int32_t *col, int row, char **thisCh) { - long long x = (TYPEOF(column)!=REALSXP) ? INTEGER(column)[i] : DtoLL(REAL(column)[i]); - // != REALSXP rather than ==INTSXP to cover LGLSXP when logicalAsInt==TRUE char *ch = *thisCh; + int32_t x = col[row]; if (x == 0) { *ch++ = '0'; - } else if (x == ((TYPEOF(column)==INTSXP) ? NA_INTEGER : NA_INT64_LL)) { + } else if (x == INT32_MIN) { write_chars(na, &ch); } else { if (x<0) { *ch++ = '-'; x=-x; } @@ -89,12 +92,28 @@ static void writeInteger(SEXP column, int i, char **thisCh) *thisCh = ch; } -SEXP genLookups() { - Rprintf("genLookups commented out of the package so it's clear it isn't needed to build. The hooks are left in so it's easy to put back in development should we need to.\n"); - // e.g. ldexpl may not be available on some platforms, or if it is it may not be accurate. - return R_NilValue; +static void writeInt64(int64_t *col, int row, char **thisCh) +{ + char *ch = *thisCh; + int64_t x = col[row]; + if (x == 0) { + *ch++ = '0'; + } else if (x == INT64_MIN) { + write_chars(na, &ch); + } else { + if (x<0) { *ch++ = '-'; x=-x; } + write_positive_int(x, &ch); + } + *thisCh = ch; } + /* + * Generate fwriteLookup.h which defines sigparts, expsig and exppow that writeNumeric() that follows uses. + * It was run once a long time ago in dev and we don't need to generate it again unless we change it. + * Commented out and left here in the file where its result is used, in case we need it in future. + * Reason: ldexpl may not be available on all platforms and is slower than a direct lookup when it is. + * +void genLookups() { FILE *f = fopen("/tmp/fwriteLookups.h", "w"); fprintf(f, "//\n\ // Generated by fwrite.c:genLookups()\n\ @@ -128,7 +147,7 @@ SEXP genLookups() { } */ -static void writeNumeric(SEXP column, int i, char **thisCh) +static void writeNumeric(double *col, int row, char **thisCh) { // hand-rolled / specialized for speed // *thisCh is safely the output destination with enough space (ensured via calculating maxLineLen up front) @@ -137,25 +156,24 @@ static void writeNumeric(SEXP column, int i, char **thisCh) // i) no buffers. writes straight to the final file buffer passed to write() // ii) no C libary calls such as sprintf() where the fmt string has to be interpretted over and over // iii) no need to return variables or flags. Just writes. - // iv) shorter, easier to read and reason with. In one self contained place. - double x = REAL(column)[i]; + // iv) shorter, easier to read and reason with in one self contained place. + double x = col[row]; char *ch = *thisCh; - if (!R_FINITE(x)) { - if (ISNAN(x)) { + if (isfinite(x)) { + if (isnan(x)) { write_chars(na, &ch); - } else if (x>0) { - *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; } else { - *ch++ = '-'; *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; + if (x<0) *ch++ = '-'; + *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; } } else if (x == 0.0) { *ch++ = '0'; // and we're done. so much easier rather than passing back special cases } else { if (x < 0.0) { *ch++ = '-'; x = -x; } // and we're done on sign, already written. no need to pass back sign - union { double d; unsigned long long ull; } u; + union { double d; uint64_t l; } u; u.d = x; - unsigned long long fraction = u.ull & 0xFFFFFFFFFFFFF; // (1ULL<<52)-1; - int exponent = (int)((u.ull>>52) & 0x7FF); // [0,2047] + uint64_t fraction = u.l & 0xFFFFFFFFFFFFF; // (1<<52)-1; + uint32_t exponent = (int32_t)((u.l>>52) & 0x7FF); // [0,2047] // Now sum the appropriate powers 2^-(1:52) of the fraction // Important for accuracy to start with the smallest first; i.e. 2^-52 @@ -168,7 +186,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) if (fraction) { while ((fraction & 0xFF) == 0) { fraction >>= 8; i-=8; } while (fraction) { - acc += sigparts[(((fraction&1u)^1u)-1u) & i]; + acc += sigparts[(((fraction & 1u)^1u)-1u) & i]; i--; fraction >>= 1; } @@ -181,7 +199,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) double y = (1.0+acc) * expsig[exponent]; // low magnitude mult int exp = exppow[exponent]; if (y>=9.99999999999999) { y /= 10; exp++; } - unsigned long long l = y * SIZE_SF; // low magnitude mult 10^NUM_SF + uint64_t l = y * SIZE_SF; // low magnitude mult 10^NUM_SF // l now contains NUM_SF+1 digits as integer where repeated /10 below is accurate // if (verbose) Rprintf("\nTRACE: acc=%.20Le ; y=%.20Le ; l=%llu ; e=%d ", acc, y, l, exp); @@ -253,66 +271,12 @@ static void writeNumeric(SEXP column, int i, char **thisCh) *thisCh = ch; } -static void writeString(SEXP column, int i, char **thisCh) -{ - SEXP x = STRING_ELT(column, i); - char *ch = *thisCh; - if (x == NA_STRING) { - // NA is not quoted by write.csv even when quote=TRUE to distinguish from "NA" - write_chars(na, &ch); - } else { - Rboolean q = quote; - if (q==NA_LOGICAL) { // quote="auto" - const char *tt = CHAR(x); - if (*tt == '\0') { - // Empty strings are always quoted: this distinguishes them from NAs - *ch = '"'; ch[1] = '"'; - *thisCh += 2; - return; - } - while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='"') *ch++ = *tt++; - // Windows includes \n in its \r\n so looking for \n only is sufficient - // sep2 is set to '\0' when no list columns are present - if (*tt=='\0') { - // most common case: no sep, newline or " contained in string - *thisCh = ch; // advance caller over the field already written - return; - } - ch = *thisCh; // rewind the field written since it needs to be quoted - q = TRUE; - } - if (q==FALSE) { - write_chars(CHAR(x), &ch); - } else { - *ch++ = '"'; - const char *tt = CHAR(x); - if (qmethod_escape) { - while (*tt!='\0') { - if (*tt=='"' || *tt=='\\') *ch++ = '\\'; - *ch++ = *tt++; - } - } else { - // qmethod='double' - while (*tt!='\0') { - if (*tt=='"') *ch++ = '"'; - *ch++ = *tt++; - } - } - *ch++ = '"'; - } - } - *thisCh = ch; -} -static void writeFactor(SEXP column, int i, char **thisCh) { - char *ch = *thisCh; - if (INTEGER(column)[i]==NA_INTEGER) write_chars(na, &ch); - else writeString(getAttrib(column, R_LevelsSymbol), INTEGER(column)[i]-1, &ch); - *thisCh = ch; -} // DATE/TIME + static inline void write_time(int x, char **thisCh) +// just a helper called below by the real writers (time-only and datetime) { char *ch = *thisCh; if (x<0) { // <0 covers NA_INTEGER too (==INT_MIN checked in init.c) @@ -334,11 +298,13 @@ static inline void write_time(int x, char **thisCh) } *thisCh = ch; } -static void writeITime(SEXP column, int i, char **thisCh) { - write_time(INTEGER(column)[i], thisCh); + +static void writeITime(int *col, int row, char **thisCh) { + write_time(col[row], thisCh); } static inline void write_date(int x, char **thisCh) +// just a helper called below by the two real writers (date-only and datetime) { // From base ?Date : // " Dates are represented as the number of days since 1970-01-01, with negative values @@ -386,14 +352,14 @@ static inline void write_date(int x, char **thisCh) } *thisCh = ch; } -static void writeDateInt(SEXP column, int i, char **thisCh) { - write_date(INTEGER(column)[i], thisCh); +static void writeDateInt(int *col, int row, char **thisCh) { + write_date(col[row], thisCh); } -static void writeDateReal(SEXP column, int i, char **thisCh) { - write_date(R_FINITE(REAL(column)[i]) ? (int)REAL(column)[i] : NA_INTEGER, thisCh); +static void writeDateReal(double *col, int row, char **thisCh) { + write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, thisCh); } -static void writePOSIXct(SEXP column, int i, char **thisCh) +static void writePOSIXct(double *col, int row, char **thisCh) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -402,9 +368,9 @@ static void writePOSIXct(SEXP column, int i, char **thisCh) // All positive integers up to 2^53 (9e15) are exactly representable by double which is relied // on in the ops here; number of seconds since epoch. - double x = REAL(column)[i]; + double x = col[row]; char *ch = *thisCh; - if (!R_FINITE(x)) { + if (!isfinite(x)) { write_chars(na, &ch); } else { int xi, d, t; @@ -453,11 +419,11 @@ static void writePOSIXct(SEXP column, int i, char **thisCh) *thisCh = ch; } -static void writeNanotime(SEXP column, int i, char **thisCh) +static void writeNanotime(int64_t *col, int row, char **thisCh) { - long long x = DtoLL(REAL(column)[i]); + int64_t x = col[row]; char *ch = *thisCh; - if (x == NA_INT64_LL) { + if (x == INT64_MIN) { write_chars(na, &ch); } else { int d/*days*/, s/*secs*/, n/*nanos*/; @@ -486,51 +452,6 @@ static void writeNanotime(SEXP column, int i, char **thisCh) *thisCh = ch; } -static void writeList(SEXP, int, char **); // prototype needed because it calls back to whichWriter too - -static writer_fun_t whichWriter(SEXP column) { - switch(TYPEOF(column)) { - case LGLSXP: - return logicalAsInt ? writeInteger : writeLogical; - case INTSXP: - if (isFactor(column)) return writeFactor; - if (dateTimeAs==DATETIMEAS_EPOCH) return writeInteger; - if (INHERITS(column, char_ITime)) return writeITime; - if (INHERITS(column, char_Date)) return writeDateInt; - return writeInteger; - case REALSXP: - if (INHERITS(column, char_nanotime) && dateTimeAs!=DATETIMEAS_EPOCH) return writeNanotime; - if (INHERITS(column, char_integer64))return writeInteger; - if (dateTimeAs==DATETIMEAS_EPOCH) return writeNumeric; - if (INHERITS(column, char_Date)) return writeDateReal; - if (INHERITS(column, char_POSIXct)) return writePOSIXct; - return writeNumeric; - case STRSXP: - return writeString; - case VECSXP: - return writeList; - default: - return NULL; - } -} - -static void writeList(SEXP column, int i, char **thisCh) { - SEXP v = VECTOR_ELT(column,i); - writer_fun_t fun = whichWriter(v); - if (TYPEOF(v)==VECSXP || fun==NULL) { - error("Row %d of list column is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", i+1, type2char(TYPEOF(v))); - } - char *ch = *thisCh; - write_chars(sep2start, &ch); - for (int j=0; j=1 because 0-columns was caught earlier. diff --git a/src/fwriteR.c b/src/fwriteR.c index f73e6e33ec..e18c34f8ba 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -1,6 +1,116 @@ #include "data.table.h" +/* Non-agnostic writers + * Where possible we use common agnostic writers. But in these cases there are unavoidable differences + * in the structure of the data being written. + */ + +static void writeString(SEXP col, int row, char **thisCh) +{ + SEXP x = STRING_ELT(col, row); + char *ch = *thisCh; + if (x == NA_STRING) { + // NA is not quoted by write.csv even when quote=TRUE to distinguish from "NA" + write_chars(na, &ch); + } else { + Rboolean q = quote; + if (q==NA_LOGICAL) { // quote="auto" + const char *tt = CHAR(x); + if (*tt == '\0') { + // Empty strings are always quoted: this distinguishes them from NAs + *ch = '"'; ch[1] = '"'; + *thisCh += 2; + return; + } + while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='"') *ch++ = *tt++; + // Windows includes \n in its \r\n so looking for \n only is sufficient + // sep2 is set to '\0' when no list columns are present + if (*tt=='\0') { + // most common case: no sep, newline or " contained in string + *thisCh = ch; // advance caller over the field already written + return; + } + ch = *thisCh; // rewind the field written since it needs to be quoted + q = TRUE; + } + if (q==FALSE) { + write_chars(CHAR(x), &ch); + } else { + *ch++ = '"'; + const char *tt = CHAR(x); + if (qmethod_escape) { + while (*tt!='\0') { + if (*tt=='"' || *tt=='\\') *ch++ = '\\'; + *ch++ = *tt++; + } + } else { + // qmethod='double' + while (*tt!='\0') { + if (*tt=='"') *ch++ = '"'; + *ch++ = *tt++; + } + } + *ch++ = '"'; + } + } + *thisCh = ch; +} + +static void writeFactor(SEXP column, int i, char **thisCh) { + char *ch = *thisCh; + if (INTEGER(column)[i]==NA_INTEGER) write_chars(na, &ch); + else writeString(getAttrib(column, R_LevelsSymbol), INTEGER(column)[i]-1, &ch); + *thisCh = ch; +} + +static void writeList(SEXP, int, char **); // prototype needed because it calls back to whichWriter too + +static writer_fun_t whichWriter(SEXP column) { + switch(TYPEOF(column)) { + case LGLSXP: + return logicalAsInt ? writeInteger : writeLogical; + case INTSXP: + if (isFactor(column)) return writeFactor; + if (dateTimeAs==DATETIMEAS_EPOCH) return writeInteger; + if (INHERITS(column, char_ITime)) return writeITime; + if (INHERITS(column, char_Date)) return writeDateInt; + return writeInteger; + case REALSXP: + if (INHERITS(column, char_nanotime) && dateTimeAs!=DATETIMEAS_EPOCH) return writeNanotime; + if (INHERITS(column, char_integer64))return writeInteger; + if (dateTimeAs==DATETIMEAS_EPOCH) return writeNumeric; + if (INHERITS(column, char_Date)) return writeDateReal; + if (INHERITS(column, char_POSIXct)) return writePOSIXct; + return writeNumeric; + case STRSXP: + return writeString; + case VECSXP: + return writeList; + default: + return NULL; + } +} + +static void writeList(SEXP column, int i, char **thisCh) { + SEXP v = VECTOR_ELT(column,i); + writer_fun_t fun = whichWriter(v); + if (TYPEOF(v)==VECSXP || fun==NULL) { + error("Row %d of list column is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", i+1, type2char(TYPEOF(v))); + } + char *ch = *thisCh; + write_chars(sep2start, &ch); + for (int j=0; j Date: Fri, 15 Sep 2017 15:29:14 -0700 Subject: [PATCH 03/16] Interim --- R/fwrite.R | 8 +++- src/fwrite.c | 116 ++++++++++---------------------------------------- src/fwriteR.c | 1 - 3 files changed, 29 insertions(+), 96 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index f70074f278..9bd5c90b92 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -2,7 +2,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", na="", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), - logicalAsInt=FALSE, dateTimeAs = c("ISO","squash","epoch","write.csv"), + logical01=TRUE, logicalAsInt=logical01, dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(), showProgress=interactive(), verbose=getOption("datatable.verbose")) { @@ -10,6 +10,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] if (missing(dateTimeAs)) dateTimeAs = dateTimeAs[1L] + if (!missing(logical01) && !missing(logicalAsInt)) + stop("logicalAsInt has been renamed logical01. Use logical01 only, not both.") + if (!missing(logicalAsInt)) { + warning("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") + logical01 = logicalAsInt + } else if (length(dateTimeAs)>1) stop("dateTimeAs must be a single string") dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L if (is.na(dateTimeAs)) stop("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") diff --git a/src/fwrite.c b/src/fwrite.c index 10dc2e569d..3ecff56949 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -484,108 +484,36 @@ static inline void checkBuffer( } } -SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. data.frame, data.table - SEXP filename_Arg, - SEXP sep_Arg, - SEXP sep2_Arg, - SEXP eol_Arg, - SEXP na_Arg, - SEXP dec_Arg, - SEXP quote_Arg, // 'auto'=NA_LOGICAL|TRUE|FALSE - SEXP qmethod_escapeArg, // TRUE|FALSE - SEXP append, // TRUE|FALSE - SEXP row_names, // TRUE|FALSE - SEXP col_names, // TRUE|FALSE - SEXP logicalAsInt_Arg, // TRUE|FALSE - SEXP dateTimeAs_Arg, // 0=ISO(yyyy-mm-dd),1=squash(yyyymmdd),2=epoch,3=write.csv - SEXP buffMB_Arg, // [1-1024] default 8MB - SEXP nThread, - SEXP showProgress_Arg, - SEXP verbose_Arg) +void writefile( + const char *filename, + void **DFin, // any list of same length vectors; e.g. data.frame, data.table + int ncol, + int64_t nrow, + writer_fun_t *fun, // a unique set of writer_fun_t function pointers + int8_t *whichFun, // length ncol vector containing which fun[] to use for each column + char sep, + const char *eol, + const char *na, + char dec, + char quote, + _Bool quotes_doubled, // FALSE means escape quotes using backslash + _Bool append, + void *row_names, + void *col_names, + _Bool logical01, // TRUE|FALSE + int dateTimeAs, // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv + int buffMB, // [1-1024] default 8MB + int nThread, + _Bool showProgress, + _Bool verbose) { - if (!isNewList(DFin)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); - RLEN ncol = length(DFin); - if (ncol==0) { - warning("fwrite was passed an empty list of no columns. Nothing to write."); - return R_NilValue; - } - RLEN nrow = length(VECTOR_ELT(DFin, 0)); - - const Rboolean showProgress = LOGICAL(showProgress_Arg)[0]; time_t start_time = time(NULL); time_t next_time = start_time+2; // start printing progress meter in 2 sec if not completed by then - verbose = LOGICAL(verbose_Arg)[0]; - - sep = *CHAR(STRING_ELT(sep_Arg, 0)); // DO NOT DO: allow multichar separator (bad idea) - sep2start = CHAR(STRING_ELT(sep2_Arg, 0)); - sep2 = *CHAR(STRING_ELT(sep2_Arg, 1)); - sep2end = CHAR(STRING_ELT(sep2_Arg, 2)); - - const char *eol = CHAR(STRING_ELT(eol_Arg, 0)); - // someone might want a trailer on every line so allow any length string as eol - - na = CHAR(STRING_ELT(na_Arg, 0)); - dec = *CHAR(STRING_ELT(dec_Arg,0)); - quote = LOGICAL(quote_Arg)[0]; - // When NA is a non-empty string, then we must quote all string fields - if (*na != '\0' && quote == NA_LOGICAL) quote = TRUE; - qmethod_escape = LOGICAL(qmethod_escapeArg)[0]; - const char *filename = CHAR(STRING_ELT(filename_Arg, 0)); - logicalAsInt = LOGICAL(logicalAsInt_Arg)[0]; - dateTimeAs = INTEGER(dateTimeAs_Arg)[0]; squash = (dateTimeAs==1); - int nth = INTEGER(nThread)[0]; int firstListColumn = 0; clock_t t0=clock(); - SEXP DF = DFin; - int protecti = 0; - if (dateTimeAs == DATETIMEAS_WRITECSV) { - int j=0; while(j 1e6 columns - writer_fun_t *fun = (writer_fun_t *)R_alloc(ncol, sizeof(writer_fun_t)); - for (int j=0; j Date: Sun, 17 Sep 2017 11:17:10 -0700 Subject: [PATCH 04/16] Interim --- src/fwrite.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 3ecff56949..1b77dc7739 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -159,7 +159,7 @@ static void writeNumeric(double *col, int row, char **thisCh) // iv) shorter, easier to read and reason with in one self contained place. double x = col[row]; char *ch = *thisCh; - if (isfinite(x)) { + if (!isfinite(x)) { if (isnan(x)) { write_chars(na, &ch); } else { @@ -514,19 +514,25 @@ void writefile( int firstListColumn = 0; clock_t t0=clock(); - // user may want row names even when they don't exist (implied row numbers as row names) - Rboolean doRowNames = LOGICAL(row_names)[0]; - SEXP rowNames = NULL; - if (doRowNames) { - rowNames = getAttrib(DFin, R_RowNamesSymbol); - if (!isString(rowNames)) rowNames=NULL; - } + _Bool doRowNames = false; // Estimate max line length of a 1000 row sample (100 rows in 10 places). // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. // Buffers will be resized later if there are too many line lengths outside the sample, anyway. // maxLineLen is required to determine a reasonable rowsPerBatch. + + + // alloc one buffMB here. Keep rewriting each field to it, to sum up the size. Restriction: one field can't be + // greater that minimumum buffMB (1MB = 1 million characters). Otherwise unbounded overwrite. Possible with very + // very long single strings, or very long list column values. + // The caller guarantees no field with be longer than this. If so, it can set buffMB larger. It might know + // due to some stats it has maintained on each column or in the environment generally. + // However, a single field being longer than 1 million characters is considered a very reasonable restriction. + // Once we have a good line length estimate, we may increase the buffer size a lot anyway. + // The default buffMB is 8MB, so it's really 8 million character limit by default. 1MB is because user might set + // buffMB to 1, say if they have 512 CPUs or more, perhaps. + int maxLineLen = 0; int na_len = strlen(na); int step = nrow<1000 ? 100 : nrow/10; From 23586052fd1ef54386ccec8b7dc8c7569ad5df00 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 17 Sep 2017 15:16:01 -0700 Subject: [PATCH 05/16] Interim --- src/fwrite.c | 306 +++++++++++++++++++++++---------------------------- 1 file changed, 135 insertions(+), 171 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 1b77dc7739..8dfba99cf9 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -2,11 +2,11 @@ #include #include // for access() #include -#include #include // true and false #include // INT32_MIN #include // isfinite, isnan #include // abs +#include // strlen, strerror #ifdef WIN32 #include #include @@ -18,21 +18,19 @@ #define CLOSE close #endif +extern double wallclock(void); + #define NUM_SF 15 #define SIZE_SF 1000000000000000ULL // 10^NUM_SF // Globals for this file only. Written once to hold parameters passed from R level. -static const char *na; // by default "" or if set (not recommended) then usually "NA" -static char sep; // comma in .csv files -static char sep2; // '|' within list columns -static const char *sep2start, *sep2end; -static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 -static _Bool verbose=false; // be chatty? -static _Bool quote=false; // whether to surround fields with double quote ". NA means 'auto' (default) -static _Bool qmethod_escape=false; // when quoting fields, how to manage double quote in the field contents -static _Bool logicalAsInt=false; // logical as 0/1 or "TRUE"/"FALSE" -static _Bool squash=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) -static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2=epoch, 3=write.csv +static const char *na; // by default "" or if set (not recommended) then usually "NA" +static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 +static bool verbose=false; // be chatty? +static bool quote=false; // whether to surround fields with double quote ". NA means 'auto' (default) +static bool qmethod_escape=false; // when quoting fields, how to manage double quote in the field contents +static bool squash=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) +static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2=epoch, 3=write.csv #define DATETIMEAS_EPOCH 2 #define DATETIMEAS_WRITECSV 3 typedef void (*writer_fun_t)(void *, int, char **); @@ -486,36 +484,34 @@ static inline void checkBuffer( void writefile( const char *filename, - void **DFin, // any list of same length vectors; e.g. data.frame, data.table + void **columns, // a vector of pointers to all-same-length column vectors int ncol, int64_t nrow, - writer_fun_t *fun, // a unique set of writer_fun_t function pointers - int8_t *whichFun, // length ncol vector containing which fun[] to use for each column + void *colNames, // NULL means no header, otherwise ncol strings + writer_fun_t *fun, // a vector of writer_fun_t function pointers + int8_t *whichFun, // length ncol vector containing which fun[] to use for each column char sep, const char *eol, const char *na, char dec, char quote, - _Bool quotes_doubled, // FALSE means escape quotes using backslash - _Bool append, - void *row_names, - void *col_names, - _Bool logical01, // TRUE|FALSE - int dateTimeAs, // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv - int buffMB, // [1-1024] default 8MB - int nThread, - _Bool showProgress, - _Bool verbose) + bool quotesDoubled, // false means escape quotes using backslash + bool append, + bool doRowNames, // optional, likely false + void *rowNames, // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. + int dateTimeAs, // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv. TODO: raise to caller choosing fun[] + int buffMB, // [1-1024] default 8MB + int nth, + bool showProgress, + bool verbose) { - time_t start_time = time(NULL); - time_t next_time = start_time+2; // start printing progress meter in 2 sec if not completed by then + + double startTime = wallclock(); + double nextTime = startTime+2; // start printing progress meter in 2 sec if not completed by then + double t0 = startTime; squash = (dateTimeAs==1); int firstListColumn = 0; - clock_t t0=clock(); - - // user may want row names even when they don't exist (implied row numbers as row names) - _Bool doRowNames = false; // Estimate max line length of a 1000 row sample (100 rows in 10 places). // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. @@ -533,127 +529,101 @@ void writefile( // The default buffMB is 8MB, so it's really 8 million character limit by default. 1MB is because user might set // buffMB to 1, say if they have 512 CPUs or more, perhaps. + // Cold section as only 1,000 rows. Speed not an issue issue here. + // Overestimating line length is ok. + int eolLen = strlen(eol); + if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); + + if (buffMB<1 || buffMB>1024) STOP("buffMB=%d outside [1,1024]", buffMB); + size_t buffSize = (size_t)1024*1024*buffMB; + char *buff = malloc(buffSize); + if (!buff) STOP("Unable to allocate %dMB for line length estimation: %s", buffMB, strerror(errno)); + int maxLineLen = 0; - int na_len = strlen(na); int step = nrow<1000 ? 100 : nrow/10; for (int start=0; start 1 million bytes + fun[whichFun[j]]( columns[j], i, &ch ); + thisLineLen += (int)(ch-buff) + 1/*sep*/; // see comments above about restrictions/guarantees/contracts + } + thisLineLen += eolLen; if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; } } maxLineLen += strlen(eol); - if (verbose) Rprintf("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(clock()-t0)/CLOCKS_PER_SEC); + if (verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); int f; if (*filename=='\0') { f=-1; // file="" means write to standard output - eol = "\n"; // We'll use Rprintf(); it knows itself about \r\n on Windows + eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows } else { #ifdef WIN32 - f = _open(filename, _O_WRONLY | _O_BINARY | _O_CREAT | (LOGICAL(append)[0] ? _O_APPEND : _O_TRUNC), _S_IWRITE); - // eol must be passed from R level as '\r\n' on Windows since write() only auto-converts \n to \r\n in - // _O_TEXT mode. We use O_BINARY for full control and perhaps speed since O_TEXT must have to deep branch an if('\n') + f = _open(filename, _O_WRONLY | _O_BINARY | _O_CREAT | (append ? _O_APPEND : _O_TRUNC), _S_IWRITE); + // O_BINARY rather than O_TEXT for explicit control and speed since it seems that write() has a branch inside it + // to convert \n to \r\n on Windows when in text mode not not when in binary mode. #else - f = open(filename, O_WRONLY | O_CREAT | (LOGICAL(append)[0] ? O_APPEND : O_TRUNC), 0666); + f = open(filename, O_WRONLY | O_CREAT | (append ? O_APPEND : O_TRUNC), 0666); + // There is no binary/text mode distinction on Linux and Mac #endif if (f == -1) { int erropen = errno; if( access( filename, F_OK ) != -1 ) - error("%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?", strerror(erropen), filename); + STOP("%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?", strerror(erropen), filename); else - error("%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), filename); + STOP("%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), filename); } } - t0=clock(); + t0=wallclock(); if (verbose) { - Rprintf("Writing column names ... "); - if (f==-1) Rprintf("\n"); + DTPRINT("Writing column names ... "); + if (f==-1) DTPRINT("\n"); } - if (LOGICAL(col_names)[0]) { - SEXP names = getAttrib(DFin, R_NamesSymbol); - if (names!=R_NilValue) { - if (LENGTH(names) != ncol) error("Internal error: length of column names is not equal to the number of columns. Please report."); - // allow for quoting even when not. - int buffSize = 2/*""*/ +1/*,*/; - for (int j=0; j 1 million bytes long + *ch++ = sep; // this sep after the last column name won't be written to the file } + WRITE(f, eol, eolLen); // TODO: move error check above inside WRITE } - if (verbose) Rprintf("done in %.3fs\n", 1.0*(clock()-t0)/CLOCKS_PER_SEC); + free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above + if (verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); if (nrow == 0) { - if (verbose) Rprintf("No data rows present (nrow==0)\n"); - if (f!=-1 && CLOSE(f)) error("%s: '%s'", strerror(errno), filename); - UNPROTECT(protecti); - return(R_NilValue); + if (verbose) DTPRINT("No data rows present (nrow==0)\n"); + if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), filename); + return; } // Decide buffer size and rowsPerBatch for each thread @@ -661,50 +631,44 @@ void writefile( // turn out to be longer than estimated from the sample. // buffSize large enough to fit many lines to i) reduce calls to write() and ii) reduce thread sync points // It doesn't need to be small in cache because it's written contiguously. - // If we don't use all the buffer for any reasons that's ok as OS will only page in the pages touched. + // If we don't use all the buffer for any reasons that's ok as OS will only getch the cache lines touched. // So, generally the larger the better up to max filesize/nth to use all the threads. A few times // smaller than that though, to achieve some load balancing across threads since schedule(dynamic). - int buffMB = INTEGER(buffMB_Arg)[0]; // checked at R level between 1 and 1024 - if (buffMB<1 || buffMB>1024) error("buffMB=%d outside [1,1024]", buffMB); // check it again even so - size_t buffSize = 1024*1024*buffMB; - if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters + if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters (since min(buffMB)==1) rowsPerBatch = - (10*maxLineLen > buffSize) ? 1 : // very long lines (100,000 characters+) we'll just do one row at a time. + (10*maxLineLen > buffSize) ? 1 : // very very long lines (100,000 characters+) each thread will just do one row at a time. 0.5 * buffSize/maxLineLen; // Aim for 50% buffer usage. See checkBuffer for comments. if (rowsPerBatch > nrow) rowsPerBatch=nrow; int numBatches = (nrow-1)/rowsPerBatch + 1; if (numBatches < nth) nth = numBatches; if (verbose) { - Rprintf("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", - nrow, numBatches, rowsPerBatch, buffMB, showProgress, nth); - if (f==-1) Rprintf("\n"); + DTPRINT("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", + nrow, numBatches, rowsPerBatch, buffMB, showProgress, nth); + if (f==-1) DTPRINT("\n"); } - t0 = clock(); + t0 = wallclock(); failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail - Rboolean hasPrinted=FALSE; - Rboolean anyBufferGrown=FALSE; + bool hasPrinted=false; + bool anyBufferGrown=false; int maxBuffUsedPC=0; #pragma omp parallel num_threads(nth) { - char *ch, *buffer; // local to each thread - ch = buffer = malloc(buffSize); // each thread has its own buffer - // Don't use any R API alloc here (e.g. R_alloc); they are - // not thread-safe as per last sentence of R-exts 6.1.1. - - if (buffer==NULL) {failed=-errno;} + char *ch, *myBuff; // local to each thread + ch = myBuff = malloc(buffSize); // each thread has its own buffer. malloc and errno are thread-safe. + if (myBuff==NULL) {failed=-errno;} // Do not rely on availability of '#omp cancel' new in OpenMP v4.0 (July 2013). // OpenMP v4.0 is in gcc 4.9+ (https://gcc.gnu.org/wiki/openmp) but // not yet in clang as of v3.8 (http://openmp.llvm.org/) // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer - // and after parallel section, single thread will call R API error() safely. + // and after parallel section, single thread will call STOP() safely. size_t myAlloc = buffSize; size_t myMaxLineLen = maxLineLen; - // so we can realloc(). Should only be needed if there are very long single CHARSXP - // much longer than occurred in the sample for maxLineLen. Or for list() columns - // contain vectors which are much longer than occurred in the sample. + // so we can realloc(). Should only be needed if there are very long lines that are + // much longer than occurred in the sample for maxLineLen; e.g. unusally long string values + // that didn't occur in the sample, or list columns with some very long vectors in some cells. #pragma omp single { @@ -713,28 +677,30 @@ void writefile( int me = omp_get_thread_num(); #pragma omp for ordered schedule(dynamic) - for(RLEN start=0; start=1 because 0-columns was caught earlier. - write_chars(eol, &ch); // replace it with the newline. + write_chars(eol, &ch); // overwrite last sep with eol instead // Track longest line seen so far. If we start to see longer lines than we saw in the // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, @@ -742,15 +708,15 @@ void writefile( // file output would be out-of-order. Can't change rowsPerBatch after the 'parallel for' started. size_t thisLineLen = ch-lineStart; if (thisLineLen > myMaxLineLen) myMaxLineLen=thisLineLen; - checkBuffer(&buffer, &myAlloc, &ch, myMaxLineLen); - if (failed) break; // this thread stop writing rows; fall through to clear up and error() below + checkBuffer(&myBuff, &myAlloc, &ch, myMaxLineLen); + if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } #pragma omp ordered { if (!failed) { // a thread ahead of me could have failed below while I was working or waiting above if (f==-1) { - *ch='\0'; // standard C string end marker so Rprintf knows where to stop - Rprintf(buffer); + *ch='\0'; // standard C string end marker so DTPRINT knows where to stop + DTPRINT(myBuff); // nth==1 at this point since when file=="" (f==-1 here) fwrite.R calls setDTthreads(1) // Although this ordered section is one-at-a-time it seems that calling Rprintf() here, even with a // R_FlushConsole() too, causes corruptions on Windows but not on Linux. At least, as observed so @@ -758,27 +724,27 @@ void writefile( // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. } else { - if (WRITE(f, buffer, (int)(ch-buffer)) == -1) { + if (WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { failed=errno; } - if (myAlloc > buffSize) anyBufferGrown = TRUE; - int used = 100*((double)(ch-buffer))/buffSize; // percentage of original buffMB + if (myAlloc > buffSize) anyBufferGrown = true; + int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB if (used > maxBuffUsedPC) maxBuffUsedPC = used; - time_t now; - if (me==0 && showProgress && (now=time(NULL))>=next_time && !failed) { + double now; + if (me==0 && showProgress && (now=wallclock())>=nextTime && !failed) { // See comments above inside the f==-1 clause. // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the // master thread (me==0) and hopefully this will work on Windows. If not, user should set // showProgress=FALSE until this can be fixed or removed. - int ETA = (int)((nrow-end)*(((double)(now-start_time))/end)); + int ETA = (int)((nrow-end)*((now-startTime)/end)); if (hasPrinted || ETA >= 2) { - if (verbose && !hasPrinted) Rprintf("\n"); - Rprintf("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " + if (verbose && !hasPrinted) DTPRINT("\n"); + DTPRINT("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " "anyBufferGrown=%s; maxBuffUsed=%d%%. ETA %d secs. ", - (100.0*end)/nrow, nrow, (int)(now-start_time), nth, nth==1?"":"s", + (100.0*end)/nrow, nrow, (int)(now-startTime), nth, nth==1?"":"s", anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); - R_FlushConsole(); // for Windows - next_time = now+1; + // TODO: use progress() as in fread + nextTime = now+1; hasPrinted = TRUE; } } @@ -797,11 +763,11 @@ void writefile( // Conclusion for now: do not provide ability to interrupt. // write() errors and malloc() fails will be caught and cleaned up properly, however. } - ch = buffer; // back to the start of my buffer ready to fill it up again + ch = myBuff; // back to the start of my buffer ready to fill it up again } } } - free(buffer); + free(myBuff); // all threads will call this free on their buffer, even if one or more threads had malloc // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. } @@ -809,28 +775,26 @@ void writefile( if (hasPrinted) { if (!failed) { // clear the progress meter - Rprintf("\r " + DTPRINT("\r " " \r"); - R_FlushConsole(); // for Windows } else { // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC - Rprintf("\n"); + DTPRINT("\n"); } } if (f!=-1 && CLOSE(f) && !failed) - error("%s: '%s'", strerror(errno), filename); + STOP("%s: '%s'", strerror(errno), filename); // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - error("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", strerror(-failed), nth, buffMB); + STOP("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", strerror(-failed), nth, buffMB); } else if (failed>0) { - error("%s: '%s'", strerror(failed), filename); + STOP("%s: '%s'", strerror(failed), filename); } - if (verbose) Rprintf("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", + if (verbose) DTPRINT("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); - UNPROTECT(protecti); - return(R_NilValue); + return; } From 979012f671b4bc4f4f86a6cf32d6f3eb34c4f01c Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 17 Sep 2017 16:06:19 -0700 Subject: [PATCH 06/16] Interim --- src/fwrite.c | 97 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 39 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 8dfba99cf9..3d09d80ca2 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -35,36 +35,55 @@ static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2 #define DATETIMEAS_WRITECSV 3 typedef void (*writer_fun_t)(void *, int, char **); -static inline void write_chars(const char *x, char **thisCh) +static inline void write_chars(const char *x, char **thCh) { // similar to C's strcpy but i) doesn't include trailing \0 and ii) moves destination along - char *ch = *thisCh; + char *ch = *thCh; while (*x) *ch++=*x++; - *thisCh = ch; + *thCh = ch; } -static void writeLogical(int *col, int row, char **thisCh) +static void writeLogicalBest(int8_t *col, int row, char **thCh) { - int x = col[row]; - char *ch = *thisCh; + int8_t x = col[row]; + if (x==INT8_MIN) return; // na empty field + *(*thCh++) = '0'+x; +} + + +static void writeLogical01(int32_t *col, int row, char **thCh) +{ + int32_t x = col[row]; + char *ch = *thCh; if (x == INT32_MIN) { write_chars(na, &ch); - } else if (logicalAsInt) { // TODO raise this up to use different processor, default TRUE + } else { *ch++ = '0'+x; + } + *thCh = ch; +} + + +static void writeLogicalLong(int32_t *col, int row, char **thCh) +{ + int32_t x = col[row]; + char *ch = *thCh; + if (x == INT32_MIN) { + write_chars(na, &ch); } else if (x) { *ch++='T'; *ch++='R'; *ch++='U'; *ch++='E'; } else { *ch++='F'; *ch++='A'; *ch++='L'; *ch++='S'; *ch++='E'; } - *thisCh = ch; + *thCh = ch; } -static inline void write_positive_int(int64_t x, char **thisCh) +static inline void write_positive_int(int64_t x, char **thCh) { // Avoid log() for speed. Write backwards then reverse when we know how long. // Separate function just because it's used if row numbers are asked for, too // x >= 1 - char *ch = *thisCh; + char *ch = *thCh; int width = 0; while (x>0) { *ch++ = '0'+x%10; x /= 10; width++; } for (int i=width/2; i>0; i--) { @@ -72,12 +91,12 @@ static inline void write_positive_int(int64_t x, char **thisCh) *(ch-i) = *(ch-width+i-1); *(ch-width+i-1) = tmp; } - *thisCh = ch; + *thCh = ch; } -static void writeInt32(int32_t *col, int row, char **thisCh) +static void writeInt32(int32_t *col, int row, char **thCh) { - char *ch = *thisCh; + char *ch = *thCh; int32_t x = col[row]; if (x == 0) { *ch++ = '0'; @@ -87,12 +106,12 @@ static void writeInt32(int32_t *col, int row, char **thisCh) if (x<0) { *ch++ = '-'; x=-x; } write_positive_int(x, &ch); } - *thisCh = ch; + *thCh = ch; } -static void writeInt64(int64_t *col, int row, char **thisCh) +static void writeInt64(int64_t *col, int row, char **thCh) { - char *ch = *thisCh; + char *ch = *thCh; int64_t x = col[row]; if (x == 0) { *ch++ = '0'; @@ -102,7 +121,7 @@ static void writeInt64(int64_t *col, int row, char **thisCh) if (x<0) { *ch++ = '-'; x=-x; } write_positive_int(x, &ch); } - *thisCh = ch; + *thCh = ch; } /* @@ -145,10 +164,10 @@ void genLookups() { } */ -static void writeNumeric(double *col, int row, char **thisCh) +static void writeNumeric(double *col, int row, char **thCh) { // hand-rolled / specialized for speed - // *thisCh is safely the output destination with enough space (ensured via calculating maxLineLen up front) + // *thCh is safely the output destination with enough space (ensured via calculating maxLineLen up front) // technique similar to base R (format.c:formatReal and printutils.c:EncodeReal0) // differences/tricks : // i) no buffers. writes straight to the final file buffer passed to write() @@ -156,7 +175,7 @@ static void writeNumeric(double *col, int row, char **thisCh) // iii) no need to return variables or flags. Just writes. // iv) shorter, easier to read and reason with in one self contained place. double x = col[row]; - char *ch = *thisCh; + char *ch = *thCh; if (!isfinite(x)) { if (isnan(x)) { write_chars(na, &ch); @@ -266,17 +285,17 @@ static void writeNumeric(double *col, int row, char **thisCh) } } } - *thisCh = ch; + *thCh = ch; } // DATE/TIME -static inline void write_time(int x, char **thisCh) +static inline void write_time(int x, char **thCh) // just a helper called below by the real writers (time-only and datetime) { - char *ch = *thisCh; + char *ch = *thCh; if (x<0) { // <0 covers NA_INTEGER too (==INT_MIN checked in init.c) write_chars(na, &ch); } else { @@ -294,14 +313,14 @@ static inline void write_time(int x, char **thisCh) *ch++ = '0'+ss/10; *ch++ = '0'+ss%10; } - *thisCh = ch; + *thCh = ch; } -static void writeITime(int *col, int row, char **thisCh) { - write_time(col[row], thisCh); +static void writeITime(int *col, int row, char **thCh) { + write_time(col[row], thCh); } -static inline void write_date(int x, char **thisCh) +static inline void write_date(int x, char **thCh) // just a helper called below by the two real writers (date-only and datetime) { // From base ?Date : @@ -322,7 +341,7 @@ static inline void write_date(int x, char **thisCh) // The end result is 5 lines of simple branch free integer math with no library calls. // as.integer(as.Date(c("0000-03-01","9999-12-31"))) == c(-719468,+2932896) - char *ch = *thisCh; + char *ch = *thCh; if (x< -719468 || x>2932896) { // NA_INTEGER<(-719468) (==INT_MIN checked in init.c) write_chars(na, &ch); @@ -348,16 +367,16 @@ static inline void write_date(int x, char **thisCh) *ch = '0'+y%10; y/=10; ch += 8 + 2*!squash; } - *thisCh = ch; + *thCh = ch; } -static void writeDateInt(int *col, int row, char **thisCh) { - write_date(col[row], thisCh); +static void writeDateInt(int *col, int row, char **thCh) { + write_date(col[row], thCh); } -static void writeDateReal(double *col, int row, char **thisCh) { - write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, thisCh); +static void writeDateReal(double *col, int row, char **thCh) { + write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, thCh); } -static void writePOSIXct(double *col, int row, char **thisCh) +static void writePOSIXct(double *col, int row, char **thCh) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -367,7 +386,7 @@ static void writePOSIXct(double *col, int row, char **thisCh) // on in the ops here; number of seconds since epoch. double x = col[row]; - char *ch = *thisCh; + char *ch = *thCh; if (!isfinite(x)) { write_chars(na, &ch); } else { @@ -414,13 +433,13 @@ static void writePOSIXct(double *col, int row, char **thisCh) *ch++ = 'Z'; ch -= squash; } - *thisCh = ch; + *thCh = ch; } -static void writeNanotime(int64_t *col, int row, char **thisCh) +static void writeNanotime(int64_t *col, int row, char **thCh) { int64_t x = col[row]; - char *ch = *thisCh; + char *ch = *thCh; if (x == INT64_MIN) { write_chars(na, &ch); } else { @@ -447,7 +466,7 @@ static void writeNanotime(int64_t *col, int row, char **thisCh) *ch++ = 'Z'; ch -= squash; } - *thisCh = ch; + *thCh = ch; } From 177b30ebfc8c819c832e1390fb82b7b926c0ac63 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 17 Sep 2017 19:52:19 -0700 Subject: [PATCH 07/16] Interim --- src/fwrite.c | 155 +++++++++++----- src/fwriteR.c | 483 +++++++++----------------------------------------- 2 files changed, 189 insertions(+), 449 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 3d09d80ca2..2c7f170f80 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -17,25 +17,26 @@ #define WRITE write #define CLOSE close #endif - -extern double wallclock(void); +#include +#include "freadR.h" // STOP, DTPRINT, DTWARN #define NUM_SF 15 #define SIZE_SF 1000000000000000ULL // 10^NUM_SF // Globals for this file only. Written once to hold parameters passed from R level. -static const char *na; // by default "" or if set (not recommended) then usually "NA" -static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 -static bool verbose=false; // be chatty? -static bool quote=false; // whether to surround fields with double quote ". NA means 'auto' (default) -static bool qmethod_escape=false; // when quoting fields, how to manage double quote in the field contents -static bool squash=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) -static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2=epoch, 3=write.csv -#define DATETIMEAS_EPOCH 2 -#define DATETIMEAS_WRITECSV 3 -typedef void (*writer_fun_t)(void *, int, char **); - -static inline void write_chars(const char *x, char **thCh) +static char *na; // by default "" or if set (not recommended) then usually "NA" +static char sep; // comma in .csv files +char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns +static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 +static int8_t quote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) +static bool qmethod_escape=false; // when quoting fields, how to escape double quotes in the field contents (default false means to add another double quote) +static bool squash=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) + +extern const char *getString(void *, int); +extern const char *getCategString(void *, int); +extern double wallclock(void); + +inline void write_chars(const char *x, char **thCh) { // similar to C's strcpy but i) doesn't include trailing \0 and ii) moves destination along char *ch = *thCh; @@ -43,28 +44,21 @@ static inline void write_chars(const char *x, char **thCh) *thCh = ch; } -static void writeLogicalBest(int8_t *col, int row, char **thCh) +void writeBool8(int8_t *col, int row, char **thCh) { int8_t x = col[row]; - if (x==INT8_MIN) return; // na empty field + if (x==INT8_MIN) return; *(*thCh++) = '0'+x; } - -static void writeLogical01(int32_t *col, int row, char **thCh) +void writeBool32(int32_t *col, int row, char **thCh) { int32_t x = col[row]; - char *ch = *thCh; - if (x == INT32_MIN) { - write_chars(na, &ch); - } else { - *ch++ = '0'+x; - } - *thCh = ch; + if (x==INT32_MIN) return; + *(*thCh++) = '0'+x; } - -static void writeLogicalLong(int32_t *col, int row, char **thCh) +void writeBool32AsString(int32_t *col, int row, char **thCh) { int32_t x = col[row]; char *ch = *thCh; @@ -94,7 +88,7 @@ static inline void write_positive_int(int64_t x, char **thCh) *thCh = ch; } -static void writeInt32(int32_t *col, int row, char **thCh) +void writeInt32(int32_t *col, int row, char **thCh) { char *ch = *thCh; int32_t x = col[row]; @@ -104,12 +98,12 @@ static void writeInt32(int32_t *col, int row, char **thCh) write_chars(na, &ch); } else { if (x<0) { *ch++ = '-'; x=-x; } - write_positive_int(x, &ch); + write_positive_int((int64_t)x, &ch); } *thCh = ch; } -static void writeInt64(int64_t *col, int row, char **thCh) +void writeInt64(int64_t *col, int row, char **thCh) { char *ch = *thCh; int64_t x = col[row]; @@ -164,7 +158,7 @@ void genLookups() { } */ -static void writeNumeric(double *col, int row, char **thCh) +void writeFloat64(double *col, int row, char **thCh) { // hand-rolled / specialized for speed // *thCh is safely the output destination with enough space (ensured via calculating maxLineLen up front) @@ -288,11 +282,9 @@ static void writeNumeric(double *col, int row, char **thCh) *thCh = ch; } - - // DATE/TIME -static inline void write_time(int x, char **thCh) +static inline void write_time(int32_t x, char **thCh) // just a helper called below by the real writers (time-only and datetime) { char *ch = *thCh; @@ -316,11 +308,11 @@ static inline void write_time(int x, char **thCh) *thCh = ch; } -static void writeITime(int *col, int row, char **thCh) { +void writeITime(int32_t *col, int row, char **thCh) { write_time(col[row], thCh); } -static inline void write_date(int x, char **thCh) +static inline void write_date(int32_t x, char **thCh) // just a helper called below by the two real writers (date-only and datetime) { // From base ?Date : @@ -369,14 +361,16 @@ static inline void write_date(int x, char **thCh) } *thCh = ch; } -static void writeDateInt(int *col, int row, char **thCh) { + +void writeDateInt32(int32_t *col, int row, char **thCh) { write_date(col[row], thCh); } -static void writeDateReal(double *col, int row, char **thCh) { + +void writeDateFloat64(double *col, int row, char **thCh) { write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, thCh); } -static void writePOSIXct(double *col, int row, char **thCh) +void writePOSIXct(double *col, int row, char **thCh) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -436,7 +430,7 @@ static void writePOSIXct(double *col, int row, char **thCh) *thCh = ch; } -static void writeNanotime(int64_t *col, int row, char **thCh) +void writeNanotime(int64_t *col, int row, char **thCh) { int64_t x = col[row]; char *ch = *thCh; @@ -469,6 +463,66 @@ static void writeNanotime(int64_t *col, int row, char **thCh) *thCh = ch; } +static inline void write_string(const char *x, char **thCh) +{ + char *ch = *thCh; + if (x == NULL) { + // NA is not quoted even when quote=TRUE to distinguish from quoted "NA" value. But going forward: ,,==NA and ,"",==empty string + write_chars(na, &ch); + } else { + int8_t q = quote; + if (q==INT8_MIN) { // NA means quote="auto" + const char *tt = x; + if (*tt == '\0') { + // Empty strings are always quoted to distinguish from ,,==NA + *ch++='"'; *ch++='"'; + *thCh = ch; + return; + } + while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='\r' && *tt!='"') *ch++ = *tt++; + // Windows includes \n in its \r\n so looking for \n only is sufficient + // sep2 is set to '\0' when no list columns are present + if (*tt=='\0') { + // most common case: no sep, newline or " contained in string + *thCh = ch; // advance caller over the field already written + return; + } + ch = *thCh; // rewind the field written since it needs to be quoted + q = true; + } + if (q==false) { + write_chars(x, &ch); + } else { + *ch++ = '"'; + const char *tt = x; + if (qmethod_escape) { + while (*tt!='\0') { + if (*tt=='"' || *tt=='\\') *ch++ = '\\'; + *ch++ = *tt++; + } + } else { + // qmethod='double' + while (*tt!='\0') { + if (*tt=='"') *ch++ = '"'; + *ch++ = *tt++; + } + } + *ch++ = '"'; + } + } + *thCh = ch; +} + +void writeString(void *col, int row, char **thCh) +{ + write_string(getString(col, row), thCh); +} + +void writeCategString(void *col, int row, char **thCh) +{ + write_string(getCategString(col, row), thCh); +} + static int failed = 0; static int rowsPerBatch; @@ -501,7 +555,7 @@ static inline void checkBuffer( } } -void writefile( +void fwriteMain( const char *filename, void **columns, // a vector of pointers to all-same-length column vectors int ncol, @@ -518,7 +572,6 @@ void writefile( bool append, bool doRowNames, // optional, likely false void *rowNames, // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. - int dateTimeAs, // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv. TODO: raise to caller choosing fun[] int buffMB, // [1-1024] default 8MB int nth, bool showProgress, @@ -530,7 +583,6 @@ void writefile( double t0 = startTime; squash = (dateTimeAs==1); - int firstListColumn = 0; // Estimate max line length of a 1000 row sample (100 rows in 10 places). // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. @@ -566,7 +618,7 @@ void writefile( int thisLineLen=0; if (doRowNames) { if (rowNames) { - const char *ch = buff; + char *ch = buff; writeString(rowNames, i, &ch); thisLineLen += (int)(ch-buff); // see comments above about restrictions/guarantees/contracts } else { @@ -626,16 +678,21 @@ void writefile( } for (int j=0; j 1 million bytes long *ch++ = sep; // this sep after the last column name won't be written to the file } - WRITE(f, eol, eolLen); // TODO: move error check above inside WRITE + if (WRITE(f, eol, eolLen)==-1) { + int errwrite=errno; + close(f); + free(buff); + STOP("%s: '%s'", strerror(errwrite), filename); + } } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above if (verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); @@ -764,7 +821,7 @@ void writefile( anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); // TODO: use progress() as in fread nextTime = now+1; - hasPrinted = TRUE; + hasPrinted = true; } } // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. diff --git a/src/fwriteR.c b/src/fwriteR.c index be0ac334ba..811315db86 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -1,114 +1,78 @@ +#include #include "data.table.h" +#include "fwrite.h" -/* Non-agnostic writers - * Where possible we use common agnostic writers. But in these cases there are unavoidable differences - * in the structure of the data being written. - */ +#define DATETIMEAS_EPOCH 2 +#define DATETIMEAS_WRITECSV 3 -static void writeString(SEXP col, int row, char **thisCh) -{ + +extern char sep2; +static bool logical01=true; +static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv +static const char *sep2start, *sep2end; +// sep2 is in main fwrite.c so that writeString can quote other fields if sep2 is present in them +// if there are no list columns, set sep2=='\0' + +// Non-agnostic helpers ... + +inline const char *getString(SEXP col, int row) { SEXP x = STRING_ELT(col, row); - char *ch = *thisCh; - if (x == NA_STRING) { - // NA is not quoted by write.csv even when quote=TRUE to distinguish from "NA" - write_chars(na, &ch); - } else { - Rboolean q = quote; - if (q==NA_LOGICAL) { // quote="auto" - const char *tt = CHAR(x); - if (*tt == '\0') { - // Empty strings are always quoted: this distinguishes them from NAs - *ch = '"'; ch[1] = '"'; - *thisCh += 2; - return; - } - while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='"') *ch++ = *tt++; - // Windows includes \n in its \r\n so looking for \n only is sufficient - // sep2 is set to '\0' when no list columns are present - if (*tt=='\0') { - // most common case: no sep, newline or " contained in string - *thisCh = ch; // advance caller over the field already written - return; - } - ch = *thisCh; // rewind the field written since it needs to be quoted - q = TRUE; - } - if (q==FALSE) { - write_chars(CHAR(x), &ch); - } else { - *ch++ = '"'; - const char *tt = CHAR(x); - if (qmethod_escape) { - while (*tt!='\0') { - if (*tt=='"' || *tt=='\\') *ch++ = '\\'; - *ch++ = *tt++; - } - } else { - // qmethod='double' - while (*tt!='\0') { - if (*tt=='"') *ch++ = '"'; - *ch++ = *tt++; - } - } - *ch++ = '"'; - } - } - *thisCh = ch; + return x==NA_STRING ? NULL : CHAR(x); } -static void writeFactor(SEXP column, int i, char **thisCh) { - char *ch = *thisCh; - if (INTEGER(column)[i]==NA_INTEGER) write_chars(na, &ch); - else writeString(getAttrib(column, R_LevelsSymbol), INTEGER(column)[i]-1, &ch); - *thisCh = ch; +inline const char *getCategString(SEXP col, int row) { + int x = INTEGER(col)[row]; + return x==NA_INTEGER ? NULL : CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } -static void writeList(SEXP, int, char **); // prototype needed because it calls back to whichWriter too +static writer_fun_t whichWriter(SEXP); + +void writeList123(void *col, int64_t row, char **thCh) { + SEXP v = VECTOR_ELT(col,row); + writer_fun_t fun = whichWriter(v); + if (TYPEOF(v)==VECSXP || fun==NULL) { + error("Row %d of list column is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", + row+1, type2char(TYPEOF(v))); + } + char *ch = *thCh; + write_chars(sep2start, &ch); + void *data = (void *)DATAPTR(v); + for (int j=0; j maxLineLen) maxLineLen = thisLineLen; - } - } - maxLineLen += strlen(eol); - if (verbose) Rprintf("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(clock()-t0)/CLOCKS_PER_SEC); - - int f; - if (*filename=='\0') { - f=-1; // file="" means write to standard output - eol = "\n"; // We'll use Rprintf(); it knows itself about \r\n on Windows - } else { -#ifdef WIN32 - f = _open(filename, _O_WRONLY | _O_BINARY | _O_CREAT | (LOGICAL(append)[0] ? _O_APPEND : _O_TRUNC), _S_IWRITE); - // eol must be passed from R level as '\r\n' on Windows since write() only auto-converts \n to \r\n in - // _O_TEXT mode. We use O_BINARY for full control and perhaps speed since O_TEXT must have to deep branch an if('\n') -#else - f = open(filename, O_WRONLY | O_CREAT | (LOGICAL(append)[0] ? O_APPEND : O_TRUNC), 0666); -#endif - if (f == -1) { - int erropen = errno; - if( access( filename, F_OK ) != -1 ) - error("%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?", strerror(erropen), filename); - else - error("%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), filename); - } - } - t0=clock(); - - if (verbose) { - Rprintf("Writing column names ... "); - if (f==-1) Rprintf("\n"); - } - if (LOGICAL(col_names)[0]) { - SEXP names = getAttrib(DFin, R_NamesSymbol); - if (names!=R_NilValue) { - if (LENGTH(names) != ncol) error("Internal error: length of column names is not equal to the number of columns. Please report."); - // allow for quoting even when not. - int buffSize = 2/*""*/ +1/*,*/; - for (int j=0; j1024) error("buffMB=%d outside [1,1024]", buffMB); // check it again even so - size_t buffSize = 1024*1024*buffMB; - if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters - rowsPerBatch = - (10*maxLineLen > buffSize) ? 1 : // very long lines (100,000 characters+) we'll just do one row at a time. - 0.5 * buffSize/maxLineLen; // Aim for 50% buffer usage. See checkBuffer for comments. - if (rowsPerBatch > nrow) rowsPerBatch=nrow; - int numBatches = (nrow-1)/rowsPerBatch + 1; - if (numBatches < nth) nth = numBatches; - if (verbose) { - Rprintf("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", - nrow, numBatches, rowsPerBatch, buffMB, showProgress, nth); - if (f==-1) Rprintf("\n"); - } - t0 = clock(); + fwriteMain( + (const char *) filename, + (void **) columns, + (int) ncol, + (int64_t) nrow, + (void *) colNames, + (writer_fun_t *) fun, + (int8_t *) whichFun, + (char) sep, + (const char *) eol, + (const char *) na, + (char) dec, + (char) quote, + (bool) quotesDoubled, + (bool) squashDateTime, + (bool) append, + (bool) doRowNames, + (void *) rowNames, + (int) buffMB, + (int) nth, + (bool) showProgress, + (bool) verbose + ); - failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail - Rboolean hasPrinted=FALSE; - Rboolean anyBufferGrown=FALSE; - int maxBuffUsedPC=0; - - #pragma omp parallel num_threads(nth) - { - char *ch, *buffer; // local to each thread - ch = buffer = malloc(buffSize); // each thread has its own buffer - // Don't use any R API alloc here (e.g. R_alloc); they are - // not thread-safe as per last sentence of R-exts 6.1.1. - - if (buffer==NULL) {failed=-errno;} - // Do not rely on availability of '#omp cancel' new in OpenMP v4.0 (July 2013). - // OpenMP v4.0 is in gcc 4.9+ (https://gcc.gnu.org/wiki/openmp) but - // not yet in clang as of v3.8 (http://openmp.llvm.org/) - // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer - // and after parallel section, single thread will call R API error() safely. - - size_t myAlloc = buffSize; - size_t myMaxLineLen = maxLineLen; - // so we can realloc(). Should only be needed if there are very long single CHARSXP - // much longer than occurred in the sample for maxLineLen. Or for list() columns - // contain vectors which are much longer than occurred in the sample. - - #pragma omp single - { - nth = omp_get_num_threads(); // update nth with the actual nth (might be different than requested) - } - int me = omp_get_thread_num(); - - #pragma omp for ordered schedule(dynamic) - for(RLEN start=0; start=1 because 0-columns was caught earlier. - write_chars(eol, &ch); // replace it with the newline. - - // Track longest line seen so far. If we start to see longer lines than we saw in the - // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, - // must fit in the buffer. Can't early write and reset buffer because the - // file output would be out-of-order. Can't change rowsPerBatch after the 'parallel for' started. - size_t thisLineLen = ch-lineStart; - if (thisLineLen > myMaxLineLen) myMaxLineLen=thisLineLen; - checkBuffer(&buffer, &myAlloc, &ch, myMaxLineLen); - if (failed) break; // this thread stop writing rows; fall through to clear up and error() below - } - #pragma omp ordered - { - if (!failed) { // a thread ahead of me could have failed below while I was working or waiting above - if (f==-1) { - *ch='\0'; // standard C string end marker so Rprintf knows where to stop - Rprintf(buffer); - // nth==1 at this point since when file=="" (f==-1 here) fwrite.R calls setDTthreads(1) - // Although this ordered section is one-at-a-time it seems that calling Rprintf() here, even with a - // R_FlushConsole() too, causes corruptions on Windows but not on Linux. At least, as observed so - // far using capture.output(). Perhaps Rprintf() updates some state or allocation that cannot be done - // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console - // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. - } else { - if (WRITE(f, buffer, (int)(ch-buffer)) == -1) { - failed=errno; - } - if (myAlloc > buffSize) anyBufferGrown = TRUE; - int used = 100*((double)(ch-buffer))/buffSize; // percentage of original buffMB - if (used > maxBuffUsedPC) maxBuffUsedPC = used; - time_t now; - if (me==0 && showProgress && (now=time(NULL))>=next_time && !failed) { - // See comments above inside the f==-1 clause. - // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the - // master thread (me==0) and hopefully this will work on Windows. If not, user should set - // showProgress=FALSE until this can be fixed or removed. - int ETA = (int)((nrow-end)*(((double)(now-start_time))/end)); - if (hasPrinted || ETA >= 2) { - if (verbose && !hasPrinted) Rprintf("\n"); - Rprintf("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " - "anyBufferGrown=%s; maxBuffUsed=%d%%. ETA %d secs. ", - (100.0*end)/nrow, nrow, (int)(now-start_time), nth, nth==1?"":"s", - anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); - R_FlushConsole(); // for Windows - next_time = now+1; - hasPrinted = TRUE; - } - } - // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. - // Something like: - // if (me==0) { - // failed = TRUE; // inside ordered here; the slaves are before ordered and not looking at 'failed' - // R_CheckUserInterrupt(); - // failed = FALSE; // no user interrupt so return state - // } - // But I fear the slaves will hang waiting for the master (me==0) to complete the ordered - // section which may not happen if the master thread has been interrupted. Rather than - // seeing failed=TRUE and falling through to free() and close() as intended. - // Could register a finalizer to free() and close() perhaps : - // http://r.789695.n4.nabble.com/checking-user-interrupts-in-C-code-tp2717528p2717722.html - // Conclusion for now: do not provide ability to interrupt. - // write() errors and malloc() fails will be caught and cleaned up properly, however. - } - ch = buffer; // back to the start of my buffer ready to fill it up again - } - } - } - free(buffer); - // all threads will call this free on their buffer, even if one or more threads had malloc - // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. - } - // Finished parallel region and can call R API safely now. - if (hasPrinted) { - if (!failed) { - // clear the progress meter - Rprintf("\r " - " \r"); - R_FlushConsole(); // for Windows - } else { - // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC - Rprintf("\n"); - } - } - if (f!=-1 && CLOSE(f) && !failed) - error("%s: '%s'", strerror(errno), filename); - // quoted '%s' in case of trailing spaces in the filename - // If a write failed, the line above tries close() to clean up, but that might fail as well. So the - // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail - // from the original error. - if (failed<0) { - error("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", strerror(-failed), nth, buffMB); - } else if (failed>0) { - error("%s: '%s'", strerror(failed), filename); - } - if (verbose) Rprintf("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", - nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); UNPROTECT(protecti); return(R_NilValue); } From dab20b78edcf51208c5f7aadf45f90348dda1adb Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Sun, 17 Sep 2017 19:52:36 -0700 Subject: [PATCH 08/16] Interim --- src/fwrite.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 src/fwrite.h diff --git a/src/fwrite.h b/src/fwrite.h new file mode 100644 index 0000000000..5f25f37041 --- /dev/null +++ b/src/fwrite.h @@ -0,0 +1,22 @@ + +typedef void (*writer_fun_t)(void *, int64_t, char **); + +writer_fun_t writeBool8; +writer_fun_t writeBool32; +writer_fun_t writeBool32AsString; +writer_fun_t writeInt32; +writer_fun_t writeInt64; +writer_fun_t writeFloat64; +writer_fun_t writeITime; +writer_fun_t writeDateInt32; +writer_fun_t writeDateFloat64; +writer_fun_t writePOSIXct; +writer_fun_t writeNanotime; +writer_fun_t writeString; +writer_fun_t writeCategString; +//writer_fun_t writeList123; + +extern void write_chars(const char *source, char **dest); + +void fwriteMain(); + From b48344a016e84bdd0234b0f83fb4f693795d662b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Tue, 19 Sep 2017 13:51:21 -0700 Subject: [PATCH 09/16] Interim --- src/fwrite.c | 250 +++++++++++++++++++++++------------------------- src/fwrite.h | 100 ++++++++++++++++---- src/fwriteR.c | 257 +++++++++++++++++++++++++++----------------------- src/init.c | 6 +- 4 files changed, 340 insertions(+), 273 deletions(-) diff --git a/src/fwrite.c b/src/fwrite.c index 2c7f170f80..462fe094d4 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -18,7 +18,7 @@ #define CLOSE close #endif #include -#include "freadR.h" // STOP, DTPRINT, DTWARN +#include "fwrite.h" #define NUM_SF 15 #define SIZE_SF 1000000000000000ULL // 10^NUM_SF @@ -26,7 +26,7 @@ // Globals for this file only. Written once to hold parameters passed from R level. static char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files -char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns +static char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 static int8_t quote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) static bool qmethod_escape=false; // when quoting fields, how to escape double quotes in the field contents (default false means to add another double quote) @@ -36,32 +36,32 @@ extern const char *getString(void *, int); extern const char *getCategString(void *, int); extern double wallclock(void); -inline void write_chars(const char *x, char **thCh) +inline void write_chars(const char *x, char **pch) { // similar to C's strcpy but i) doesn't include trailing \0 and ii) moves destination along - char *ch = *thCh; + char *ch = *pch; while (*x) *ch++=*x++; - *thCh = ch; + *pch = ch; } -void writeBool8(int8_t *col, int row, char **thCh) +void writeBool8(int8_t *col, int row, char **pch) { int8_t x = col[row]; if (x==INT8_MIN) return; - *(*thCh++) = '0'+x; + *(*pch++) = '0'+x; } -void writeBool32(int32_t *col, int row, char **thCh) +void writeBool32(int32_t *col, int row, char **pch) { int32_t x = col[row]; if (x==INT32_MIN) return; - *(*thCh++) = '0'+x; + *(*pch++) = '0'+x; } -void writeBool32AsString(int32_t *col, int row, char **thCh) +void writeBool32AsString(int32_t *col, int row, char **pch) { int32_t x = col[row]; - char *ch = *thCh; + char *ch = *pch; if (x == INT32_MIN) { write_chars(na, &ch); } else if (x) { @@ -69,15 +69,15 @@ void writeBool32AsString(int32_t *col, int row, char **thCh) } else { *ch++='F'; *ch++='A'; *ch++='L'; *ch++='S'; *ch++='E'; } - *thCh = ch; + *pch = ch; } -static inline void write_positive_int(int64_t x, char **thCh) +static inline void write_positive_int(int64_t x, char **pch) { // Avoid log() for speed. Write backwards then reverse when we know how long. // Separate function just because it's used if row numbers are asked for, too // x >= 1 - char *ch = *thCh; + char *ch = *pch; int width = 0; while (x>0) { *ch++ = '0'+x%10; x /= 10; width++; } for (int i=width/2; i>0; i--) { @@ -85,12 +85,12 @@ static inline void write_positive_int(int64_t x, char **thCh) *(ch-i) = *(ch-width+i-1); *(ch-width+i-1) = tmp; } - *thCh = ch; + *pch = ch; } -void writeInt32(int32_t *col, int row, char **thCh) +void writeInt32(int32_t *col, int row, char **pch) { - char *ch = *thCh; + char *ch = *pch; int32_t x = col[row]; if (x == 0) { *ch++ = '0'; @@ -100,12 +100,12 @@ void writeInt32(int32_t *col, int row, char **thCh) if (x<0) { *ch++ = '-'; x=-x; } write_positive_int((int64_t)x, &ch); } - *thCh = ch; + *pch = ch; } -void writeInt64(int64_t *col, int row, char **thCh) +void writeInt64(int64_t *col, int row, char **pch) { - char *ch = *thCh; + char *ch = *pch; int64_t x = col[row]; if (x == 0) { *ch++ = '0'; @@ -115,7 +115,7 @@ void writeInt64(int64_t *col, int row, char **thCh) if (x<0) { *ch++ = '-'; x=-x; } write_positive_int(x, &ch); } - *thCh = ch; + *pch = ch; } /* @@ -158,10 +158,10 @@ void genLookups() { } */ -void writeFloat64(double *col, int row, char **thCh) +void writeFloat64(double *col, int row, char **pch) { // hand-rolled / specialized for speed - // *thCh is safely the output destination with enough space (ensured via calculating maxLineLen up front) + // *pch is safely the output destination with enough space (ensured via calculating maxLineLen up front) // technique similar to base R (format.c:formatReal and printutils.c:EncodeReal0) // differences/tricks : // i) no buffers. writes straight to the final file buffer passed to write() @@ -169,7 +169,7 @@ void writeFloat64(double *col, int row, char **thCh) // iii) no need to return variables or flags. Just writes. // iv) shorter, easier to read and reason with in one self contained place. double x = col[row]; - char *ch = *thCh; + char *ch = *pch; if (!isfinite(x)) { if (isnan(x)) { write_chars(na, &ch); @@ -279,15 +279,15 @@ void writeFloat64(double *col, int row, char **thCh) } } } - *thCh = ch; + *pch = ch; } // DATE/TIME -static inline void write_time(int32_t x, char **thCh) +static inline void write_time(int32_t x, char **pch) // just a helper called below by the real writers (time-only and datetime) { - char *ch = *thCh; + char *ch = *pch; if (x<0) { // <0 covers NA_INTEGER too (==INT_MIN checked in init.c) write_chars(na, &ch); } else { @@ -305,14 +305,14 @@ static inline void write_time(int32_t x, char **thCh) *ch++ = '0'+ss/10; *ch++ = '0'+ss%10; } - *thCh = ch; + *pch = ch; } -void writeITime(int32_t *col, int row, char **thCh) { - write_time(col[row], thCh); +void writeITime(int32_t *col, int row, char **pch) { + write_time(col[row], pch); } -static inline void write_date(int32_t x, char **thCh) +static inline void write_date(int32_t x, char **pch) // just a helper called below by the two real writers (date-only and datetime) { // From base ?Date : @@ -333,7 +333,7 @@ static inline void write_date(int32_t x, char **thCh) // The end result is 5 lines of simple branch free integer math with no library calls. // as.integer(as.Date(c("0000-03-01","9999-12-31"))) == c(-719468,+2932896) - char *ch = *thCh; + char *ch = *pch; if (x< -719468 || x>2932896) { // NA_INTEGER<(-719468) (==INT_MIN checked in init.c) write_chars(na, &ch); @@ -359,18 +359,18 @@ static inline void write_date(int32_t x, char **thCh) *ch = '0'+y%10; y/=10; ch += 8 + 2*!squash; } - *thCh = ch; + *pch = ch; } -void writeDateInt32(int32_t *col, int row, char **thCh) { - write_date(col[row], thCh); +void writeDateInt32(int32_t *col, int row, char **pch) { + write_date(col[row], pch); } -void writeDateFloat64(double *col, int row, char **thCh) { - write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, thCh); +void writeDateFloat64(double *col, int row, char **pch) { + write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, pch); } -void writePOSIXct(double *col, int row, char **thCh) +void writePOSIXct(double *col, int row, char **pch) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -380,7 +380,7 @@ void writePOSIXct(double *col, int row, char **thCh) // on in the ops here; number of seconds since epoch. double x = col[row]; - char *ch = *thCh; + char *ch = *pch; if (!isfinite(x)) { write_chars(na, &ch); } else { @@ -427,13 +427,13 @@ void writePOSIXct(double *col, int row, char **thCh) *ch++ = 'Z'; ch -= squash; } - *thCh = ch; + *pch = ch; } -void writeNanotime(int64_t *col, int row, char **thCh) +void writeNanotime(int64_t *col, int row, char **pch) { int64_t x = col[row]; - char *ch = *thCh; + char *ch = *pch; if (x == INT64_MIN) { write_chars(na, &ch); } else { @@ -460,12 +460,12 @@ void writeNanotime(int64_t *col, int row, char **thCh) *ch++ = 'Z'; ch -= squash; } - *thCh = ch; + *pch = ch; } -static inline void write_string(const char *x, char **thCh) +static inline void write_string(const char *x, char **pch) { - char *ch = *thCh; + char *ch = *pch; if (x == NULL) { // NA is not quoted even when quote=TRUE to distinguish from quoted "NA" value. But going forward: ,,==NA and ,"",==empty string write_chars(na, &ch); @@ -476,7 +476,7 @@ static inline void write_string(const char *x, char **thCh) if (*tt == '\0') { // Empty strings are always quoted to distinguish from ,,==NA *ch++='"'; *ch++='"'; - *thCh = ch; + *pch = ch; return; } while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='\r' && *tt!='"') *ch++ = *tt++; @@ -484,10 +484,10 @@ static inline void write_string(const char *x, char **thCh) // sep2 is set to '\0' when no list columns are present if (*tt=='\0') { // most common case: no sep, newline or " contained in string - *thCh = ch; // advance caller over the field already written + *pch = ch; // advance caller over the field already written return; } - ch = *thCh; // rewind the field written since it needs to be quoted + ch = *pch; // rewind the field written since it needs to be quoted q = true; } if (q==false) { @@ -510,17 +510,17 @@ static inline void write_string(const char *x, char **thCh) *ch++ = '"'; } } - *thCh = ch; + *pch = ch; } -void writeString(void *col, int row, char **thCh) +void writeString(void *col, int row, char **pch) { - write_string(getString(col, row), thCh); + write_string(getString(col, row), pch); } -void writeCategString(void *col, int row, char **thCh) +void writeCategString(void *col, int row, char **pch) { - write_string(getCategString(col, row), thCh); + write_string(getCategString(col, row), pch); } @@ -555,34 +555,14 @@ static inline void checkBuffer( } } -void fwriteMain( - const char *filename, - void **columns, // a vector of pointers to all-same-length column vectors - int ncol, - int64_t nrow, - void *colNames, // NULL means no header, otherwise ncol strings - writer_fun_t *fun, // a vector of writer_fun_t function pointers - int8_t *whichFun, // length ncol vector containing which fun[] to use for each column - char sep, - const char *eol, - const char *na, - char dec, - char quote, - bool quotesDoubled, // false means escape quotes using backslash - bool append, - bool doRowNames, // optional, likely false - void *rowNames, // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. - int buffMB, // [1-1024] default 8MB - int nth, - bool showProgress, - bool verbose) +void fwriteMain(fwriteMainArgs args) { double startTime = wallclock(); double nextTime = startTime+2; // start printing progress meter in 2 sec if not completed by then double t0 = startTime; - squash = (dateTimeAs==1); + squash = args.squashDateTime; // Estimate max line length of a 1000 row sample (100 rows in 10 places). // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. @@ -602,103 +582,103 @@ void fwriteMain( // Cold section as only 1,000 rows. Speed not an issue issue here. // Overestimating line length is ok. - int eolLen = strlen(eol); + int eolLen = strlen(args.eol); if (eolLen<=0) STOP("eol must be 1 or more bytes (usually either \\n or \\r\\n) but is length %d", eolLen); + int buffMB = args.buffMB; if (buffMB<1 || buffMB>1024) STOP("buffMB=%d outside [1,1024]", buffMB); size_t buffSize = (size_t)1024*1024*buffMB; char *buff = malloc(buffSize); if (!buff) STOP("Unable to allocate %dMB for line length estimation: %s", buffMB, strerror(errno)); int maxLineLen = 0; - int step = nrow<1000 ? 100 : nrow/10; - for (int start=0; start 1 million bytes - fun[whichFun[j]]( columns[j], i, &ch ); + args.funs[args.whichFun[j]]( args.columns[j], i, &ch ); thisLineLen += (int)(ch-buff) + 1/*sep*/; // see comments above about restrictions/guarantees/contracts } - thisLineLen += eolLen; if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; } } - maxLineLen += strlen(eol); - if (verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); + maxLineLen += eolLen; + if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); int f; - if (*filename=='\0') { + if (*args.filename=='\0') { f=-1; // file="" means write to standard output - eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows + // eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows } else { #ifdef WIN32 - f = _open(filename, _O_WRONLY | _O_BINARY | _O_CREAT | (append ? _O_APPEND : _O_TRUNC), _S_IWRITE); + f = _open(args.filename, _O_WRONLY | _O_BINARY | _O_CREAT | (args.append ? _O_APPEND : _O_TRUNC), _S_IWRITE); // O_BINARY rather than O_TEXT for explicit control and speed since it seems that write() has a branch inside it // to convert \n to \r\n on Windows when in text mode not not when in binary mode. #else - f = open(filename, O_WRONLY | O_CREAT | (append ? O_APPEND : O_TRUNC), 0666); + f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); // There is no binary/text mode distinction on Linux and Mac #endif if (f == -1) { int erropen = errno; - if( access( filename, F_OK ) != -1 ) - STOP("%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?", strerror(erropen), filename); - else - STOP("%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), filename); + STOP(access( args.filename, F_OK ) != -1 ? + "%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" : + "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", + strerror(erropen), args.filename); } } t0=wallclock(); - if (verbose) { + if (args.verbose) { DTPRINT("Writing column names ... "); if (f==-1) DTPRINT("\n"); } - if (colNames) { + if (args.colNames) { // We don't know how long this line will be. // It could be (much) longer than the data row line lengths // To keep things simple we'll reuse the same buffer used above for each field, and write each column name separately to the file. // If multiple calls to write() is ever an issue, we'll come back to this. But very unlikely. char *ch = buff; - if (doRowNames) { + if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column - if (quote!=false) { *ch++='"'; *ch++='"'; } // to match write.csv + if (args.doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv *ch++ = sep; } - for (int j=0; j 1 million bytes long - *ch++ = sep; // this sep after the last column name won't be written to the file + *ch++ = args.sep; // this sep after the last column name won't be written to the file } - if (WRITE(f, eol, eolLen)==-1) { + if (WRITE(f, args.eol, eolLen)==-1) { int errwrite=errno; close(f); free(buff); - STOP("%s: '%s'", strerror(errwrite), filename); + STOP("%s: '%s'", strerror(errwrite), args.filename); } } free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above - if (verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); - if (nrow == 0) { - if (verbose) DTPRINT("No data rows present (nrow==0)\n"); - if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), filename); + if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); + if (args.nrow == 0) { + if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); + if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); return; } @@ -714,12 +694,13 @@ void fwriteMain( rowsPerBatch = (10*maxLineLen > buffSize) ? 1 : // very very long lines (100,000 characters+) each thread will just do one row at a time. 0.5 * buffSize/maxLineLen; // Aim for 50% buffer usage. See checkBuffer for comments. - if (rowsPerBatch > nrow) rowsPerBatch=nrow; - int numBatches = (nrow-1)/rowsPerBatch + 1; + if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; + int numBatches = (args.nrow-1)/rowsPerBatch + 1; + int nth = args.nth; if (numBatches < nth) nth = numBatches; - if (verbose) { + if (args.verbose) { DTPRINT("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", - nrow, numBatches, rowsPerBatch, buffMB, showProgress, nth); + args.nrow, numBatches, rowsPerBatch, args.buffMB, args.showProgress, nth); if (f==-1) DTPRINT("\n"); } t0 = wallclock(); @@ -753,30 +734,30 @@ void fwriteMain( int me = omp_get_thread_num(); #pragma omp for ordered schedule(dynamic) - for(int start=0; start=1 because 0-columns was caught earlier. - write_chars(eol, &ch); // overwrite last sep with eol instead + write_chars(args.eol, &ch); // overwrite last sep with eol instead // Track longest line seen so far. If we start to see longer lines than we saw in the // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, @@ -807,17 +788,17 @@ void fwriteMain( int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB if (used > maxBuffUsedPC) maxBuffUsedPC = used; double now; - if (me==0 && showProgress && (now=wallclock())>=nextTime && !failed) { + if (me==0 && args.showProgress && (now=wallclock())>=nextTime && !failed) { // See comments above inside the f==-1 clause. // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the // master thread (me==0) and hopefully this will work on Windows. If not, user should set // showProgress=FALSE until this can be fixed or removed. - int ETA = (int)((nrow-end)*((now-startTime)/end)); + int ETA = (int)((args.nrow-end)*((now-startTime)/end)); if (hasPrinted || ETA >= 2) { - if (verbose && !hasPrinted) DTPRINT("\n"); + if (args.verbose && !hasPrinted) DTPRINT("\n"); DTPRINT("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " "anyBufferGrown=%s; maxBuffUsed=%d%%. ETA %d secs. ", - (100.0*end)/nrow, nrow, (int)(now-startTime), nth, nth==1?"":"s", + (100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, nth==1?"":"s", anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); // TODO: use progress() as in fread nextTime = now+1; @@ -859,18 +840,19 @@ void fwriteMain( } } if (f!=-1 && CLOSE(f) && !failed) - STOP("%s: '%s'", strerror(errno), filename); + STOP("%s: '%s'", strerror(errno), args.filename); // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - STOP("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", strerror(-failed), nth, buffMB); + STOP("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", + strerror(-failed), nth, args.buffMB); } else if (failed>0) { - STOP("%s: '%s'", strerror(failed), filename); + STOP("%s: '%s'", strerror(failed), args.filename); } - if (verbose) DTPRINT("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", - nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); + if (args.verbose) DTPRINT("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", + nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); return; } diff --git a/src/fwrite.h b/src/fwrite.h index 5f25f37041..89f2d44697 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -1,22 +1,86 @@ +#ifdef DTPY + #include "py_fread.h" +#else + #include "freadR.h" // STOP, DTPRINT, DTWARN // TODO rename frw.h? +#endif + typedef void (*writer_fun_t)(void *, int64_t, char **); -writer_fun_t writeBool8; -writer_fun_t writeBool32; -writer_fun_t writeBool32AsString; -writer_fun_t writeInt32; -writer_fun_t writeInt64; -writer_fun_t writeFloat64; -writer_fun_t writeITime; -writer_fun_t writeDateInt32; -writer_fun_t writeDateFloat64; -writer_fun_t writePOSIXct; -writer_fun_t writeNanotime; -writer_fun_t writeString; -writer_fun_t writeCategString; -//writer_fun_t writeList123; - -extern void write_chars(const char *source, char **dest); - -void fwriteMain(); +void writeBool8(); +void writeBool32(); +void writeBool32AsString(); +void writeInt32(); +void writeInt64(); +void writeFloat64(); +void writeITime(); +void writeDateInt32(); +void writeDateFloat64(); +void writePOSIXct(); +void writeNanotime(); +void writeString(); +void writeCategString(); +void writeList(); + +void write_chars(const char *source, char **dest); + +typedef struct fwriteMainArgs +{ + // Name of the file to open (a \0-terminated C string). If the file name + // contains non-ASCII characters, it should be UTF-8 encoded (however fread + // will not validate the encoding). + const char *filename; + + // a vector of pointers to all-same-length column vectors + void **columns; + + int ncol; + + int64_t nrow; + + void *colNames; // NULL means no header, otherwise ncol strings + + writer_fun_t *funs; // a vector of writer_fun_t function pointers + + // length ncol vector containing which fun[] to use for each column + // one byte to use 8 times less cache lines than a vector of function pointers would do + // A limit of 256 writers seems more than sufficient + uint8_t *whichFun; + + char sep; + + char sep2; + + const char *eol; + + const char *na; + + char dec; + + // The quote character is always " (ascii 34) and cannot be changed since nobody on Earth uses a different quoting character, surely + // doQuote controls whether to quote fields or not. NA=="auto" (default) means the contents are inspected to see if sep, eol or quote + // is present and if so, quotes the filed. Else 1=quote all fields, 0=no quoting even when sep is present + int8_t doQuote; + + bool qmethodEscape; // true means escape quotes using backslash, else double-up double quotes. + + bool squashDateTime; + + bool append; + + bool doRowNames; // optional, likely false + + void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. + + int buffMB; // [1-1024] default 8MB + + int nth; + + bool showProgress; + + bool verbose; + +} fwriteMainArgs; + +void fwriteMain(fwriteMainArgs args); diff --git a/src/fwriteR.c b/src/fwriteR.c index 811315db86..7933b19090 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -6,9 +6,8 @@ #define DATETIMEAS_EPOCH 2 #define DATETIMEAS_WRITECSV 3 - -extern char sep2; -static bool logical01=true; +static char sep2; // '\0' if there are no list columns. Otherwise, the within-column separator. +static bool logical01=true; // should logicals be written as 0|1 or true|false. Needed by list column writer too in case a cell is a logical vector. static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv static const char *sep2start, *sep2end; // sep2 is in main fwrite.c so that writeString can quote other fields if sep2 is present in them @@ -16,194 +15,218 @@ static const char *sep2start, *sep2end; // Non-agnostic helpers ... -inline const char *getString(SEXP col, int row) { +const char *getString(SEXP col, int row) { // TODO: inline for use in fwrite.c SEXP x = STRING_ELT(col, row); return x==NA_STRING ? NULL : CHAR(x); } -inline const char *getCategString(SEXP col, int row) { +const char *getCategString(SEXP col, int row) { int x = INTEGER(col)[row]; return x==NA_INTEGER ? NULL : CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } -static writer_fun_t whichWriter(SEXP); - -void writeList123(void *col, int64_t row, char **thCh) { +writer_fun_t funs[] = { + &writeBool8, + &writeBool32, + &writeBool32AsString, + &writeInt32, + &writeInt64, + &writeFloat64, + &writeITime, + &writeDateInt32, + &writeDateFloat64, + &writePOSIXct, + &writeNanotime, + &writeString, + &writeCategString, + &writeList +}; + +typedef enum { // same order as fun[] above + WF_Bool8, + WF_Bool32, + WF_Bool32AsString, + WF_Int32, + WF_Int64, + WF_Float64, + WF_ITime, + WF_DateInt32, + WF_DateFloat64, + WF_POSIXct, + WF_Nanotime, + WF_String, + WF_CategString, + WF_List +} WFs; + +static int32_t whichWriter(SEXP); + +void writeList(void *col, int64_t row, char **pch) { SEXP v = VECTOR_ELT(col,row); - writer_fun_t fun = whichWriter(v); - if (TYPEOF(v)==VECSXP || fun==NULL) { + int32_t wf = whichWriter(v); + if (TYPEOF(v)==VECSXP || wf==INT32_MIN) { error("Row %d of list column is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", row+1, type2char(TYPEOF(v))); } - char *ch = *thCh; + char *ch = *pch; write_chars(sep2start, &ch); void *data = (void *)DATAPTR(v); + writer_fun_t fun = funs[wf]; for (int j=0; j 1e6 columns - writer_fun_t *fun = (writer_fun_t *)R_alloc(ncol, sizeof(writer_fun_t)); - for (int j=0; j Date: Tue, 19 Sep 2017 19:01:35 -0700 Subject: [PATCH 10/16] Interim --- R/fwrite.R | 11 +++--- src/fwrite.c | 64 ++++++++++++++++++++-------------- src/fwrite.h | 21 ++++++------ src/fwriteR.c | 95 +++++++++++++++++++++++++++++---------------------- 4 files changed, 109 insertions(+), 82 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 9bd5c90b92..75d79417bd 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -15,6 +15,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", if (!missing(logicalAsInt)) { warning("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") logical01 = logicalAsInt + logicalAsInt=NULL } else if (length(dateTimeAs)>1) stop("dateTimeAs must be a single string") dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L @@ -32,7 +33,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %in% c("double", "escape"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names), - isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logicalAsInt), + isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01), length(na) == 1L, #1725, handles NULL or character(0) input is.character(file) && length(file)==1 && !is.na(file), length(buffMB)==1 && !is.na(buffMB) && 1<=buffMB && buffMB<=1024, @@ -48,11 +49,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", nThread=1L showProgress=FALSE } - .Call(Cwritefile, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, - row.names, col.names, logicalAsInt, dateTimeAs, buffMB, nThread, - showProgress, verbose) + .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, + row.names, col.names, logical01, dateTimeAs, buffMB, nThread, + showProgress, verbose) invisible() } -genLookups = function() invisible(.Call(CgenLookups)) - diff --git a/src/fwrite.c b/src/fwrite.c index 462fe094d4..b490fe5fda 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -24,13 +24,13 @@ #define SIZE_SF 1000000000000000ULL // 10^NUM_SF // Globals for this file only. Written once to hold parameters passed from R level. -static char *na; // by default "" or if set (not recommended) then usually "NA" +static const char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files static char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 -static int8_t quote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) -static bool qmethod_escape=false; // when quoting fields, how to escape double quotes in the field contents (default false means to add another double quote) -static bool squash=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) +static int8_t doQuote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) +static bool qmethodEscape=false; // when quoting fields, how to escape double quotes in the field contents (default false means to add another double quote) +static bool squashDateTime=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) extern const char *getString(void *, int); extern const char *getCategString(void *, int); @@ -297,11 +297,11 @@ static inline void write_time(int32_t x, char **pch) *ch++ = '0'+hh/10; *ch++ = '0'+hh%10; *ch++ = ':'; - ch -= squash; + ch -= squashDateTime; *ch++ = '0'+mm/10; *ch++ = '0'+mm%10; *ch++ = ':'; - ch -= squash; + ch -= squashDateTime; *ch++ = '0'+ss/10; *ch++ = '0'+ss%10; } @@ -344,20 +344,20 @@ static inline void write_date(int32_t x, char **pch) int md = monthday[z]; // See fwriteLookups.h for how the 366 item lookup 'monthday' is arranged y += z && (md/100)<3; // The +1 above turned z=-1 to 0 (meaning Feb29 of year y not Jan or Feb of y+1) - ch += 7 + 2*!squash; + ch += 7 + 2*!squashDateTime; *ch-- = '0'+md%10; md/=10; *ch-- = '0'+md%10; md/=10; *ch-- = '-'; - ch += squash; + ch += squashDateTime; *ch-- = '0'+md%10; md/=10; *ch-- = '0'+md%10; md/=10; *ch-- = '-'; - ch += squash; + ch += squashDateTime; *ch-- = '0'+y%10; y/=10; *ch-- = '0'+y%10; y/=10; *ch-- = '0'+y%10; y/=10; *ch = '0'+y%10; y/=10; - ch += 8 + 2*!squash; + ch += 8 + 2*!squashDateTime; } *pch = ch; } @@ -400,21 +400,21 @@ void writePOSIXct(double *col, int row, char **pch) m /= 10; write_date(d, &ch); *ch++ = 'T'; - ch -= squash; + ch -= squashDateTime; write_time(t, &ch); - if (squash || (m && m%1000==0)) { - // when squash always write 3 digits of milliseconds even if 000, for consistent scale of squash integer64 + if (squashDateTime || (m && m%1000==0)) { + // when squashDateTime always write 3 digits of milliseconds even if 000, for consistent scale of squash integer64 // don't use writeInteger() because it doesn't 0 pad which we need here // integer64 is big enough for squash with milli but not micro; trunc (not round) micro when squash m /= 1000; *ch++ = '.'; - ch -= squash; + ch -= squashDateTime; *(ch+2) = '0'+m%10; m/=10; *(ch+1) = '0'+m%10; m/=10; *ch = '0'+m; ch += 3; } else if (m) { - // microseconds are present and !squash + // microseconds are present and !squashDateTime *ch++ = '.'; *(ch+5) = '0'+m%10; m/=10; *(ch+4) = '0'+m%10; m/=10; @@ -425,7 +425,7 @@ void writePOSIXct(double *col, int row, char **pch) ch += 6; } *ch++ = 'Z'; - ch -= squash; + ch -= squashDateTime; } *pch = ch; } @@ -451,14 +451,14 @@ void writeNanotime(int64_t *col, int row, char **pch) } write_date(d, &ch); *ch++ = 'T'; - ch -= squash; + ch -= squashDateTime; write_time(s, &ch); *ch++ = '.'; - ch -= squash; + ch -= squashDateTime; for (int i=8; i>=0; i--) { *(ch+i) = '0'+n%10; n/=10; } // always 9 digits for nanoseconds ch += 9; *ch++ = 'Z'; - ch -= squash; + ch -= squashDateTime; } *pch = ch; } @@ -470,7 +470,7 @@ static inline void write_string(const char *x, char **pch) // NA is not quoted even when quote=TRUE to distinguish from quoted "NA" value. But going forward: ,,==NA and ,"",==empty string write_chars(na, &ch); } else { - int8_t q = quote; + int8_t q = doQuote; if (q==INT8_MIN) { // NA means quote="auto" const char *tt = x; if (*tt == '\0') { @@ -495,7 +495,7 @@ static inline void write_string(const char *x, char **pch) } else { *ch++ = '"'; const char *tt = x; - if (qmethod_escape) { + if (qmethodEscape) { while (*tt!='\0') { if (*tt=='"' || *tt=='\\') *ch++ = '\\'; *ch++ = *tt++; @@ -562,7 +562,13 @@ void fwriteMain(fwriteMainArgs args) double nextTime = startTime+2; // start printing progress meter in 2 sec if not completed by then double t0 = startTime; - squash = args.squashDateTime; + na = args.na; + sep = args.sep; + sep2 = args.sep2; + dec = args.dec; + doQuote = args.doQuote; + qmethodEscape = args.qmethodEscape; + squashDateTime = args.squashDateTime; // Estimate max line length of a 1000 row sample (100 rows in 10 places). // 'Estimate' even of this sample because quote='auto' may add quotes and escape embedded quotes. @@ -658,7 +664,10 @@ void fwriteMain(fwriteMainArgs args) } for (int j=0; j 1 million bytes long *ch++ = args.sep; // this sep after the last column name won't be written to the file } - if (WRITE(f, args.eol, eolLen)==-1) { + if (f==-1) { + DTPRINT(args.eol); + } else if (WRITE(f, args.eol, eolLen)==-1) { int errwrite=errno; close(f); free(buff); @@ -752,8 +763,11 @@ void fwriteMain(fwriteMainArgs args) } // Hot loop for (int j=0; j=1 because 0-columns was caught earlier. diff --git a/src/fwrite.h b/src/fwrite.h index 89f2d44697..6eacdc0846 100644 --- a/src/fwrite.h +++ b/src/fwrite.h @@ -1,4 +1,3 @@ - #ifdef DTPY #include "py_fread.h" #else @@ -31,14 +30,12 @@ typedef struct fwriteMainArgs // will not validate the encoding). const char *filename; - // a vector of pointers to all-same-length column vectors - void **columns; - int ncol; int64_t nrow; - void *colNames; // NULL means no header, otherwise ncol strings + // a vector of pointers to all-same-length column vectors + void **columns; writer_fun_t *funs; // a vector of writer_fun_t function pointers @@ -47,16 +44,22 @@ typedef struct fwriteMainArgs // A limit of 256 writers seems more than sufficient uint8_t *whichFun; + void *colNames; // NULL means no header, otherwise ncol strings + + bool doRowNames; // optional, likely false + + void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. + char sep; char sep2; + char dec; + const char *eol; const char *na; - char dec; - // The quote character is always " (ascii 34) and cannot be changed since nobody on Earth uses a different quoting character, surely // doQuote controls whether to quote fields or not. NA=="auto" (default) means the contents are inspected to see if sep, eol or quote // is present and if so, quotes the filed. Else 1=quote all fields, 0=no quoting even when sep is present @@ -68,10 +71,6 @@ typedef struct fwriteMainArgs bool append; - bool doRowNames; // optional, likely false - - void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. - int buffMB; // [1-1024] default 8MB int nth; diff --git a/src/fwriteR.c b/src/fwriteR.c index 7933b19090..c6a39bdb55 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -15,12 +15,13 @@ static const char *sep2start, *sep2end; // Non-agnostic helpers ... -const char *getString(SEXP col, int row) { // TODO: inline for use in fwrite.c - SEXP x = STRING_ELT(col, row); +const char *getString(SEXP *col, int row) { // TODO: inline for use in fwrite.c + SEXP x = col[row]; return x==NA_STRING ? NULL : CHAR(x); } const char *getCategString(SEXP col, int row) { + // the only writer that needs to have the header of the SEXP column, to get to the levels int x = INTEGER(col)[row]; return x==NA_INTEGER ? NULL : CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); } @@ -118,20 +119,21 @@ SEXP fwriteR( SEXP na_Arg, SEXP dec_Arg, SEXP quote_Arg, // 'auto'=NA_LOGICAL|TRUE|FALSE - SEXP qmethod_escapeArg, // TRUE|FALSE - SEXP append, // TRUE|FALSE - SEXP row_names, // TRUE|FALSE - SEXP col_names, // TRUE|FALSE + SEXP qmethodEscape_Arg, // TRUE|FALSE + SEXP append_Arg, // TRUE|FALSE + SEXP rowNames_Arg, // TRUE|FALSE + SEXP colNames_Arg, // TRUE|FALSE SEXP logical01_Arg, // TRUE|FALSE SEXP dateTimeAs_Arg, // 0=ISO(yyyy-mm-dd),1=squash(yyyymmdd),2=epoch,3=write.csv SEXP buffMB_Arg, // [1-1024] default 8MB - SEXP nThread, + SEXP nThread_Arg, SEXP showProgress_Arg, SEXP verbose_Arg) { if (!isNewList(DF)) error("fwrite must be passed an object of type list; e.g. data.frame, data.table"); fwriteMainArgs args; - + args.verbose = LOGICAL(verbose_Arg)[0]; + args.filename = CHAR(STRING_ELT(filename_Arg, 0)); args.ncol = length(DF); if (args.ncol==0) { warning("fwrite was passed an empty list of no columns. Nothing to write."); @@ -139,30 +141,9 @@ SEXP fwriteR( } args.nrow = length(VECTOR_ELT(DF, 0)); - args.showProgress = LOGICAL(showProgress_Arg)[0]; - args.verbose = LOGICAL(verbose_Arg)[0]; - - args.sep = *CHAR(STRING_ELT(sep_Arg, 0)); // DO NOT DO: allow multichar separator (bad idea) - sep2start = CHAR(STRING_ELT(sep2_Arg, 0)); - args.sep2 = sep2 = *CHAR(STRING_ELT(sep2_Arg, 1)); - sep2end = CHAR(STRING_ELT(sep2_Arg, 2)); - - args.eol = CHAR(STRING_ELT(eol_Arg, 0)); - // someone might want a trailer on every line so allow any length string as eol - - args.na = CHAR(STRING_ELT(na_Arg, 0)); - args.dec = *CHAR(STRING_ELT(dec_Arg,0)); - args.doQuote = LOGICAL(quote_Arg)[0] == NA_LOGICAL ? INT8_MIN : LOGICAL(quote_Arg)[0]==1; - args.qmethodEscape = (int8_t)(LOGICAL(qmethod_escapeArg)[0]==1); - args.filename = CHAR(STRING_ELT(filename_Arg, 0)); - logical01 = LOGICAL(logical01_Arg)[0]==1; - dateTimeAs = INTEGER(dateTimeAs_Arg)[0]; - args.squashDateTime = (dateTimeAs==1); - args.nth = INTEGER(nThread)[0]; - int firstListColumn = 0; - SEXP DFcoerced = DF; int protecti = 0; + dateTimeAs = INTEGER(dateTimeAs_Arg)[0]; if (dateTimeAs == DATETIMEAS_WRITECSV) { int j=0; while(j 1e6 columns + // allocate new `columns` vector. Although this could be DATAPTR(DFcoerced) directly, it can't + // because there's an offset on each column that points to (DATAPTR for each column) which fread.c + // would need to know. Rather than have the complication of a new offset variable, we just alloc a + // new vetcors of pointers directly. It won't make a difference to speed because only this new + // vector need be used by fread.c. It just uses a tiny bit more memory (ncol * 8 bytes). + args.columns = (void *)R_alloc(args.ncol, sizeof(SEXP)); + args.funs = funs; // funs declared statically at the top of this file + + // Allocate and populate lookup vector to writer function for each column, whichFun[] args.whichFun = (uint8_t *)R_alloc(args.ncol, sizeof(uint8_t)); + + int firstListColumn = 0; for (int j=0; j Date: Wed, 20 Sep 2017 11:15:14 -0700 Subject: [PATCH 11/16] Interim --- src/fwriteR.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fwriteR.c b/src/fwriteR.c index c6a39bdb55..b0085db516 100644 --- a/src/fwriteR.c +++ b/src/fwriteR.c @@ -195,9 +195,11 @@ SEXP fwriteR( if (TYPEOF(column)==VECSXP && firstListColumn==0) firstListColumn = j+1; } - args.colNames = LOGICAL(colNames_Arg)[0] ? (void *)DATAPTR(getAttrib(DF, R_NamesSymbol)) : NULL; + SEXP cn = getAttrib(DF, R_NamesSymbol); + args.colNames = (LOGICAL(colNames_Arg)[0] && isString(cn)) ? (void *)DATAPTR(cn) : NULL; // user may want row names even when they don't exist (implied row numbers as row names) + // so we need a separate boolean flag as well as the row names should they exist (rare) args.doRowNames = LOGICAL(rowNames_Arg)[0]; args.rowNames = NULL; if (args.doRowNames) { From 5481173ec7e1ed47202153d63e186aadcea6ae2a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 20 Sep 2017 13:43:38 -0700 Subject: [PATCH 12/16] Passing local tests --- R/fwrite.R | 6 +++--- inst/tests/tests.Rraw | 8 ++++---- man/fwrite.Rd | 6 ++++-- src/fwrite.c | 26 +++++++++++++++++++------- src/fwriteR.c | 14 +++++++------- 5 files changed, 37 insertions(+), 23 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 75d79417bd..dff1b75a05 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -10,6 +10,9 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] if (missing(dateTimeAs)) dateTimeAs = dateTimeAs[1L] + else if (length(dateTimeAs)>1) stop("dateTimeAs must be a single string") + dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L + if (is.na(dateTimeAs)) stop("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") if (!missing(logical01) && !missing(logicalAsInt)) stop("logicalAsInt has been renamed logical01. Use logical01 only, not both.") if (!missing(logicalAsInt)) { @@ -17,9 +20,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", logical01 = logicalAsInt logicalAsInt=NULL } - else if (length(dateTimeAs)>1) stop("dateTimeAs must be a single string") - dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L - if (is.na(dateTimeAs)) stop("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") buffMB = as.integer(buffMB) nThread = as.integer(nThread) # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4a7ce20e6e..d81be143d4 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8831,7 +8831,7 @@ test(1658.16, fwrite(data.table( factor1=as.factor(c('foo', 'bar')), factor2=as.factor(c(NA, "baz")), bool=c(TRUE,NA), - ints=as.integer(c(NA, 5))), na='na', quote=TRUE), + ints=as.integer(c(NA, 5))), na='na', quote=TRUE, logical01=FALSE), output='"factor1","factor2","bool","ints"\n"foo",na,TRUE,na\n"bar","baz",na,5\n') # empty data table (headers but no rows) @@ -9699,14 +9699,14 @@ set.seed(1) DT = data.table(A=1:4, B=list(1:10,15:18,7,9:10), C=list(letters[19:23],c(1.2,2.3,3.4,pi,-9),c("foo","bar"),c(TRUE,TRUE,FALSE))) -test(1736.1, capture.output(fwrite(DT)), c("A,B,C", "1,1|2|3|4|5|6|7|8|9|10,s|t|u|v|w", +test(1736.1, capture.output(fwrite(DT,logical01=FALSE)), c("A,B,C", "1,1|2|3|4|5|6|7|8|9|10,s|t|u|v|w", "2,15|16|17|18,1.2|2.3|3.4|3.14159265358979|-9", "3,7,foo|bar", "4,9|10,TRUE|TRUE|FALSE")) test(1736.2, fwrite(DT, sep2=","), error="length(sep2)") test(1736.3, fwrite(DT, sep2=c("",",","")), error="sep.*,.*sep2.*,.*must all be different") test(1736.4, fwrite(DT, sep2=c("","||","")), error="nchar.*sep2.*2") -test(1736.5, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"))), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", +test(1736.5, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"), logical01=FALSE)), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", "2|c(15,16,17,18)|c(1.2,2.3,3.4,3.14159265358979,-9)", "3|c(7)|c(foo,bar)", "4|c(9,10)|c(TRUE,TRUE,FALSE)")) -test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE)), +test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logical01=TRUE)), c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}", "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}")) DT = data.table(A=c("foo","ba|r","baz")) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 51414bfe7b..8214e8ddab 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -12,7 +12,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), - logicalAsInt = FALSE, dateTimeAs = c("ISO","squash","epoch","write.csv"), + logical01 = TRUE, logicalAsInt = logical01, + dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(), showProgress = interactive(), verbose = getOption("datatable.verbose")) @@ -34,7 +35,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{"escape" - the quote character (as well as the backslash character) is escaped in C style by a backslash, or} \item{"double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one.} }} - \item{logicalAsInt}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?} + \item{logical01}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?} + \item{logicalAsInt}{Deprecated. Old name for `logical01`. Name change for consistency with `fread` for which `logicalAsInt` would not make sense.} \item{dateTimeAs}{ How \code{Date}/\code{IDate}, \code{ITime} and \code{POSIXct} items are written. \itemize{ \item{"ISO" (default) - \code{2016-09-12}, \code{18:12:16} and \code{2016-09-12T18:12:16.999999Z}. 0, 3 or 6 digits of fractional seconds are printed if and when present for convenience, regardless of any R options such as \code{digits.secs}. The idea being that if milli and microseconds are present then you most likely want to retain them. R's internal UTC representation is written faithfully to encourage ISO standards, stymie timezone ambiguity and for speed. An option to consider is to start R in the UTC timezone simply with \code{"$ TZ='UTC' R"} at the shell (NB: it must be one or more spaces between \code{TZ='UTC'} and \code{R}, anything else will be silently ignored; this TZ setting applies just to that R process) or \code{Sys.setenv(TZ='UTC')} at the R prompt and then continue as if UTC were local time.} diff --git a/src/fwrite.c b/src/fwrite.c index b490fe5fda..b21b40d63d 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -48,14 +48,21 @@ void writeBool8(int8_t *col, int row, char **pch) { int8_t x = col[row]; if (x==INT8_MIN) return; - *(*pch++) = '0'+x; + char *ch = *pch; + *ch++ = '0'+x; + *pch = ch; } void writeBool32(int32_t *col, int row, char **pch) { int32_t x = col[row]; - if (x==INT32_MIN) return; - *(*pch++) = '0'+x; + char *ch = *pch; + if (x==INT32_MIN) { + write_chars(na, &ch); + } else { + *ch++ = '0'+x; + } + *pch = ch; } void writeBool32AsString(int32_t *col, int row, char **pch) @@ -567,6 +574,11 @@ void fwriteMain(fwriteMainArgs args) sep2 = args.sep2; dec = args.dec; doQuote = args.doQuote; + + // When NA is a non-empty string, then we must quote all string fields in case they contain the na string + // na is recommended to be empty, though + if (na[0]!='\0' && doQuote==INT8_MIN) doQuote = true; + qmethodEscape = args.qmethodEscape; squashDateTime = args.squashDateTime; @@ -611,7 +623,7 @@ void fwriteMain(fwriteMainArgs args) } else { thisLineLen += 1+(int)log10(args.nrow); // the width of the row number } - thisLineLen += 2*(args.doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; + thisLineLen += 2*(doQuote!=0/*NA('auto') or true*/) + 1/*sep*/; } for (int j=0; j 1 million bytes @@ -659,7 +671,7 @@ void fwriteMain(fwriteMainArgs args) char *ch = buff; if (args.doRowNames) { // Unusual: the extra blank column name when row_names are added as the first column - if (args.doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv + if (doQuote!=0/*'auto'(NA) or true*/) { *ch++='"'; *ch++='"'; } // to match write.csv *ch++ = sep; } for (int j=0; j Date: Wed, 20 Sep 2017 14:05:12 -0700 Subject: [PATCH 13/16] Added datatable.logical01 option and news item --- NEWS.md | 4 +++- R/fwrite.R | 6 ++++-- R/onLoad.R | 3 ++- man/fwrite.Rd | 5 +++-- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/NEWS.md b/NEWS.md index 8449f22e1c..8e33dda2c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,7 +25,9 @@ * Now handles floating-point NaN values in a wide variety of formats, including `NaN`, `sNaN`, `1.#QNAN`, `NaN1234`, `#NUM!` and others, [#1800](https://github.com/Rdatatable/data.table/issues/1800). Thanks to Jori Liesenborgs for highlighting and the PR. * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246) -2. `fwrite` now always quotes empty strings (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). +2. `fwrite()`: + * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). + * `logicalAsInt` has been renamed `logical01` and the default changed from `FALSE` to `TRUE`, both changes for consistency with `fread` (see item above). The old name `logicalAsInt` continues to work but is now deprecated. The previous default can easily be restored without any code changes by setting `options("datatable.logical01" = FALSE)`. 3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. diff --git a/R/fwrite.R b/R/fwrite.R index dff1b75a05..17bb42bc94 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -2,10 +2,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", na="", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), - logical01=TRUE, logicalAsInt=logical01, dateTimeAs = c("ISO","squash","epoch","write.csv"), + logical01=getOption("datatable.logical01", TRUE), + logicalAsInt=logical01, + dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(), showProgress=interactive(), - verbose=getOption("datatable.verbose")) { + verbose=getOption("datatable.verbose", FALSE)) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] diff --git a/R/onLoad.R b/R/onLoad.R index 83b67fea63..e49df3b1e3 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -45,7 +45,8 @@ "datatable.use.index"="TRUE", # global switch to address #1422 "datatable.fread.datatable"="TRUE", "datatable.prettyprint.char" = NULL, # FR #1091 - "datatable.old.unique.by.key" = "FALSE" # TODO: warn 1 year, remove after 2 years + "datatable.old.unique.by.key" = "FALSE", # TODO: warn 1 year, remove after 2 years + "datatable.logical01" = "TRUE" # fwrite/fread to revert to FALSE. TODO: warn in next release and remove after 1 year ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste("options(",i,"=",opts[i],")",sep=""))) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 8214e8ddab..65da84e3a0 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -12,11 +12,12 @@ fwrite(x, file = "", append = FALSE, quote = "auto", eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), - logical01 = TRUE, logicalAsInt = logical01, + logical01 = getOption("datatable.logical01", TRUE), + logicalAsInt = logical01, # deprecated dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(), showProgress = interactive(), - verbose = getOption("datatable.verbose")) + verbose = getOption("datatable.verbose", FALSE)) } \arguments{ \item{x}{Any \code{list} of same length vectors; e.g. \code{data.frame} and \code{data.table}.} From 6ed4180eda0ee91ba326741049a516d2474eb29a Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 20 Sep 2017 14:37:20 -0700 Subject: [PATCH 14/16] Windows fix --- R/fwrite.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 17bb42bc94..096e0bb7d7 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -46,10 +46,10 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", col.names = FALSE # test 1658.16 checks this if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { - # console output (Rprintf) isn't thread safe. - # Perhaps more so on Windows (as experienced) than Linux - nThread=1L - showProgress=FALSE + # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time + nThread = 1L + showProgress = FALSE + eol = "\n" # Rprintf() is used at C level which knows inside it to output \r\n on Windows. Otherwise extra \r is output. } .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, row.names, col.names, logical01, dateTimeAs, buffMB, nThread, From a9029350e0c718fe2f8db7bc06598aefd178f12e Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 20 Sep 2017 15:49:58 -0700 Subject: [PATCH 15/16] Added 2 fwrite tests --- inst/tests/tests.Rraw | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index cb9856df1d..517014c79e 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8855,6 +8855,11 @@ unlink(f) ok_dt <- data.table(foo="bar") test(1658.22, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"\n') +# integer NA +DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5)) +test(1658.23, fwrite(DT), output='A,B2,,43,5') +test(1658.24, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NANA,43,5') + options(oldverbose) # wrong argument types From fccba083f74268f0690adba9870b2aa044d501e4 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Wed, 20 Sep 2017 16:34:46 -0700 Subject: [PATCH 16/16] Added test that fwrite logicalAsInt continues to work --- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/R/fwrite.R b/R/fwrite.R index 096e0bb7d7..f1bc34cf2f 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -18,7 +18,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", if (!missing(logical01) && !missing(logicalAsInt)) stop("logicalAsInt has been renamed logical01. Use logical01 only, not both.") if (!missing(logicalAsInt)) { - warning("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") + # TODO: warning("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") logical01 = logicalAsInt logicalAsInt=NULL } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 517014c79e..2658ed6ddf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9711,7 +9711,8 @@ test(1736.3, fwrite(DT, sep2=c("",",","")), error="sep.*,.*sep2.*,.*must all be test(1736.4, fwrite(DT, sep2=c("","||","")), error="nchar.*sep2.*2") test(1736.5, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"), logical01=FALSE)), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", "2|c(15,16,17,18)|c(1.2,2.3,3.4,3.14159265358979,-9)", "3|c(7)|c(foo,bar)", "4|c(9,10)|c(TRUE,TRUE,FALSE)")) -test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logical01=TRUE)), +# Aside: logicalAsInt tested in 1736.6 to continue to work without warning, currently. TODO: warning, deprecate and remove +test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE)), c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}", "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}")) DT = data.table(A=c("foo","ba|r","baz"))