diff --git a/NEWS.md b/NEWS.md index 8449f22e1c..8e33dda2c0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,7 +25,9 @@ * Now handles floating-point NaN values in a wide variety of formats, including `NaN`, `sNaN`, `1.#QNAN`, `NaN1234`, `#NUM!` and others, [#1800](https://github.com/Rdatatable/data.table/issues/1800). Thanks to Jori Liesenborgs for highlighting and the PR. * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246) -2. `fwrite` now always quotes empty strings (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). +2. `fwrite()`: + * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). + * `logicalAsInt` has been renamed `logical01` and the default changed from `FALSE` to `TRUE`, both changes for consistency with `fread` (see item above). The old name `logicalAsInt` continues to work but is now deprecated. The previous default can easily be restored without any code changes by setting `options("datatable.logical01" = FALSE)`. 3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. diff --git a/R/fwrite.R b/R/fwrite.R index f70074f278..f1bc34cf2f 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -2,10 +2,12 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", na="", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), - logicalAsInt=FALSE, dateTimeAs = c("ISO","squash","epoch","write.csv"), + logical01=getOption("datatable.logical01", TRUE), + logicalAsInt=logical01, + dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB=8, nThread=getDTthreads(), showProgress=interactive(), - verbose=getOption("datatable.verbose")) { + verbose=getOption("datatable.verbose", FALSE)) { isLOGICAL = function(x) isTRUE(x) || identical(FALSE, x) # it seems there is no isFALSE in R? na = as.character(na[1L]) # fix for #1725 if (missing(qmethod)) qmethod = qmethod[1L] @@ -13,6 +15,13 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", else if (length(dateTimeAs)>1) stop("dateTimeAs must be a single string") dateTimeAs = chmatch(dateTimeAs, c("ISO","squash","epoch","write.csv"))-1L if (is.na(dateTimeAs)) stop("dateTimeAs must be 'ISO','squash','epoch' or 'write.csv'") + if (!missing(logical01) && !missing(logicalAsInt)) + stop("logicalAsInt has been renamed logical01. Use logical01 only, not both.") + if (!missing(logicalAsInt)) { + # TODO: warning("logicalAsInt has been renamed logical01 for consistency with fread. It will work fine but please change to logical01 at your convenience so we can remove logicalAsInt in future.") + logical01 = logicalAsInt + logicalAsInt=NULL + } buffMB = as.integer(buffMB) nThread = as.integer(nThread) # write.csv default is 'double' so fwrite follows suit. write.table's default is 'escape' @@ -26,7 +35,7 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", is.character(eol) && length(eol)==1L, length(qmethod) == 1L && qmethod %in% c("double", "escape"), isLOGICAL(col.names), isLOGICAL(append), isLOGICAL(row.names), - isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logicalAsInt), + isLOGICAL(verbose), isLOGICAL(showProgress), isLOGICAL(logical01), length(na) == 1L, #1725, handles NULL or character(0) input is.character(file) && length(file)==1 && !is.na(file), length(buffMB)==1 && !is.na(buffMB) && 1<=buffMB && buffMB<=1024, @@ -37,16 +46,14 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", col.names = FALSE # test 1658.16 checks this if (identical(quote,"auto")) quote=NA # logical NA if (file=="") { - # console output (Rprintf) isn't thread safe. - # Perhaps more so on Windows (as experienced) than Linux - nThread=1L - showProgress=FALSE + # console output which it seems isn't thread safe on Windows even when one-batch-at-a-time + nThread = 1L + showProgress = FALSE + eol = "\n" # Rprintf() is used at C level which knows inside it to output \r\n on Windows. Otherwise extra \r is output. } - .Call(Cwritefile, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, - row.names, col.names, logicalAsInt, dateTimeAs, buffMB, nThread, - showProgress, verbose) + .Call(CfwriteR, x, file, sep, sep2, eol, na, dec, quote, qmethod=="escape", append, + row.names, col.names, logical01, dateTimeAs, buffMB, nThread, + showProgress, verbose) invisible() } -genLookups = function() invisible(.Call(CgenLookups)) - diff --git a/R/onLoad.R b/R/onLoad.R index 83b67fea63..e49df3b1e3 100644 --- a/R/onLoad.R +++ b/R/onLoad.R @@ -45,7 +45,8 @@ "datatable.use.index"="TRUE", # global switch to address #1422 "datatable.fread.datatable"="TRUE", "datatable.prettyprint.char" = NULL, # FR #1091 - "datatable.old.unique.by.key" = "FALSE" # TODO: warn 1 year, remove after 2 years + "datatable.old.unique.by.key" = "FALSE", # TODO: warn 1 year, remove after 2 years + "datatable.logical01" = "TRUE" # fwrite/fread to revert to FALSE. TODO: warn in next release and remove after 1 year ) for (i in setdiff(names(opts),names(options()))) { eval(parse(text=paste("options(",i,"=",opts[i],")",sep=""))) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4fd72d5bd5..2658ed6ddf 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8831,7 +8831,7 @@ test(1658.16, fwrite(data.table( factor1=as.factor(c('foo', 'bar')), factor2=as.factor(c(NA, "baz")), bool=c(TRUE,NA), - ints=as.integer(c(NA, 5))), na='na', quote=TRUE), + ints=as.integer(c(NA, 5))), na='na', quote=TRUE, logical01=FALSE), output='"factor1","factor2","bool","ints"\n"foo",na,TRUE,na\n"bar","baz",na,5\n') # empty data table (headers but no rows) @@ -8855,6 +8855,11 @@ unlink(f) ok_dt <- data.table(foo="bar") test(1658.22, fwrite(ok_dt, quote=TRUE), output='"foo"\n"bar"\n') +# integer NA +DT = data.table(A=c(2L,NA,3L), B=c(NA,4:5)) +test(1658.23, fwrite(DT), output='A,B2,,43,5') +test(1658.24, fwrite(DT, na="NA", verbose=TRUE), output='Writing column names.*"A","B".*2,NANA,43,5') + options(oldverbose) # wrong argument types @@ -9699,13 +9704,14 @@ set.seed(1) DT = data.table(A=1:4, B=list(1:10,15:18,7,9:10), C=list(letters[19:23],c(1.2,2.3,3.4,pi,-9),c("foo","bar"),c(TRUE,TRUE,FALSE))) -test(1736.1, capture.output(fwrite(DT)), c("A,B,C", "1,1|2|3|4|5|6|7|8|9|10,s|t|u|v|w", +test(1736.1, capture.output(fwrite(DT,logical01=FALSE)), c("A,B,C", "1,1|2|3|4|5|6|7|8|9|10,s|t|u|v|w", "2,15|16|17|18,1.2|2.3|3.4|3.14159265358979|-9", "3,7,foo|bar", "4,9|10,TRUE|TRUE|FALSE")) test(1736.2, fwrite(DT, sep2=","), error="length(sep2)") test(1736.3, fwrite(DT, sep2=c("",",","")), error="sep.*,.*sep2.*,.*must all be different") test(1736.4, fwrite(DT, sep2=c("","||","")), error="nchar.*sep2.*2") -test(1736.5, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"))), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", +test(1736.5, capture.output(fwrite(DT, sep='|', sep2=c("c(",",",")"), logical01=FALSE)), c("A|B|C", "1|c(1,2,3,4,5,6,7,8,9,10)|c(s,t,u,v,w)", "2|c(15,16,17,18)|c(1.2,2.3,3.4,3.14159265358979,-9)", "3|c(7)|c(foo,bar)", "4|c(9,10)|c(TRUE,TRUE,FALSE)")) +# Aside: logicalAsInt tested in 1736.6 to continue to work without warning, currently. TODO: warning, deprecate and remove test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsInt=TRUE)), c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}", "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}")) diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 51414bfe7b..65da84e3a0 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -12,10 +12,12 @@ fwrite(x, file = "", append = FALSE, quote = "auto", eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", na = "", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), - logicalAsInt = FALSE, dateTimeAs = c("ISO","squash","epoch","write.csv"), + logical01 = getOption("datatable.logical01", TRUE), + logicalAsInt = logical01, # deprecated + dateTimeAs = c("ISO","squash","epoch","write.csv"), buffMB = 8L, nThread = getDTthreads(), showProgress = interactive(), - verbose = getOption("datatable.verbose")) + verbose = getOption("datatable.verbose", FALSE)) } \arguments{ \item{x}{Any \code{list} of same length vectors; e.g. \code{data.frame} and \code{data.table}.} @@ -34,7 +36,8 @@ fwrite(x, file = "", append = FALSE, quote = "auto", \item{"escape" - the quote character (as well as the backslash character) is escaped in C style by a backslash, or} \item{"double" (default, same as \code{write.csv}), in which case the double quote is doubled with another one.} }} - \item{logicalAsInt}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?} + \item{logical01}{Should \code{logical} values be written as \code{1} and \code{0} rather than \code{"TRUE"} and \code{"FALSE"}?} + \item{logicalAsInt}{Deprecated. Old name for `logical01`. Name change for consistency with `fread` for which `logicalAsInt` would not make sense.} \item{dateTimeAs}{ How \code{Date}/\code{IDate}, \code{ITime} and \code{POSIXct} items are written. \itemize{ \item{"ISO" (default) - \code{2016-09-12}, \code{18:12:16} and \code{2016-09-12T18:12:16.999999Z}. 0, 3 or 6 digits of fractional seconds are printed if and when present for convenience, regardless of any R options such as \code{digits.secs}. The idea being that if milli and microseconds are present then you most likely want to retain them. R's internal UTC representation is written faithfully to encourage ISO standards, stymie timezone ambiguity and for speed. An option to consider is to start R in the UTC timezone simply with \code{"$ TZ='UTC' R"} at the shell (NB: it must be one or more spaces between \code{TZ='UTC'} and \code{R}, anything else will be silently ignored; this TZ setting applies just to that R process) or \code{Sys.setenv(TZ='UTC')} at the R prompt and then continue as if UTC were local time.} diff --git a/src/fwrite.c b/src/fwrite.c index dca9b40adf..b21b40d63d 100644 --- a/src/fwrite.c +++ b/src/fwrite.c @@ -1,9 +1,12 @@ -#include "data.table.h" #include "fwriteLookups.h" #include -#include // for access() +#include // for access() #include -#include +#include // true and false +#include // INT32_MIN +#include // isfinite, isnan +#include // abs +#include // strlen, strerror #ifdef WIN32 #include #include @@ -14,56 +17,74 @@ #define WRITE write #define CLOSE close #endif +#include +#include "fwrite.h" #define NUM_SF 15 #define SIZE_SF 1000000000000000ULL // 10^NUM_SF // Globals for this file only. Written once to hold parameters passed from R level. -static const char *na; // by default "" or if set then usually "NA" +static const char *na; // by default "" or if set (not recommended) then usually "NA" static char sep; // comma in .csv files -static char sep2; // '|' within list columns -static const char *sep2start, *sep2end; +static char sep2; // '|' within list columns. Used here to know if field should be quoted and in freadR.c to write sep2 in list columns static char dec; // the '.' in the number 3.1416. In Europe often: 3,1416 -static Rboolean verbose=FALSE; // be chatty? -static Rboolean quote=FALSE; // whether to surround fields with double quote ". NA means 'auto' (default) -static Rboolean qmethod_escape=TRUE; // when quoting fields, how to manage double quote in the field contents -static Rboolean logicalAsInt=FALSE; // logical as 0/1 or "TRUE"/"FALSE" -static Rboolean squash=FALSE; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) -static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd), 2=epoch, 3=write.csv -#define DATETIMEAS_EPOCH 2 -#define DATETIMEAS_WRITECSV 3 -typedef void (*writer_fun_t)(SEXP, int, char **); - -static inline void write_chars(const char *x, char **thisCh) +static int8_t doQuote=INT8_MIN; // whether to surround fields with double quote ". NA means 'auto' (default) +static bool qmethodEscape=false; // when quoting fields, how to escape double quotes in the field contents (default false means to add another double quote) +static bool squashDateTime=false; // 0=ISO(yyyy-mm-dd) 1=squash(yyyymmdd) + +extern const char *getString(void *, int); +extern const char *getCategString(void *, int); +extern double wallclock(void); + +inline void write_chars(const char *x, char **pch) { // similar to C's strcpy but i) doesn't include trailing \0 and ii) moves destination along - char *ch = *thisCh; + char *ch = *pch; while (*x) *ch++=*x++; - *thisCh = ch; + *pch = ch; +} + +void writeBool8(int8_t *col, int row, char **pch) +{ + int8_t x = col[row]; + if (x==INT8_MIN) return; + char *ch = *pch; + *ch++ = '0'+x; + *pch = ch; } -static void writeLogical(SEXP column, int i, char **thisCh) +void writeBool32(int32_t *col, int row, char **pch) { - Rboolean x = LOGICAL(column)[i]; - char *ch = *thisCh; - if (x == NA_LOGICAL) { + int32_t x = col[row]; + char *ch = *pch; + if (x==INT32_MIN) { write_chars(na, &ch); - } else if (logicalAsInt) { + } else { *ch++ = '0'+x; + } + *pch = ch; +} + +void writeBool32AsString(int32_t *col, int row, char **pch) +{ + int32_t x = col[row]; + char *ch = *pch; + if (x == INT32_MIN) { + write_chars(na, &ch); } else if (x) { *ch++='T'; *ch++='R'; *ch++='U'; *ch++='E'; } else { *ch++='F'; *ch++='A'; *ch++='L'; *ch++='S'; *ch++='E'; } - *thisCh = ch; + *pch = ch; } -static inline void write_positive_int(long long x, char **thisCh) +static inline void write_positive_int(int64_t x, char **pch) { // Avoid log() for speed. Write backwards then reverse when we know how long. // Separate function just because it's used if row numbers are asked for, too // x >= 1 - char *ch = *thisCh; + char *ch = *pch; int width = 0; while (x>0) { *ch++ = '0'+x%10; x /= 10; width++; } for (int i=width/2; i>0; i--) { @@ -71,31 +92,46 @@ static inline void write_positive_int(long long x, char **thisCh) *(ch-i) = *(ch-width+i-1); *(ch-width+i-1) = tmp; } - *thisCh = ch; + *pch = ch; } -static void writeInteger(SEXP column, int i, char **thisCh) +void writeInt32(int32_t *col, int row, char **pch) { - long long x = (TYPEOF(column)!=REALSXP) ? INTEGER(column)[i] : DtoLL(REAL(column)[i]); - // != REALSXP rather than ==INTSXP to cover LGLSXP when logicalAsInt==TRUE - char *ch = *thisCh; + char *ch = *pch; + int32_t x = col[row]; if (x == 0) { *ch++ = '0'; - } else if (x == ((TYPEOF(column)==INTSXP) ? NA_INTEGER : NA_INT64_LL)) { + } else if (x == INT32_MIN) { write_chars(na, &ch); } else { if (x<0) { *ch++ = '-'; x=-x; } - write_positive_int(x, &ch); + write_positive_int((int64_t)x, &ch); } - *thisCh = ch; + *pch = ch; } -SEXP genLookups() { - Rprintf("genLookups commented out of the package so it's clear it isn't needed to build. The hooks are left in so it's easy to put back in development should we need to.\n"); - // e.g. ldexpl may not be available on some platforms, or if it is it may not be accurate. - return R_NilValue; +void writeInt64(int64_t *col, int row, char **pch) +{ + char *ch = *pch; + int64_t x = col[row]; + if (x == 0) { + *ch++ = '0'; + } else if (x == INT64_MIN) { + write_chars(na, &ch); + } else { + if (x<0) { *ch++ = '-'; x=-x; } + write_positive_int(x, &ch); + } + *pch = ch; } + /* + * Generate fwriteLookup.h which defines sigparts, expsig and exppow that writeNumeric() that follows uses. + * It was run once a long time ago in dev and we don't need to generate it again unless we change it. + * Commented out and left here in the file where its result is used, in case we need it in future. + * Reason: ldexpl may not be available on all platforms and is slower than a direct lookup when it is. + * +void genLookups() { FILE *f = fopen("/tmp/fwriteLookups.h", "w"); fprintf(f, "//\n\ // Generated by fwrite.c:genLookups()\n\ @@ -129,34 +165,33 @@ SEXP genLookups() { } */ -static void writeNumeric(SEXP column, int i, char **thisCh) +void writeFloat64(double *col, int row, char **pch) { // hand-rolled / specialized for speed - // *thisCh is safely the output destination with enough space (ensured via calculating maxLineLen up front) + // *pch is safely the output destination with enough space (ensured via calculating maxLineLen up front) // technique similar to base R (format.c:formatReal and printutils.c:EncodeReal0) // differences/tricks : // i) no buffers. writes straight to the final file buffer passed to write() // ii) no C libary calls such as sprintf() where the fmt string has to be interpretted over and over // iii) no need to return variables or flags. Just writes. - // iv) shorter, easier to read and reason with. In one self contained place. - double x = REAL(column)[i]; - char *ch = *thisCh; - if (!R_FINITE(x)) { - if (ISNAN(x)) { + // iv) shorter, easier to read and reason with in one self contained place. + double x = col[row]; + char *ch = *pch; + if (!isfinite(x)) { + if (isnan(x)) { write_chars(na, &ch); - } else if (x>0) { - *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; } else { - *ch++ = '-'; *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; + if (x<0) *ch++ = '-'; + *ch++ = 'I'; *ch++ = 'n'; *ch++ = 'f'; } } else if (x == 0.0) { *ch++ = '0'; // and we're done. so much easier rather than passing back special cases } else { if (x < 0.0) { *ch++ = '-'; x = -x; } // and we're done on sign, already written. no need to pass back sign - union { double d; unsigned long long ull; } u; + union { double d; uint64_t l; } u; u.d = x; - unsigned long long fraction = u.ull & 0xFFFFFFFFFFFFF; // (1ULL<<52)-1; - int exponent = (int)((u.ull>>52) & 0x7FF); // [0,2047] + uint64_t fraction = u.l & 0xFFFFFFFFFFFFF; // (1<<52)-1; + uint32_t exponent = (int32_t)((u.l>>52) & 0x7FF); // [0,2047] // Now sum the appropriate powers 2^-(1:52) of the fraction // Important for accuracy to start with the smallest first; i.e. 2^-52 @@ -169,7 +204,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) if (fraction) { while ((fraction & 0xFF) == 0) { fraction >>= 8; i-=8; } while (fraction) { - acc += sigparts[(((fraction&1u)^1u)-1u) & i]; + acc += sigparts[(((fraction & 1u)^1u)-1u) & i]; i--; fraction >>= 1; } @@ -182,7 +217,7 @@ static void writeNumeric(SEXP column, int i, char **thisCh) double y = (1.0+acc) * expsig[exponent]; // low magnitude mult int exp = exppow[exponent]; if (y>=9.99999999999999) { y /= 10; exp++; } - unsigned long long l = y * SIZE_SF; // low magnitude mult 10^NUM_SF + uint64_t l = y * SIZE_SF; // low magnitude mult 10^NUM_SF // l now contains NUM_SF+1 digits as integer where repeated /10 below is accurate // if (verbose) Rprintf("\nTRACE: acc=%.20Le ; y=%.20Le ; l=%llu ; e=%d ", acc, y, l, exp); @@ -251,71 +286,15 @@ static void writeNumeric(SEXP column, int i, char **thisCh) } } } - *thisCh = ch; -} - -static void writeString(SEXP column, int i, char **thisCh) -{ - SEXP x = STRING_ELT(column, i); - char *ch = *thisCh; - if (x == NA_STRING) { - // NA is not quoted by write.csv even when quote=TRUE to distinguish from "NA" - write_chars(na, &ch); - } else { - Rboolean q = quote; - if (q==NA_LOGICAL) { // quote="auto" - const char *tt = CHAR(x); - if (*tt == '\0') { - // Empty strings are always quoted: this distinguishes them from NAs - *ch = '"'; ch[1] = '"'; - *thisCh += 2; - return; - } - while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='"') *ch++ = *tt++; - // Windows includes \n in its \r\n so looking for \n only is sufficient - // sep2 is set to '\0' when no list columns are present - if (*tt=='\0') { - // most common case: no sep, newline or " contained in string - *thisCh = ch; // advance caller over the field already written - return; - } - ch = *thisCh; // rewind the field written since it needs to be quoted - q = TRUE; - } - if (q==FALSE) { - write_chars(CHAR(x), &ch); - } else { - *ch++ = '"'; - const char *tt = CHAR(x); - if (qmethod_escape) { - while (*tt!='\0') { - if (*tt=='"' || *tt=='\\') *ch++ = '\\'; - *ch++ = *tt++; - } - } else { - // qmethod='double' - while (*tt!='\0') { - if (*tt=='"') *ch++ = '"'; - *ch++ = *tt++; - } - } - *ch++ = '"'; - } - } - *thisCh = ch; -} - -static void writeFactor(SEXP column, int i, char **thisCh) { - char *ch = *thisCh; - if (INTEGER(column)[i]==NA_INTEGER) write_chars(na, &ch); - else writeString(getAttrib(column, R_LevelsSymbol), INTEGER(column)[i]-1, &ch); - *thisCh = ch; + *pch = ch; } // DATE/TIME -static inline void write_time(int x, char **thisCh) + +static inline void write_time(int32_t x, char **pch) +// just a helper called below by the real writers (time-only and datetime) { - char *ch = *thisCh; + char *ch = *pch; if (x<0) { // <0 covers NA_INTEGER too (==INT_MIN checked in init.c) write_chars(na, &ch); } else { @@ -325,21 +304,23 @@ static inline void write_time(int x, char **thisCh) *ch++ = '0'+hh/10; *ch++ = '0'+hh%10; *ch++ = ':'; - ch -= squash; + ch -= squashDateTime; *ch++ = '0'+mm/10; *ch++ = '0'+mm%10; *ch++ = ':'; - ch -= squash; + ch -= squashDateTime; *ch++ = '0'+ss/10; *ch++ = '0'+ss%10; } - *thisCh = ch; + *pch = ch; } -static void writeITime(SEXP column, int i, char **thisCh) { - write_time(INTEGER(column)[i], thisCh); + +void writeITime(int32_t *col, int row, char **pch) { + write_time(col[row], pch); } -static inline void write_date(int x, char **thisCh) +static inline void write_date(int32_t x, char **pch) +// just a helper called below by the two real writers (date-only and datetime) { // From base ?Date : // " Dates are represented as the number of days since 1970-01-01, with negative values @@ -359,7 +340,7 @@ static inline void write_date(int x, char **thisCh) // The end result is 5 lines of simple branch free integer math with no library calls. // as.integer(as.Date(c("0000-03-01","9999-12-31"))) == c(-719468,+2932896) - char *ch = *thisCh; + char *ch = *pch; if (x< -719468 || x>2932896) { // NA_INTEGER<(-719468) (==INT_MIN checked in init.c) write_chars(na, &ch); @@ -370,31 +351,33 @@ static inline void write_date(int x, char **thisCh) int md = monthday[z]; // See fwriteLookups.h for how the 366 item lookup 'monthday' is arranged y += z && (md/100)<3; // The +1 above turned z=-1 to 0 (meaning Feb29 of year y not Jan or Feb of y+1) - ch += 7 + 2*!squash; + ch += 7 + 2*!squashDateTime; *ch-- = '0'+md%10; md/=10; *ch-- = '0'+md%10; md/=10; *ch-- = '-'; - ch += squash; + ch += squashDateTime; *ch-- = '0'+md%10; md/=10; *ch-- = '0'+md%10; md/=10; *ch-- = '-'; - ch += squash; + ch += squashDateTime; *ch-- = '0'+y%10; y/=10; *ch-- = '0'+y%10; y/=10; *ch-- = '0'+y%10; y/=10; *ch = '0'+y%10; y/=10; - ch += 8 + 2*!squash; + ch += 8 + 2*!squashDateTime; } - *thisCh = ch; + *pch = ch; } -static void writeDateInt(SEXP column, int i, char **thisCh) { - write_date(INTEGER(column)[i], thisCh); + +void writeDateInt32(int32_t *col, int row, char **pch) { + write_date(col[row], pch); } -static void writeDateReal(SEXP column, int i, char **thisCh) { - write_date(R_FINITE(REAL(column)[i]) ? (int)REAL(column)[i] : NA_INTEGER, thisCh); + +void writeDateFloat64(double *col, int row, char **pch) { + write_date(isfinite(col[row]) ? (int)(col[row]) : INT32_MIN, pch); } -static void writePOSIXct(SEXP column, int i, char **thisCh) +void writePOSIXct(double *col, int row, char **pch) { // Write ISO8601 UTC by default to encourage ISO standards, stymie ambiguity and for speed. // R internally represents POSIX datetime in UTC always. Its 'tzone' attribute can be ignored. @@ -403,9 +386,9 @@ static void writePOSIXct(SEXP column, int i, char **thisCh) // All positive integers up to 2^53 (9e15) are exactly representable by double which is relied // on in the ops here; number of seconds since epoch. - double x = REAL(column)[i]; - char *ch = *thisCh; - if (!R_FINITE(x)) { + double x = col[row]; + char *ch = *pch; + if (!isfinite(x)) { write_chars(na, &ch); } else { int xi, d, t; @@ -424,21 +407,21 @@ static void writePOSIXct(SEXP column, int i, char **thisCh) m /= 10; write_date(d, &ch); *ch++ = 'T'; - ch -= squash; + ch -= squashDateTime; write_time(t, &ch); - if (squash || (m && m%1000==0)) { - // when squash always write 3 digits of milliseconds even if 000, for consistent scale of squash integer64 + if (squashDateTime || (m && m%1000==0)) { + // when squashDateTime always write 3 digits of milliseconds even if 000, for consistent scale of squash integer64 // don't use writeInteger() because it doesn't 0 pad which we need here // integer64 is big enough for squash with milli but not micro; trunc (not round) micro when squash m /= 1000; *ch++ = '.'; - ch -= squash; + ch -= squashDateTime; *(ch+2) = '0'+m%10; m/=10; *(ch+1) = '0'+m%10; m/=10; *ch = '0'+m; ch += 3; } else if (m) { - // microseconds are present and !squash + // microseconds are present and !squashDateTime *ch++ = '.'; *(ch+5) = '0'+m%10; m/=10; *(ch+4) = '0'+m%10; m/=10; @@ -449,16 +432,16 @@ static void writePOSIXct(SEXP column, int i, char **thisCh) ch += 6; } *ch++ = 'Z'; - ch -= squash; + ch -= squashDateTime; } - *thisCh = ch; + *pch = ch; } -static void writeNanotime(SEXP column, int i, char **thisCh) +void writeNanotime(int64_t *col, int row, char **pch) { - long long x = DtoLL(REAL(column)[i]); - char *ch = *thisCh; - if (x == NA_INT64_LL) { + int64_t x = col[row]; + char *ch = *pch; + if (x == INT64_MIN) { write_chars(na, &ch); } else { int d/*days*/, s/*secs*/, n/*nanos*/; @@ -475,61 +458,76 @@ static void writeNanotime(SEXP column, int i, char **thisCh) } write_date(d, &ch); *ch++ = 'T'; - ch -= squash; + ch -= squashDateTime; write_time(s, &ch); *ch++ = '.'; - ch -= squash; + ch -= squashDateTime; for (int i=8; i>=0; i--) { *(ch+i) = '0'+n%10; n/=10; } // always 9 digits for nanoseconds ch += 9; *ch++ = 'Z'; - ch -= squash; + ch -= squashDateTime; } - *thisCh = ch; + *pch = ch; } -static void writeList(SEXP, int, char **); // prototype needed because it calls back to whichWriter too - -static writer_fun_t whichWriter(SEXP column) { - switch(TYPEOF(column)) { - case LGLSXP: - return logicalAsInt ? writeInteger : writeLogical; - case INTSXP: - if (isFactor(column)) return writeFactor; - if (dateTimeAs==DATETIMEAS_EPOCH) return writeInteger; - if (INHERITS(column, char_ITime)) return writeITime; - if (INHERITS(column, char_Date)) return writeDateInt; - return writeInteger; - case REALSXP: - if (INHERITS(column, char_nanotime) && dateTimeAs!=DATETIMEAS_EPOCH) return writeNanotime; - if (INHERITS(column, char_integer64))return writeInteger; - if (dateTimeAs==DATETIMEAS_EPOCH) return writeNumeric; - if (INHERITS(column, char_Date)) return writeDateReal; - if (INHERITS(column, char_POSIXct)) return writePOSIXct; - return writeNumeric; - case STRSXP: - return writeString; - case VECSXP: - return writeList; - default: - return NULL; +static inline void write_string(const char *x, char **pch) +{ + char *ch = *pch; + if (x == NULL) { + // NA is not quoted even when quote=TRUE to distinguish from quoted "NA" value. But going forward: ,,==NA and ,"",==empty string + write_chars(na, &ch); + } else { + int8_t q = doQuote; + if (q==INT8_MIN) { // NA means quote="auto" + const char *tt = x; + if (*tt == '\0') { + // Empty strings are always quoted to distinguish from ,,==NA + *ch++='"'; *ch++='"'; + *pch = ch; + return; + } + while (*tt!='\0' && *tt!=sep && *tt!=sep2 && *tt!='\n' && *tt!='\r' && *tt!='"') *ch++ = *tt++; + // Windows includes \n in its \r\n so looking for \n only is sufficient + // sep2 is set to '\0' when no list columns are present + if (*tt=='\0') { + // most common case: no sep, newline or " contained in string + *pch = ch; // advance caller over the field already written + return; + } + ch = *pch; // rewind the field written since it needs to be quoted + q = true; + } + if (q==false) { + write_chars(x, &ch); + } else { + *ch++ = '"'; + const char *tt = x; + if (qmethodEscape) { + while (*tt!='\0') { + if (*tt=='"' || *tt=='\\') *ch++ = '\\'; + *ch++ = *tt++; + } + } else { + // qmethod='double' + while (*tt!='\0') { + if (*tt=='"') *ch++ = '"'; + *ch++ = *tt++; + } + } + *ch++ = '"'; + } } + *pch = ch; } -static void writeList(SEXP column, int i, char **thisCh) { - SEXP v = VECTOR_ELT(column,i); - writer_fun_t fun = whichWriter(v); - if (TYPEOF(v)==VECSXP || fun==NULL) { - error("Row %d of list column is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", i+1, type2char(TYPEOF(v))); - } - char *ch = *thisCh; - write_chars(sep2start, &ch); - for (int j=0; j 1e6 columns - writer_fun_t *fun = (writer_fun_t *)R_alloc(ncol, sizeof(writer_fun_t)); - for (int j=0; j1024) STOP("buffMB=%d outside [1,1024]", buffMB); + size_t buffSize = (size_t)1024*1024*buffMB; + char *buff = malloc(buffSize); + if (!buff) STOP("Unable to allocate %dMB for line length estimation: %s", buffMB, strerror(errno)); + int maxLineLen = 0; - int na_len = strlen(na); - int step = nrow<1000 ? 100 : nrow/10; - for (int start=0; start 1 million bytes + args.funs[args.whichFun[j]]( args.columns[j], i, &ch ); + thisLineLen += (int)(ch-buff) + 1/*sep*/; // see comments above about restrictions/guarantees/contracts + } if (thisLineLen > maxLineLen) maxLineLen = thisLineLen; } } - maxLineLen += strlen(eol); - if (verbose) Rprintf("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(clock()-t0)/CLOCKS_PER_SEC); + maxLineLen += eolLen; + if (args.verbose) DTPRINT("maxLineLen=%d from sample. Found in %.3fs\n", maxLineLen, 1.0*(wallclock()-t0)); int f; - if (*filename=='\0') { + if (*args.filename=='\0') { f=-1; // file="" means write to standard output - eol = "\n"; // We'll use Rprintf(); it knows itself about \r\n on Windows + // eol = "\n"; // We'll use DTPRINT which converts \n to \r\n inside it on Windows } else { #ifdef WIN32 - f = _open(filename, _O_WRONLY | _O_BINARY | _O_CREAT | (LOGICAL(append)[0] ? _O_APPEND : _O_TRUNC), _S_IWRITE); - // eol must be passed from R level as '\r\n' on Windows since write() only auto-converts \n to \r\n in - // _O_TEXT mode. We use O_BINARY for full control and perhaps speed since O_TEXT must have to deep branch an if('\n') + f = _open(args.filename, _O_WRONLY | _O_BINARY | _O_CREAT | (args.append ? _O_APPEND : _O_TRUNC), _S_IWRITE); + // O_BINARY rather than O_TEXT for explicit control and speed since it seems that write() has a branch inside it + // to convert \n to \r\n on Windows when in text mode not not when in binary mode. #else - f = open(filename, O_WRONLY | O_CREAT | (LOGICAL(append)[0] ? O_APPEND : O_TRUNC), 0666); + f = open(args.filename, O_WRONLY | O_CREAT | (args.append ? O_APPEND : O_TRUNC), 0666); + // There is no binary/text mode distinction on Linux and Mac #endif if (f == -1) { int erropen = errno; - if( access( filename, F_OK ) != -1 ) - error("%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?", strerror(erropen), filename); - else - error("%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", strerror(erropen), filename); + STOP(access( args.filename, F_OK ) != -1 ? + "%s: '%s'. Failed to open existing file for writing. Do you have write permission to it? Is this Windows and does another process such as Excel have it open?" : + "%s: '%s'. Unable to create new file for writing (it does not exist already). Do you have permission to write here, is there space on the disk and does the path exist?", + strerror(erropen), args.filename); } } - t0=clock(); + t0=wallclock(); - if (verbose) { - Rprintf("Writing column names ... "); - if (f==-1) Rprintf("\n"); + if (args.verbose) { + DTPRINT("Writing column names ... "); + if (f==-1) DTPRINT("\n"); } - if (LOGICAL(col_names)[0]) { - SEXP names = getAttrib(DFin, R_NamesSymbol); - if (names!=R_NilValue) { - if (LENGTH(names) != ncol) error("Internal error: length of column names is not equal to the number of columns. Please report."); - // allow for quoting even when not. - int buffSize = 2/*""*/ +1/*,*/; - for (int j=0; j 1 million bytes long + *ch++ = args.sep; // this sep after the last column name won't be written to the file + } + if (f==-1) { + DTPRINT(args.eol); + } else if (WRITE(f, args.eol, eolLen)==-1) { + int errwrite=errno; + close(f); + free(buff); + STOP("%s: '%s'", strerror(errwrite), args.filename); } } - if (verbose) Rprintf("done in %.3fs\n", 1.0*(clock()-t0)/CLOCKS_PER_SEC); - if (nrow == 0) { - if (verbose) Rprintf("No data rows present (nrow==0)\n"); - if (f!=-1 && CLOSE(f)) error("%s: '%s'", strerror(errno), filename); - UNPROTECT(protecti); - return(R_NilValue); + free(buff); // TODO: also to be free'd in cleanup when there's an error opening file above + if (args.verbose) DTPRINT("done in %.3fs\n", 1.0*(wallclock()-t0)); + if (args.nrow == 0) { + if (args.verbose) DTPRINT("No data rows present (nrow==0)\n"); + if (f!=-1 && CLOSE(f)) STOP("%s: '%s'", strerror(errno), args.filename); + return; } // Decide buffer size and rowsPerBatch for each thread @@ -807,50 +710,45 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. // turn out to be longer than estimated from the sample. // buffSize large enough to fit many lines to i) reduce calls to write() and ii) reduce thread sync points // It doesn't need to be small in cache because it's written contiguously. - // If we don't use all the buffer for any reasons that's ok as OS will only page in the pages touched. + // If we don't use all the buffer for any reasons that's ok as OS will only getch the cache lines touched. // So, generally the larger the better up to max filesize/nth to use all the threads. A few times // smaller than that though, to achieve some load balancing across threads since schedule(dynamic). - int buffMB = INTEGER(buffMB_Arg)[0]; // checked at R level between 1 and 1024 - if (buffMB<1 || buffMB>1024) error("buffMB=%d outside [1,1024]", buffMB); // check it again even so - size_t buffSize = 1024*1024*buffMB; - if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters + if (maxLineLen > buffSize) buffSize=2*maxLineLen; // A very long line; at least 1,048,576 characters (since min(buffMB)==1) rowsPerBatch = - (10*maxLineLen > buffSize) ? 1 : // very long lines (100,000 characters+) we'll just do one row at a time. + (10*maxLineLen > buffSize) ? 1 : // very very long lines (100,000 characters+) each thread will just do one row at a time. 0.5 * buffSize/maxLineLen; // Aim for 50% buffer usage. See checkBuffer for comments. - if (rowsPerBatch > nrow) rowsPerBatch=nrow; - int numBatches = (nrow-1)/rowsPerBatch + 1; + if (rowsPerBatch > args.nrow) rowsPerBatch = args.nrow; + int numBatches = (args.nrow-1)/rowsPerBatch + 1; + int nth = args.nth; if (numBatches < nth) nth = numBatches; - if (verbose) { - Rprintf("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", - nrow, numBatches, rowsPerBatch, buffMB, showProgress, nth); - if (f==-1) Rprintf("\n"); + if (args.verbose) { + DTPRINT("Writing %d rows in %d batches of %d rows (each buffer size %dMB, showProgress=%d, nth=%d) ... ", + args.nrow, numBatches, rowsPerBatch, args.buffMB, args.showProgress, nth); + if (f==-1) DTPRINT("\n"); } - t0 = clock(); + t0 = wallclock(); failed=0; // static global so checkBuffer can set it. -errno for malloc or realloc fails, +errno for write fail - Rboolean hasPrinted=FALSE; - Rboolean anyBufferGrown=FALSE; + bool hasPrinted=false; + bool anyBufferGrown=false; int maxBuffUsedPC=0; #pragma omp parallel num_threads(nth) { - char *ch, *buffer; // local to each thread - ch = buffer = malloc(buffSize); // each thread has its own buffer - // Don't use any R API alloc here (e.g. R_alloc); they are - // not thread-safe as per last sentence of R-exts 6.1.1. - - if (buffer==NULL) {failed=-errno;} + char *ch, *myBuff; // local to each thread + ch = myBuff = malloc(buffSize); // each thread has its own buffer. malloc and errno are thread-safe. + if (myBuff==NULL) {failed=-errno;} // Do not rely on availability of '#omp cancel' new in OpenMP v4.0 (July 2013). // OpenMP v4.0 is in gcc 4.9+ (https://gcc.gnu.org/wiki/openmp) but // not yet in clang as of v3.8 (http://openmp.llvm.org/) // If not-me failed, I'll see shared 'failed', fall through loop, free my buffer - // and after parallel section, single thread will call R API error() safely. + // and after parallel section, single thread will call STOP() safely. size_t myAlloc = buffSize; size_t myMaxLineLen = maxLineLen; - // so we can realloc(). Should only be needed if there are very long single CHARSXP - // much longer than occurred in the sample for maxLineLen. Or for list() columns - // contain vectors which are much longer than occurred in the sample. + // so we can realloc(). Should only be needed if there are very long lines that are + // much longer than occurred in the sample for maxLineLen; e.g. unusally long string values + // that didn't occur in the sample, or list columns with some very long vectors in some cells. #pragma omp single { @@ -859,28 +757,33 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. int me = omp_get_thread_num(); #pragma omp for ordered schedule(dynamic) - for(RLEN start=0; start=1 because 0-columns was caught earlier. - write_chars(eol, &ch); // replace it with the newline. + write_chars(args.eol, &ch); // overwrite last sep with eol instead // Track longest line seen so far. If we start to see longer lines than we saw in the // sample, we'll realloc the buffer. The rowsPerBatch chosen based on the (very good) sample, @@ -888,15 +791,15 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. // file output would be out-of-order. Can't change rowsPerBatch after the 'parallel for' started. size_t thisLineLen = ch-lineStart; if (thisLineLen > myMaxLineLen) myMaxLineLen=thisLineLen; - checkBuffer(&buffer, &myAlloc, &ch, myMaxLineLen); - if (failed) break; // this thread stop writing rows; fall through to clear up and error() below + checkBuffer(&myBuff, &myAlloc, &ch, myMaxLineLen); + if (failed) break; // this thread stop writing rows; fall through to clear up and STOP() below } #pragma omp ordered { if (!failed) { // a thread ahead of me could have failed below while I was working or waiting above if (f==-1) { - *ch='\0'; // standard C string end marker so Rprintf knows where to stop - Rprintf(buffer); + *ch='\0'; // standard C string end marker so DTPRINT knows where to stop + DTPRINT(myBuff); // nth==1 at this point since when file=="" (f==-1 here) fwrite.R calls setDTthreads(1) // Although this ordered section is one-at-a-time it seems that calling Rprintf() here, even with a // R_FlushConsole() too, causes corruptions on Windows but not on Linux. At least, as observed so @@ -904,28 +807,28 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. // by slave threads, even when one-at-a-time. Anyway, made this single-threaded when output to console // to be safe (setDTthreads(1) in fwrite.R) since output to console doesn't need to be fast. } else { - if (WRITE(f, buffer, (int)(ch-buffer)) == -1) { + if (WRITE(f, myBuff, (int)(ch-myBuff)) == -1) { failed=errno; } - if (myAlloc > buffSize) anyBufferGrown = TRUE; - int used = 100*((double)(ch-buffer))/buffSize; // percentage of original buffMB + if (myAlloc > buffSize) anyBufferGrown = true; + int used = 100*((double)(ch-myBuff))/buffSize; // percentage of original buffMB if (used > maxBuffUsedPC) maxBuffUsedPC = used; - time_t now; - if (me==0 && showProgress && (now=time(NULL))>=next_time && !failed) { + double now; + if (me==0 && args.showProgress && (now=wallclock())>=nextTime && !failed) { // See comments above inside the f==-1 clause. // Not only is this ordered section one-at-a-time but we'll also Rprintf() here only from the // master thread (me==0) and hopefully this will work on Windows. If not, user should set // showProgress=FALSE until this can be fixed or removed. - int ETA = (int)((nrow-end)*(((double)(now-start_time))/end)); + int ETA = (int)((args.nrow-end)*((now-startTime)/end)); if (hasPrinted || ETA >= 2) { - if (verbose && !hasPrinted) Rprintf("\n"); - Rprintf("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " + if (args.verbose && !hasPrinted) DTPRINT("\n"); + DTPRINT("\rWritten %.1f%% of %d rows in %d secs using %d thread%s. " "anyBufferGrown=%s; maxBuffUsed=%d%%. ETA %d secs. ", - (100.0*end)/nrow, nrow, (int)(now-start_time), nth, nth==1?"":"s", + (100.0*end)/args.nrow, args.nrow, (int)(now-startTime), nth, nth==1?"":"s", anyBufferGrown?"yes":"no", maxBuffUsedPC, ETA); - R_FlushConsole(); // for Windows - next_time = now+1; - hasPrinted = TRUE; + // TODO: use progress() as in fread + nextTime = now+1; + hasPrinted = true; } } // May be possible for master thread (me==0) to call R_CheckUserInterrupt() here. @@ -943,11 +846,11 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. // Conclusion for now: do not provide ability to interrupt. // write() errors and malloc() fails will be caught and cleaned up properly, however. } - ch = buffer; // back to the start of my buffer ready to fill it up again + ch = myBuff; // back to the start of my buffer ready to fill it up again } } } - free(buffer); + free(myBuff); // all threads will call this free on their buffer, even if one or more threads had malloc // or realloc fail. If the initial malloc failed, free(NULL) is ok and does nothing. } @@ -955,28 +858,27 @@ SEXP writefile(SEXP DFin, // any list of same length vectors; e.g. if (hasPrinted) { if (!failed) { // clear the progress meter - Rprintf("\r " + DTPRINT("\r " " \r"); - R_FlushConsole(); // for Windows } else { // unless failed as we'd like to see anyBufferGrown and maxBuffUsedPC - Rprintf("\n"); + DTPRINT("\n"); } } if (f!=-1 && CLOSE(f) && !failed) - error("%s: '%s'", strerror(errno), filename); + STOP("%s: '%s'", strerror(errno), args.filename); // quoted '%s' in case of trailing spaces in the filename // If a write failed, the line above tries close() to clean up, but that might fail as well. So the // '&& !failed' is to not report the error as just 'closing file' but the next line for more detail // from the original error. if (failed<0) { - error("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", strerror(-failed), nth, buffMB); + STOP("%s. One or more threads failed to malloc or realloc their private buffer. nThread=%d and initial buffMB per thread was %d.\n", + strerror(-failed), nth, args.buffMB); } else if (failed>0) { - error("%s: '%s'", strerror(failed), filename); + STOP("%s: '%s'", strerror(failed), args.filename); } - if (verbose) Rprintf("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", - nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); - UNPROTECT(protecti); - return(R_NilValue); + if (args.verbose) DTPRINT("done (actual nth=%d, anyBufferGrown=%s, maxBuffUsed=%d%%)\n", + nth, anyBufferGrown?"yes":"no", maxBuffUsedPC); + return; } diff --git a/src/fwrite.h b/src/fwrite.h new file mode 100644 index 0000000000..6eacdc0846 --- /dev/null +++ b/src/fwrite.h @@ -0,0 +1,85 @@ +#ifdef DTPY + #include "py_fread.h" +#else + #include "freadR.h" // STOP, DTPRINT, DTWARN // TODO rename frw.h? +#endif + +typedef void (*writer_fun_t)(void *, int64_t, char **); + +void writeBool8(); +void writeBool32(); +void writeBool32AsString(); +void writeInt32(); +void writeInt64(); +void writeFloat64(); +void writeITime(); +void writeDateInt32(); +void writeDateFloat64(); +void writePOSIXct(); +void writeNanotime(); +void writeString(); +void writeCategString(); +void writeList(); + +void write_chars(const char *source, char **dest); + +typedef struct fwriteMainArgs +{ + // Name of the file to open (a \0-terminated C string). If the file name + // contains non-ASCII characters, it should be UTF-8 encoded (however fread + // will not validate the encoding). + const char *filename; + + int ncol; + + int64_t nrow; + + // a vector of pointers to all-same-length column vectors + void **columns; + + writer_fun_t *funs; // a vector of writer_fun_t function pointers + + // length ncol vector containing which fun[] to use for each column + // one byte to use 8 times less cache lines than a vector of function pointers would do + // A limit of 256 writers seems more than sufficient + uint8_t *whichFun; + + void *colNames; // NULL means no header, otherwise ncol strings + + bool doRowNames; // optional, likely false + + void *rowNames; // if doRowNames is true and rowNames is not NULL then they're used, otherwise row numbers are output. + + char sep; + + char sep2; + + char dec; + + const char *eol; + + const char *na; + + // The quote character is always " (ascii 34) and cannot be changed since nobody on Earth uses a different quoting character, surely + // doQuote controls whether to quote fields or not. NA=="auto" (default) means the contents are inspected to see if sep, eol or quote + // is present and if so, quotes the filed. Else 1=quote all fields, 0=no quoting even when sep is present + int8_t doQuote; + + bool qmethodEscape; // true means escape quotes using backslash, else double-up double quotes. + + bool squashDateTime; + + bool append; + + int buffMB; // [1-1024] default 8MB + + int nth; + + bool showProgress; + + bool verbose; + +} fwriteMainArgs; + +void fwriteMain(fwriteMainArgs args); + diff --git a/src/fwriteR.c b/src/fwriteR.c new file mode 100644 index 0000000000..19b3912252 --- /dev/null +++ b/src/fwriteR.c @@ -0,0 +1,251 @@ + +#include +#include "data.table.h" +#include "fwrite.h" + +#define DATETIMEAS_EPOCH 2 +#define DATETIMEAS_WRITECSV 3 + +static char sep2; // '\0' if there are no list columns. Otherwise, the within-column separator. +static bool logical01=true; // should logicals be written as 0|1 or true|false. Needed by list column writer too in case a cell is a logical vector. +static int dateTimeAs=0; // 0=ISO(yyyy-mm-dd), 1=squash(yyyymmdd), 2=epoch, 3=write.csv +static const char *sep2start, *sep2end; +// sep2 is in main fwrite.c so that writeString can quote other fields if sep2 is present in them +// if there are no list columns, set sep2=='\0' + +// Non-agnostic helpers ... + +const char *getString(SEXP *col, int row) { // TODO: inline for use in fwrite.c + SEXP x = col[row]; + return x==NA_STRING ? NULL : CHAR(x); +} + +const char *getCategString(SEXP col, int row) { + // the only writer that needs to have the header of the SEXP column, to get to the levels + int x = INTEGER(col)[row]; + return x==NA_INTEGER ? NULL : CHAR(STRING_ELT(getAttrib(col, R_LevelsSymbol), x-1)); +} + +writer_fun_t funs[] = { + &writeBool8, + &writeBool32, + &writeBool32AsString, + &writeInt32, + &writeInt64, + &writeFloat64, + &writeITime, + &writeDateInt32, + &writeDateFloat64, + &writePOSIXct, + &writeNanotime, + &writeString, + &writeCategString, + &writeList +}; + +typedef enum { // same order as fun[] above + WF_Bool8, + WF_Bool32, + WF_Bool32AsString, + WF_Int32, + WF_Int64, + WF_Float64, + WF_ITime, + WF_DateInt32, + WF_DateFloat64, + WF_POSIXct, + WF_Nanotime, + WF_String, + WF_CategString, + WF_List +} WFs; + +static int32_t whichWriter(SEXP); + +void writeList(SEXP *col, int64_t row, char **pch) { + SEXP v = col[row]; + int32_t wf = whichWriter(v); + if (TYPEOF(v)==VECSXP || wf==INT32_MIN) { + error("Row %d of list column is type '%s' - not yet implemented. fwrite() can write list columns containing atomic vectors of type logical, integer, integer64, double, character and factor, currently.", + row+1, type2char(TYPEOF(v))); + } + char *ch = *pch; + write_chars(sep2start, &ch); + void *data = (void *)DATAPTR(v); + writer_fun_t fun = funs[wf]; + for (int j=0; j