diff --git a/NAMESPACE b/NAMESPACE index 207d50593a..ad306b4ce8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -148,7 +148,7 @@ if (getRversion() >= "3.6.0") { # IDateTime support: export(as.IDate,as.ITime,IDateTime) -export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year) +export(second,minute,hour,yday,wday,mday,week,isoweek,month,quarter,year,yearmon,yearqtr) S3method("[", ITime) S3method("+", IDate) diff --git a/NEWS.md b/NEWS.md index a3f2fe70cd..bf83288464 100644 --- a/NEWS.md +++ b/NEWS.md @@ -292,6 +292,8 @@ # 2: 2 10 ``` +40. New functions `yearmon()` and `yearqtr` give a combined representation of `year()` and `month()`/`quarter()`. These and also `yday`, `wday`, `mday`, `week`, `month` and `year` are now optimized for memory and compute efficiency by removing the `POSIXlt` dependency, [#649](https://github.com/Rdatatable/data.table/issues/649). Thanks to Matt Dowle for the request, and Benjamin Schwendinger for the PR. + ## BUG FIXES 1. `by=.EACHI` when `i` is keyed but `on=` different columns than `i`'s key could create an invalidly keyed result, [#4603](https://github.com/Rdatatable/data.table/issues/4603) [#4911](https://github.com/Rdatatable/data.table/issues/4911). Thanks to @myoung3 and @adamaltmejd for reporting, and @ColeMiller1 for the PR. An invalid key is where a `data.table` is marked as sorted by the key columns but the data is not sorted by those columns, leading to incorrect results from subsequent queries. diff --git a/R/IDateTime.R b/R/IDateTime.R index 33d04b87c4..4e6adf55e3 100644 --- a/R/IDateTime.R +++ b/R/IDateTime.R @@ -338,10 +338,10 @@ hour = function(x) { if (inherits(x, 'ITime')) return(as.integer(x) %/% 3600L %% 24L) as.POSIXlt(x)$hour } -yday = function(x) as.POSIXlt(x)$yday + 1L -wday = function(x) (unclass(as.IDate(x)) + 4L) %% 7L + 1L -mday = function(x) as.POSIXlt(x)$mday -week = function(x) yday(x) %/% 7L + 1L +yday = function(x) convertDate(as.IDate(x), "yday") +wday = function(x) convertDate(as.IDate(x), "wday") +mday = function(x) convertDate(as.IDate(x), "mday") +week = function(x) convertDate(as.IDate(x), "week") isoweek = function(x) { # ISO 8601-conformant week, as described at # https://en.wikipedia.org/wiki/ISO_week_date @@ -356,7 +356,13 @@ isoweek = function(x) { 1L + (nearest_thurs - year_start) %/% 7L } -month = function(x) as.POSIXlt(x)$mon + 1L -quarter = function(x) as.POSIXlt(x)$mon %/% 3L + 1L -year = function(x) as.POSIXlt(x)$year + 1900L +month = function(x) convertDate(as.IDate(x), "month") +quarter = function(x) convertDate(as.IDate(x), "quarter") +year = function(x) convertDate(as.IDate(x), "year") +yearmon = function(x) convertDate(as.IDate(x), "yearmon") +yearqtr = function(x) convertDate(as.IDate(x), "yearqtr") +convertDate = function(x, type) { + type = match.arg(type, c("yday", "wday", "mday", "week", "month", "quarter", "year", "yearmon", "yearqtr")) + .Call(CconvertDate, x, type) +} diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 7ac83b66b5..a1b751859c 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -86,6 +86,19 @@ if (exists("test.data.table", .GlobalEnv, inherits=FALSE)) { last = data.table::last # xts first = data.table::first # xts, S4Vectors copy = data.table::copy # bit64 v4; bit64 offered to rename though so this is just in case bit64 unoffers + second = data.table::second # lubridate #1135 + minute = data.table::minute # lubridate + hour = data.table::hour # lubridate + yday = data.table::yday # lubridate + wday = data.table::wday # lubridate + mday = data.table::mday # lubridate + week = data.table::week # lubridate + isoweek = data.table::isoweek # lubridate + month = data.table::month # lubridate + quarter = data.table::quarter # lubridate + year = data.table::year # lubridate + yearmon = data.table::yearmon # zoo + yearqtr = data.table::yearqtr # zoo } # Load optional Suggests packages, which are tested by Travis for code coverage, and on CRAN @@ -10449,15 +10462,17 @@ test(1692, capture.output(as.data.table(structure(57600L, class = "ITime"))), # testing all time part extraction routines (subsumes #874) t <- "2016-08-03 01:02:03.45" -test(1693.1, second(t), 3L) -test(1693.2, minute(t), 2L) -test(1693.3, hour(t), 1L) -test(1693.4, yday(t), 216L) -test(1693.5, wday(t), 4L) -test(1693.6, week(t), 31L) -test(1693.7, month(t), 8L) -test(1693.8, quarter(t), 3L) -test(1693.9, year(t), 2016L) +test(1693.01, second(t), 3L) +test(1693.02, minute(t), 2L) +test(1693.03, hour(t), 1L) +test(1693.04, yday(t), 216L) +test(1693.05, wday(t), 4L) +test(1693.06, week(t), 31L) +test(1693.07, month(t), 8L) +test(1693.08, quarter(t), 3L) +test(1693.09, year(t), 2016L) +test(1693.10, yearmon(t), 2016+7/12) +test(1693.11, yearqtr(t), 2016.5) # fix for #1740 - sub-assigning NAs for factors dt = data.table(x = 1:5, y = factor(c("","a","b","a", "")), z = 5:9) @@ -18769,3 +18784,16 @@ test(2234.9, DT[, min(.SD), by=somefun(.I)], error="by.*contains .I.*supported") DT = data.table(x = 1) test(2235.1, copy(DT)[, c("z", "x") := {x = NULL; list(2, NULL)}], data.table(z = 2)) test(2235.2, copy(DT)[, c("z", "x") := {list(2, NULL)}], data.table(z = 2)) + +# move IDate from POSIXlt to C, add yearquarter; #649 +x = c("1111-11-11", "2019-01-01", "2019-02-28", "2019-03-01", "2019-12-31", "2020-02-29", "2020-03-01", "2020-12-31", "2040-01-01", "2040-12-31", "2100-03-01") +test(2236.1, yday(x), c(315L, 1L, 59L, 60L, 365L, 60L, 61L, 366L, 1L, 366L, 60L)) +test(2236.2, mday(x), c(11L, 1L, 28L, 1L, 31L, 29L, 1L, 31L, 1L, 31L, 1L)) +test(2236.3, wday(x), c(7L, 3L, 5L, 6L, 3L, 7L, 1L, 5L, 1L, 2L, 2L)) +test(2236.4, week(x), c(46L, 1L, 9L, 9L, 53L, 9L, 9L, 53L, 1L, 53L, 9L)) +test(2236.5, month(x), c(11L, 1L, 2L, 3L, 12L, 2L, 3L, 12L, 1L, 12L, 3L)) +test(2236.6, quarter(x), c(4L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 1L, 4L, 1L)) +test(2236.7, year(x), c(1111L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2040L, 2040L, 2100L)) +test(2236.8, yearmon(x), c(1111+10/12, 2019, 2019+1/12, 2019+2/12, 2019+11/12, 2020+1/12, 2020+2/12, 2020+11/12, 2040, 2040+11/12, 2100+2/12)) +test(2236.9, yearqtr(x), c(1111.75, 2019, 2019, 2019, 2019.75, 2020, 2020, 2020.75, 2040, 2040.75, 2100)) + diff --git a/man/IDateTime.Rd b/man/IDateTime.Rd index 876b28b161..6854f59ae9 100644 --- a/man/IDateTime.Rd +++ b/man/IDateTime.Rd @@ -41,6 +41,8 @@ \alias{month} \alias{quarter} \alias{year} +\alias{yearmon} +\alias{yearqtr} \alias{IDate-class} \alias{ITime-class} @@ -93,6 +95,8 @@ isoweek(x) month(x) quarter(x) year(x) +yearmon(x) +yearqtr(x) } @@ -164,11 +168,11 @@ functions \code{weekdays}, \code{months}, and \code{quarters} can also be used, but these return character values, so they must be converted to factors for use with data.table. \code{isoweek} is ISO 8601-consistent. -The \code{round} method for IDate's is useful for grouping and plotting. +The \code{round} method for IDate's is useful for grouping and plotting. It can round to weeks, months, quarters, and years. Similarly, the \code{round} and \code{trunc} methods for ITime's are useful for grouping and plotting. -They can round or truncate to hours and minutes. -Note for ITime's with 30 seconds, rounding is inconsistent due to rounding off a 5. +They can round or truncate to hours and minutes. +Note for ITime's with 30 seconds, rounding is inconsistent due to rounding off a 5. See 'Details' in \code{\link{round}} for more information. } @@ -188,9 +192,14 @@ See 'Details' in \code{\link{round}} for more information. and \code{year} return integer values for second, minute, hour, day of year, day of week, day of month, week, month, quarter, and year, respectively. - - These values are all taken directly from the \code{POSIXlt} representation - of \code{x}, with the notable difference that while \code{yday}, \code{wday}, + \code{yearmon} and \code{yearqtr} return double values representing + respectively `year + (month-1) / 12` and `year + (quarter-1) / 4`. + + \code{second}, \code{minute}, \code{hour} are taken directly from + the \code{POSIXlt} representation. + All other values are computed from the underlying integer representation + and comparable with the values of their \code{POSIXlt} representation + of \code{x}, with the notable difference that while \code{yday}, \code{wday}, and \code{mon} are all 0-based, here they are 1-based. } @@ -253,7 +262,7 @@ round(seqdates, "months") (seqtimes <- seq(as.ITime("07:00"), as.ITime("08:00"), by = 20)) round(seqtimes, "hours") trunc(seqtimes, "hours") - + } \keyword{utilities} diff --git a/src/idatetime.c b/src/idatetime.c new file mode 100644 index 0000000000..c70df3b053 --- /dev/null +++ b/src/idatetime.c @@ -0,0 +1,154 @@ +#include "data.table.h" + +#define YEARS400 146097 +#define YEARS100 36524 +#define YEARS4 1461 +#define YEARS1 365 + +typedef enum { YDAY, WDAY, MDAY, WEEK, MONTH, QUARTER, YEAR, YEARMON, YEARQTR} datetype; + +static inline bool isLeapYear(int year) { + return (year % 100 != 0 || year % 400 == 0) && year % 4 == 0; +} + +void convertSingleDate(int x, datetype type, void *out) +{ + static const char months[] = {31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 31, 29}; + static const int quarter[] = {31, 91, 92, 92, 60}; + + if (type == WDAY) { + int wday = (x + 4) % 7; + if (wday < 0) wday += 7; + *(int *)out = ++wday; + return; + } + + int days = x - 11017; + + int years400 = days / YEARS400; + days %= YEARS400; + if (days < 0) { + days += YEARS400; + years400--; + } + + int years100 = days / YEARS100; + days %= YEARS100; + + int years4 = days / YEARS4; + days %= YEARS4; + + int years1 = days / YEARS1; + days %= YEARS1; + + int year = 2000 + years1 + 4*years4 + 100*years100 + 400*years400; + if (days > 305) + ++year; + + if (type == YEAR) { + *(int *)out = year; + return; + } + + int leap = !years1 && (years4 || !years100); + + if (type == YDAY || type == WEEK) { + int yday = days + 31 + 28 + leap; + if (yday >= YEARS1 + leap) + yday -= YEARS1 + leap; + *(int *)out = ++yday; + if (type == WEEK) + *(int *)out = (*(int *)out / 7) + 1; + return; + } + + if (type == MONTH || type == YEARMON) { + int i; + if (days==0 && !leap && isLeapYear(year)) { + i = 1; + } else { + i = 2; + while (months[i-2] <= days) { + days -= months[i-2]; + i++; + } + } + if (i >= 12) + i -= 12; + + if (type == MONTH) { + *(int *)out = i + 1; + } else { + *(double *)out = year + i / 12.0; + } + return; + } + + if (type == MDAY) { + if (days==0 && !leap && isLeapYear(year)) { + *(int *)out = 29; + return; + } + int i = 0; + while (months[i] <= days) { + days -= months[i]; + i++; + } + *(int *)out = ++days; + return; + } + + if (type == QUARTER || type == YEARQTR) { + int i = 0; + while (quarter[i] <= days) { + days -= quarter[i]; + i++; + } + if (i >= 4) + i -= 4; + if (type == QUARTER) { + *(int *)out = i + 1; + } else { + *(double *)out = year + (i / 4.0); + } + return; + } +} + +SEXP convertDate(SEXP x, SEXP type) +{ + if (!isInteger(x)) error(_("x must be an integer vector")); + const int *ix = INTEGER(x); + const int n = length(x); + if (!isString(type) || length(type) != 1) + error(_("Internal error: invalid type for convertDate(), should have been caught before. please report to data.table issue tracker")); // # nocov + datetype ctype; + bool ansint = true; + if (!strcmp(CHAR(STRING_ELT(type, 0)), "yday")) ctype = YDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "wday")) ctype = WDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "mday")) ctype = MDAY; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "week")) ctype = WEEK; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "month")) ctype = MONTH; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "quarter")) ctype = QUARTER; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "year")) ctype = YEAR; + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "yearmon")) { ctype = YEARMON; ansint = false; } + else if (!strcmp(CHAR(STRING_ELT(type, 0)), "yearqtr")) { ctype = YEARQTR; ansint = false; } + else error(_("Internal error: invalid type for convertDate, should have been caught before. please report to data.table issue tracker")); // # nocov + + SEXP ans; + if (ansint) { + ans = PROTECT(allocVector(INTSXP, n)); + int *ansp = INTEGER(ans); + for (int i=0; i < n; ++i) { + convertSingleDate(ix[i], ctype, &ansp[i]); + } + } else { + ans = PROTECT(allocVector(REALSXP, n)); + double *ansp = REAL(ans); + for (int i=0; i < n; ++i) { + convertSingleDate(ix[i], ctype, &ansp[i]); + } + } + UNPROTECT(1); + return ans; +} diff --git a/src/init.c b/src/init.c index 814ada375d..fd43b956e5 100644 --- a/src/init.c +++ b/src/init.c @@ -130,6 +130,7 @@ SEXP allNAR(); SEXP test_dt_win_snprintf(); SEXP dt_zlib_version(); SEXP startsWithAny(); +SEXP convertDate(); // .Externals SEXP fastmean(); @@ -228,6 +229,7 @@ R_CallMethodDef callMethods[] = { {"Cdt_zlib_version", (DL_FUNC)&dt_zlib_version, -1}, {"Csubstitute_call_arg_namesR", (DL_FUNC) &substitute_call_arg_namesR, -1}, {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, +{"CconvertDate", (DL_FUNC)&convertDate, -1}, {NULL, NULL, 0} };