From ca74947b6354d020cbae998047f2b53ed58f3466 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Fri, 7 Dec 2018 01:02:10 -0800
Subject: [PATCH 01/10] interim

---
 src/gsumm.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index 035cfe1950..0c14fddede 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -51,23 +51,32 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   grp = (int *)R_alloc(grpn, sizeof(int));
   // global grp because the g* functions (inside jsub) share this common memory
 
-  maxgrpn = 0;
+  const int *restrict fdp = INTEGER(f);
   if (LENGTH(o)) {
     isunsorted = 1; // for gmedian
-    for (int g=0, *od=INTEGER(o), *fd=INTEGER(f); g<ngrp; g++) {   // R API outside should help when very many small groups, pr#3045
-      int *elem = od + fd[g]-1;
+    const int *restrict odp = INTEGER(o);
+    #pragma omp parallel for num_threads(getDTthreads())
+    for (int g=0; g<ngrp; g++) {
+      const int *elem = odp + fdp[g]-1;
       for (int j=0; j<grpsize[g]; j++)  grp[ elem[j]-1 ] = g;
-      if (grpsize[g]>maxgrpn) maxgrpn = grpsize[g];  // recalculate (may as well since looping anyway) and check below
     }
   } else {
-    for (int g=0, *fd=INTEGER(f); g<ngrp; g++) {
-      int *elem = grp + fd[g]-1;
+    #pragma omp parallel for num_threads(getDTthreads())
+    for (int g=0; g<ngrp; g++) {
+      int *elem = grp + fdp[g]-1;
       for (int j=0; j<grpsize[g]; j++)  elem[j] = g;
-      if (grpsize[g]>maxgrpn) maxgrpn = grpsize[g];  // needed for #2046 and #2111 when maxgrpn attribute is not attached to empty o
     }
   }
   SEXP tt = getAttrib(o, install("maxgrpn"));
-  if (length(tt) && INTEGER(tt)[0]!=maxgrpn) error("Internal error: o's maxgrpn mismatches recalculated maxgrpn"); // # nocov
+  if (length(tt)!=1) {
+    // seems to happen, and finding the maxgrpn is needed, but not sure why (TODO - trace)
+    // old comment to be checked: 'needed for #2046 and #2111 when maxgrpn attribute is not attached to empty o'
+    maxgrpn = 0;
+    for (int g=0; g<ngrp; g++) if (grpsize[g]>maxgrpn) maxgrpn = grpsize[g];
+  } else {
+    // && INTEGER(tt)[0]!=maxgrpn) error("Internal error: o's maxgrpn mismatches recalculated maxgrpn"); // # nocov
+    maxgrpn = INTEGER(tt)[0];
+  }
   oo = INTEGER(o);
   ff = INTEGER(f);
 

From 60e6c4044805deab43a2c90f8f44de9f13273ee8 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Sat, 8 Dec 2018 01:42:01 -0800
Subject: [PATCH 02/10] interim

---
 src/gsumm.c | 239 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 173 insertions(+), 66 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index 0c14fddede..f0d33cce4d 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -1,12 +1,19 @@
 #include "data.table.h"
 //#include <time.h>
 
-static int *grp = NULL;      // the group of each x item, like a factor
 static int ngrp = 0;         // number of groups
 static int *grpsize = NULL;  // size of each group, used by gmean (and gmedian) not gsum
-static int grpn = 0;         // length of underlying x == length(grp)
+static int nrow = 0;         // length of underlying x; same as length(ghigh) and length(glow)
 static int *irows;           // GForce support for subsets in 'i' (TODO: joins in 'i')
 static int irowslen = -1;    // -1 is for irows = NULL
+static uint16_t *high=NULL, *low=NULL;  // the group of each x item; a.k.a. which-group-am-I
+static int *restrict grp;    // TODO: eventually this can be made local for gforce as won't be needed globally when all functions here use gather
+static size_t highSize;
+static int shift, mask;
+static char *gx=NULL;
+
+static size_t nBatch, batchSize, lastBatchSize;
+static int *counts, *tmpcounts;
 
 // for gmedian
 static int maxgrpn = 0;
@@ -43,13 +50,27 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   else error("irowsArg is neither an integer vector nor NULL");
   ngrp = LENGTH(l);
   if (LENGTH(f) != ngrp) error("length(f)=%d != length(l)=%d", LENGTH(f), ngrp);
-  grpn=0;
+  nrow=0;
   grpsize = INTEGER(l);
-  for (int i=0; i<ngrp; i++) grpn+=grpsize[i];
-  if (LENGTH(o) && LENGTH(o)!=grpn) error("o has length %d but sum(l)=%d", LENGTH(o), grpn);
+  maxgrpn = 0;
+  for (int i=0; i<ngrp; i++) {
+    nrow+=grpsize[i];
+    if (grpsize[i]>maxgrpn) maxgrpn = grpsize[i];  // old comment to be checked: 'needed for #2046 and #2111 when maxgrpn attribute is not attached to empty o'
+  }
+  if (LENGTH(o) && LENGTH(o)!=nrow) error("o has length %d but sum(l)=%d", LENGTH(o), nrow);
+  {
+    SEXP tt = getAttrib(o, install("maxgrpn"));
+    if (length(tt)==1 && INTEGER(tt)[0]!=maxgrpn) error("Internal error: o's maxgrpn attribute mismatches recalculated maxgrpn"); // # nocov
+  }
+
+  int nbit=0;
+  { int tt=ngrp-1; while (tt) { nbit++; tt>>=1; } }  // i.e. floor(log2(ngrp-1))+1
+  shift = nbit/2;
+  mask = (1<<shift)-1;
+  highSize = ((ngrp-1)>>shift) + 1;
 
-  grp = (int *)R_alloc(grpn, sizeof(int));
-  // global grp because the g* functions (inside jsub) share this common memory
+  grp = (int *)R_alloc(nrow, sizeof(int));   // TODO: use malloc and made this local as not needed globally when all functions here use gather
+                                             // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc
 
   const int *restrict fdp = INTEGER(f);
   if (LENGTH(o)) {
@@ -67,16 +88,47 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
       for (int j=0; j<grpsize[g]; j++)  elem[j] = g;
     }
   }
-  SEXP tt = getAttrib(o, install("maxgrpn"));
-  if (length(tt)!=1) {
-    // seems to happen, and finding the maxgrpn is needed, but not sure why (TODO - trace)
-    // old comment to be checked: 'needed for #2046 and #2111 when maxgrpn attribute is not attached to empty o'
-    maxgrpn = 0;
-    for (int g=0; g<ngrp; g++) if (grpsize[g]>maxgrpn) maxgrpn = grpsize[g];
-  } else {
-    // && INTEGER(tt)[0]!=maxgrpn) error("Internal error: o's maxgrpn mismatches recalculated maxgrpn"); // # nocov
-    maxgrpn = INTEGER(tt)[0];
+
+  high = (uint16_t *)R_alloc(nrow, sizeof(uint16_t));  // maybe better to malloc to avoid R's heap, but safer to R_alloc since it's done via eval()
+  low  = (uint16_t *)R_alloc(nrow, sizeof(uint16_t));
+  // global ghigh and glow because the g* functions (inside jsub) share this common memory
+
+  gx = (char *)R_alloc(nrow, sizeof(double));  // enough for a copy of one column (or length(irows) if supplied)
+
+  nBatch = getDTthreads()*2;  // 2 to reduce last-thread-home. TODO: experiment. The higher this is though, the bigger is counts[]
+  batchSize = (nrow-1)/nBatch + 1;
+  lastBatchSize = nrow - (nBatch-1)*batchSize;
+  counts = (int *)S_alloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64
+  tmpcounts = (int *)R_alloc(getDTthreads()*highSize, sizeof(int));
+
+  const int *restrict gp = grp;
+  #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
+  for (int b=0; b<nBatch; b++) {
+    int *restrict my_counts = counts + b*highSize;
+    uint16_t *restrict my_high = high + b*batchSize;
+    const int *my_pg = gp + b*batchSize;
+    const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
+    for (int i=0; i<howMany; i++) {
+      const int w = my_pg[i] >> shift;
+      my_counts[w]++;
+      my_high[i] = (uint16_t)w;  // reduce 4 bytes to 2
+    }
+    for (int i=0, cum=0; i<highSize; i++) {
+      int tmp = my_counts[i];
+      my_counts[i] = cum;
+      cum += tmp;
+    }
+    uint16_t *restrict my_low = low + b*batchSize;
+    int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
+    memcpy(my_tmpcounts, my_counts, highSize*sizeof(int));
+    for (int i=0; i<howMany; i++) {
+      const int w = my_pg[i] >> shift;   // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too
+      my_low[my_tmpcounts[w]++] = (uint16_t)(my_pg[i] & mask);
+    }
+    // counts is now cumulated within batch (with ending values) and we leave it that way
+    // memcpy(counts + b*256, myCounts, 256*sizeof(int));  // save cumulate for later, first bucket contains position of next. For ease later in the very last batch.
   }
+
   oo = INTEGER(o);
   ff = INTEGER(f);
 
@@ -88,13 +140,49 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     SET_VECTOR_ELT(ans, 0, tt);
     UNPROTECT(1);
   }
-  ngrp = 0; maxgrpn = 0; irowslen = -1; isunsorted = 0;
+  ngrp = 0; maxgrpn=0; irowslen = -1; isunsorted = 0;
 
   // Rprintf("gforce took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
   UNPROTECT(1);
   return(ans);
 }
 
+void *gather(void *x, size_t size, bool *anyNA)
+{
+  if (size==4) {
+    const int *thisx = x;
+    //int *restrict thisgx = gx;
+    #pragma omp parallel for num_threads(getDTthreads())
+    for (int b=0; b<nBatch; b++) {
+      int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
+      memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));   // original cumulated   // already cumulated for this batch
+      int *restrict my_gx = (int *)gx + b*batchSize;
+      const uint16_t *my_high = high + b*batchSize;
+      const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
+      bool my_anyNA = false;
+      if (irowslen==-1) {
+        const int *my_x = thisx + b*batchSize;
+        for (int i=0; i<howMany; i++) {
+          const int elem = my_x[i];
+          my_gx[ my_tmpcounts[my_high[i]]++ ] = elem;
+          if (elem==NA_INTEGER) my_anyNA = true;
+        }
+      } else {
+        const int *my_x = irows + b*batchSize;
+        for (int i=0; i<howMany; i++) {
+          int elem = thisx[ my_x[i]-1 ];
+          my_gx[ my_tmpcounts[my_high[i]]++ ] = elem;
+          if (elem==NA_INTEGER) my_anyNA = true;
+        }
+      }
+      if (my_anyNA) *anyNA = true;  // naked write ok since just bool and always writing true; and no performance issue as maximum nBatch writes
+    }
+  } else {
+    error("gather not yet implemented for size!=4");
+  }
+  return gx;
+}
+
 // long double usage here results in test 648 being failed when running with valgrind
 // http://valgrind.org/docs/manual/manual-core.html#manual-core.limits
 SEXP gsum(SEXP x, SEXP narmArg)
@@ -105,47 +193,66 @@ SEXP gsum(SEXP x, SEXP narmArg)
   if (inherits(x, "factor")) error("sum is not meaningful for factors.");
   const int n = (irowslen == -1) ? length(x) : irowslen;
   //clock_t start = clock();
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gsum", nrow, n);
+  long double *ldsum = calloc(ngrp, sizeof(long double));
+  if (!ldsum) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(long double));
+  bool anyNA=false;
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gsum", grpn, n);
-  long double *s = calloc(ngrp, sizeof(long double));
-  if (!s) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(long double));
   switch(TYPEOF(x)) {
   case LGLSXP: case INTSXP: {
-    int *xd = INTEGER(x);
-    if (irowslen==-1) {
-      for (int i=0, *g=grp; i<n; i++) {
-        if (*xd==NA_INTEGER) {
-          if (!narm) s[*g] = NA_REAL;  // Let NA_REAL propogate from here (this is gforce, so no break here). R_NaReal is IEEE
-          g++; xd++;
-          continue;
-        }
-        s[*g++] += *xd++;     // no under/overflow here, s is long double (like base)
-      }
-    } else {
-      for (int i=0, *g=grp; i<n; i++) {
-        int elem = xd[irows[i]-1];
-        if (elem==NA_INTEGER) {
-          if (!narm) s[*g] = NA_REAL;
-          g++;
-          continue;
+    // int *xd = INTEGER(x);
+    const int *restrict gx = gather(INTEGER(x), sizeof(int), &anyNA);  // TODO: could return anyNA too
+    #pragma omp parallel for num_threads(getDTthreads())  // schedule(dynamic,1)
+    for (int h=0; h<highSize; h++) {   // very important that high is first loop here
+      long double *restrict _ans = ldsum + (h<<shift);
+      for (int b=0; b<nBatch; b++) {
+        const int pos = counts[ b*highSize + h ];
+        const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
+        const int *my_gx = gx + b*batchSize + pos;
+        const uint16_t *my_low = low + b*batchSize + pos;
+        if (!anyNA) {   // TODO: take out before prallel loop, and repeat PARLOOP using macro, for completness just in case (e.g. K=2).
+          for (int i=0; i<howMany; i++) {
+            _ans[my_low[i]] += my_gx[i];  // naked by design; each thread does all of each h for all batches
+          }
+        } else {
+          for (int i=0; i<howMany; i++) {
+            int elem = my_gx[i];
+            if (elem==NA_INTEGER) {
+              if (!narm) _ans[my_low[i]] = NA_REAL;
+            } else {
+              _ans[my_low[i]] += elem;
+            }
+          }
         }
-        s[*g++] += elem;
       }
     }
-    ans = PROTECT(allocVector(INTSXP, ngrp));
-    xd = INTEGER(ans);
+    bool stop = false;
+    #pragma omp parallel for num_threads(getDTthreads())
     for (int i=0; i<ngrp; i++) {
-      if (s[i] > INT_MAX || s[i] < INT_MIN) {
-        warning("Group %d summed to more than type 'integer' can hold so the result has been coerced to 'numeric' automatically, for convenience.", i+1);
-        UNPROTECT(1);
-        ans = PROTECT(allocVector(REALSXP, ngrp));
-        double *tt = REAL(ans);
-        for (i=0; i<ngrp; i++) tt[i] = (double)s[i];
-        break;
-      } else if (ISNA(s[i])) {
-        xd[i] = NA_INTEGER;
+      if (stop) continue;
+      if (ldsum[i]>INT_MAX || ldsum[i]<INT_MIN) stop=true;
+    }
+    if (stop) {
+      warning("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.");
+      ans = PROTECT(allocVector(REALSXP, ngrp));
+      double *restrict ansp = REAL(ans);
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int i=0; i<ngrp; i++) {
+        ansp[i] = (double)ldsum[i];
+      }
+    } else {
+      ans = PROTECT(allocVector(INTSXP, ngrp));
+      int *restrict ansp = INTEGER(ans);
+      if (anyNA) {
+        #pragma omp parallel for num_threads(getDTthreads())
+        for (int i=0; i<ngrp; i++) {
+          ansp[i] = ISNA(ldsum[i]) ? NA_INTEGER : (int)ldsum[i];
+        }
       } else {
-        xd[i] = (int)s[i];
+        #pragma omp parallel for num_threads(getDTthreads())
+        for (int i=0; i<ngrp; i++) {
+          ansp[i] = (int)ldsum[i];
+        }
       }
     }
   } break;
@@ -154,28 +261,28 @@ SEXP gsum(SEXP x, SEXP narmArg)
     if (irowslen==-1) {
       for (int i=0, *g=grp; i<n; i++) {
         if (narm && ISNAN(*xd)) {g++; xd++; continue;}   // narm first and leave to branch prediction
-        s[*g++] += *xd++;                                // accumulate in long-double like base. Let NA propogate when !narm
+        ldsum[*g++] += *xd++;                            // accumulate in long-double like base. Let NA propogate when !narm
       }
     } else {
       for (int i=0, *g=grp; i<n; i++) {
         double elem = xd[irows[i]-1];
         if (narm && ISNAN(elem)) {g++; continue;}
-        s[*g++] += elem;
+        ldsum[*g++] += elem;
       }
     }
     ans = PROTECT(allocVector(REALSXP, ngrp));
     xd = REAL(ans);
     for (int i=0; i<ngrp; i++) {
-      if (s[i] > DBL_MAX) xd[i] = R_PosInf;
-      else if (s[i] < -DBL_MAX) xd[i] = R_NegInf;
-      else xd[i] = (double)s[i];
+      if (ldsum[i] > DBL_MAX) xd[i] = R_PosInf;
+      else if (ldsum[i] < -DBL_MAX) xd[i] = R_NegInf;
+      else xd[i] = (double)ldsum[i];
     }
   } break;
   default:
-    free(s);
+    free(ldsum);
     error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
   }
-  free(s);
+  free(ldsum);
   copyMostAttrib(x, ans);
   UNPROTECT(1);
   // Rprintf("this gsum took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
@@ -207,7 +314,7 @@ SEXP gmean(SEXP x, SEXP narm)
   }
   // na.rm=TRUE.  Similar to gsum, but we need to count the non-NA as well for the divisor
   const int n = (irowslen == -1) ? length(x) : irowslen;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gsum", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gsum", nrow, n);
 
   long double *s = calloc(ngrp, sizeof(long double));
   if (!s) error("Unable to allocate %d * %d bytes for sum in gmean na.rm=TRUE", ngrp, sizeof(long double));
@@ -263,7 +370,7 @@ SEXP gmin(SEXP x, SEXP narm)
   int n = (irowslen == -1) ? length(x) : irowslen;
   //clock_t start = clock();
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gmin", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gmin", nrow, n);
   int protecti=0;
   switch(TYPEOF(x)) {
   case LGLSXP: case INTSXP:
@@ -379,7 +486,7 @@ SEXP gmax(SEXP x, SEXP narm)
   int n = (irowslen == -1) ? length(x) : irowslen;
   //clock_t start = clock();
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gmax", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gmax", nrow, n);
 
   // TODO rework gmax in the same way as gmin and remove this *update
   char *update = (char *)R_alloc(ngrp, sizeof(char));
@@ -524,7 +631,7 @@ SEXP gmedian(SEXP x, SEXP narm) {
   SEXP ans, sub, klass;
   void *ptr;
   int n = (irowslen == -1) ? length(x) : irowslen;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gmedian", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gmedian", nrow, n);
   switch(TYPEOF(x)) {
   case REALSXP:
     klass = getAttrib(x, R_ClassSymbol);
@@ -693,7 +800,7 @@ SEXP glast(SEXP x) {
   R_len_t i,k;
   int n = (irowslen == -1) ? length(x) : irowslen;
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gtail", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gtail", nrow, n);
   switch(TYPEOF(x)) {
   case LGLSXP:
     ans = PROTECT(allocVector(LGLSXP, ngrp));
@@ -755,7 +862,7 @@ SEXP gfirst(SEXP x) {
   R_len_t i,k;
   int n = (irowslen == -1) ? length(x) : irowslen;
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in ghead", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in ghead", nrow, n);
   switch(TYPEOF(x)) {
   case LGLSXP:
     ans = PROTECT(allocVector(LGLSXP, ngrp));
@@ -826,7 +933,7 @@ SEXP gnthvalue(SEXP x, SEXP valArg) {
   R_len_t i,k, val=INTEGER(valArg)[0];
   int n = (irowslen == -1) ? length(x) : irowslen;
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in ghead", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in ghead", nrow, n);
   switch(TYPEOF(x)) {
   case LGLSXP:
     ans = PROTECT(allocVector(LGLSXP, ngrp));
@@ -895,7 +1002,7 @@ SEXP gvarsd1(SEXP x, SEXP narm, Rboolean isSD)
   if (inherits(x, "factor")) error("var/sd is not meaningful for factors.");
   long double m, s, v;
   R_len_t i, j, ix, thisgrpsize = 0, n = (irowslen == -1) ? length(x) : irowslen;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gvar", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gvar", nrow, n);
   SEXP sub, ans = PROTECT(allocVector(REALSXP, ngrp));
   Rboolean ans_na;
   switch(TYPEOF(x)) {
@@ -1037,7 +1144,7 @@ SEXP gprod(SEXP x, SEXP narm)
   int n = (irowslen == -1) ? length(x) : irowslen;
   //clock_t start = clock();
   SEXP ans;
-  if (grpn != n) error("grpn [%d] != length(x) [%d] in gprod", grpn, n);
+  if (nrow != n) error("nrow [%d] != length(x) [%d] in gprod", nrow, n);
   long double *s = malloc(ngrp * sizeof(long double));
   if (!s) error("Unable to allocate %d * %d bytes for gprod", ngrp, sizeof(long double));
   for (i=0; i<ngrp; i++) s[i] = 1.0;

From daada95976fa74cd9715eacd50196b70e66f3794 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Sat, 8 Dec 2018 15:15:01 -0800
Subject: [PATCH 03/10] interim

---
 src/gsumm.c | 58 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index f0d33cce4d..d2218dd32c 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -68,6 +68,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   shift = nbit/2;
   mask = (1<<shift)-1;
   highSize = ((ngrp-1)>>shift) + 1;
+  Rprintf("ngrp=%d nbit=%d shift=%d highSize=%d\n", ngrp, nbit, shift, highSize);
 
   grp = (int *)R_alloc(nrow, sizeof(int));   // TODO: use malloc and made this local as not needed globally when all functions here use gather
                                              // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc
@@ -194,17 +195,21 @@ SEXP gsum(SEXP x, SEXP narmArg)
   const int n = (irowslen == -1) ? length(x) : irowslen;
   //clock_t start = clock();
   if (nrow != n) error("nrow [%d] != length(x) [%d] in gsum", nrow, n);
-  long double *ldsum = calloc(ngrp, sizeof(long double));
-  if (!ldsum) error("Unable to allocate %d * %d bytes for gsum", ngrp, sizeof(long double));
   bool anyNA=false;
   SEXP ans;
   switch(TYPEOF(x)) {
   case LGLSXP: case INTSXP: {
     // int *xd = INTEGER(x);
     const int *restrict gx = gather(INTEGER(x), sizeof(int), &anyNA);  // TODO: could return anyNA too
-    #pragma omp parallel for num_threads(getDTthreads())  // schedule(dynamic,1)
+    ans = PROTECT(allocVector(INTSXP, ngrp));
+    int *restrict ansp = INTEGER(ans);
+    memset(ansp, 0, ngrp*sizeof(int));
+    //int64_t *i64sum = calloc(ngrp, sizeof(int64_t));
+    //if (!i64sum) error("Unable to allocate %d * %d bytes for gsum i64", ngrp, sizeof(int64_t));
+    bool overflow=false;
+    #pragma omp parallel for num_threads(getDTthreads()) schedule(dynamic,1)
     for (int h=0; h<highSize; h++) {   // very important that high is first loop here
-      long double *restrict _ans = ldsum + (h<<shift);
+      int *restrict _ans = ansp + (h<<shift);
       for (int b=0; b<nBatch; b++) {
         const int pos = counts[ b*highSize + h ];
         const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
@@ -212,33 +217,40 @@ SEXP gsum(SEXP x, SEXP narmArg)
         const uint16_t *my_low = low + b*batchSize + pos;
         if (!anyNA) {   // TODO: take out before prallel loop, and repeat PARLOOP using macro, for completness just in case (e.g. K=2).
           for (int i=0; i<howMany; i++) {
-            _ans[my_low[i]] += my_gx[i];  // naked by design; each thread does all of each h for all batches
+            const int a = _ans[my_low[i]];
+            const int b = my_gx[i];
+            if ((a>0 && b>INT_MAX-a) || (a<0 && b<NA_INTEGER+1-a)) overflow=true;
+            else _ans[my_low[i]] += b;  // naked by design; each thread does all of each h for all batches
           }
         } else {
           for (int i=0; i<howMany; i++) {
-            int elem = my_gx[i];
-            if (elem==NA_INTEGER) {
-              if (!narm) _ans[my_low[i]] = NA_REAL;
-            } else {
-              _ans[my_low[i]] += elem;
+            const int a = _ans[my_low[i]];
+            if (a==NA_INTEGER) continue;
+            const int b = my_gx[i];
+            if (b==NA_INTEGER) {
+              if (!narm) _ans[my_low[i]]=NA_INTEGER;
+              continue;
             }
+            if ((a>0 && b>INT_MAX-a) || (a<0 && b<NA_INTEGER+1-a)) overflow=true;
+            else _ans[my_low[i]] += b;  // naked by design; each thread does all of each h for all batches
           }
         }
       }
     }
-    bool stop = false;
-    #pragma omp parallel for num_threads(getDTthreads())
-    for (int i=0; i<ngrp; i++) {
-      if (stop) continue;
-      if (ldsum[i]>INT_MAX || ldsum[i]<INT_MIN) stop=true;
-    }
-    if (stop) {
+    if (overflow) error("overflow summing integer not yet auto-coerce");
+/*    bool stop = false;
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int i=0; i<ngrp; i++) {
+        if (stop) continue;
+        if (i64sum[i]>INT32_MAX || (i64sum[i]<=NA_INTEGER && i64sum[i]!=INT64_MIN)) stop=true;
+      }
+      if (stop) {
       warning("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.");
       ans = PROTECT(allocVector(REALSXP, ngrp));
       double *restrict ansp = REAL(ans);
       #pragma omp parallel for num_threads(getDTthreads())
       for (int i=0; i<ngrp; i++) {
-        ansp[i] = (double)ldsum[i];
+        ansp[i] = i64sum[i]==INT64_MIN ? NA_REAL : (double)i64sum[i];
       }
     } else {
       ans = PROTECT(allocVector(INTSXP, ngrp));
@@ -246,17 +258,20 @@ SEXP gsum(SEXP x, SEXP narmArg)
       if (anyNA) {
         #pragma omp parallel for num_threads(getDTthreads())
         for (int i=0; i<ngrp; i++) {
-          ansp[i] = ISNA(ldsum[i]) ? NA_INTEGER : (int)ldsum[i];
+          ansp[i] = i64sum[i]==INT64_MIN ? NA_INTEGER : (int)i64sum[i];
         }
       } else {
         #pragma omp parallel for num_threads(getDTthreads())
         for (int i=0; i<ngrp; i++) {
-          ansp[i] = (int)ldsum[i];
+          ansp[i] = (int)i64sum[i];
         }
       }
     }
+    free(i64sum);*/
   } break;
   case REALSXP: {
+    long double *ldsum = calloc(ngrp, sizeof(long double));
+    if (!ldsum) error("Unable to allocate %d * %d bytes for gsum ld", ngrp, sizeof(long double));
     double *xd = REAL(x);                                // now-slower R API with altrep, outside
     if (irowslen==-1) {
       for (int i=0, *g=grp; i<n; i++) {
@@ -277,12 +292,11 @@ SEXP gsum(SEXP x, SEXP narmArg)
       else if (ldsum[i] < -DBL_MAX) xd[i] = R_NegInf;
       else xd[i] = (double)ldsum[i];
     }
+    free(ldsum);
   } break;
   default:
-    free(ldsum);
     error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));
   }
-  free(ldsum);
   copyMostAttrib(x, ans);
   UNPROTECT(1);
   // Rprintf("this gsum took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);

From 3e1af9ce0c4610853b0478e3991df4b4b3ebbcab Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Mon, 10 Dec 2018 18:51:44 -0800
Subject: [PATCH 04/10] interim

---
 src/gsumm.c | 109 +++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 90 insertions(+), 19 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index d2218dd32c..90dfedaffc 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -32,8 +32,18 @@ static union {
 # define SQRTL sqrt
 #endif
 
+static int nbit(int n)
+{
+  // returns position of biggest bit; i.e. floor(log2(n))+1 without using fpa
+  // not needed to be fast. Just a helper function.
+  int nb=0;
+  while (n) { nb++; n>>=1; }
+  return nb;
+}
+
 SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   // clock_t start = clock();
+  double started = wallclock();
   if (TYPEOF(env) != ENVSXP) error("env is not an environment");
   // The type of jsub is pretty flexbile in R, so leave checking to eval() below.
   if (!isInteger(o)) error("o is not an integer vector");
@@ -63,31 +73,87 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     if (length(tt)==1 && INTEGER(tt)[0]!=maxgrpn) error("Internal error: o's maxgrpn attribute mismatches recalculated maxgrpn"); // # nocov
   }
 
-  int nbit=0;
-  { int tt=ngrp-1; while (tt) { nbit++; tt>>=1; } }  // i.e. floor(log2(ngrp-1))+1
-  shift = nbit/2;
+  int nb = nbit(ngrp-1);
+  //shift = nb/2;
+  shift = MAX(nb-8,0);
   mask = (1<<shift)-1;
   highSize = ((ngrp-1)>>shift) + 1;
-  Rprintf("ngrp=%d nbit=%d shift=%d highSize=%d\n", ngrp, nbit, shift, highSize);
 
   grp = (int *)R_alloc(nrow, sizeof(int));   // TODO: use malloc and made this local as not needed globally when all functions here use gather
                                              // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc
-
   const int *restrict fdp = INTEGER(f);
+
+  nBatch = MIN((nrow+1)/2, getDTthreads()*2);  // 2 to reduce last-thread-home. TODO: experiment. The higher this is though, the bigger is counts[]
+  batchSize = (nrow-1)/nBatch + 1;
+  lastBatchSize = nrow - (nBatch-1)*batchSize;
+
+  Rprintf("ngrp=%d  nbit=%d  shift=%d  highSize=%d  nBatch=%d  batchSize=%d  lastBatchSize=%d\n", ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize);
+
+  // initial population of g:
+  #pragma omp parallel for num_threads(getDTthreads())
+  for (int g=0; g<ngrp; g++) {
+    int *elem = grp + fdp[g]-1;
+    for (int j=0; j<grpsize[g]; j++)  elem[j] = g;
+  }
+  Rprintf("gforce initial population of grp took %.3f\n", wallclock()-started); started=wallclock();
   if (LENGTH(o)) {
     isunsorted = 1; // for gmedian
-    const int *restrict odp = INTEGER(o);
-    #pragma omp parallel for num_threads(getDTthreads())
-    for (int g=0; g<ngrp; g++) {
-      const int *elem = odp + fdp[g]-1;
-      for (int j=0; j<grpsize[g]; j++)  grp[ elem[j]-1 ] = g;
+
+    const int *restrict op = INTEGER(o);  // o is a permutation of 1:nrow
+    // What follows is more cache-efficient version of this scattered assign :
+    // for (int g=0; g<ngrp; g++) {
+    //  const int *elem = odp + fdp[g]-1;
+    //  for (int j=0; j<grpsize[g]; j++)  grp[ elem[j]-1 ] = g;
+    //}
+    int nb = nbit(nrow-1);
+    int shift = MAX(nb-8, 0);
+    int highSize = ((nrow-1)>>shift) + 1;
+    Rprintf("When assigning grp[o] = g, highSize=%d  nb=%d  shift=%d  nBatch=%d\n", highSize, nb, shift, nBatch);
+    int *counts = calloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64.  This +1 is for easier diff later
+    int *tmpO   = malloc(nrow*sizeof(int));
+    int *tmpG   = malloc(nrow*sizeof(int));
+    if (!counts || !tmpO || !tmpG) error("Internal error: Failed to allocate counts, tmpO or tmpG when assigning g in gforce");
+    #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
+    for (int b=0; b<nBatch; b++) {
+      const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
+      const int *my_o = op + b*batchSize;
+      int *restrict my_counts = counts + b*highSize;
+      for (int i=0; i<howMany; i++) {
+        const int w = (my_o[i]-1) >> shift;
+        my_counts[w]++;
+      }
+      for (int i=0, cum=0; i<highSize; i++) {
+        int tmp = my_counts[i];
+        my_counts[i] = cum;
+        cum += tmp;
+      }
+      const int *restrict my_g = grp + b*batchSize;
+      int *restrict my_tmpO = tmpO + b*batchSize;
+      int *restrict my_tmpG = tmpG + b*batchSize;
+      for (int i=0; i<howMany; i++) {
+        const int w = (my_o[i]-1) >> shift;   // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too
+        const int p = my_counts[w]++;
+        my_tmpO[p] = (int)(my_o[i]-1);
+        my_tmpG[p] = (int)(my_g[i]);
+      }
     }
-  } else {
+    Rprintf("gforce assign tmpO and tmpG took %.3f\n", wallclock()-started); started=wallclock();
     #pragma omp parallel for num_threads(getDTthreads())
-    for (int g=0; g<ngrp; g++) {
-      int *elem = grp + fdp[g]-1;
-      for (int j=0; j<grpsize[g]; j++)  elem[j] = g;
+    for (int h=0; h<highSize; h++) {  // very important that high is first loop here
+      for (int b=0; b<nBatch; b++) {
+        const int start = h==0 ? 0 : counts[ b*highSize + h - 1 ];
+        const int end   = counts[ b*highSize + h ];
+        const int *restrict my_tmpO = tmpO + b*batchSize;
+        const int *restrict my_tmpG = tmpG + b*batchSize;
+        for (int k=start; k<end; k++) {
+          grp[ my_tmpO[k] ] = my_tmpG[k];  // TODO: could write high here, and initial low.   ** If so, same in initial population when o is missing **
+        }
+      }
     }
+    free(counts);
+    free(tmpO);
+    free(tmpG);
+    Rprintf("gforce assign tmpO and tmpG back to grp took %.3f\n", wallclock()-started); started=wallclock();
   }
 
   high = (uint16_t *)R_alloc(nrow, sizeof(uint16_t));  // maybe better to malloc to avoid R's heap, but safer to R_alloc since it's done via eval()
@@ -96,9 +162,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
 
   gx = (char *)R_alloc(nrow, sizeof(double));  // enough for a copy of one column (or length(irows) if supplied)
 
-  nBatch = getDTthreads()*2;  // 2 to reduce last-thread-home. TODO: experiment. The higher this is though, the bigger is counts[]
-  batchSize = (nrow-1)/nBatch + 1;
-  lastBatchSize = nrow - (nBatch-1)*batchSize;
+
   counts = (int *)S_alloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64
   tmpcounts = (int *)R_alloc(getDTthreads()*highSize, sizeof(int));
 
@@ -129,11 +193,14 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     // counts is now cumulated within batch (with ending values) and we leave it that way
     // memcpy(counts + b*256, myCounts, 256*sizeof(int));  // save cumulate for later, first bucket contains position of next. For ease later in the very last batch.
   }
+  Rprintf("gforce assign high and low took %.3f\n", wallclock()-started); started=wallclock();
 
   oo = INTEGER(o);
   ff = INTEGER(f);
+  Rprintf("gforce two INTEGERs took %.3f\n", wallclock()-started); started=wallclock();
 
   SEXP ans = PROTECT( eval(jsub, env) );
+  Rprintf("gforce eval took %.3f\n", wallclock()-started);
   // if this eval() fails with R error, R will release grp for us. Which is why we use R_alloc above.
   if (isVectorAtomic(ans)) {
     SEXP tt = ans;
@@ -143,13 +210,14 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   }
   ngrp = 0; maxgrpn=0; irowslen = -1; isunsorted = 0;
 
-  // Rprintf("gforce took %8.3f\n", 1.0*(clock()-start)/CLOCKS_PER_SEC);
+
   UNPROTECT(1);
   return(ans);
 }
 
 void *gather(void *x, size_t size, bool *anyNA)
 {
+  double started = wallclock();
   if (size==4) {
     const int *thisx = x;
     //int *restrict thisgx = gx;
@@ -181,6 +249,7 @@ void *gather(void *x, size_t size, bool *anyNA)
   } else {
     error("gather not yet implemented for size!=4");
   }
+  Rprintf("gather took %.3fs\n", wallclock()-started);
   return gx;
 }
 
@@ -207,7 +276,8 @@ SEXP gsum(SEXP x, SEXP narmArg)
     //int64_t *i64sum = calloc(ngrp, sizeof(int64_t));
     //if (!i64sum) error("Unable to allocate %d * %d bytes for gsum i64", ngrp, sizeof(int64_t));
     bool overflow=false;
-    #pragma omp parallel for num_threads(getDTthreads()) schedule(dynamic,1)
+    double started = wallclock();
+    #pragma omp parallel for num_threads(getDTthreads()) //schedule(dynamic,1)
     for (int h=0; h<highSize; h++) {   // very important that high is first loop here
       int *restrict _ans = ansp + (h<<shift);
       for (int b=0; b<nBatch; b++) {
@@ -237,6 +307,7 @@ SEXP gsum(SEXP x, SEXP narmArg)
         }
       }
     }
+    Rprintf("gsum int took %.3f\n", wallclock()-started);
     if (overflow) error("overflow summing integer not yet auto-coerce");
 /*    bool stop = false;
       #pragma omp parallel for num_threads(getDTthreads())

From 4c472a6879dde356f40c51f2a1fc5ddb90de8b0c Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Mon, 10 Dec 2018 22:13:57 -0800
Subject: [PATCH 05/10] assigning g from each batch didn't work quite as well

---
 src/gsumm.c | 98 +++++++++++++++++++++++++++++------------------------
 1 file changed, 53 insertions(+), 45 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index 90dfedaffc..42cffc79ca 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -105,54 +105,62 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     //  const int *elem = odp + fdp[g]-1;
     //  for (int j=0; j<grpsize[g]; j++)  grp[ elem[j]-1 ] = g;
     //}
-    int nb = nbit(nrow-1);
-    int shift = MAX(nb-8, 0);
-    int highSize = ((nrow-1)>>shift) + 1;
-    Rprintf("When assigning grp[o] = g, highSize=%d  nb=%d  shift=%d  nBatch=%d\n", highSize, nb, shift, nBatch);
-    int *counts = calloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64.  This +1 is for easier diff later
-    int *tmpO   = malloc(nrow*sizeof(int));
-    int *tmpG   = malloc(nrow*sizeof(int));
-    if (!counts || !tmpO || !tmpG) error("Internal error: Failed to allocate counts, tmpO or tmpG when assigning g in gforce");
-    #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
-    for (int b=0; b<nBatch; b++) {
-      const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
-      const int *my_o = op + b*batchSize;
-      int *restrict my_counts = counts + b*highSize;
-      for (int i=0; i<howMany; i++) {
-        const int w = (my_o[i]-1) >> shift;
-        my_counts[w]++;
-      }
-      for (int i=0, cum=0; i<highSize; i++) {
-        int tmp = my_counts[i];
-        my_counts[i] = cum;
-        cum += tmp;
-      }
-      const int *restrict my_g = grp + b*batchSize;
-      int *restrict my_tmpO = tmpO + b*batchSize;
-      int *restrict my_tmpG = tmpG + b*batchSize;
-      for (int i=0; i<howMany; i++) {
-        const int w = (my_o[i]-1) >> shift;   // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too
-        const int p = my_counts[w]++;
-        my_tmpO[p] = (int)(my_o[i]-1);
-        my_tmpG[p] = (int)(my_g[i]);
-      }
-    }
-    Rprintf("gforce assign tmpO and tmpG took %.3f\n", wallclock()-started); started=wallclock();
-    #pragma omp parallel for num_threads(getDTthreads())
-    for (int h=0; h<highSize; h++) {  // very important that high is first loop here
-      for (int b=0; b<nBatch; b++) {
-        const int start = h==0 ? 0 : counts[ b*highSize + h - 1 ];
-        const int end   = counts[ b*highSize + h ];
-        const int *restrict my_tmpO = tmpO + b*batchSize;
-        const int *restrict my_tmpG = tmpG + b*batchSize;
-        for (int k=start; k<end; k++) {
-          grp[ my_tmpO[k] ] = my_tmpG[k];  // TODO: could write high here, and initial low.   ** If so, same in initial population when o is missing **
+
+    int _nBatch =    nBatch;             // (nrow-1)/_batchSize + 1;
+
+    int _batchSize = (nrow-1)/_nBatch + 1;          // MIN(65535, nrow/2);
+    int _lastBatchSize = nrow - (_nBatch-1)*_batchSize;
+
+    int _nb = nbit(nrow-1);
+    int _shift = _nb/2;   //MAX(_nb-8, 0);   // TODO: try more than 8, and try _nb/2 again
+    int _highSize = ((nrow-1)>>_shift) + 1;
+    int _nth = MIN(_nBatch, getDTthreads());
+
+    int *_counts = malloc(_nth*_highSize*sizeof(int));  // TODO: cache-line align and make highSize a multiple of 64.  This +1 is for easier diff later
+    int *_tmpO   = malloc(_nth*_batchSize*sizeof(int));
+    int *_tmpG   = malloc(_nth*_batchSize*sizeof(int));
+    int *restrict _newG   = malloc(nrow*sizeof(int));
+    if (!_counts || !_tmpO || !_tmpG || !_newG) error("Internal error: Failed to allocate counts, tmpO, tmpG or newG when assigning g in gforce");
+    Rprintf("When assigning grp[o]=g, _highSize=%d  _nb=%d  _shift=%d  _nBatch=%d, _batchSize=%d  _lastBatchSize=%d  _nth=%d\n",
+            _highSize, _nb, _shift, _nBatch, _batchSize, _lastBatchSize, _nth);
+
+    #pragma omp parallel num_threads(_nth)      // TODO: could loop through g and avoid needing newG ?
+    {
+      const int me = omp_get_thread_num();
+      int *restrict my_tmpO = _tmpO + me*_batchSize;
+      int *restrict my_tmpG = _tmpG + me*_batchSize;
+      int *restrict my_counts = _counts + me*_highSize;
+      #pragma omp for  // schedule(dynamic,1)
+      for (int b=0; b<_nBatch; b++) {
+        memset(my_counts, 0, _highSize*sizeof(int));
+        const int howMany = b==_nBatch-1 ? _lastBatchSize : _batchSize;
+        const int *my_o = op + b*_batchSize;
+        const int *restrict my_g = grp + b*_batchSize;
+        for (int i=0; i<howMany; i++) {
+          const int w = (my_o[i]-1) >> _shift;
+          my_counts[w]++;
+        }
+        for (int i=0, cum=0; i<_highSize; i++) {
+          int tmp = my_counts[i];
+          my_counts[i] = cum;
+          cum += tmp;
+        }
+        for (int i=0; i<howMany; i++) {
+          const int w = (my_o[i]-1) >> _shift;   // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too
+          const int p = my_counts[w]++;
+          my_tmpO[p] = (int)(my_o[i]-1);
+          my_tmpG[p] = (int)(my_g[i]);
+        }
+        for (int i=0; i<howMany; i++) {
+          _newG[ my_tmpO[i] ] = my_tmpG[i];  // TODO: could write high here, and initial low.   ** If so, same in initial population when o is missing **
         }
       }
     }
-    free(counts);
-    free(tmpO);
-    free(tmpG);
+    memcpy(grp, _newG, nrow*sizeof(int));
+    free(_counts);
+    free(_tmpO);
+    free(_tmpG);
+    free(_newG);
     Rprintf("gforce assign tmpO and tmpG back to grp took %.3f\n", wallclock()-started); started=wallclock();
   }
 

From 9b9f1aa41d0cad7bfb6d57a74d60811b21920a0c Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Mon, 10 Dec 2018 23:00:28 -0800
Subject: [PATCH 06/10] (o,g) pairs better

---
 src/gsumm.c | 96 +++++++++++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 54 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index 42cffc79ca..d7a313133e 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -105,63 +105,51 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     //  const int *elem = odp + fdp[g]-1;
     //  for (int j=0; j<grpsize[g]; j++)  grp[ elem[j]-1 ] = g;
     //}
-
-    int _nBatch =    nBatch;             // (nrow-1)/_batchSize + 1;
-
-    int _batchSize = (nrow-1)/_nBatch + 1;          // MIN(65535, nrow/2);
-    int _lastBatchSize = nrow - (_nBatch-1)*_batchSize;
-
-    int _nb = nbit(nrow-1);
-    int _shift = _nb/2;   //MAX(_nb-8, 0);   // TODO: try more than 8, and try _nb/2 again
-    int _highSize = ((nrow-1)>>_shift) + 1;
-    int _nth = MIN(_nBatch, getDTthreads());
-
-    int *_counts = malloc(_nth*_highSize*sizeof(int));  // TODO: cache-line align and make highSize a multiple of 64.  This +1 is for easier diff later
-    int *_tmpO   = malloc(_nth*_batchSize*sizeof(int));
-    int *_tmpG   = malloc(_nth*_batchSize*sizeof(int));
-    int *restrict _newG   = malloc(nrow*sizeof(int));
-    if (!_counts || !_tmpO || !_tmpG || !_newG) error("Internal error: Failed to allocate counts, tmpO, tmpG or newG when assigning g in gforce");
-    Rprintf("When assigning grp[o]=g, _highSize=%d  _nb=%d  _shift=%d  _nBatch=%d, _batchSize=%d  _lastBatchSize=%d  _nth=%d\n",
-            _highSize, _nb, _shift, _nBatch, _batchSize, _lastBatchSize, _nth);
-
-    #pragma omp parallel num_threads(_nth)      // TODO: could loop through g and avoid needing newG ?
-    {
-      const int me = omp_get_thread_num();
-      int *restrict my_tmpO = _tmpO + me*_batchSize;
-      int *restrict my_tmpG = _tmpG + me*_batchSize;
-      int *restrict my_counts = _counts + me*_highSize;
-      #pragma omp for  // schedule(dynamic,1)
-      for (int b=0; b<_nBatch; b++) {
-        memset(my_counts, 0, _highSize*sizeof(int));
-        const int howMany = b==_nBatch-1 ? _lastBatchSize : _batchSize;
-        const int *my_o = op + b*_batchSize;
-        const int *restrict my_g = grp + b*_batchSize;
-        for (int i=0; i<howMany; i++) {
-          const int w = (my_o[i]-1) >> _shift;
-          my_counts[w]++;
-        }
-        for (int i=0, cum=0; i<_highSize; i++) {
-          int tmp = my_counts[i];
-          my_counts[i] = cum;
-          cum += tmp;
-        }
-        for (int i=0; i<howMany; i++) {
-          const int w = (my_o[i]-1) >> _shift;   // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too
-          const int p = my_counts[w]++;
-          my_tmpO[p] = (int)(my_o[i]-1);
-          my_tmpG[p] = (int)(my_g[i]);
-        }
-        for (int i=0; i<howMany; i++) {
-          _newG[ my_tmpO[i] ] = my_tmpG[i];  // TODO: could write high here, and initial low.   ** If so, same in initial population when o is missing **
+    int nb = nbit(nrow-1);
+    int shift = MAX(nb-8, 0);
+    int highSize = ((nrow-1)>>shift) + 1;
+    Rprintf("When assigning grp[o] = g, highSize=%d  nb=%d  shift=%d  nBatch=%d\n", highSize, nb, shift, nBatch);
+    int *counts = calloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64.  This +1 is for easier diff later
+    int *TMP   = malloc(nrow*2*sizeof(int));
+    if (!counts || !TMP ) error("Internal error: Failed to allocate counts, tmpO or tmpG when assigning g in gforce");
+    #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
+    for (int b=0; b<nBatch; b++) {
+      const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
+      const int *my_o = op + b*batchSize;
+      int *restrict my_counts = counts + b*highSize;
+      for (int i=0; i<howMany; i++) {
+        const int w = (my_o[i]-1) >> shift;
+        my_counts[w]++;
+      }
+      for (int i=0, cum=0; i<highSize; i++) {
+        int tmp = my_counts[i];
+        my_counts[i] = cum;
+        cum += tmp;
+      }
+      const int *restrict my_g = grp + b*batchSize;
+      int *restrict my_tmp = TMP + b*2*batchSize;
+      for (int i=0; i<howMany; i++) {
+        const int w = (my_o[i]-1) >> shift;   // could use my_high but may as well use my_pg since we need my_pg anyway for the lower bits next too
+        int *p = my_tmp + 2*my_counts[w]++;
+        *p++ = my_o[i]-1;
+        *p   = my_g[i];
+      }
+    }
+    Rprintf("gforce assign TMP (o,g) pairs took %.3f\n", wallclock()-started); started=wallclock();
+    #pragma omp parallel for num_threads(getDTthreads())
+    for (int h=0; h<highSize; h++) {  // very important that high is first loop here
+      for (int b=0; b<nBatch; b++) {
+        const int start = h==0 ? 0 : counts[ b*highSize + h - 1 ];
+        const int end   = counts[ b*highSize + h ];
+        const int *restrict p = TMP + b*2*batchSize + start*2;
+        for (int k=start; k<end; k++, p+=2) {
+          grp[p[0]] = p[1];  // TODO: could write high here, and initial low.   ** If so, same in initial population when o is missing **
         }
       }
     }
-    memcpy(grp, _newG, nrow*sizeof(int));
-    free(_counts);
-    free(_tmpO);
-    free(_tmpG);
-    free(_newG);
-    Rprintf("gforce assign tmpO and tmpG back to grp took %.3f\n", wallclock()-started); started=wallclock();
+    free(counts);
+    free(TMP);
+    Rprintf("gforce assign TMP [ (o,g) pairs ] back to grp took %.3f\n", wallclock()-started); started=wallclock();
   }
 
   high = (uint16_t *)R_alloc(nrow, sizeof(uint16_t));  // maybe better to malloc to avoid R's heap, but safer to R_alloc since it's done via eval()

From 4d63fba5719c20ea9244baf48b68ff4c5eabb531 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Mon, 10 Dec 2018 23:39:48 -0800
Subject: [PATCH 07/10] tidy

---
 src/gsumm.c | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/gsumm.c b/src/gsumm.c
index d7a313133e..ff2358e112 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -42,8 +42,7 @@ static int nbit(int n)
 }
 
 SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
-  // clock_t start = clock();
-  double started = wallclock();
+  // double started = wallclock();
   if (TYPEOF(env) != ENVSXP) error("env is not an environment");
   // The type of jsub is pretty flexbile in R, so leave checking to eval() below.
   if (!isInteger(o)) error("o is not an integer vector");
@@ -74,44 +73,44 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   }
 
   int nb = nbit(ngrp-1);
-  //shift = nb/2;
-  shift = MAX(nb-8,0);
+  shift = MAX(nb-8,0);   //shift = nb/2;
   mask = (1<<shift)-1;
   highSize = ((ngrp-1)>>shift) + 1;
 
   grp = (int *)R_alloc(nrow, sizeof(int));   // TODO: use malloc and made this local as not needed globally when all functions here use gather
                                              // maybe better to malloc to avoid R's heap. This grp isn't global, so it doesn't need to be R_alloc
-  const int *restrict fdp = INTEGER(f);
+  const int *restrict fp = INTEGER(f);
 
   nBatch = MIN((nrow+1)/2, getDTthreads()*2);  // 2 to reduce last-thread-home. TODO: experiment. The higher this is though, the bigger is counts[]
   batchSize = (nrow-1)/nBatch + 1;
   lastBatchSize = nrow - (nBatch-1)*batchSize;
 
-  Rprintf("ngrp=%d  nbit=%d  shift=%d  highSize=%d  nBatch=%d  batchSize=%d  lastBatchSize=%d\n", ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize);
+  //Rprintf("ngrp=%d  nbit=%d  shift=%d  highSize=%d  nBatch=%d  batchSize=%d  lastBatchSize=%d\n", ngrp, nb, shift, highSize, nBatch, batchSize, lastBatchSize);
 
   // initial population of g:
   #pragma omp parallel for num_threads(getDTthreads())
   for (int g=0; g<ngrp; g++) {
-    int *elem = grp + fdp[g]-1;
+    int *elem = grp + fp[g]-1;
     for (int j=0; j<grpsize[g]; j++)  elem[j] = g;
   }
-  Rprintf("gforce initial population of grp took %.3f\n", wallclock()-started); started=wallclock();
+  //Rprintf("gforce initial population of grp took %.3f\n", wallclock()-started); started=wallclock();
   if (LENGTH(o)) {
     isunsorted = 1; // for gmedian
 
-    const int *restrict op = INTEGER(o);  // o is a permutation of 1:nrow
     // What follows is more cache-efficient version of this scattered assign :
     // for (int g=0; g<ngrp; g++) {
-    //  const int *elem = odp + fdp[g]-1;
+    //  const int *elem = op + fp[g]-1;
     //  for (int j=0; j<grpsize[g]; j++)  grp[ elem[j]-1 ] = g;
     //}
+
+    const int *restrict op = INTEGER(o);  // o is a permutation of 1:nrow
     int nb = nbit(nrow-1);
-    int shift = MAX(nb-8, 0);
+    int shift = MAX(nb-8, 0);  // TODO: experiment nb/2
     int highSize = ((nrow-1)>>shift) + 1;
-    Rprintf("When assigning grp[o] = g, highSize=%d  nb=%d  shift=%d  nBatch=%d\n", highSize, nb, shift, nBatch);
-    int *counts = calloc(nBatch*highSize, sizeof(int));  // (S_ zeros) TODO: cache-line align and make highSize a multiple of 64.  This +1 is for easier diff later
+    //Rprintf("When assigning grp[o] = g, highSize=%d  nb=%d  shift=%d  nBatch=%d\n", highSize, nb, shift, nBatch);
+    int *counts = calloc(nBatch*highSize, sizeof(int));  // TODO: cache-line align and make highSize a multiple of 64
     int *TMP   = malloc(nrow*2*sizeof(int));
-    if (!counts || !TMP ) error("Internal error: Failed to allocate counts, tmpO or tmpG when assigning g in gforce");
+    if (!counts || !TMP ) error("Internal error: Failed to allocate counts or TMP when assigning g in gforce");
     #pragma omp parallel for num_threads(getDTthreads())   // schedule(dynamic,1)
     for (int b=0; b<nBatch; b++) {
       const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
@@ -135,7 +134,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
         *p   = my_g[i];
       }
     }
-    Rprintf("gforce assign TMP (o,g) pairs took %.3f\n", wallclock()-started); started=wallclock();
+    //Rprintf("gforce assign TMP (o,g) pairs took %.3f\n", wallclock()-started); started=wallclock();
     #pragma omp parallel for num_threads(getDTthreads())
     for (int h=0; h<highSize; h++) {  // very important that high is first loop here
       for (int b=0; b<nBatch; b++) {
@@ -149,7 +148,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     }
     free(counts);
     free(TMP);
-    Rprintf("gforce assign TMP [ (o,g) pairs ] back to grp took %.3f\n", wallclock()-started); started=wallclock();
+    //Rprintf("gforce assign TMP [ (o,g) pairs ] back to grp took %.3f\n", wallclock()-started); started=wallclock();
   }
 
   high = (uint16_t *)R_alloc(nrow, sizeof(uint16_t));  // maybe better to malloc to avoid R's heap, but safer to R_alloc since it's done via eval()
@@ -189,14 +188,13 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     // counts is now cumulated within batch (with ending values) and we leave it that way
     // memcpy(counts + b*256, myCounts, 256*sizeof(int));  // save cumulate for later, first bucket contains position of next. For ease later in the very last batch.
   }
-  Rprintf("gforce assign high and low took %.3f\n", wallclock()-started); started=wallclock();
+  //Rprintf("gforce assign high and low took %.3f\n", wallclock()-started); started=wallclock();
 
   oo = INTEGER(o);
   ff = INTEGER(f);
-  Rprintf("gforce two INTEGERs took %.3f\n", wallclock()-started); started=wallclock();
 
   SEXP ans = PROTECT( eval(jsub, env) );
-  Rprintf("gforce eval took %.3f\n", wallclock()-started);
+  //Rprintf("gforce eval took %.3f\n", wallclock()-started);
   // if this eval() fails with R error, R will release grp for us. Which is why we use R_alloc above.
   if (isVectorAtomic(ans)) {
     SEXP tt = ans;
@@ -206,14 +204,13 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   }
   ngrp = 0; maxgrpn=0; irowslen = -1; isunsorted = 0;
 
-
   UNPROTECT(1);
   return(ans);
 }
 
 void *gather(void *x, size_t size, bool *anyNA)
 {
-  double started = wallclock();
+  //double started = wallclock();
   if (size==4) {
     const int *thisx = x;
     //int *restrict thisgx = gx;
@@ -245,7 +242,7 @@ void *gather(void *x, size_t size, bool *anyNA)
   } else {
     error("gather not yet implemented for size!=4");
   }
-  Rprintf("gather took %.3fs\n", wallclock()-started);
+  //Rprintf("gather took %.3fs\n", wallclock()-started);
   return gx;
 }
 
@@ -272,7 +269,7 @@ SEXP gsum(SEXP x, SEXP narmArg)
     //int64_t *i64sum = calloc(ngrp, sizeof(int64_t));
     //if (!i64sum) error("Unable to allocate %d * %d bytes for gsum i64", ngrp, sizeof(int64_t));
     bool overflow=false;
-    double started = wallclock();
+    //double started = wallclock();
     #pragma omp parallel for num_threads(getDTthreads()) //schedule(dynamic,1)
     for (int h=0; h<highSize; h++) {   // very important that high is first loop here
       int *restrict _ans = ansp + (h<<shift);
@@ -303,7 +300,7 @@ SEXP gsum(SEXP x, SEXP narmArg)
         }
       }
     }
-    Rprintf("gsum int took %.3f\n", wallclock()-started);
+    //Rprintf("gsum int took %.3f\n", wallclock()-started);
     if (overflow) error("overflow summing integer not yet auto-coerce");
 /*    bool stop = false;
       #pragma omp parallel for num_threads(getDTthreads())

From d6f896890d55b94e60c90465a74a970c8431d6ea Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Tue, 11 Dec 2018 01:08:42 -0800
Subject: [PATCH 08/10] gsum real

---
 CRAN_Release.cmd |   2 +
 src/gsumm.c      | 202 +++++++++++++++++++++++++++++++----------------
 2 files changed, 138 insertions(+), 66 deletions(-)

diff --git a/CRAN_Release.cmd b/CRAN_Release.cmd
index e85e9a59fd..ada6baf4c6 100644
--- a/CRAN_Release.cmd
+++ b/CRAN_Release.cmd
@@ -236,6 +236,8 @@ print(Sys.time()); require(data.table); print(Sys.time()); started.at<-proc.time
 
 # Investigated and ignore :
 # Tests 648 and 1262 (see their comments) have single precision issues under valgrind that don't occur on CRAN, even Solaris.
+# Old comment from gsumm.c ...  // long double usage here used to result in test 648 failing when run under valgrind
+                                // http://valgrind.org/docs/manual/manual-core.html#manual-core.limits"
 # Ignore all "set address range perms" warnings :
 #   http://stackoverflow.com/questions/13558067/what-does-this-valgrind-warning-mean-warning-set-address-range-perms
 # Ignore heap summaries around test 1705 and 1707/1708 due to the fork() test opening/closing, I guess.
diff --git a/src/gsumm.c b/src/gsumm.c
index ff2358e112..d400bac63d 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -208,12 +208,12 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
   return(ans);
 }
 
-void *gather(void *x, size_t size, bool *anyNA)
+void *gather(SEXP x, bool *anyNA)
 {
   //double started = wallclock();
-  if (size==4) {
-    const int *thisx = x;
-    //int *restrict thisgx = gx;
+  switch (TYPEOF(x)) {
+  case LGLSXP: case INTSXP: {
+    const int *restrict thisx = INTEGER(x);
     #pragma omp parallel for num_threads(getDTthreads())
     for (int b=0; b<nBatch; b++) {
       int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
@@ -239,15 +239,71 @@ void *gather(void *x, size_t size, bool *anyNA)
       }
       if (my_anyNA) *anyNA = true;  // naked write ok since just bool and always writing true; and no performance issue as maximum nBatch writes
     }
-  } else {
-    error("gather not yet implemented for size!=4");
+  } break;
+  case REALSXP: {
+    if (!INHERITS(x, char_integer64)) {
+      const double *restrict thisx = REAL(x);
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int b=0; b<nBatch; b++) {
+        int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
+        memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));
+        double *restrict my_gx = (double *)gx + b*batchSize;
+        const uint16_t *my_high = high + b*batchSize;
+        const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
+        bool my_anyNA = false;
+        if (irowslen==-1) {
+          const double *my_x = thisx + b*batchSize;
+          for (int i=0; i<howMany; i++) {
+            const double elem = my_x[i];
+            my_gx[ my_tmpcounts[my_high[i]]++ ] = elem;
+            if (ISNA(elem)) my_anyNA = true;
+          }
+        } else {
+          const int *my_x = irows + b*batchSize;
+          for (int i=0; i<howMany; i++) {
+            double elem = thisx[ my_x[i]-1 ];
+            my_gx[ my_tmpcounts[my_high[i]]++ ] = elem;
+            if (ISNA(elem)) my_anyNA = true;
+          }
+        }
+        if (my_anyNA) *anyNA = true;
+      }
+    } else {
+      const int64_t *restrict thisx = (int64_t *)REAL(x);
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int b=0; b<nBatch; b++) {
+        int *restrict my_tmpcounts = tmpcounts + omp_get_thread_num()*highSize;
+        memcpy(my_tmpcounts, counts + b*highSize, highSize*sizeof(int));
+        int64_t *restrict my_gx = (int64_t *)gx + b*batchSize;
+        const uint16_t *my_high = high + b*batchSize;
+        const int howMany = b==nBatch-1 ? lastBatchSize : batchSize;
+        bool my_anyNA = false;
+        if (irowslen==-1) {
+          const int64_t *my_x = thisx + b*batchSize;
+          for (int i=0; i<howMany; i++) {
+            const int64_t elem = my_x[i];
+            my_gx[ my_tmpcounts[my_high[i]]++ ] = elem;
+            if (elem==INT64_MIN) my_anyNA = true;
+          }
+        } else {
+          const int *my_x = irows + b*batchSize;
+          for (int i=0; i<howMany; i++) {
+            int64_t elem = thisx[ my_x[i]-1 ];
+            my_gx[ my_tmpcounts[my_high[i]]++ ] = elem;
+            if (elem==INT64_MIN) my_anyNA = true;
+          }
+        }
+        if (my_anyNA) *anyNA = true;
+      }
+    }
+  } break;
+  default :
+    error("gather implemented for INTSXP and REALSXP but not '%s'", type2char(TYPEOF(x)));
   }
   //Rprintf("gather took %.3fs\n", wallclock()-started);
   return gx;
 }
 
-// long double usage here results in test 648 being failed when running with valgrind
-// http://valgrind.org/docs/manual/manual-core.html#manual-core.limits
 SEXP gsum(SEXP x, SEXP narmArg)
 {
   if (!isLogical(narmArg) || LENGTH(narmArg)!=1 || LOGICAL(narmArg)[0]==NA_LOGICAL) error("na.rm must be TRUE or FALSE");
@@ -261,31 +317,38 @@ SEXP gsum(SEXP x, SEXP narmArg)
   SEXP ans;
   switch(TYPEOF(x)) {
   case LGLSXP: case INTSXP: {
-    // int *xd = INTEGER(x);
-    const int *restrict gx = gather(INTEGER(x), sizeof(int), &anyNA);  // TODO: could return anyNA too
+    const int *restrict gx = gather(x, &anyNA);
     ans = PROTECT(allocVector(INTSXP, ngrp));
     int *restrict ansp = INTEGER(ans);
     memset(ansp, 0, ngrp*sizeof(int));
-    //int64_t *i64sum = calloc(ngrp, sizeof(int64_t));
-    //if (!i64sum) error("Unable to allocate %d * %d bytes for gsum i64", ngrp, sizeof(int64_t));
     bool overflow=false;
     //double started = wallclock();
-    #pragma omp parallel for num_threads(getDTthreads()) //schedule(dynamic,1)
-    for (int h=0; h<highSize; h++) {   // very important that high is first loop here
-      int *restrict _ans = ansp + (h<<shift);
-      for (int b=0; b<nBatch; b++) {
-        const int pos = counts[ b*highSize + h ];
-        const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
-        const int *my_gx = gx + b*batchSize + pos;
-        const uint16_t *my_low = low + b*batchSize + pos;
-        if (!anyNA) {   // TODO: take out before prallel loop, and repeat PARLOOP using macro, for completness just in case (e.g. K=2).
+    if (!anyNA) {
+      #pragma omp parallel for num_threads(getDTthreads()) //schedule(dynamic,1)
+      for (int h=0; h<highSize; h++) {   // very important that high is first loop here
+        int *restrict _ans = ansp + (h<<shift);
+        for (int b=0; b<nBatch; b++) {
+          const int pos = counts[ b*highSize + h ];
+          const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
+          const int *my_gx = gx + b*batchSize + pos;
+          const uint16_t *my_low = low + b*batchSize + pos;
           for (int i=0; i<howMany; i++) {
             const int a = _ans[my_low[i]];
             const int b = my_gx[i];
             if ((a>0 && b>INT_MAX-a) || (a<0 && b<NA_INTEGER+1-a)) overflow=true;
             else _ans[my_low[i]] += b;  // naked by design; each thread does all of each h for all batches
           }
-        } else {
+        }
+      }
+    } else {
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int h=0; h<highSize; h++) {
+        int *restrict _ans = ansp + (h<<shift);
+        for (int b=0; b<nBatch; b++) {
+          const int pos = counts[ b*highSize + h ];
+          const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
+          const int *my_gx = gx + b*batchSize + pos;
+          const uint16_t *my_low = low + b*batchSize + pos;
           for (int i=0; i<howMany; i++) {
             const int a = _ans[my_low[i]];
             if (a==NA_INTEGER) continue;
@@ -295,68 +358,75 @@ SEXP gsum(SEXP x, SEXP narmArg)
               continue;
             }
             if ((a>0 && b>INT_MAX-a) || (a<0 && b<NA_INTEGER+1-a)) overflow=true;
-            else _ans[my_low[i]] += b;  // naked by design; each thread does all of each h for all batches
+            else _ans[my_low[i]] += b;
           }
         }
       }
     }
     //Rprintf("gsum int took %.3f\n", wallclock()-started);
-    if (overflow) error("overflow summing integer not yet auto-coerce");
-/*    bool stop = false;
-      #pragma omp parallel for num_threads(getDTthreads())
-      for (int i=0; i<ngrp; i++) {
-        if (stop) continue;
-        if (i64sum[i]>INT32_MAX || (i64sum[i]<=NA_INTEGER && i64sum[i]!=INT64_MIN)) stop=true;
-      }
-      if (stop) {
+    if (overflow) {
+      UNPROTECT(1); // discard the result with overflow
       warning("The sum of an integer column for a group was more than type 'integer' can hold so the result has been coerced to 'numeric' automatically for convenience.");
       ans = PROTECT(allocVector(REALSXP, ngrp));
       double *restrict ansp = REAL(ans);
+      memset(ansp, 0, ngrp*sizeof(double));
       #pragma omp parallel for num_threads(getDTthreads())
-      for (int i=0; i<ngrp; i++) {
-        ansp[i] = i64sum[i]==INT64_MIN ? NA_REAL : (double)i64sum[i];
-      }
-    } else {
-      ans = PROTECT(allocVector(INTSXP, ngrp));
-      int *restrict ansp = INTEGER(ans);
-      if (anyNA) {
-        #pragma omp parallel for num_threads(getDTthreads())
-        for (int i=0; i<ngrp; i++) {
-          ansp[i] = i64sum[i]==INT64_MIN ? NA_INTEGER : (int)i64sum[i];
-        }
-      } else {
-        #pragma omp parallel for num_threads(getDTthreads())
-        for (int i=0; i<ngrp; i++) {
-          ansp[i] = (int)i64sum[i];
+      for (int h=0; h<highSize; h++) {
+        double *restrict _ans = ansp + (h<<shift);
+        for (int b=0; b<nBatch; b++) {
+          const int pos = counts[ b*highSize + h ];
+          const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
+          const int *my_gx = gx + b*batchSize + pos;
+          const uint16_t *my_low = low + b*batchSize + pos;
+          // rare and slower so no need to switch on anyNA
+          for (int i=0; i<howMany; i++) {
+            const int elem = my_gx[i];
+            if (elem==NA_INTEGER) {
+              if (!narm) _ans[my_low[i]]=NA_REAL;
+              continue;
+            }
+            _ans[my_low[i]] += b;  // let NA_REAL propagate
+          }
         }
       }
     }
-    free(i64sum);*/
   } break;
   case REALSXP: {
-    long double *ldsum = calloc(ngrp, sizeof(long double));
-    if (!ldsum) error("Unable to allocate %d * %d bytes for gsum ld", ngrp, sizeof(long double));
-    double *xd = REAL(x);                                // now-slower R API with altrep, outside
-    if (irowslen==-1) {
-      for (int i=0, *g=grp; i<n; i++) {
-        if (narm && ISNAN(*xd)) {g++; xd++; continue;}   // narm first and leave to branch prediction
-        ldsum[*g++] += *xd++;                            // accumulate in long-double like base. Let NA propogate when !narm
+    const double *restrict gx = gather(x, &anyNA);
+    ans = PROTECT(allocVector(REALSXP, ngrp));
+    double *restrict ansp = REAL(ans);
+    memset(ansp, 0, ngrp*sizeof(double));
+    if (!narm || !anyNA) {
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int h=0; h<highSize; h++) {
+        double *restrict _ans = ansp + (h<<shift);
+        for (int b=0; b<nBatch; b++) {
+          const int pos = counts[ b*highSize + h ];
+          const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
+          const double *my_gx = gx + b*batchSize + pos;
+          const uint16_t *my_low = low + b*batchSize + pos;
+          for (int i=0; i<howMany; i++) {
+            _ans[my_low[i]] += my_gx[i];  // let NA propagate when !narm
+          }
+        }
       }
     } else {
-      for (int i=0, *g=grp; i<n; i++) {
-        double elem = xd[irows[i]-1];
-        if (narm && ISNAN(elem)) {g++; continue;}
-        ldsum[*g++] += elem;
+      // narm==true and anyNA==true
+      #pragma omp parallel for num_threads(getDTthreads())
+      for (int h=0; h<highSize; h++) {
+        double *restrict _ans = ansp + (h<<shift);
+        for (int b=0; b<nBatch; b++) {
+          const int pos = counts[ b*highSize + h ];
+          const int howMany = ((h==highSize-1) ? (b==nBatch-1?lastBatchSize:batchSize) : counts[ b*highSize + h + 1 ]) - pos;
+          const double *my_gx = gx + b*batchSize + pos;
+          const uint16_t *my_low = low + b*batchSize + pos;
+          for (int i=0; i<howMany; i++) {
+            const double elem = my_gx[i];
+            if (!ISNAN(elem)) _ans[my_low[i]] += elem;
+          }
+        }
       }
     }
-    ans = PROTECT(allocVector(REALSXP, ngrp));
-    xd = REAL(ans);
-    for (int i=0; i<ngrp; i++) {
-      if (ldsum[i] > DBL_MAX) xd[i] = R_PosInf;
-      else if (ldsum[i] < -DBL_MAX) xd[i] = R_NegInf;
-      else xd[i] = (double)ldsum[i];
-    }
-    free(ldsum);
   } break;
   default:
     error("Type '%s' not supported by GForce sum (gsum). Either add the prefix base::sum(.) or turn off GForce optimization using options(datatable.optimize=1)", type2char(TYPEOF(x)));

From 313e90d8be496111356d90466327dcccd4e69f64 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Tue, 11 Dec 2018 03:04:14 -0800
Subject: [PATCH 09/10] coverage

---
 inst/tests/tests.Rraw | 8 ++++++++
 src/gsumm.c           | 6 +++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 759c41c597..b9bfee365c 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -13029,6 +13029,14 @@ test(1967.75, x[ , .(v = sum(v)), by = i1:i4], x[-10L])
 test(1967.76, x[1:5, sum(v), by = list(i5 = 1:5 %% 2L), verbose = TRUE],
      data.table(i5 = 1:0, V1 = c(0, 0)), output = 'i clause present but columns used in by not detected')
 
+# gforce integer overflow coerce to double
+DT = data.table(A=1:5, B=-3i, C=2147483647L)
+test(1968.1, DT[, sum(B), by=A%%2L], error="Type 'complex' not supported by GForce sum (gsum). Either add the")
+test(1968.2, storage.mode(DT$C), "integer")
+test(1968.3, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294)),
+             warning="sum.*integer column.*more than type 'integer' can hold.*coerced to 'numeric'")
+
+
 ###################################
 #  Add new tests above this line  #
 ###################################
diff --git a/src/gsumm.c b/src/gsumm.c
index d400bac63d..0641af5d14 100644
--- a/src/gsumm.c
+++ b/src/gsumm.c
@@ -56,7 +56,7 @@ SEXP gforce(SEXP env, SEXP jsub, SEXP o, SEXP f, SEXP l, SEXP irowsArg) {
     irows = INTEGER(irowsArg);
     irowslen = LENGTH(irowsArg);
   }
-  else error("irowsArg is neither an integer vector nor NULL");
+  else error("irowsArg is neither an integer vector nor NULL");  // # nocov
   ngrp = LENGTH(l);
   if (LENGTH(f) != ngrp) error("length(f)=%d != length(l)=%d", LENGTH(f), ngrp);
   nrow=0;
@@ -298,7 +298,7 @@ void *gather(SEXP x, bool *anyNA)
     }
   } break;
   default :
-    error("gather implemented for INTSXP and REALSXP but not '%s'", type2char(TYPEOF(x)));
+    error("gather implemented for INTSXP and REALSXP but not '%s'", type2char(TYPEOF(x)));   // # nocov
   }
   //Rprintf("gather took %.3fs\n", wallclock()-started);
   return gx;
@@ -385,7 +385,7 @@ SEXP gsum(SEXP x, SEXP narmArg)
               if (!narm) _ans[my_low[i]]=NA_REAL;
               continue;
             }
-            _ans[my_low[i]] += b;  // let NA_REAL propagate
+            _ans[my_low[i]] += elem;  // let NA_REAL propagate
           }
         }
       }

From 0a955e2328d8de710a801482dfdd785e012f3d44 Mon Sep 17 00:00:00 2001
From: mattdowle <mattjdowle@gmail.com>
Date: Tue, 11 Dec 2018 03:30:45 -0800
Subject: [PATCH 10/10] coverage

---
 inst/tests/tests.Rraw | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index b9bfee365c..d23547b937 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -13035,6 +13035,13 @@ test(1968.1, DT[, sum(B), by=A%%2L], error="Type 'complex' not supported by GFor
 test(1968.2, storage.mode(DT$C), "integer")
 test(1968.3, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(6442450941, 4294967294)),
              warning="sum.*integer column.*more than type 'integer' can hold.*coerced to 'numeric'")
+DT[3,C:=NA]
+test(1968.4, DT[, sum(C), by=A%%2L], data.table(A=c(1L,0L), V1=c(NA, 4294967294)), warning="coerced to 'numeric'")
+test(1968.5, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 4294967294)), warning="coerced to 'numeric'")
+DT[4,C:=NA]
+test(1968.6, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 2147483647)), warning="coerced to 'numeric'")
+DT[2,C:=NA]
+test(1968.7, DT[, sum(C,na.rm=TRUE), by=A%%2L], data.table(A=c(1L,0L), V1=c(4294967294, 0)), warning="coerced to 'numeric'")
 
 
 ###################################