From 9ca7ab986d09905ee2ed928932060ea14a4b2c92 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Tue, 20 Jul 2021 14:06:21 +0200
Subject: [PATCH 1/3] Unified Memory in Random123.

---
 coreneuron/CMakeLists.txt              |  10 ++
 coreneuron/utils/randoms/nrnran123.cpp | 147 +----------------------
 coreneuron/utils/randoms/nrnran123.cu  | 157 ++++++++++++++-----------
 coreneuron/utils/randoms/nrnran123.h   | 142 +++++++++++-----------
 4 files changed, 168 insertions(+), 288 deletions(-)
 mode change 100644 => 120000 coreneuron/utils/randoms/nrnran123.cpp
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index a79c905e8..16116adc2 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -138,8 +138,18 @@ if(CORENRN_ENABLE_GPU)
   cuda_add_library("cudacoreneuron" ${CORENEURON_CUDA_FILES} OPTIONS ${cuda_arch_flags} -Xcompiler
                                                                      -fPIC)
   set(link_cudacoreneuron cudacoreneuron)
+  # nrnran123.cpp is a symlink to nrnran123.cu, in GPU builds we compile this as CUDA code (so we
+  # want to remove the .cpp here), while in non-GPU builds we compile it as plain C++ (so we want to
+  # remove the .cu below). Unfortunately CMake <v3.20 does not pass explicit -x <lang> options based
+  # on the LANGUAGE property (https://cmake.org/cmake/help/latest/policy/CMP0119.html), so using a
+  # single .cu file and setting LANGUAGE=CXX in non-GPU builds does not work.
+  list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp")
+  list(REMOVE_ITEM CORENEURON_UTILS_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp")
 else()
   set(link_cudacoreneuron "")
+  # See above regarding CMake policy CMP0119.
+  list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu")
+  list(REMOVE_ITEM CORENEURON_UTILS_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu")
 endif()
 
 # =============================================================================
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
deleted file mode 100644
index ed206a4e3..000000000
--- a/coreneuron/utils/randoms/nrnran123.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================.
-*/
-
-#include <stdlib.h>
-#include <math.h>
-#include "coreneuron/utils/randoms/nrnran123.h"
-#include "coreneuron/nrnconf.h"
-#include "coreneuron/utils/nrnmutdec.h"
-#include "coreneuron/utils/nrnoc_aux.hpp"
-
-namespace coreneuron {
-static const double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
-
-static philox4x32_key_t k = {{0}};
-
-static size_t instance_count_ = 0;
-size_t nrnran123_instance_count() {
-    return instance_count_;
-}
-
-size_t nrnran123_state_size() {
-    return sizeof(nrnran123_State);
-}
-
-void nrnran123_set_globalindex(uint32_t gix) {
-    k.v[0] = gix;
-#if (defined(__CUDACC__) || defined(_OPENACC))
-    nrnran123_set_gpu_globalindex(gix);
-#endif
-}
-
-/* if one sets the global, one should reset all the stream sequences. */
-uint32_t nrnran123_get_globalindex() {
-    return k.v[0];
-}
-
-#ifdef _OPENMP
-static MUTDEC void nrnran123_mutconstruct() {
-    if (!mut_) {
-        MUTCONSTRUCT(1);
-    }
-}
-#else
-void nrnran123_mutconstruct() {}
-#endif
-
-nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) {
-    return nrnran123_newstream3(id1, id2, 0);
-}
-
-nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) {
-    nrnran123_State* s = (nrnran123_State*) ecalloc(sizeof(nrnran123_State), 1);
-    s->c.v[1] = id3;
-    s->c.v[2] = id1;
-    s->c.v[3] = id2;
-    nrnran123_setseq(s, 0, 0);
-    MUTLOCK
-    ++instance_count_;
-    MUTUNLOCK
-    return s;
-}
-
-void nrnran123_deletestream(nrnran123_State* s) {
-    MUTLOCK
-    --instance_count_;
-    MUTUNLOCK
-    free(s);
-}
-
-void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
-    *seq = s->c.v[0];
-    *which = s->which_;
-}
-
-void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
-    if (which > 3) {
-        s->which_ = 0;
-    } else {
-        s->which_ = which;
-    }
-    s->c.v[0] = seq;
-    s->r = philox4x32(s->c, k);
-}
-
-void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
-    *id1 = s->c.v[2];
-    *id2 = s->c.v[3];
-}
-
-void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
-    *id3 = s->c.v[1];
-    *id1 = s->c.v[2];
-    *id2 = s->c.v[3];
-}
-
-uint32_t nrnran123_ipick(nrnran123_State* s) {
-    uint32_t rval;
-    char which = s->which_;
-    assert(which < 4);
-    rval = s->r.v[which++];
-    if (which > 3) {
-        which = 0;
-        s->c.v[0]++;
-        s->r = philox4x32(s->c, k);
-    }
-    s->which_ = which;
-    return rval;
-}
-
-double nrnran123_dblpick(nrnran123_State* s) {
-    return nrnran123_uint2dbl(nrnran123_ipick(s));
-}
-
-double nrnran123_negexp(nrnran123_State* s) {
-    /* min 2.3283064e-10 to max 22.18071 */
-    return -log(nrnran123_dblpick(s));
-}
-
-/* at cost of a cached  value we could compute two at a time. */
-double nrnran123_normal(nrnran123_State* s) {
-    double w, x, y;
-    double u1, u2;
-    do {
-        u1 = nrnran123_dblpick(s);
-        u2 = nrnran123_dblpick(s);
-        u1 = 2. * u1 - 1.;
-        u2 = 2. * u2 - 1.;
-        w = (u1 * u1) + (u2 * u2);
-    } while (w > 1);
-
-    y = sqrt((-2. * log(w)) / w);
-    x = u1 * y;
-    return x;
-}
-
-double nrnran123_uint2dbl(uint32_t u) {
-    /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
-    /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
-    return ((double) u + 1.0) * SHIFT32;
-}
-}  // namespace coreneuron
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
new file mode 120000
index 000000000..3821c8cc1
--- /dev/null
+++ b/coreneuron/utils/randoms/nrnran123.cpp
@@ -0,0 +1 @@
+nrnran123.cu
\ No newline at end of file
diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
index 568a87b13..a07bc7cdf 100644
--- a/coreneuron/utils/randoms/nrnran123.cu
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -5,99 +5,107 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
+#include "coreneuron/utils/randoms/nrnran123.h"
+
+#ifdef _OPENMP
+#include "coreneuron/utils/nrnmutdec.h"
+#endif
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
-#include "coreneuron/utils/randoms/nrnran123.h"
-namespace coreneuron {
-/* global data structure per process */
-__device__ static const double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
-__device__ static philox4x32_key_t k = {{0}};
-__device__ static unsigned int instance_count_ = 0;
-__device__ size_t nrnran123_instance_count() {
-    return instance_count_;
-}
 
-__device__ size_t nrnran123_state_size() {
-    return sizeof(nrnran123_State);
-}
+#include <atomic>
+#include <mutex>
 
-__global__ void nrnran123_set_globalindex(uint32_t gix) {
-    k.v[0] = gix;
-}
+// In a GPU build this file will be compiled by NVCC as CUDA code
+// In a CPU build this file will be compiled by a C++ compiler as C++ code
+#ifdef __CUDACC__
+#define CORENRN_MANAGED __managed__
+#else
+#define CORENRN_MANAGED
+#endif
 
-/* if one sets the global, one should reset all the stream sequences. */
-__device__ uint32_t nrnran123_get_globalindex() {
-    return k.v[0];
+namespace {
+/* global data structure per process */
+CORENRN_MANAGED philox4x32_key_t g_k = {{0}};
+std::atomic<std::size_t> instance_count_{};
+constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
+}  // namespace
+
+namespace coreneuron {
+std::size_t nrnran123_instance_count() {
+    return instance_count_;
 }
 
-__global__ void nrnran123_setup_cuda_newstream(nrnran123_State* s,
-                                               uint32_t id1,
-                                               uint32_t id2,
-                                               uint32_t id3) {
-    s->c.v[0] = 0;
-    s->c.v[1] = id3;
-    s->c.v[2] = id1;
-    s->c.v[3] = id2;
-    nrnran123_setseq(s, 0, 0);
-    atomicAdd(&instance_count_, 1);
+#ifdef _OPENMP
+static MUTDEC void nrnran123_mutconstruct() {
+    if (!mut_) {
+        MUTCONSTRUCT(1);
+    }
 }
+#else
+void nrnran123_mutconstruct() {}
+#endif
 
-__global__ void nrnran123_cuda_deletestream(nrnran123_State* s) {
-    atomicSub(&instance_count_, 1);
+/* if one sets the global, one should reset all the stream sequences. */
+CORENRN_HOST_DEVICE uint32_t nrnran123_get_globalindex() {
+    return g_k.v[0];
 }
 
-__device__ void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
+CORENRN_HOST_DEVICE void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) {
     *seq = s->c.v[0];
     *which = s->which_;
 }
 
-__device__ void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
+CORENRN_HOST_DEVICE void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) {
     if (which > 3) {
         s->which_ = 0;
     } else {
         s->which_ = which;
     }
     s->c.v[0] = seq;
-    s->r = philox4x32(s->c, k);
+    s->r = philox4x32(s->c, g_k);
 }
 
-__device__ void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
+CORENRN_HOST_DEVICE void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) {
     *id1 = s->c.v[2];
     *id2 = s->c.v[3];
 }
 
-__device__ void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) {
+CORENRN_HOST_DEVICE void nrnran123_getids3(nrnran123_State* s,
+                                           uint32_t* id1,
+                                           uint32_t* id2,
+                                           uint32_t* id3) {
     *id3 = s->c.v[1];
     *id1 = s->c.v[2];
     *id2 = s->c.v[3];
 }
 
-__device__ uint32_t nrnran123_ipick(nrnran123_State* s) {
+CORENRN_HOST_DEVICE uint32_t nrnran123_ipick(nrnran123_State* s) {
     uint32_t rval;
     char which = s->which_;
     rval = s->r.v[which++];
     if (which > 3) {
         which = 0;
         s->c.v[0]++;
-        s->r = philox4x32(s->c, k);
+        s->r = philox4x32(s->c, g_k);
     }
     s->which_ = which;
     return rval;
 }
 
-__device__ double nrnran123_dblpick(nrnran123_State* s) {
+CORENRN_HOST_DEVICE double nrnran123_dblpick(nrnran123_State* s) {
     return nrnran123_uint2dbl(nrnran123_ipick(s));
 }
 
-__device__ double nrnran123_negexp(nrnran123_State* s) {
+CORENRN_HOST_DEVICE double nrnran123_negexp(nrnran123_State* s) {
     /* min 2.3283064e-10 to max 22.18071 */
     return -log(nrnran123_dblpick(s));
 }
 
 /* at cost of a cached  value we could compute two at a time. */
-__device__ double nrnran123_normal(nrnran123_State* s) {
+CORENRN_HOST_DEVICE double nrnran123_normal(nrnran123_State* s) {
     double w, x, y;
     double u1, u2;
 
@@ -114,41 +122,54 @@ __device__ double nrnran123_normal(nrnran123_State* s) {
     return x;
 }
 
-__device__ double nrnran123_uint2dbl(uint32_t u) {
+CORENRN_HOST_DEVICE double nrnran123_uint2dbl(uint32_t u) {
     /* 0 to 2^32-1 transforms to double value in open (0,1) interval */
     /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */
-    return ((double)u + 1.0) * SHIFT32;
+    return ((double) u + 1.0) * SHIFT32;
 }
 
 /* nrn123 streams are created from cpu launcher routine */
-nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) {
-    return nrnran123_newstream3(id1, id2, 0);
-}
-
-nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) {
-    nrnran123_State* s;
-
-    cudaMalloc((void**)&s, sizeof(nrnran123_State));
-    cudaMemset((void*)s, 0, sizeof(nrnran123_State));
-
-    nrnran123_setup_cuda_newstream<<<1, 1>>>(s, id1, id2, id3);
-    cudaDeviceSynchronize();
-
+void nrnran123_set_globalindex(uint32_t gix) {
+    g_k.v[0] = gix;
+}
+
+namespace detail {
+nrnran123_State* nrnran123_newstream3(uint32_t id1,
+                                      uint32_t id2,
+                                      uint32_t id3,
+                                      bool use_unified_memory) {
+    nrnran123_State* s{nullptr};
+    if (use_unified_memory) {
+#ifdef __CUDACC__
+        assert(cudaMallocManaged(&s, sizeof(nrnran123_State)) == cudaSuccess);
+        assert(cudaMemset(s, 0, sizeof(nrnran123_State)) == cudaSuccess);
+#else
+        throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
+#endif
+    } else {
+        s = new nrnran123_State{};
+    }
+    s->c.v[0] = 0;
+    s->c.v[1] = id3;
+    s->c.v[2] = id1;
+    s->c.v[3] = id2;
+    nrnran123_setseq(s, 0, 0);
+    ++instance_count_;
     return s;
 }
 
 /* nrn123 streams are destroyed from cpu launcher routine */
-void nrnran123_deletestream(nrnran123_State* s) {
-    nrnran123_cuda_deletestream<<<1, 1>>>(s);
-    cudaDeviceSynchronize();
-
-    cudaFree(s);
-}
-
-/* set global index for random123 stream on gpu */
-void nrnran123_set_gpu_globalindex(uint32_t gix) {
-    nrnran123_set_globalindex<<<1,1>>>(gix);
-    cudaDeviceSynchronize();
+void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
+    --instance_count_;
+    if (use_unified_memory) {
+#ifdef __CUDACC__
+        cudaFree(s);
+#else
+        throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build.");
+#endif
+    } else {
+        delete s;
+    }
 }
-
-} //namespace coreneuron
+}  // namespace detail
+}  // namespace coreneuron
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index 4fb3f6c2b..d9afbd491 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -5,7 +5,8 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
+// Beware changing this to #pragma once, we rely on this file shadowing the
+// equivalent file from NEURON.
 #ifndef nrnran123_h
 #define nrnran123_h
 
@@ -40,111 +41,104 @@ of the full distribution available from
 #include <inttypes.h>
 
 #ifdef __CUDACC__
-#define DEVICE __device__
-#define GLOBAL __global__
+#define CORENRN_HOST_DEVICE __host__ __device__
 #else
-#define DEVICE
-#define GLOBAL
+#define CORENRN_HOST_DEVICE
 #endif
 
+// Is there actually any harm leaving the pragma in when DISABLE_OPENACC is true?
+#if defined(_OPENACC) && !defined(DISABLE_OPENACC)
+#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE _Pragma("acc routine seq")
+#else
+#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE
+#endif
+
+// Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
+// support at all. In these two cases, request that the random123 state is
+// allocated using new/delete instead of CUDA unified memory.
 #if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC)
-#define nrnran123_newstream       cu_nrnran123_newstream
-#define nrnran123_newstream3      cu_nrnran123_newstream3
-#define nrnran123_deletestream    cu_nrnran123_deletestream
-#define nrnran123_uint2dbl        cu_nrnran123_uint2dbl
-#define nrnran123_negexp          cu_nrnran123_negexp
-#define nrnran123_dblpick         cu_nrnran123_dblpick
-#define nrnran123_ipick           cu_nrnran123_ipick
-#define nrnran123_getids          cu_nrnran123_getids
-#define nrnran123_setseq          cu_nrnran123_setseq
-#define nrnran123_getseq          cu_nrnran123_getseq
-#define nrnran123_get_globalindex cu_nrnran123_get_globalindex
-#define nrnran123_set_globalindex cu_nrnran123_set_globalindex
-#define nrnran123_state_size      cu_nrnran123_state_size
-#define nrnran123_instance_count  cu_nrnran123_instance_count
-#define nrnran123_normal          cu_nrnran123_normal
-#define nrnran123_getids3         cu_nrnran123_getids3
+#define CORENRN_RAN123_USE_UNIFIED_MEMORY true
+#else
+#define CORENRN_RAN123_USE_UNIFIED_MEMORY false
 #endif
 
 namespace coreneuron {
 
-typedef struct nrnran123_State {
+struct nrnran123_State {
     philox4x32_ctr_t c;
     philox4x32_ctr_t r;
     char which_;
-} nrnran123_State;
+};
 
-typedef struct nrnran123_array4x32 {
+struct nrnran123_array4x32 {
     uint32_t v[4];
-} nrnran123_array4x32;
+};
 
 /* do this on launch to make nrnran123_newstream threadsafe */
-extern DEVICE void nrnran123_mutconstruct(void);
+void nrnran123_mutconstruct();
 
 /* global index. eg. run number */
 /* all generator instances share this global index */
-extern GLOBAL void nrnran123_set_globalindex(uint32_t gix);
-extern DEVICE uint32_t nrnran123_get_globalindex();
-
-extern DEVICE size_t nrnran123_instance_count(void);
-extern DEVICE size_t nrnran123_state_size(void);
-
-/* routines for creating and deleteing streams are called from cpu */
-extern nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2);
-extern nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3);
-extern void nrnran123_deletestream(nrnran123_State*);
-
-/* routines for creating and deleteing streams are called from cpu but initializing/deleting gpu
- * context */
-extern nrnran123_State* cu_nrnran123_newstream(uint32_t id1, uint32_t id2);
-extern nrnran123_State* cu_nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3);
-extern void cu_nrnran123_deletestream(nrnran123_State*);
-
-extern GLOBAL void nrnran123_setup_deletestream(nrnran123_State* s);
-extern GLOBAL void nrnran123_setup_newstream(nrnran123_State* s, uint32_t id1, uint32_t id2);
-extern GLOBAL void nrnran123_setup_newstream3(nrnran123_State* s,
-                                              uint32_t id1,
-                                              uint32_t id2,
-                                              uint32_t id3);
+void nrnran123_set_globalindex(uint32_t gix);
+CORENRN_HOST_DEVICE_ACC uint32_t nrnran123_get_globalindex();
+
+// Utilities used for calculating model size, only called from the CPU.
+std::size_t nrnran123_instance_count();
+inline std::size_t nrnran123_state_size() {
+    return sizeof(nrnran123_State);
+}
+
+// Implementation functions that have the value of
+// CORENRN_RAN123_USE_UNIFIED_MEMORY promoted into a runtime parameter, which is
+// an easy way to avoid linking problems
+namespace detail {
+nrnran123_State* nrnran123_newstream3(uint32_t id1,
+                                      uint32_t id2,
+                                      uint32_t id3,
+                                      bool use_unified_memory);
+void nrnran123_deletestream(nrnran123_State*, bool use_unified_memory);
+}  // namespace detail
+
+/* routines for creating and deleting streams are called from cpu */
+inline nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) {
+    return detail::nrnran123_newstream3(id1, id2, id3, CORENRN_RAN123_USE_UNIFIED_MEMORY);
+}
+inline nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) {
+    return nrnran123_newstream3(id1, id2, 0);
+}
+inline void nrnran123_deletestream(nrnran123_State* s) {
+    return detail::nrnran123_deletestream(s, CORENRN_RAN123_USE_UNIFIED_MEMORY);
+}
 
 /* minimal data stream */
-extern DEVICE void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
-extern DEVICE void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
-extern DEVICE void nrnran123_getids3(nrnran123_State*, uint32_t* id1, uint32_t* id2, uint32_t* id3);
-extern DEVICE uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */
+CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
+CORENRN_HOST_DEVICE_ACC void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
+CORENRN_HOST_DEVICE_ACC void nrnran123_getids3(nrnran123_State*,
+                                               uint32_t* id1,
+                                               uint32_t* id2,
+                                               uint32_t* id3);
+CORENRN_HOST_DEVICE_ACC uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */
 
 /* this could be called from openacc parallel construct */
-#if !defined(DISABLE_OPENACC)
-#pragma acc routine seq
-#endif
-extern DEVICE double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/
+CORENRN_HOST_DEVICE_ACC double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/
 /* nrnran123_dblpick minimum value is 2.3283064e-10 and max value is 1-min */
 
 /* this could be called from openacc parallel construct (in INITIAL block) */
-#if !defined(DISABLE_OPENACC)
-#pragma acc routine seq
-#endif
-extern DEVICE void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which);
+CORENRN_HOST_DEVICE_ACC void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which);
 
-#if !defined(DISABLE_OPENACC)
-#pragma acc routine seq
-#endif
-extern DEVICE double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */
+CORENRN_HOST_DEVICE_ACC double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */
 /* nrnran123_negexp min value is 2.3283064e-10, max is 22.18071 */
 
 /* missing declaration in coreneuron */
-#if !defined(DISABLE_OPENACC)
-#pragma acc routine seq
-#endif
-extern DEVICE double nrnran123_normal(nrnran123_State*);
+CORENRN_HOST_DEVICE_ACC double nrnran123_normal(nrnran123_State*);
 
-extern DEVICE double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */
+CORENRN_HOST_DEVICE_ACC double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */
 
 /* more fundamental (stateless) (though the global index is still used) */
-extern DEVICE nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2);
-extern DEVICE double nrnran123_uint2dbl(uint32_t);
-extern void nrnran123_set_gpu_globalindex(uint32_t gix);
-
+CORENRN_HOST_DEVICE_ACC nrnran123_array4x32 nrnran123_iran(uint32_t seq,
+                                                           uint32_t id1,
+                                                           uint32_t id2);
+CORENRN_HOST_DEVICE_ACC double nrnran123_uint2dbl(uint32_t);
 }  // namespace coreneuron
 
 #endif

From f8fd104a9f202996a4942e86bfbc38faa48b4459 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 21 Jul 2021 10:57:26 +0200
Subject: [PATCH 2/3] Use OMP_Mutex instead of std::atomic, drop detail.

---
 coreneuron/utils/randoms/nrnran123.cu | 25 +++++++++++++------------
 coreneuron/utils/randoms/nrnran123.h  | 26 +++++++++-----------------
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
index a07bc7cdf..8e56a1995 100644
--- a/coreneuron/utils/randoms/nrnran123.cu
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -5,17 +5,13 @@
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-#include "coreneuron/utils/randoms/nrnran123.h"
-
-#ifdef _OPENMP
 #include "coreneuron/utils/nrnmutdec.h"
-#endif
+#include "coreneuron/utils/randoms/nrnran123.h"
 
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
 
-#include <atomic>
 #include <mutex>
 
 // In a GPU build this file will be compiled by NVCC as CUDA code
@@ -29,13 +25,14 @@
 namespace {
 /* global data structure per process */
 CORENRN_MANAGED philox4x32_key_t g_k = {{0}};
-std::atomic<std::size_t> instance_count_{};
+OMP_Mutex g_instance_count_mutex;
+std::size_t g_instance_count{};
 constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
 }  // namespace
 
 namespace coreneuron {
 std::size_t nrnran123_instance_count() {
-    return instance_count_;
+    return g_instance_count;
 }
 
 #ifdef _OPENMP
@@ -133,7 +130,6 @@ void nrnran123_set_globalindex(uint32_t gix) {
     g_k.v[0] = gix;
 }
 
-namespace detail {
 nrnran123_State* nrnran123_newstream3(uint32_t id1,
                                       uint32_t id2,
                                       uint32_t id3,
@@ -154,13 +150,19 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1,
     s->c.v[2] = id1;
     s->c.v[3] = id2;
     nrnran123_setseq(s, 0, 0);
-    ++instance_count_;
+    {
+        std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
+        ++g_instance_count;
+    }
     return s;
 }
 
 /* nrn123 streams are destroyed from cpu launcher routine */
 void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
-    --instance_count_;
+    {
+        std::lock_guard<OMP_Mutex> _{g_instance_count_mutex};
+        --g_instance_count;
+    }
     if (use_unified_memory) {
 #ifdef __CUDACC__
         cudaFree(s);
@@ -171,5 +173,4 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
         delete s;
     }
 }
-}  // namespace detail
 }  // namespace coreneuron
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index d9afbd491..51df36917 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -88,27 +88,19 @@ inline std::size_t nrnran123_state_size() {
     return sizeof(nrnran123_State);
 }
 
-// Implementation functions that have the value of
-// CORENRN_RAN123_USE_UNIFIED_MEMORY promoted into a runtime parameter, which is
-// an easy way to avoid linking problems
-namespace detail {
+/* routines for creating and deleting streams are called from cpu */
 nrnran123_State* nrnran123_newstream3(uint32_t id1,
                                       uint32_t id2,
                                       uint32_t id3,
-                                      bool use_unified_memory);
-void nrnran123_deletestream(nrnran123_State*, bool use_unified_memory);
-}  // namespace detail
-
-/* routines for creating and deleting streams are called from cpu */
-inline nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) {
-    return detail::nrnran123_newstream3(id1, id2, id3, CORENRN_RAN123_USE_UNIFIED_MEMORY);
-}
-inline nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) {
-    return nrnran123_newstream3(id1, id2, 0);
-}
-inline void nrnran123_deletestream(nrnran123_State* s) {
-    return detail::nrnran123_deletestream(s, CORENRN_RAN123_USE_UNIFIED_MEMORY);
+                                      bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
+inline nrnran123_State* nrnran123_newstream(
+    uint32_t id1,
+    uint32_t id2,
+    bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY) {
+    return nrnran123_newstream3(id1, id2, 0, use_unified_memory);
 }
+void nrnran123_deletestream(nrnran123_State* s,
+                            bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
 
 /* minimal data stream */
 CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);

From 769352af32c014947bc971eec726ec96fb6de305 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 21 Jul 2021 13:34:36 +0200
Subject: [PATCH 3/3] Load gcc module for CUDA in Jenkins CI.

---
 tests/jenkins/install_coreneuron.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/jenkins/install_coreneuron.sh b/tests/jenkins/install_coreneuron.sh
index ac5008777..465c5f956 100755
--- a/tests/jenkins/install_coreneuron.sh
+++ b/tests/jenkins/install_coreneuron.sh
@@ -12,7 +12,8 @@ CORENRN_TYPE="$1"
 
 if [ "${CORENRN_TYPE}" = "GPU-non-unified" ] || [ "${CORENRN_TYPE}" = "GPU-unified" ]; then
     # PGI compiler issue in unstable :  BSD-204
-    module load nvhpc cuda/11.0.2 hpe-mpi cmake boost
+    # load gcc so CUDA uses deployed gcc ~v9 not systen ~v4
+    module load gcc nvhpc cuda/11.0.2 hpe-mpi cmake boost
     mkdir build_${CORENRN_TYPE}
 else
     module load boost intel hpe-mpi cmake