From 9ca7ab986d09905ee2ed928932060ea14a4b2c92 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Tue, 20 Jul 2021 14:06:21 +0200 Subject: [PATCH 1/3] Unified Memory in Random123. --- coreneuron/CMakeLists.txt | 10 ++ coreneuron/utils/randoms/nrnran123.cpp | 147 +---------------------- coreneuron/utils/randoms/nrnran123.cu | 157 ++++++++++++++----------- coreneuron/utils/randoms/nrnran123.h | 142 +++++++++++----------- 4 files changed, 168 insertions(+), 288 deletions(-) mode change 100644 => 120000 coreneuron/utils/randoms/nrnran123.cpp diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index a79c905e8..16116adc2 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -138,8 +138,18 @@ if(CORENRN_ENABLE_GPU) cuda_add_library("cudacoreneuron" ${CORENEURON_CUDA_FILES} OPTIONS ${cuda_arch_flags} -Xcompiler -fPIC) set(link_cudacoreneuron cudacoreneuron) + # nrnran123.cpp is a symlink to nrnran123.cu, in GPU builds we compile this as CUDA code (so we + # want to remove the .cpp here), while in non-GPU builds we compile it as plain C++ (so we want to + # remove the .cu below). Unfortunately CMake options based + # on the LANGUAGE property (https://cmake.org/cmake/help/latest/policy/CMP0119.html), so using a + # single .cu file and setting LANGUAGE=CXX in non-GPU builds does not work. + list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp") + list(REMOVE_ITEM CORENEURON_UTILS_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cpp") else() set(link_cudacoreneuron "") + # See above regarding CMake policy CMP0119. + list(REMOVE_ITEM CORENEURON_CODE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu") + list(REMOVE_ITEM CORENEURON_UTILS_FILES "${CMAKE_CURRENT_SOURCE_DIR}/utils/randoms/nrnran123.cu") endif() # ============================================================================= diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp deleted file mode 100644 index ed206a4e3..000000000 --- a/coreneuron/utils/randoms/nrnran123.cpp +++ /dev/null @@ -1,146 +0,0 @@ -/* -# ============================================================================= -# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL -# -# See top-level LICENSE file for details. -# =============================================================================. -*/ - -#include -#include -#include "coreneuron/utils/randoms/nrnran123.h" -#include "coreneuron/nrnconf.h" -#include "coreneuron/utils/nrnmutdec.h" -#include "coreneuron/utils/nrnoc_aux.hpp" - -namespace coreneuron { -static const double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */ - -static philox4x32_key_t k = {{0}}; - -static size_t instance_count_ = 0; -size_t nrnran123_instance_count() { - return instance_count_; -} - -size_t nrnran123_state_size() { - return sizeof(nrnran123_State); -} - -void nrnran123_set_globalindex(uint32_t gix) { - k.v[0] = gix; -#if (defined(__CUDACC__) || defined(_OPENACC)) - nrnran123_set_gpu_globalindex(gix); -#endif -} - -/* if one sets the global, one should reset all the stream sequences. */ -uint32_t nrnran123_get_globalindex() { - return k.v[0]; -} - -#ifdef _OPENMP -static MUTDEC void nrnran123_mutconstruct() { - if (!mut_) { - MUTCONSTRUCT(1); - } -} -#else -void nrnran123_mutconstruct() {} -#endif - -nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) { - return nrnran123_newstream3(id1, id2, 0); -} - -nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) { - nrnran123_State* s = (nrnran123_State*) ecalloc(sizeof(nrnran123_State), 1); - s->c.v[1] = id3; - s->c.v[2] = id1; - s->c.v[3] = id2; - nrnran123_setseq(s, 0, 0); - MUTLOCK - ++instance_count_; - MUTUNLOCK - return s; -} - -void nrnran123_deletestream(nrnran123_State* s) { - MUTLOCK - --instance_count_; - MUTUNLOCK - free(s); -} - -void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) { - *seq = s->c.v[0]; - *which = s->which_; -} - -void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) { - if (which > 3) { - s->which_ = 0; - } else { - s->which_ = which; - } - s->c.v[0] = seq; - s->r = philox4x32(s->c, k); -} - -void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) { - *id1 = s->c.v[2]; - *id2 = s->c.v[3]; -} - -void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) { - *id3 = s->c.v[1]; - *id1 = s->c.v[2]; - *id2 = s->c.v[3]; -} - -uint32_t nrnran123_ipick(nrnran123_State* s) { - uint32_t rval; - char which = s->which_; - assert(which < 4); - rval = s->r.v[which++]; - if (which > 3) { - which = 0; - s->c.v[0]++; - s->r = philox4x32(s->c, k); - } - s->which_ = which; - return rval; -} - -double nrnran123_dblpick(nrnran123_State* s) { - return nrnran123_uint2dbl(nrnran123_ipick(s)); -} - -double nrnran123_negexp(nrnran123_State* s) { - /* min 2.3283064e-10 to max 22.18071 */ - return -log(nrnran123_dblpick(s)); -} - -/* at cost of a cached value we could compute two at a time. */ -double nrnran123_normal(nrnran123_State* s) { - double w, x, y; - double u1, u2; - do { - u1 = nrnran123_dblpick(s); - u2 = nrnran123_dblpick(s); - u1 = 2. * u1 - 1.; - u2 = 2. * u2 - 1.; - w = (u1 * u1) + (u2 * u2); - } while (w > 1); - - y = sqrt((-2. * log(w)) / w); - x = u1 * y; - return x; -} - -double nrnran123_uint2dbl(uint32_t u) { - /* 0 to 2^32-1 transforms to double value in open (0,1) interval */ - /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */ - return ((double) u + 1.0) * SHIFT32; -} -} // namespace coreneuron diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp new file mode 120000 index 000000000..3821c8cc1 --- /dev/null +++ b/coreneuron/utils/randoms/nrnran123.cpp @@ -0,0 +1 @@ +nrnran123.cu \ No newline at end of file diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu index 568a87b13..a07bc7cdf 100644 --- a/coreneuron/utils/randoms/nrnran123.cu +++ b/coreneuron/utils/randoms/nrnran123.cu @@ -5,99 +5,107 @@ # See top-level LICENSE file for details. # =============================================================================. */ +#include "coreneuron/utils/randoms/nrnran123.h" + +#ifdef _OPENMP +#include "coreneuron/utils/nrnmutdec.h" +#endif #include #include #include -#include "coreneuron/utils/randoms/nrnran123.h" -namespace coreneuron { -/* global data structure per process */ -__device__ static const double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */ -__device__ static philox4x32_key_t k = {{0}}; -__device__ static unsigned int instance_count_ = 0; -__device__ size_t nrnran123_instance_count() { - return instance_count_; -} -__device__ size_t nrnran123_state_size() { - return sizeof(nrnran123_State); -} +#include +#include -__global__ void nrnran123_set_globalindex(uint32_t gix) { - k.v[0] = gix; -} +// In a GPU build this file will be compiled by NVCC as CUDA code +// In a CPU build this file will be compiled by a C++ compiler as C++ code +#ifdef __CUDACC__ +#define CORENRN_MANAGED __managed__ +#else +#define CORENRN_MANAGED +#endif -/* if one sets the global, one should reset all the stream sequences. */ -__device__ uint32_t nrnran123_get_globalindex() { - return k.v[0]; +namespace { +/* global data structure per process */ +CORENRN_MANAGED philox4x32_key_t g_k = {{0}}; +std::atomic instance_count_{}; +constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */ +} // namespace + +namespace coreneuron { +std::size_t nrnran123_instance_count() { + return instance_count_; } -__global__ void nrnran123_setup_cuda_newstream(nrnran123_State* s, - uint32_t id1, - uint32_t id2, - uint32_t id3) { - s->c.v[0] = 0; - s->c.v[1] = id3; - s->c.v[2] = id1; - s->c.v[3] = id2; - nrnran123_setseq(s, 0, 0); - atomicAdd(&instance_count_, 1); +#ifdef _OPENMP +static MUTDEC void nrnran123_mutconstruct() { + if (!mut_) { + MUTCONSTRUCT(1); + } } +#else +void nrnran123_mutconstruct() {} +#endif -__global__ void nrnran123_cuda_deletestream(nrnran123_State* s) { - atomicSub(&instance_count_, 1); +/* if one sets the global, one should reset all the stream sequences. */ +CORENRN_HOST_DEVICE uint32_t nrnran123_get_globalindex() { + return g_k.v[0]; } -__device__ void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) { +CORENRN_HOST_DEVICE void nrnran123_getseq(nrnran123_State* s, uint32_t* seq, char* which) { *seq = s->c.v[0]; *which = s->which_; } -__device__ void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) { +CORENRN_HOST_DEVICE void nrnran123_setseq(nrnran123_State* s, uint32_t seq, char which) { if (which > 3) { s->which_ = 0; } else { s->which_ = which; } s->c.v[0] = seq; - s->r = philox4x32(s->c, k); + s->r = philox4x32(s->c, g_k); } -__device__ void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) { +CORENRN_HOST_DEVICE void nrnran123_getids(nrnran123_State* s, uint32_t* id1, uint32_t* id2) { *id1 = s->c.v[2]; *id2 = s->c.v[3]; } -__device__ void nrnran123_getids3(nrnran123_State* s, uint32_t* id1, uint32_t* id2, uint32_t* id3) { +CORENRN_HOST_DEVICE void nrnran123_getids3(nrnran123_State* s, + uint32_t* id1, + uint32_t* id2, + uint32_t* id3) { *id3 = s->c.v[1]; *id1 = s->c.v[2]; *id2 = s->c.v[3]; } -__device__ uint32_t nrnran123_ipick(nrnran123_State* s) { +CORENRN_HOST_DEVICE uint32_t nrnran123_ipick(nrnran123_State* s) { uint32_t rval; char which = s->which_; rval = s->r.v[which++]; if (which > 3) { which = 0; s->c.v[0]++; - s->r = philox4x32(s->c, k); + s->r = philox4x32(s->c, g_k); } s->which_ = which; return rval; } -__device__ double nrnran123_dblpick(nrnran123_State* s) { +CORENRN_HOST_DEVICE double nrnran123_dblpick(nrnran123_State* s) { return nrnran123_uint2dbl(nrnran123_ipick(s)); } -__device__ double nrnran123_negexp(nrnran123_State* s) { +CORENRN_HOST_DEVICE double nrnran123_negexp(nrnran123_State* s) { /* min 2.3283064e-10 to max 22.18071 */ return -log(nrnran123_dblpick(s)); } /* at cost of a cached value we could compute two at a time. */ -__device__ double nrnran123_normal(nrnran123_State* s) { +CORENRN_HOST_DEVICE double nrnran123_normal(nrnran123_State* s) { double w, x, y; double u1, u2; @@ -114,41 +122,54 @@ __device__ double nrnran123_normal(nrnran123_State* s) { return x; } -__device__ double nrnran123_uint2dbl(uint32_t u) { +CORENRN_HOST_DEVICE double nrnran123_uint2dbl(uint32_t u) { /* 0 to 2^32-1 transforms to double value in open (0,1) interval */ /* min 2.3283064e-10 to max (1 - 2.3283064e-10) */ - return ((double)u + 1.0) * SHIFT32; + return ((double) u + 1.0) * SHIFT32; } /* nrn123 streams are created from cpu launcher routine */ -nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) { - return nrnran123_newstream3(id1, id2, 0); -} - -nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) { - nrnran123_State* s; - - cudaMalloc((void**)&s, sizeof(nrnran123_State)); - cudaMemset((void*)s, 0, sizeof(nrnran123_State)); - - nrnran123_setup_cuda_newstream<<<1, 1>>>(s, id1, id2, id3); - cudaDeviceSynchronize(); - +void nrnran123_set_globalindex(uint32_t gix) { + g_k.v[0] = gix; +} + +namespace detail { +nrnran123_State* nrnran123_newstream3(uint32_t id1, + uint32_t id2, + uint32_t id3, + bool use_unified_memory) { + nrnran123_State* s{nullptr}; + if (use_unified_memory) { +#ifdef __CUDACC__ + assert(cudaMallocManaged(&s, sizeof(nrnran123_State)) == cudaSuccess); + assert(cudaMemset(s, 0, sizeof(nrnran123_State)) == cudaSuccess); +#else + throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build."); +#endif + } else { + s = new nrnran123_State{}; + } + s->c.v[0] = 0; + s->c.v[1] = id3; + s->c.v[2] = id1; + s->c.v[3] = id2; + nrnran123_setseq(s, 0, 0); + ++instance_count_; return s; } /* nrn123 streams are destroyed from cpu launcher routine */ -void nrnran123_deletestream(nrnran123_State* s) { - nrnran123_cuda_deletestream<<<1, 1>>>(s); - cudaDeviceSynchronize(); - - cudaFree(s); -} - -/* set global index for random123 stream on gpu */ -void nrnran123_set_gpu_globalindex(uint32_t gix) { - nrnran123_set_globalindex<<<1,1>>>(gix); - cudaDeviceSynchronize(); +void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) { + --instance_count_; + if (use_unified_memory) { +#ifdef __CUDACC__ + cudaFree(s); +#else + throw std::runtime_error("Tried to use CUDA unified memory in a non-GPU build."); +#endif + } else { + delete s; + } } - -} //namespace coreneuron +} // namespace detail +} // namespace coreneuron diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h index 4fb3f6c2b..d9afbd491 100644 --- a/coreneuron/utils/randoms/nrnran123.h +++ b/coreneuron/utils/randoms/nrnran123.h @@ -5,7 +5,8 @@ # See top-level LICENSE file for details. # =============================================================================. */ - +// Beware changing this to #pragma once, we rely on this file shadowing the +// equivalent file from NEURON. #ifndef nrnran123_h #define nrnran123_h @@ -40,111 +41,104 @@ of the full distribution available from #include #ifdef __CUDACC__ -#define DEVICE __device__ -#define GLOBAL __global__ +#define CORENRN_HOST_DEVICE __host__ __device__ #else -#define DEVICE -#define GLOBAL +#define CORENRN_HOST_DEVICE #endif +// Is there actually any harm leaving the pragma in when DISABLE_OPENACC is true? +#if defined(_OPENACC) && !defined(DISABLE_OPENACC) +#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE _Pragma("acc routine seq") +#else +#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE +#endif + +// Some files are compiled with DISABLE_OPENACC, and some builds have no GPU +// support at all. In these two cases, request that the random123 state is +// allocated using new/delete instead of CUDA unified memory. #if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC) -#define nrnran123_newstream cu_nrnran123_newstream -#define nrnran123_newstream3 cu_nrnran123_newstream3 -#define nrnran123_deletestream cu_nrnran123_deletestream -#define nrnran123_uint2dbl cu_nrnran123_uint2dbl -#define nrnran123_negexp cu_nrnran123_negexp -#define nrnran123_dblpick cu_nrnran123_dblpick -#define nrnran123_ipick cu_nrnran123_ipick -#define nrnran123_getids cu_nrnran123_getids -#define nrnran123_setseq cu_nrnran123_setseq -#define nrnran123_getseq cu_nrnran123_getseq -#define nrnran123_get_globalindex cu_nrnran123_get_globalindex -#define nrnran123_set_globalindex cu_nrnran123_set_globalindex -#define nrnran123_state_size cu_nrnran123_state_size -#define nrnran123_instance_count cu_nrnran123_instance_count -#define nrnran123_normal cu_nrnran123_normal -#define nrnran123_getids3 cu_nrnran123_getids3 +#define CORENRN_RAN123_USE_UNIFIED_MEMORY true +#else +#define CORENRN_RAN123_USE_UNIFIED_MEMORY false #endif namespace coreneuron { -typedef struct nrnran123_State { +struct nrnran123_State { philox4x32_ctr_t c; philox4x32_ctr_t r; char which_; -} nrnran123_State; +}; -typedef struct nrnran123_array4x32 { +struct nrnran123_array4x32 { uint32_t v[4]; -} nrnran123_array4x32; +}; /* do this on launch to make nrnran123_newstream threadsafe */ -extern DEVICE void nrnran123_mutconstruct(void); +void nrnran123_mutconstruct(); /* global index. eg. run number */ /* all generator instances share this global index */ -extern GLOBAL void nrnran123_set_globalindex(uint32_t gix); -extern DEVICE uint32_t nrnran123_get_globalindex(); - -extern DEVICE size_t nrnran123_instance_count(void); -extern DEVICE size_t nrnran123_state_size(void); - -/* routines for creating and deleteing streams are called from cpu */ -extern nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2); -extern nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3); -extern void nrnran123_deletestream(nrnran123_State*); - -/* routines for creating and deleteing streams are called from cpu but initializing/deleting gpu - * context */ -extern nrnran123_State* cu_nrnran123_newstream(uint32_t id1, uint32_t id2); -extern nrnran123_State* cu_nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3); -extern void cu_nrnran123_deletestream(nrnran123_State*); - -extern GLOBAL void nrnran123_setup_deletestream(nrnran123_State* s); -extern GLOBAL void nrnran123_setup_newstream(nrnran123_State* s, uint32_t id1, uint32_t id2); -extern GLOBAL void nrnran123_setup_newstream3(nrnran123_State* s, - uint32_t id1, - uint32_t id2, - uint32_t id3); +void nrnran123_set_globalindex(uint32_t gix); +CORENRN_HOST_DEVICE_ACC uint32_t nrnran123_get_globalindex(); + +// Utilities used for calculating model size, only called from the CPU. +std::size_t nrnran123_instance_count(); +inline std::size_t nrnran123_state_size() { + return sizeof(nrnran123_State); +} + +// Implementation functions that have the value of +// CORENRN_RAN123_USE_UNIFIED_MEMORY promoted into a runtime parameter, which is +// an easy way to avoid linking problems +namespace detail { +nrnran123_State* nrnran123_newstream3(uint32_t id1, + uint32_t id2, + uint32_t id3, + bool use_unified_memory); +void nrnran123_deletestream(nrnran123_State*, bool use_unified_memory); +} // namespace detail + +/* routines for creating and deleting streams are called from cpu */ +inline nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) { + return detail::nrnran123_newstream3(id1, id2, id3, CORENRN_RAN123_USE_UNIFIED_MEMORY); +} +inline nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) { + return nrnran123_newstream3(id1, id2, 0); +} +inline void nrnran123_deletestream(nrnran123_State* s) { + return detail::nrnran123_deletestream(s, CORENRN_RAN123_USE_UNIFIED_MEMORY); +} /* minimal data stream */ -extern DEVICE void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which); -extern DEVICE void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2); -extern DEVICE void nrnran123_getids3(nrnran123_State*, uint32_t* id1, uint32_t* id2, uint32_t* id3); -extern DEVICE uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */ +CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which); +CORENRN_HOST_DEVICE_ACC void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2); +CORENRN_HOST_DEVICE_ACC void nrnran123_getids3(nrnran123_State*, + uint32_t* id1, + uint32_t* id2, + uint32_t* id3); +CORENRN_HOST_DEVICE_ACC uint32_t nrnran123_ipick(nrnran123_State*); /* uniform 0 to 2^32-1 */ /* this could be called from openacc parallel construct */ -#if !defined(DISABLE_OPENACC) -#pragma acc routine seq -#endif -extern DEVICE double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/ +CORENRN_HOST_DEVICE_ACC double nrnran123_dblpick(nrnran123_State*); /* uniform open interval (0,1)*/ /* nrnran123_dblpick minimum value is 2.3283064e-10 and max value is 1-min */ /* this could be called from openacc parallel construct (in INITIAL block) */ -#if !defined(DISABLE_OPENACC) -#pragma acc routine seq -#endif -extern DEVICE void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which); +CORENRN_HOST_DEVICE_ACC void nrnran123_setseq(nrnran123_State*, uint32_t seq, char which); -#if !defined(DISABLE_OPENACC) -#pragma acc routine seq -#endif -extern DEVICE double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */ +CORENRN_HOST_DEVICE_ACC double nrnran123_negexp(nrnran123_State*); /* mean 1.0 */ /* nrnran123_negexp min value is 2.3283064e-10, max is 22.18071 */ /* missing declaration in coreneuron */ -#if !defined(DISABLE_OPENACC) -#pragma acc routine seq -#endif -extern DEVICE double nrnran123_normal(nrnran123_State*); +CORENRN_HOST_DEVICE_ACC double nrnran123_normal(nrnran123_State*); -extern DEVICE double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */ +CORENRN_HOST_DEVICE_ACC double nrnran123_gauss(nrnran123_State*); /* mean 0.0, std 1.0 */ /* more fundamental (stateless) (though the global index is still used) */ -extern DEVICE nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2); -extern DEVICE double nrnran123_uint2dbl(uint32_t); -extern void nrnran123_set_gpu_globalindex(uint32_t gix); - +CORENRN_HOST_DEVICE_ACC nrnran123_array4x32 nrnran123_iran(uint32_t seq, + uint32_t id1, + uint32_t id2); +CORENRN_HOST_DEVICE_ACC double nrnran123_uint2dbl(uint32_t); } // namespace coreneuron #endif From f8fd104a9f202996a4942e86bfbc38faa48b4459 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 21 Jul 2021 10:57:26 +0200 Subject: [PATCH 2/3] Use OMP_Mutex instead of std::atomic, drop detail. --- coreneuron/utils/randoms/nrnran123.cu | 25 +++++++++++++------------ coreneuron/utils/randoms/nrnran123.h | 26 +++++++++----------------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu index a07bc7cdf..8e56a1995 100644 --- a/coreneuron/utils/randoms/nrnran123.cu +++ b/coreneuron/utils/randoms/nrnran123.cu @@ -5,17 +5,13 @@ # See top-level LICENSE file for details. # =============================================================================. */ -#include "coreneuron/utils/randoms/nrnran123.h" - -#ifdef _OPENMP #include "coreneuron/utils/nrnmutdec.h" -#endif +#include "coreneuron/utils/randoms/nrnran123.h" +#include #include #include -#include -#include #include // In a GPU build this file will be compiled by NVCC as CUDA code @@ -29,13 +25,14 @@ namespace { /* global data structure per process */ CORENRN_MANAGED philox4x32_key_t g_k = {{0}}; -std::atomic instance_count_{}; +OMP_Mutex g_instance_count_mutex; +std::size_t g_instance_count{}; constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */ } // namespace namespace coreneuron { std::size_t nrnran123_instance_count() { - return instance_count_; + return g_instance_count; } #ifdef _OPENMP @@ -133,7 +130,6 @@ void nrnran123_set_globalindex(uint32_t gix) { g_k.v[0] = gix; } -namespace detail { nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3, @@ -154,13 +150,19 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1, s->c.v[2] = id1; s->c.v[3] = id2; nrnran123_setseq(s, 0, 0); - ++instance_count_; + { + std::lock_guard _{g_instance_count_mutex}; + ++g_instance_count; + } return s; } /* nrn123 streams are destroyed from cpu launcher routine */ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) { - --instance_count_; + { + std::lock_guard _{g_instance_count_mutex}; + --g_instance_count; + } if (use_unified_memory) { #ifdef __CUDACC__ cudaFree(s); @@ -171,5 +173,4 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) { delete s; } } -} // namespace detail } // namespace coreneuron diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h index d9afbd491..51df36917 100644 --- a/coreneuron/utils/randoms/nrnran123.h +++ b/coreneuron/utils/randoms/nrnran123.h @@ -88,27 +88,19 @@ inline std::size_t nrnran123_state_size() { return sizeof(nrnran123_State); } -// Implementation functions that have the value of -// CORENRN_RAN123_USE_UNIFIED_MEMORY promoted into a runtime parameter, which is -// an easy way to avoid linking problems -namespace detail { +/* routines for creating and deleting streams are called from cpu */ nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3, - bool use_unified_memory); -void nrnran123_deletestream(nrnran123_State*, bool use_unified_memory); -} // namespace detail - -/* routines for creating and deleting streams are called from cpu */ -inline nrnran123_State* nrnran123_newstream3(uint32_t id1, uint32_t id2, uint32_t id3) { - return detail::nrnran123_newstream3(id1, id2, id3, CORENRN_RAN123_USE_UNIFIED_MEMORY); -} -inline nrnran123_State* nrnran123_newstream(uint32_t id1, uint32_t id2) { - return nrnran123_newstream3(id1, id2, 0); -} -inline void nrnran123_deletestream(nrnran123_State* s) { - return detail::nrnran123_deletestream(s, CORENRN_RAN123_USE_UNIFIED_MEMORY); + bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY); +inline nrnran123_State* nrnran123_newstream( + uint32_t id1, + uint32_t id2, + bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY) { + return nrnran123_newstream3(id1, id2, 0, use_unified_memory); } +void nrnran123_deletestream(nrnran123_State* s, + bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY); /* minimal data stream */ CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which); From 769352af32c014947bc971eec726ec96fb6de305 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 21 Jul 2021 13:34:36 +0200 Subject: [PATCH 3/3] Load gcc module for CUDA in Jenkins CI. --- tests/jenkins/install_coreneuron.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/jenkins/install_coreneuron.sh b/tests/jenkins/install_coreneuron.sh index ac5008777..465c5f956 100755 --- a/tests/jenkins/install_coreneuron.sh +++ b/tests/jenkins/install_coreneuron.sh @@ -12,7 +12,8 @@ CORENRN_TYPE="$1" if [ "${CORENRN_TYPE}" = "GPU-non-unified" ] || [ "${CORENRN_TYPE}" = "GPU-unified" ]; then # PGI compiler issue in unstable : BSD-204 - module load nvhpc cuda/11.0.2 hpe-mpi cmake boost + # load gcc so CUDA uses deployed gcc ~v9 not systen ~v4 + module load gcc nvhpc cuda/11.0.2 hpe-mpi cmake boost mkdir build_${CORENRN_TYPE} else module load boost intel hpe-mpi cmake