diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 5bea0569a..60bd2b370 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -284,6 +284,15 @@ target_include_directories(coreneuron SYSTEM target_include_directories(coreneuron SYSTEM PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include) +if(CORENRN_ENABLE_GPU) + # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available. + find_package(Boost QUIET) + if(Boost_FOUND) + target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS}) + target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL) + endif() +endif() + set_target_properties( coreneuron scopmath PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu index b13dad7eb..9a3d205a3 100644 --- a/coreneuron/utils/randoms/nrnran123.cu +++ b/coreneuron/utils/randoms/nrnran123.cu @@ -15,6 +15,11 @@ #include #include +#ifdef CORENEURON_USE_BOOST_POOL +#include +#include +#endif + // In a GPU build this file will be compiled by NVCC as CUDA code // In a CPU build this file will be compiled by a C++ compiler as C++ code #ifdef __CUDACC__ @@ -24,6 +29,48 @@ #endif namespace { +#ifdef CORENEURON_USE_BOOST_POOL +/** Tag type for use with boost::fast_pool_allocator that forwards to + * coreneuron::[de]allocate_unified(). Using a Random123-specific type here + * makes sure that allocations do not come from the same global pool as other + * usage of boost pools for objects with sizeof == sizeof(nrnran123_State). + * + * The messy m_block_sizes map is just because `deallocate_unified` uses sized + * deallocations, but the Boost pool allocators don't. Because this is hidden + * behind the pool mechanism, these methods are not called very often and the + * overhead is minimal. + */ +struct random123_allocate_unified { + using size_type = std::size_t; + using difference_type = std::size_t; + static char* malloc(const size_type bytes) { + std::lock_guard const lock{m_mutex}; + static_cast(lock); + auto* buffer = coreneuron::allocate_unified(bytes); + m_block_sizes[buffer] = bytes; + return reinterpret_cast(buffer); + } + static void free(char* const block) { + std::lock_guard const lock{m_mutex}; + static_cast(lock); + auto const iter = m_block_sizes.find(block); + assert(iter != m_block_sizes.end()); + auto const size = iter->second; + m_block_sizes.erase(iter); + return coreneuron::deallocate_unified(block, size); + } + static std::mutex m_mutex; + static std::unordered_map m_block_sizes; +}; + +std::mutex random123_allocate_unified::m_mutex{}; +std::unordered_map random123_allocate_unified::m_block_sizes{}; + +using random123_allocator = + boost::fast_pool_allocator; +#else +using random123_allocator = coreneuron::unified_allocator; +#endif /* Global data structure per process. Using a unique_ptr here causes [minor] * problems because its destructor can be called very late during application * shutdown. If the destructor calls cudaFree and the CUDA runtime has already @@ -212,9 +259,7 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1, #endif nrnran123_State* s{nullptr}; if (use_unified_memory) { - s = coreneuron::allocate_unique( - coreneuron::unified_allocator{}) - .release(); + s = coreneuron::allocate_unique(random123_allocator{}).release(); } else { s = new nrnran123_State{}; } @@ -244,9 +289,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) { --g_instance_count; } if (use_unified_memory) { - std::unique_ptr>> - _{s}; + std::unique_ptr> _{s}; } else { delete s; }