diff --git a/.clang-format.changes b/.clang-format.changes index 01b58702d..4c2b11b59 100644 --- a/.clang-format.changes +++ b/.clang-format.changes @@ -1,2 +1,3 @@ -SortIncludes: false IndentCaseLabels: true +SortIncludes: false +StatementMacros: [nrn_pragma_acc, nrn_pragma_omp] diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml index 19ea9c084..2f20247f7 100644 --- a/.cmake-format.changes.yaml +++ b/.cmake-format.changes.yaml @@ -1,9 +1,4 @@ additional_commands: - cuda_add_library: - pargs: '*' - flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"] - kwargs: - OPTIONS: '*' cpp_cc_build_time_copy: flags: ['NO_TARGET'] kwargs: diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5e3967f7d..8b434bf81 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -45,7 +45,7 @@ spack_setup: variables: SPACK_PACKAGE: neuron SPACK_PACKAGE_REF: '' - SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit + SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit model_tests=channel-benchmark,olfactory .gpu_node: variables: bb5_constraint: volta @@ -93,7 +93,19 @@ build:coreneuron+nmodl:gpu: SPACK_PACKAGE: coreneuron # +report pulls in a lot of dependencies and the tests fail. # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type - SPACK_PACKAGE_SPEC: +nmodl+gpu+tests~legacy-unit~report build_type=RelWithDebInfo + SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report~sympy build_type=RelWithDebInfo + extends: + - .spack_build + - .spack_nvhpc + needs: ["build:nmodl:gpu"] + +build:coreneuron+nmodl~openmp:gpu: + variables: + SPACK_PACKAGE: coreneuron + # +report pulls in a lot of dependencies and the tests fail. + # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type + # Sympy + OpenMP target offload does not currently work with NVHPC + SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit~report+sympy build_type=RelWithDebInfo extends: - .spack_build - .spack_nvhpc @@ -104,7 +116,7 @@ build:coreneuron:gpu: SPACK_PACKAGE: coreneuron # +report pulls in a lot of dependencies and the tests fail. # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type - SPACK_PACKAGE_SPEC: +gpu+tests~legacy-unit~report build_type=RelWithDebInfo + SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit~report build_type=RelWithDebInfo extends: - .spack_build - .spack_nvhpc @@ -121,6 +133,10 @@ test:coreneuron+nmodl:gpu: extends: [.ctest, .gpu_node] needs: ["build:coreneuron+nmodl:gpu"] +test:coreneuron+nmodl~openmp:gpu: + extends: [.ctest, .gpu_node] + needs: ["build:coreneuron+nmodl~openmp:gpu"] + test:coreneuron:gpu: extends: [.ctest, .gpu_node] needs: ["build:coreneuron:gpu"] @@ -153,6 +169,18 @@ build:neuron+nmodl:gpu: - !reference [.spack_build, before_script] needs: ["build:coreneuron+nmodl:gpu"] +build:neuron+nmodl~openmp:gpu: + stage: build_neuron + extends: + - .spack_build + - .spack_neuron + - .spack_nvhpc + before_script: + # Build py-cython and py-numpy with GCC instead of NVHPC. + - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc" + - !reference [.spack_build, before_script] + needs: ["build:coreneuron+nmodl~openmp:gpu"] + build:neuron:gpu: stage: build_neuron extends: @@ -180,6 +208,11 @@ test:neuron+nmodl:gpu: extends: [.ctest, .gpu_node] needs: ["build:neuron+nmodl:gpu"] +test:neuron+nmodl~openmp:gpu: + stage: test_neuron + extends: [.ctest, .gpu_node] + needs: ["build:neuron+nmodl~openmp:gpu"] + test:neuron:gpu: stage: test_neuron extends: [.ctest, .gpu_node] diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake index 7767a3672..5838742f8 100644 --- a/CMake/OpenAccHelper.cmake +++ b/CMake/OpenAccHelper.cmake @@ -10,6 +10,9 @@ if(CORENRN_ENABLE_GPU) # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU) + # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and + # stopping profiling. This makes sure those headers can be found. + include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # cuda unified memory support if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY) add_compile_definitions(CORENEURON_UNIFIED_MEMORY) @@ -47,25 +50,32 @@ if(CORENRN_ENABLE_GPU) endif() set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}") endif() - # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be - # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA - # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which - # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when - # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences - # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for - # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same - # CUDA version as is used for the explicit CUDA code. - set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}") - set(NVHPC_ACC_LINK_FLAGS "-acc -cuda") + # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code + # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code + # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in + # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP + # code is compiled with the same CUDA version as the explicit CUDA code. + set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo") # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the # same default compute capabilities as each other, particularly on GPU-less build machines. foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES}) string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}") endforeach() + if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP") + # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available + # for a region then prefer OpenMP. + add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD) + string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu") + elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC") + # Only enable OpenACC offload for GPU + string(APPEND NVHPC_ACC_COMP_FLAGS " -acc") + else() + message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers") + endif() # avoid PGI adding standard compliant "-A" flags set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14) - string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}") + string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}") # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors. # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined @@ -81,7 +91,7 @@ if(CORENRN_ENABLE_GPU) GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS - "${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive" + "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive" ) else() set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS diff --git a/CMakeLists.txt b/CMakeLists.txt index 1b3edf3a5..d3e1950d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11) # Build options # ============================================================================= option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON) +option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON) option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON) option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF) option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON) @@ -117,6 +118,7 @@ else() set(CORENRN_HAVE_NVHPC_COMPILER OFF) endif() +set(CORENRN_ACCELERATOR_OFFLOAD "Disabled") if(CORENRN_ENABLE_GPU) # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after # https://github.com/BlueBrain/CoreNeuron/pull/609. @@ -135,7 +137,7 @@ if(CORENRN_ENABLE_GPU) # Set some sensible default CUDA architectures. if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 60 70 80) + set(CMAKE_CUDA_ARCHITECTURES 70 80) message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}") endif() @@ -185,6 +187,18 @@ if(CORENRN_ENABLE_GPU) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085" ) + + if(CORENRN_ENABLE_NMODL) + # NMODL supports both OpenACC and OpenMP target offload + if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD) + set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP") + else() + set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC") + endif() + else() + # MOD2C only supports OpenACC offload + set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC") + endif() endif() # ============================================================================= @@ -526,6 +540,7 @@ message(STATUS "MOD2CPP PATH | ${CORENRN_MOD2CPP_BINARY}") message(STATUS "GPU Support | ${CORENRN_ENABLE_GPU}") if(CORENRN_ENABLE_GPU) message(STATUS " CUDA | ${CUDAToolkit_LIBRARY_DIR}") + message(STATUS " Offload | ${CORENRN_ACCELERATOR_OFFLOAD}") message(STATUS " Unified Memory | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}") endif() message(STATUS "Auto Timeout | ${CORENRN_ENABLE_TIMEOUT}") diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 6fd5c98a8..d370df1df 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -284,6 +284,16 @@ target_include_directories(coreneuron SYSTEM target_include_directories(coreneuron SYSTEM PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include) +if(CORENRN_ENABLE_GPU) + # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available. + find_package(Boost QUIET) + if(Boost_FOUND) + message(STATUS "Boost found, enabling use of memory pools for Random123...") + target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS}) + target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL) + endif() +endif() + set_target_properties( coreneuron scopmath PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib diff --git a/coreneuron/apps/corenrn_parameters.cpp b/coreneuron/apps/corenrn_parameters.cpp index c0aa02ab0..4403f44db 100644 --- a/coreneuron/apps/corenrn_parameters.cpp +++ b/coreneuron/apps/corenrn_parameters.cpp @@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() { "Print number of instances of each mechanism and detailed memory stats."); auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU."); - sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true) + sub_gpu + ->add_option("-W, --nwarp", + this->nwarp, + "Number of warps to execute in parallel the Hines solver. Each warp solves a " + "group of cells. (Only used with cell permute 2)", + true) ->check(CLI::Range(0, 1'000'000)); sub_gpu ->add_option("-R, --cell-permute", diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp index ea7ef8aba..e22cf348d 100644 --- a/coreneuron/apps/corenrn_parameters.hpp +++ b/coreneuron/apps/corenrn_parameters.hpp @@ -46,8 +46,8 @@ struct corenrn_parameters { unsigned ms_subint = 2; /// Number of multisend interval. 1 or 2 unsigned spkcompress = 0; /// Spike Compression unsigned cell_interleave_permute = 0; /// Cell interleaving permutation - unsigned nwarp = 0; /// Number of warps to balance for cell_interleave_permute == 2 - unsigned num_gpus = 0; /// Number of gpus to use per node + unsigned nwarp = 65536; /// Number of warps to balance for cell_interleave_permute == 2 + unsigned num_gpus = 0; /// Number of gpus to use per node unsigned report_buff_size = report_buff_size_default; /// Size in MB of the report buffer. int seed = -1; /// Initialization seed for random number generator (int) diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp index 0fdaa509b..5bfda9421 100644 --- a/coreneuron/apps/main1.cpp +++ b/coreneuron/apps/main1.cpp @@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc, // precedence is: set by user, globals.dat, 34.0 celsius = corenrn_param.celsius; -#if _OPENACC +#if CORENEURON_ENABLE_GPU if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) { fprintf(stderr, - "compiled with _OPENACC does not allow the combination of --cell-permute=2 and " + "compiled with CORENEURON_ENABLE_GPU does not allow the combination of " + "--cell-permute=2 and " "missing --gpu\n"); exit(1); } @@ -497,7 +498,7 @@ extern "C" void mk_mech_init(int argc, char** argv) { } #endif -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (corenrn_param.gpu) { init_gpu(); } @@ -558,10 +559,8 @@ extern "C" int run_solve_core(int argc, char** argv) { #endif bool compute_gpu = corenrn_param.gpu; - // clang-format off - - #pragma acc update device(celsius, secondorder, pi) if (compute_gpu) - // clang-format on + nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu)) + nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu)) { double v = corenrn_param.voltage; double dt = corenrn_param.dt; diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index ac98f5420..d5e723527 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -24,13 +24,14 @@ #include "coreneuron/mpi/nrnmpidec.h" #include "coreneuron/utils/utils.hpp" -#ifdef _OPENACC -#include -#endif - #ifdef CRAYPAT #include #endif + +#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) +#include +#endif + namespace coreneuron { extern InterleaveInfo* interleave_info; void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div); @@ -40,9 +41,227 @@ void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); +int cnrn_target_get_num_devices() { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + // choose nvidia GPU by default + acc_device_t device_type = acc_device_nvidia; + // check how many gpu devices available per node + return acc_get_num_devices(device_type); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + return omp_get_num_devices(); +#else + throw std::runtime_error( + "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +void cnrn_target_set_default_device(int device_num) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + acc_set_device_num(device_num, acc_device_nvidia); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + omp_set_default_device(device_num); + // It seems that with NVHPC 21.9 then only setting the default OpenMP device + // is not enough: there were errors on some nodes when not-the-0th GPU was + // used. These seemed to be related to the NMODL instance structs, which are + // allocated using cudaMallocManaged. + auto const cuda_code = cudaSetDevice(device_num); + assert(cuda_code == cudaSuccess); +#else + throw std::runtime_error( + "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) { + // As we never run code for artificial cell inside GPU we don't copy it. + int is_art = corenrn.get_is_artificial()[type]; + if (is_art) { + return nullptr; + } + + auto d_ml = cnrn_target_copyin(ml); + + int n = ml->nodecount; + int szp = corenrn.get_prop_param_size()[type]; + int szdp = corenrn.get_prop_dparam_size()[type]; + + double* dptr = cnrn_target_deviceptr(ml->data); + cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr)); + + + int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n); + cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices); + + if (szdp) { + int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt); + cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata); + } + + int ts = corenrn.get_memb_funcs()[type].thread_size_; + if (ts) { + ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts); + cnrn_target_memcpy_to_device(&(d_ml->_thread), &td); + } + + // net_receive buffer associated with mechanism + NetReceiveBuffer_t* nrb = ml->_net_receive_buffer; + + // if net receive buffer exist for mechanism + if (nrb) { + NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb); + cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb); + + int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); + + int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); + + double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); + + double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); + + int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); + + int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); + } + + /* copy NetSendBuffer_t on to GPU */ + NetSendBuffer_t* nsb = ml->_net_send_buffer; + + if (nsb) { + NetSendBuffer_t* d_nsb; + int* d_iptr; + double* d_dptr; + + d_nsb = cnrn_target_copyin(nsb); + cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb); + + d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr); + + d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr); + + d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr); + + d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr); + + d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr); + + d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size); + cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr); + } + + return d_ml; +} + +static void update_ml_on_host(const Memb_list* ml, int type) { + int is_art = corenrn.get_is_artificial()[type]; + if (is_art) { + // Artificial mechanisms such as PatternStim and IntervalFire + // are not copied onto the GPU. They should not, therefore, be + // updated from the GPU. + return; + } + + int n = ml->nodecount; + int szp = corenrn.get_prop_param_size()[type]; + int szdp = corenrn.get_prop_dparam_size()[type]; + + int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; + + nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n])) + nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n])) + + int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp)) + nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp)) + + auto nrb = ml->_net_receive_buffer; + + // clang-format off + nrn_pragma_acc(update self(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) + nrn_pragma_omp(target update from(nrb->_cnt, + nrb->_size, + nrb->_pnt_offset, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_size], + nrb->_weight_index[:nrb->_size], + nrb->_displ[:nrb->_size + 1], + nrb->_nrb_index[:nrb->_size]) + if (nrb != nullptr)) + // clang-format on +} + +static void delete_ml_from_device(Memb_list* ml, int type) { + int is_art = corenrn.get_is_artificial()[type]; + if (is_art) { + return; + } + // Cleanup the net send buffer if it exists + { + NetSendBuffer_t* nsb{ml->_net_send_buffer}; + if (nsb) { + cnrn_target_delete(nsb->_nsb_flag, nsb->_size); + cnrn_target_delete(nsb->_nsb_t, nsb->_size); + cnrn_target_delete(nsb->_weight_index, nsb->_size); + cnrn_target_delete(nsb->_pnt_index, nsb->_size); + cnrn_target_delete(nsb->_vdata_index, nsb->_size); + cnrn_target_delete(nsb->_sendtype, nsb->_size); + cnrn_target_delete(nsb); + } + } + // Cleanup the net receive buffer if it exists. + { + NetReceiveBuffer_t* nrb{ml->_net_receive_buffer}; + if (nrb) { + cnrn_target_delete(nrb->_nrb_index, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb); + } + } + int n = ml->nodecount; + int szdp = corenrn.get_prop_dparam_size()[type]; + int ts = corenrn.get_memb_funcs()[type].thread_size_; + if (ts) { + cnrn_target_delete(ml->_thread, ts); + } + if (szdp) { + int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; + cnrn_target_delete(ml->pdata, pcnt); + } + cnrn_target_delete(ml->nodeindices, n); + cnrn_target_delete(ml); +} + /* note: threads here are corresponding to global nrn_threads array */ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // initialize NrnThreads for gpu execution // empty thread or only artificial cells should be on cpu for (int i = 0; i < nthreads; i++) { @@ -58,13 +277,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { NrnThread* nt = threads + i; // NrnThread on host if (nt->n_presyn) { - PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); + PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay); // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu, // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay // inside nt on cpu (due to unified memory). @@ -82,7 +301,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * find * corresponding NrnThread using Point_process in NET_RECEIVE block */ - NrnThread* d_threads = (NrnThread*) acc_copyin(threads, sizeof(NrnThread) * nthreads); + NrnThread* d_threads = cnrn_target_copyin(threads, nthreads); if (interleave_info == nullptr) { printf("\n Warning: No permutation data? Required for linear algebra!"); @@ -101,7 +320,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { /* -- copy _data to device -- */ /*copy all double data for thread */ - d__data = (double*) acc_copyin(nt->_data, nt->_ndata * sizeof(double)); + d__data = cnrn_target_copyin(nt->_data, nt->_ndata); + /* Here is the example of using OpenACC data enter/exit * Remember that we are not allowed to use nt->_data but we have to use: @@ -111,7 +331,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { */ /*update d_nt._data to point to device copy */ - acc_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data); /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */ double* dptr; @@ -120,36 +340,34 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); dptr = d__data + 0 * ne; - acc_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr)); dptr = d__data + 1 * ne; - acc_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr)); dptr = d__data + 2 * ne; - acc_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr)); dptr = d__data + 3 * ne; - acc_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr)); dptr = d__data + 4 * ne; - acc_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr)); dptr = d__data + 5 * ne; - acc_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr)); if (nt->_actual_diam) { dptr = d__data + 6 * ne; - acc_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*)); + cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr)); } - int* d_v_parent_index = (int*) acc_copyin(nt->_v_parent_index, nt->end * sizeof(int)); - acc_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*)); + int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end); + cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index)); /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/ - Memb_list** d_ml_list = (Memb_list**) acc_copyin(nt->_ml_list, - corenrn.get_memb_funcs().size() * - sizeof(Memb_list*)); - acc_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**)); + Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size()); + cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list)); /* -- copy NrnThreadMembList list ml to device -- */ @@ -160,119 +378,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { for (auto tml = nt->tml; tml; tml = tml->next) { /*copy tml to device*/ /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/ - auto d_tml = (NrnThreadMembList*) acc_copyin(tml, sizeof(NrnThreadMembList)); + auto d_tml = cnrn_target_copyin(tml); /*first tml is pointed by nt */ if (first_tml) { - acc_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml); first_tml = false; } else { /*rest of tml forms linked list */ - acc_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*)); + cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml); } // book keeping for linked-list d_last_tml = d_tml; /* now for every tml, there is a ml. copy that and setup pointer */ - auto d_ml = (Memb_list*) acc_copyin(tml->ml, sizeof(Memb_list)); - acc_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*)); - + Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index); + cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml); /* setup nt._ml_list */ - acc_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*)); - - int type = tml->index; - int n = tml->ml->nodecount; - int szp = corenrn.get_prop_param_size()[type]; - int szdp = corenrn.get_prop_dparam_size()[type]; - int is_art = corenrn.get_is_artificial()[type]; - - // get device pointer for corresponding mechanism data - dptr = (double*) acc_deviceptr(tml->ml->data); - acc_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*)); - - - if (!is_art) { - int* d_nodeindices = (int*) acc_copyin(tml->ml->nodeindices, sizeof(int) * n); - acc_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*)); - } - - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - int* d_pdata = (int*) acc_copyin(tml->ml->pdata, sizeof(int) * pcnt); - acc_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*)); - } - - int ts = corenrn.get_memb_funcs()[type].thread_size_; - if (ts) { - ThreadDatum* td = (ThreadDatum*) acc_copyin(tml->ml->_thread, - ts * sizeof(ThreadDatum)); - acc_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*)); - } - - NetReceiveBuffer_t *nrb, *d_nrb; - int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index; - double *d_nrb_t, *d_nrb_flag; - - // net_receive buffer associated with mechanism - nrb = tml->ml->_net_receive_buffer; - - // if net receive buffer exist for mechanism - if (nrb) { - d_nrb = (NetReceiveBuffer_t*) acc_copyin(nrb, sizeof(NetReceiveBuffer_t)); - acc_memcpy_to_device(&(d_ml->_net_receive_buffer), - &d_nrb, - sizeof(NetReceiveBuffer_t*)); - - d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); - - d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); - - d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); - - d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); - - d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); - - d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); - } - - /* copy NetSendBuffer_t on to GPU */ - NetSendBuffer_t* nsb; - nsb = tml->ml->_net_send_buffer; - - if (nsb) { - NetSendBuffer_t* d_nsb; - int* d_iptr; - double* d_dptr; - - d_nsb = (NetSendBuffer_t*) acc_copyin(nsb, sizeof(NetSendBuffer_t)); - acc_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*)); - - d_iptr = (int*) acc_copyin(nsb->_sendtype, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*)); - - d_iptr = (int*) acc_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*)); - - d_iptr = (int*) acc_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*)); - - d_iptr = (int*) acc_copyin(nsb->_weight_index, sizeof(int) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*)); - - d_dptr = (double*) acc_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*)); - - d_dptr = (double*) acc_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size); - acc_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*)); - } + cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml); } if (nt->shadow_rhs_cnt) { @@ -281,50 +405,46 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); /* copy shadow_rhs to device and fix-up the pointer */ - d_shadow_ptr = (double*) acc_copyin(nt->_shadow_rhs, pcnt * sizeof(double)); - acc_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt); + cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr); /* copy shadow_d to device and fix-up the pointer */ - d_shadow_ptr = (double*) acc_copyin(nt->_shadow_d, pcnt * sizeof(double)); - acc_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*)); + d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt); + cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr); } /* Fast membrane current calculation struct */ if (nt->nrn_fast_imem) { - auto* d_fast_imem = reinterpret_cast( - acc_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem))); - acc_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*)); + NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem); + cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem); { - auto* d_ptr = reinterpret_cast( - acc_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double))); - acc_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*)); + double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); + cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr); } { - auto* d_ptr = reinterpret_cast( - acc_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double))); - acc_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*)); + double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end); + cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr); } } if (nt->n_pntproc) { /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU */ - Point_process* pntptr = - (Point_process*) acc_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); - acc_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*)); + Point_process* pntptr = cnrn_target_copyin(nt->pntprocs, nt->n_pntproc); + cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr); } if (nt->n_weight) { /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */ - double* d_weights = (double*) acc_copyin(nt->weights, sizeof(double) * nt->n_weight); - acc_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*)); + double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight); + cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights); } if (nt->_nvdata) { /* copy vdata which is setup in bbcore_read. This contains cuda allocated * nrnran123_State * */ - void** d_vdata = (void**) acc_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata); - acc_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**)); + void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata); + cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata); } if (nt->n_presyn) { @@ -333,25 +453,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { * while updating PreSyn objects which has virtual base class. May be this is issue due * to * VTable and alignment */ - PreSynHelper* d_presyns_helper = - (PreSynHelper*) acc_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - acc_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*)); - PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - acc_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*)); + PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn); + cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper); + PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn); + cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns); } if (nt->_net_send_buffer_size) { /* copy send_receive buffer */ - int* d_net_send_buffer = (int*) acc_copyin(nt->_net_send_buffer, - sizeof(int) * nt->_net_send_buffer_size); - acc_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*)); + int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer, + nt->_net_send_buffer_size); + cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer); } if (nt->n_vecplay) { /* copy VecPlayContinuous instances */ /** just empty containers */ - void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay); - acc_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**)); + void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay); + cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay); nrn_VecPlay_copyto_device(nt, d_vecplay); } @@ -360,41 +479,41 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (interleave_permute_type == 1) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo)); int* d_ptr = nullptr; + InterleaveInfo* d_info = cnrn_target_copyin(info); - d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * (info->nstride + 1)); - acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1); + cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr); - d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * nt->ncell); - acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr); - d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * nt->ncell); - acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr); - d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * nt->ncell); - acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell); + cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr); } else if (interleave_permute_type == 2) { /* todo: not necessary to setup pointers, just copy it */ InterleaveInfo* info = interleave_info + i; - InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo)); + InterleaveInfo* d_info = cnrn_target_copyin(info); int* d_ptr = nullptr; - d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * info->nstride); - acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stride, info->nstride); + cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr); - d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1)); - acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr); - d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1)); - acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr); - d_ptr = (int*) acc_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - acc_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1); + cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr); - d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * info->nwarp); - acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*)); + d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp); + cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr); } else { printf("\n ERROR: only --cell_permute = [12] implemented"); abort(); @@ -408,38 +527,30 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { // Create a device-side copy of the `trajec_requests` struct and // make sure the device-side NrnThread object knows about it. - auto* d_trajec_requests = reinterpret_cast( - acc_copyin(tr, sizeof(TrajectoryRequests))); - acc_memcpy_to_device(&(d_nt->trajec_requests), - &d_trajec_requests, - sizeof(TrajectoryRequests*)); + TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr); + cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests); // Initialise the double** gather member of the struct. - auto* d_tr_gather = reinterpret_cast( - acc_copyin(tr->gather, sizeof(double*) * tr->n_trajec)); - acc_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**)); + double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec); + cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather); // Initialise the double** varrays member of the struct if it's // set. double** d_tr_varrays{nullptr}; if (tr->varrays) { - d_tr_varrays = reinterpret_cast( - acc_copyin(tr->varrays, sizeof(double*) * tr->n_trajec)); - acc_memcpy_to_device(&(d_trajec_requests->varrays), - &d_tr_varrays, - sizeof(double**)); + d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec); + cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays); } for (int i = 0; i < tr->n_trajec; ++i) { if (tr->varrays) { // tr->varrays[i] is a buffer of tr->bsize doubles on the host, // make a device-side copy of it and store a pointer to it in // the device-side version of tr->varrays. - auto* d_buf_traj_i = reinterpret_cast( - acc_copyin(tr->varrays[i], tr->bsize * sizeof(double))); - acc_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*)); + double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize); + cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i); } // tr->gather[i] is a double* referring to (host) data in the // (host) _data block - auto* d_gather_i = acc_deviceptr(tr->gather[i]); - acc_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*)); + auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]); + cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i); } // TODO: other `double** scatter` and `void** vpr` members of // the TrajectoryRequests struct are not copied to the device. @@ -457,14 +568,14 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { } void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { -#ifdef _OPENACC - IvocVect* d_iv = (IvocVect*) acc_copyin((void*) &from, sizeof(IvocVect)); - acc_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*)); +#ifdef CORENEURON_ENABLE_GPU + /// by default `to` is desitionation pointer on a device + IvocVect* d_iv = &to; size_t n = from.size(); if (n) { - double* d_data = (double*) acc_copyin((void*) from.data(), sizeof(double) * n); - acc_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*)); + double* d_data = cnrn_target_copyin(from.data(), n); + cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data); } #else (void) from; @@ -473,12 +584,12 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) { } void delete_ivoc_vect_from_device(IvocVect& vec) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU auto const n = vec.size(); if (n) { - acc_delete(vec.data(), sizeof(double) * n); + cnrn_target_delete(vec.data(), n); } - acc_delete(&vec, sizeof(IvocVect)); + cnrn_target_delete(&vec); #else (void) vec; #endif @@ -490,15 +601,15 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { return; } -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (nt->compute_gpu) { // free existing vectors in buffers on gpu - acc_delete(nrb->_pnt_index, nrb->_size * sizeof(int)); - acc_delete(nrb->_weight_index, nrb->_size * sizeof(int)); - acc_delete(nrb->_nrb_t, nrb->_size * sizeof(double)); - acc_delete(nrb->_nrb_flag, nrb->_size * sizeof(double)); - acc_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int)); - acc_delete(nrb->_nrb_index, nrb->_size * sizeof(int)); + cnrn_target_delete(nrb->_pnt_index, nrb->_size); + cnrn_target_delete(nrb->_weight_index, nrb->_size); + cnrn_target_delete(nrb->_nrb_t, nrb->_size); + cnrn_target_delete(nrb->_nrb_flag, nrb->_size); + cnrn_target_delete(nrb->_displ, nrb->_size + 1); + cnrn_target_delete(nrb->_nrb_index, nrb->_size); } #endif @@ -511,34 +622,35 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) { nrb->_displ = (int*) erealloc(nrb->_displ, (nrb->_size + 1) * sizeof(int)); nrb->_nrb_index = (int*) erealloc(nrb->_nrb_index, nrb->_size * sizeof(int)); -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (nt->compute_gpu) { int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index; double *d_nrb_t, *d_nrb_flag; // update device copy - acc_update_device(nrb, sizeof(NetReceiveBuffer_t)); + nrn_pragma_acc(update device(nrb)); + nrn_pragma_omp(target update to(nrb)); - NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) acc_deviceptr(nrb); + NetReceiveBuffer_t* d_nrb = cnrn_target_deviceptr(nrb); // recopy the vectors in the buffer - d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*)); + d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index); - d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*)); + d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index); - d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*)); + d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t); - d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*)); + d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag); - d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*)); + d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1); + cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ); - d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size); - acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*)); + d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size); + cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index); } #endif } @@ -597,6 +709,10 @@ static void net_receive_buffer_order(NetReceiveBuffer_t* nrb) { void update_net_receive_buffer(NrnThread* nt) { Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf"); for (auto tml = nt->tml; tml; tml = tml->next) { + int is_art = corenrn.get_is_artificial()[tml->index]; + if (is_art) { + continue; + } // net_receive buffer to copy NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer; @@ -605,29 +721,39 @@ void update_net_receive_buffer(NrnThread* nt) { // instance order to avoid race. setup _displ and _nrb_index net_receive_buffer_order(nrb); -#ifdef _OPENACC if (nt->compute_gpu) { Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu"); // note that dont update nrb otherwise we lose pointers + // clang-format off + /* update scalar elements */ - acc_update_device(&nrb->_cnt, sizeof(int)); - acc_update_device(&nrb->_displ_cnt, sizeof(int)); - - acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt); - acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt); - acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt); - acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt); - acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1)); - acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt); + nrn_pragma_acc(update device(nrb->_cnt, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_cnt], + nrb->_weight_index[:nrb->_cnt], + nrb->_nrb_t[:nrb->_cnt], + nrb->_nrb_flag[:nrb->_cnt], + nrb->_displ[:nrb->_displ_cnt + 1], + nrb->_nrb_index[:nrb->_cnt]) + async(nt->stream_id)) + nrn_pragma_omp(target update to(nrb->_cnt, + nrb->_displ_cnt, + nrb->_pnt_index[:nrb->_cnt], + nrb->_weight_index[:nrb->_cnt], + nrb->_nrb_t[:nrb->_cnt], + nrb->_nrb_flag[:nrb->_cnt], + nrb->_displ[:nrb->_displ_cnt + 1], + nrb->_nrb_index[:nrb->_cnt])) + // clang-format on } -#endif } } + nrn_pragma_acc(wait(nt->stream_id)) } void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU if (!nt->compute_gpu) return; @@ -641,13 +767,23 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { if (nsb->_cnt) { Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu"); - acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_pnt_index, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_weight_index, sizeof(int) * nsb->_cnt); - acc_update_self(nsb->_nsb_t, sizeof(double) * nsb->_cnt); - acc_update_self(nsb->_nsb_flag, sizeof(double) * nsb->_cnt); } + // clang-format off + nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt], + nsb->_vdata_index[:nsb->_cnt], + nsb->_pnt_index[:nsb->_cnt], + nsb->_weight_index[:nsb->_cnt], + nsb->_nsb_t[:nsb->_cnt], + nsb->_nsb_flag[:nsb->_cnt]) + if (nsb->_cnt)) + nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt], + nsb->_vdata_index[:nsb->_cnt], + nsb->_pnt_index[:nsb->_cnt], + nsb->_weight_index[:nsb->_cnt], + nsb->_nsb_t[:nsb->_cnt], + nsb->_nsb_flag[:nsb->_cnt]) + if (nsb->_cnt)) + // clang-format on #else (void) nt; (void) nsb; @@ -655,7 +791,7 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) { } void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU for (int i = 0; i < nthreads; i++) { NrnThread* nt = threads + i; @@ -665,197 +801,63 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) { int ne = nrn_soa_padded_size(nt->end, 0); - acc_update_self(nt->_actual_rhs, ne * sizeof(double)); - acc_update_self(nt->_actual_d, ne * sizeof(double)); - acc_update_self(nt->_actual_a, ne * sizeof(double)); - acc_update_self(nt->_actual_b, ne * sizeof(double)); - acc_update_self(nt->_actual_v, ne * sizeof(double)); - acc_update_self(nt->_actual_area, ne * sizeof(double)); - if (nt->_actual_diam) { - acc_update_self(nt->_actual_diam, ne * sizeof(double)); - } - - /* @todo: nt._ml_list[tml->index] = tml->ml; */ - - /* -- copy NrnThreadMembList list ml to host -- */ - for (auto tml = nt->tml; tml; tml = tml->next) { - Memb_list* ml = tml->ml; - - acc_update_self(&tml->index, sizeof(int)); - acc_update_self(&ml->nodecount, sizeof(int)); - - int type = tml->index; - int n = ml->nodecount; - int szp = corenrn.get_prop_param_size()[type]; - int szdp = corenrn.get_prop_dparam_size()[type]; - int is_art = corenrn.get_is_artificial()[type]; - - // Artificial mechanisms such as PatternStim and IntervalFire - // are not copied onto the GPU. They should not, therefore, be - // updated from the GPU. - if (is_art) { - continue; - } - - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - - acc_update_self(ml->data, pcnt * sizeof(double)); - acc_update_self(ml->nodeindices, n * sizeof(int)); - - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - acc_update_self(ml->pdata, pcnt * sizeof(int)); - } - - auto nrb = tml->ml->_net_receive_buffer; - - if (nrb) { - acc_update_self(&nrb->_cnt, sizeof(int)); - acc_update_self(&nrb->_size, sizeof(int)); - acc_update_self(&nrb->_pnt_offset, sizeof(int)); - acc_update_self(&nrb->_displ_cnt, sizeof(int)); - - acc_update_self(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_update_self(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_update_self(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_update_self(nrb->_nrb_index, sizeof(int) * nrb->_size); - } - } - - if (nt->shadow_rhs_cnt) { - int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - /* copy shadow_rhs to host */ - acc_update_self(nt->_shadow_rhs, pcnt * sizeof(double)); - /* copy shadow_d to host */ - acc_update_self(nt->_shadow_d, pcnt * sizeof(double)); - } - - if (nt->nrn_fast_imem) { - acc_update_self(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - acc_update_self(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - } - - if (nt->n_pntproc) { - acc_update_self(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); - } - - if (nt->n_weight) { - acc_update_self(nt->weights, sizeof(double) * nt->n_weight); - } - - if (nt->n_presyn) { - acc_update_self(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - acc_update_self(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - } - - { - TrajectoryRequests* tr = nt->trajec_requests; - if (tr && tr->varrays) { - // The full buffers have `bsize` entries, but only `vsize` - // of them are valid. - for (int i = 0; i < tr->n_trajec; ++i) { - acc_update_self(tr->varrays[i], tr->vsize * sizeof(double)); - } - } - } - - /* dont update vdata, its pointer array - if(nt->_nvdata) { - acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata); - } - */ - } - } -#else - (void) threads; - (void) nthreads; -#endif -} - -void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { -#ifdef _OPENACC - - for (int i = 0; i < nthreads; i++) { - NrnThread* nt = threads + i; - - if (nt->compute_gpu && (nt->end > 0)) { - /* -- copy data to device -- */ - - int ne = nrn_soa_padded_size(nt->end, 0); + // clang-format off + nrn_pragma_acc(update self(nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + nrn_pragma_omp(target update from(nt->_actual_rhs[:ne], + nt->_actual_d[:ne], + nt->_actual_a[:ne], + nt->_actual_b[:ne], + nt->_actual_v[:ne], + nt->_actual_area[:ne])) + // clang-format on - acc_update_device(nt->_actual_rhs, ne * sizeof(double)); - acc_update_device(nt->_actual_d, ne * sizeof(double)); - acc_update_device(nt->_actual_a, ne * sizeof(double)); - acc_update_device(nt->_actual_b, ne * sizeof(double)); - acc_update_device(nt->_actual_v, ne * sizeof(double)); - acc_update_device(nt->_actual_area, ne * sizeof(double)); - if (nt->_actual_diam) { - acc_update_device(nt->_actual_diam, ne * sizeof(double)); - } + nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) + nrn_pragma_omp( + target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr)) /* @todo: nt._ml_list[tml->index] = tml->ml; */ /* -- copy NrnThreadMembList list ml to host -- */ for (auto tml = nt->tml; tml; tml = tml->next) { - Memb_list* ml = tml->ml; - int type = tml->index; - int n = ml->nodecount; - int szp = corenrn.get_prop_param_size()[type]; - int szdp = corenrn.get_prop_dparam_size()[type]; - - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp; - - acc_update_device(ml->data, pcnt * sizeof(double)); - - if (!corenrn.get_is_artificial()[type]) { - acc_update_device(ml->nodeindices, n * sizeof(int)); - } - - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - acc_update_device(ml->pdata, pcnt * sizeof(int)); - } - - auto nrb = tml->ml->_net_receive_buffer; - - if (nrb) { - acc_update_device(&nrb->_cnt, sizeof(int)); - acc_update_device(&nrb->_size, sizeof(int)); - acc_update_device(&nrb->_pnt_offset, sizeof(int)); - acc_update_device(&nrb->_displ_cnt, sizeof(int)); - - acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_update_device(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_size); + if (!corenrn.get_is_artificial()[tml->index]) { + nrn_pragma_acc(update self(tml->index, tml->ml->nodecount)) + nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount)) } + update_ml_on_host(tml->ml, tml->index); } - if (nt->shadow_rhs_cnt) { - int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - /* copy shadow_rhs to host */ - acc_update_device(nt->_shadow_rhs, pcnt * sizeof(double)); - /* copy shadow_d to host */ - acc_update_device(nt->_shadow_d, pcnt * sizeof(double)); - } + int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); + /* copy shadow_rhs to host */ + /* copy shadow_d to host */ + nrn_pragma_acc( + update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt)) + nrn_pragma_omp(target update from( + nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt)) - if (nt->nrn_fast_imem) { - acc_update_device(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - acc_update_device(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - } + // clang-format off + nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if (nt->nrn_fast_imem != nullptr)) + nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end], + nt->nrn_fast_imem->nrn_sav_d[:nt->end]) + if (nt->nrn_fast_imem != nullptr)) + // clang-format on - if (nt->n_pntproc) { - acc_update_device(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); - } + nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) + nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc)) - if (nt->n_weight) { - acc_update_device(nt->weights, sizeof(double) * nt->n_weight); - } + nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight)) + nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight)) - if (nt->n_presyn) { - acc_update_device(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); - acc_update_device(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - } + nrn_pragma_acc(update self( + nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn)) + nrn_pragma_omp(target update from( + nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn)) { TrajectoryRequests* tr = nt->trajec_requests; @@ -863,15 +865,15 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) { // The full buffers have `bsize` entries, but only `vsize` // of them are valid. for (int i = 0; i < tr->n_trajec; ++i) { - acc_update_device(tr->varrays[i], tr->vsize * sizeof(double)); + nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize])) + nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize])) } } } - /* don't and don't update vdata, its pointer array - if(nt->_nvdata) { - acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata); - } + /* dont update vdata, its pointer array + nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata) + nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata)) */ } } @@ -894,85 +896,30 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) { size_t n_weight = nt->n_weight; if (nt->compute_gpu && n_weight > 0) { double* weights = nt->weights; - // clang-format off - - #pragma acc update host(weights [0:n_weight]) - // clang-format on + nrn_pragma_acc(update host(weights [0:n_weight])) + nrn_pragma_omp(target update from(weights [0:n_weight])) } } } -void update_matrix_from_gpu(NrnThread* _nt) { -#ifdef _OPENACC - if (_nt->compute_gpu && (_nt->end > 0)) { - /* before copying, make sure all computations in the stream are completed */ - - // clang-format off - - #pragma acc wait(_nt->stream_id) - - /* openacc routine doesn't allow asyn, use pragma */ - // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double)); - - /* RHS and D are contigious, copy them in one go! - * NOTE: in pragma you have to give actual pointer like below and not nt->rhs... - */ - double* rhs = _nt->_actual_rhs; - int ne = nrn_soa_padded_size(_nt->end, 0); - - #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id) - #pragma acc wait(_nt->stream_id) - // clang-format on - } -#else - (void) _nt; -#endif -} - -void update_matrix_to_gpu(NrnThread* _nt) { -#ifdef _OPENACC - if (_nt->compute_gpu && (_nt->end > 0)) { - /* before copying, make sure all computations in the stream are completed */ - - // clang-format off - - #pragma acc wait(_nt->stream_id) - - /* while discussion with Michael we found that RHS is also needed on - * gpu because nrn_cap_jacob uses rhs which is being updated on GPU - */ - double* v = _nt->_actual_v; - double* rhs = _nt->_actual_rhs; - int ne = nrn_soa_padded_size(_nt->end, 0); - - #pragma acc update device(v[0 : ne]) async(_nt->stream_id) - #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id) - #pragma acc wait(_nt->stream_id) - // clang-format on - } -#else - (void) _nt; -#endif -} - /** Cleanup device memory that is being tracked by the OpenACC runtime. * - * This function painstakingly calls `acc_delete` in reverse order on all - * pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`. + * This function painstakingly calls `cnrn_target_delete` in reverse order on all + * pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`. * This cleanup ensures that if the GPU is initialised multiple times from the * same process then the OpenACC runtime will not be polluted with old * pointers, which can cause errors. In particular if we do: * @code * { * // ... some_ptr is dynamically allocated ... - * acc_copyin(some_ptr, some_size); + * cnrn_target_copyin(some_ptr, some_size); * // ... do some work ... - * // acc_delete(some_ptr); + * // cnrn_target_delete(some_ptr); * free(some_ptr); * } * { * // ... same_ptr_again is dynamically allocated at the same address ... - * acc_copyin(same_ptr_again, some_other_size); // ERROR + * cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR * } * @endcode * the application will/may abort with an error such as: @@ -981,7 +928,7 @@ void update_matrix_to_gpu(NrnThread* _nt) { * the same process. */ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU for (int i = 0; i < nthreads; i++) { NrnThread* nt = threads + i; { @@ -989,132 +936,91 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) { if (tr) { if (tr->varrays) { for (int i = 0; i < tr->n_trajec; ++i) { - acc_delete(tr->varrays[i], tr->bsize * sizeof(double)); + cnrn_target_delete(tr->varrays[i], tr->bsize); } - acc_delete(tr->varrays, sizeof(double*) * tr->n_trajec); + cnrn_target_delete(tr->varrays, tr->n_trajec); } - acc_delete(tr->gather, sizeof(double*) * tr->n_trajec); - acc_delete(tr, sizeof(TrajectoryRequests)); + cnrn_target_delete(tr->gather, tr->n_trajec); + cnrn_target_delete(tr); } } if (nt->_permute) { if (interleave_permute_type == 1) { InterleaveInfo* info = interleave_info + i; - acc_delete(info->cellsize, sizeof(int) * nt->ncell); - acc_delete(info->lastnode, sizeof(int) * nt->ncell); - acc_delete(info->firstnode, sizeof(int) * nt->ncell); - acc_delete(info->stride, sizeof(int) * (info->nstride + 1)); - acc_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, nt->ncell); + cnrn_target_delete(info->lastnode, nt->ncell); + cnrn_target_delete(info->firstnode, nt->ncell); + cnrn_target_delete(info->stride, info->nstride + 1); + cnrn_target_delete(info); } else if (interleave_permute_type == 2) { InterleaveInfo* info = interleave_info + i; - acc_delete(info->cellsize, sizeof(int) * info->nwarp); - acc_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1)); - acc_delete(info->lastnode, sizeof(int) * (info->nwarp + 1)); - acc_delete(info->firstnode, sizeof(int) * (info->nwarp + 1)); - acc_delete(info->stride, sizeof(int) * info->nstride); - acc_delete(info, sizeof(InterleaveInfo)); + cnrn_target_delete(info->cellsize, info->nwarp); + cnrn_target_delete(info->stridedispl, info->nwarp + 1); + cnrn_target_delete(info->lastnode, info->nwarp + 1); + cnrn_target_delete(info->firstnode, info->nwarp + 1); + cnrn_target_delete(info->stride, info->nstride); + cnrn_target_delete(info); } } if (nt->n_vecplay) { nrn_VecPlay_delete_from_device(nt); - acc_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay); + cnrn_target_delete(nt->_vecplay, nt->n_vecplay); } // Cleanup send_receive buffer. if (nt->_net_send_buffer_size) { - acc_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size); + cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size); } if (nt->n_presyn) { - acc_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn); - acc_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn); + cnrn_target_delete(nt->presyns, nt->n_presyn); + cnrn_target_delete(nt->presyns_helper, nt->n_presyn); } // Cleanup data that's setup in bbcore_read. if (nt->_nvdata) { - acc_delete(nt->_vdata, sizeof(void*) * nt->_nvdata); + cnrn_target_delete(nt->_vdata, nt->_nvdata); } // Cleanup weight vector used in NET_RECEIVE if (nt->n_weight) { - acc_delete(nt->weights, sizeof(double) * nt->n_weight); + cnrn_target_delete(nt->weights, nt->n_weight); } // Cleanup point processes if (nt->n_pntproc) { - acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process)); + cnrn_target_delete(nt->pntprocs, nt->n_pntproc); } if (nt->nrn_fast_imem) { - acc_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)); - acc_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)); - acc_delete(nt->nrn_fast_imem, sizeof(NrnFastImem)); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end); + cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end); + cnrn_target_delete(nt->nrn_fast_imem); } if (nt->shadow_rhs_cnt) { int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0); - acc_delete(nt->_shadow_d, pcnt * sizeof(double)); - acc_delete(nt->_shadow_rhs, pcnt * sizeof(double)); + cnrn_target_delete(nt->_shadow_d, pcnt); + cnrn_target_delete(nt->_shadow_rhs, pcnt); } for (auto tml = nt->tml; tml; tml = tml->next) { - // Cleanup the net send buffer if it exists - { - NetSendBuffer_t* nsb{tml->ml->_net_send_buffer}; - if (nsb) { - acc_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size); - acc_delete(nsb->_nsb_t, sizeof(double) * nsb->_size); - acc_delete(nsb->_weight_index, sizeof(int) * nsb->_size); - acc_delete(nsb->_pnt_index, sizeof(int) * nsb->_size); - acc_delete(nsb->_vdata_index, sizeof(int) * nsb->_size); - acc_delete(nsb->_sendtype, sizeof(int) * nsb->_size); - acc_delete(nsb, sizeof(NetSendBuffer_t)); - } - } - // Cleanup the net receive buffer if it exists. - { - NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer}; - if (nrb) { - acc_delete(nrb->_nrb_index, sizeof(int) * nrb->_size); - acc_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1)); - acc_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size); - acc_delete(nrb->_nrb_t, sizeof(double) * nrb->_size); - acc_delete(nrb->_weight_index, sizeof(int) * nrb->_size); - acc_delete(nrb->_pnt_index, sizeof(int) * nrb->_size); - acc_delete(nrb, sizeof(NetReceiveBuffer_t)); - } - } - int type = tml->index; - int n = tml->ml->nodecount; - int szdp = corenrn.get_prop_dparam_size()[type]; - int is_art = corenrn.get_is_artificial()[type]; - int ts = corenrn.get_memb_funcs()[type].thread_size_; - if (ts) { - acc_delete(tml->ml->_thread, ts * sizeof(ThreadDatum)); - } - if (szdp) { - int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp; - acc_delete(tml->ml->pdata, sizeof(int) * pcnt); - } - if (!is_art) { - acc_delete(tml->ml->nodeindices, sizeof(int) * n); - } - acc_delete(tml->ml, sizeof(Memb_list)); - acc_delete(tml, sizeof(NrnThreadMembList)); + delete_ml_from_device(tml->ml, tml->index); + cnrn_target_delete(tml); } - acc_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*)); - acc_delete(nt->_v_parent_index, nt->end * sizeof(int)); - acc_delete(nt->_data, nt->_ndata * sizeof(double)); + cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size()); + cnrn_target_delete(nt->_v_parent_index, nt->end); + cnrn_target_delete(nt->_data, nt->_ndata); } - acc_delete(threads, sizeof(NrnThread) * nthreads); + cnrn_target_delete(threads, nthreads); nrn_ion_global_map_delete_from_device(); #endif } void nrn_newtonspace_copyto_device(NewtonSpace* ns) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1123,59 +1029,59 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) { int n = ns->n * ns->n_instance; // actually, the values of double do not matter, only the pointers. - NewtonSpace* d_ns = (NewtonSpace*) acc_copyin(ns, sizeof(NewtonSpace)); + NewtonSpace* d_ns = cnrn_target_copyin(ns); double* pd; - pd = (double*) acc_copyin(ns->delta_x, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->delta_x, n); + cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd); - pd = (double*) acc_copyin(ns->high_value, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->high_value, n); + cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd); - pd = (double*) acc_copyin(ns->low_value, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->low_value, n); + cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd); - pd = (double*) acc_copyin(ns->rowmax, n * sizeof(double)); - acc_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*)); + pd = cnrn_target_copyin(ns->rowmax, n); + cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd); - auto pint = (int*) acc_copyin(ns->perm, n * sizeof(int)); - acc_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*)); + auto pint = cnrn_target_copyin(ns->perm, n); + cnrn_target_memcpy_to_device(&(d_ns->perm), &pint); - auto ppd = (double**) acc_copyin(ns->jacobian, ns->n * sizeof(double*)); - acc_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**)); + auto ppd = cnrn_target_copyin(ns->jacobian, ns->n); + cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd); // the actual jacobian doubles were allocated as a single array - double* d_jacdat = (double*) acc_copyin(ns->jacobian[0], ns->n * n * sizeof(double)); + double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n); for (int i = 0; i < ns->n; ++i) { pd = d_jacdat + i * n; - acc_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*)); + cnrn_target_memcpy_to_device(&(ppd[i]), &pd); } #endif } void nrn_newtonspace_delete_from_device(NewtonSpace* ns) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { return; } int n = ns->n * ns->n_instance; - acc_delete(ns->jacobian[0], ns->n * n * sizeof(double)); - acc_delete(ns->jacobian, ns->n * sizeof(double*)); - acc_delete(ns->perm, n * sizeof(int)); - acc_delete(ns->rowmax, n * sizeof(double)); - acc_delete(ns->low_value, n * sizeof(double)); - acc_delete(ns->high_value, n * sizeof(double)); - acc_delete(ns->delta_x, n * sizeof(double)); - acc_delete(ns, sizeof(NewtonSpace)); + cnrn_target_delete(ns->jacobian[0], ns->n * n); + cnrn_target_delete(ns->jacobian, ns->n); + cnrn_target_delete(ns->perm, n); + cnrn_target_delete(ns->rowmax, n); + cnrn_target_delete(ns->low_value, n); + cnrn_target_delete(ns->high_value, n); + cnrn_target_delete(ns->delta_x, n); + cnrn_target_delete(ns); #endif } void nrn_sparseobj_copyto_device(SparseObj* so) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1183,82 +1089,82 @@ void nrn_sparseobj_copyto_device(SparseObj* so) { } unsigned n1 = so->neqn + 1; - SparseObj* d_so = (SparseObj*) acc_copyin(so, sizeof(SparseObj)); + SparseObj* d_so = cnrn_target_copyin(so); // only pointer fields in SparseObj that need setting up are // rowst, diag, rhs, ngetcall, coef_list // only pointer fields in Elm that need setting up are // r_down, c_right, value // do not care about the Elm* ptr value, just the space. - Elm** d_rowst = (Elm**) acc_copyin(so->rowst, n1 * sizeof(Elm*)); - acc_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**)); + Elm** d_rowst = cnrn_target_copyin(so->rowst, n1); + cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst); - Elm** d_diag = (Elm**) acc_copyin(so->diag, n1 * sizeof(Elm*)); - acc_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**)); + Elm** d_diag = cnrn_target_copyin(so->diag, n1); + cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag); - auto pu = (unsigned*) acc_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - acc_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**)); + unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded); + cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu); - auto pd = (double*) acc_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - acc_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*)); + double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded); + cnrn_target_memcpy_to_device(&(d_so->rhs), &pd); - auto d_coef_list = (double**) acc_copyin(so->coef_list, so->coef_list_size * sizeof(double*)); - acc_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**)); + double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size); + cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list); // Fill in relevant Elm pointer values for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - Elm* pelm = (Elm*) acc_copyin(elm, sizeof(Elm)); + Elm* pelm = cnrn_target_copyin(elm); if (elm == so->rowst[irow]) { - acc_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*)); + cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm); } else { - Elm* d_e = (Elm*) acc_deviceptr(elm->c_left); - acc_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*)); + Elm* d_e = cnrn_target_deviceptr(elm->c_left); + cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e); } if (elm->col == elm->row) { - acc_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*)); + cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm); } if (irow > 1) { if (elm->r_up) { - Elm* d_e = (Elm*) acc_deviceptr(elm->r_up); - acc_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*)); + Elm* d_e = cnrn_target_deviceptr(elm->r_up); + cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e); } } - pd = (double*) acc_copyin(elm->value, so->_cntml_padded * sizeof(double)); - acc_memcpy_to_device(&(pelm->value), &pd, sizeof(double*)); + pd = cnrn_target_copyin(elm->value, so->_cntml_padded); + cnrn_target_memcpy_to_device(&(pelm->value), &pd); } } // visit all the Elm again and fill in pelm->r_down and pelm->c_left for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - auto pelm = (Elm*) acc_deviceptr(elm); + auto pelm = cnrn_target_deviceptr(elm); if (elm->r_down) { - auto d_e = (Elm*) acc_deviceptr(elm->r_down); - acc_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*)); + auto d_e = cnrn_target_deviceptr(elm->r_down); + cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e); } if (elm->c_right) { - auto d_e = (Elm*) acc_deviceptr(elm->c_right); - acc_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*)); + auto d_e = cnrn_target_deviceptr(elm->c_right); + cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e); } } } // Fill in the d_so->coef_list for (unsigned i = 0; i < so->coef_list_size; ++i) { - pd = (double*) acc_deviceptr(so->coef_list[i]); - acc_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*)); + pd = cnrn_target_deviceptr(so->coef_list[i]); + cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd); } #endif } void nrn_sparseobj_delete_from_device(SparseObj* so) { -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU // FIXME this check needs to be tweaked if we ever want to run with a mix // of CPU and GPU threads. if (nrn_threads[0].compute_gpu == 0) { @@ -1267,31 +1173,29 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) { unsigned n1 = so->neqn + 1; for (unsigned irow = 1; irow < n1; ++irow) { for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) { - acc_delete(elm->value, so->_cntml_padded * sizeof(double)); - acc_delete(elm, sizeof(Elm)); + cnrn_target_delete(elm->value, so->_cntml_padded); + cnrn_target_delete(elm); } } - acc_delete(so->coef_list, so->coef_list_size * sizeof(double*)); - acc_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double)); - acc_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned)); - acc_delete(so->diag, n1 * sizeof(Elm*)); - acc_delete(so->rowst, n1 * sizeof(Elm*)); - acc_delete(so, sizeof(SparseObj)); + cnrn_target_delete(so->coef_list, so->coef_list_size); + cnrn_target_delete(so->rhs, n1 * so->_cntml_padded); + cnrn_target_delete(so->ngetcall, so->_cntml_padded); + cnrn_target_delete(so->diag, n1); + cnrn_target_delete(so->rowst, n1); + cnrn_target_delete(so); #endif } -#ifdef _OPENACC +#ifdef CORENEURON_ENABLE_GPU void nrn_ion_global_map_copyto_device() { if (nrn_ion_global_map_size) { - double** d_data = (double**) acc_copyin(nrn_ion_global_map, - sizeof(double*) * nrn_ion_global_map_size); + double** d_data = cnrn_target_copyin(nrn_ion_global_map, nrn_ion_global_map_size); for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - double* d_mechmap = (double*) acc_copyin(nrn_ion_global_map[j], - ion_global_map_member_size * - sizeof(double)); - acc_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*)); + double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j], + ion_global_map_member_size); + cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap); } } } @@ -1300,20 +1204,17 @@ void nrn_ion_global_map_copyto_device() { void nrn_ion_global_map_delete_from_device() { for (int j = 0; j < nrn_ion_global_map_size; j++) { if (nrn_ion_global_map[j]) { - acc_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double)); + cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size); } } if (nrn_ion_global_map_size) { - acc_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size); + cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size); } } void init_gpu() { - // choose nvidia GPU by default - acc_device_t device_type = acc_device_nvidia; - // check how many gpu devices available per node - int num_devices_per_node = acc_get_num_devices(device_type); + int num_devices_per_node = cnrn_target_get_num_devices(); // if no gpu found, can't run on GPU if (num_devices_per_node == 0) { @@ -1341,10 +1242,9 @@ void init_gpu() { } #endif - int device_num = local_rank % num_devices_per_node; - acc_set_device_num(device_num, device_type); + cnrn_target_set_default_device(local_rank % num_devices_per_node); - if (nrnmpi_myid == 0) { + if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) { std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size << " ranks per node\n"; } @@ -1355,42 +1255,44 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) { VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i]; /** just VecPlayContinuous object */ - void* d_p = (void*) acc_copyin(vecplay_instance, sizeof(VecPlayContinuous)); - acc_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*)); - - VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p; + VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance); + cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance); /** copy y_, t_ and discon_indices_ */ copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_); copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_); + // OL211213: beware, the test suite does not currently include anything + // with a non-null discon_indices_. if (vecplay_instance->discon_indices_) { + IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices); copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_), *(d_vecplay_instance->discon_indices_)); } /** copy PlayRecordEvent : todo: verify this */ - PlayRecordEvent* d_e_ = (PlayRecordEvent*) acc_copyin(vecplay_instance->e_, - sizeof(PlayRecordEvent)); - acc_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*)); - acc_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*)); + PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_); + + cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance)); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_); /** copy pd_ : note that it's pointer inside ml->data and hence data itself is * already on GPU */ - double* d_pd_ = (double*) acc_deviceptr(vecplay_instance->pd_); - acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*)); + double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_); + cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_); } } void nrn_VecPlay_delete_from_device(NrnThread* nt) { for (int i = 0; i < nt->n_vecplay; i++) { auto* vecplay_instance = reinterpret_cast(nt->_vecplay[i]); - acc_delete(vecplay_instance->e_, sizeof(PlayRecordEvent)); + cnrn_target_delete(vecplay_instance->e_); if (vecplay_instance->discon_indices_) { delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_)); } delete_ivoc_vect_from_device(vecplay_instance->t_); delete_ivoc_vect_from_device(vecplay_instance->y_); - acc_delete(vecplay_instance, sizeof(VecPlayContinuous)); + cnrn_target_delete(vecplay_instance); } } diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp index 67e6a058c..72d222cdd 100644 --- a/coreneuron/gpu/nrn_acc_manager.hpp +++ b/coreneuron/gpu/nrn_acc_manager.hpp @@ -9,25 +9,19 @@ #ifndef _nrn_device_manager_ #define _nrn_device_manager_ -#if defined(_OPENACC) -#include -#endif - #include "coreneuron/sim/multicore.hpp" namespace coreneuron { void setup_nrnthreads_on_device(NrnThread* threads, int nthreads); void delete_nrnthreads_on_device(NrnThread* threads, int nthreads); void update_nrnthreads_on_host(NrnThread* threads, int nthreads); -void update_nrnthreads_on_device(NrnThread* threads, int nthreads); -void modify_data_on_device(NrnThread* threads, int nthreads); -void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads); -void update_matrix_from_gpu(NrnThread* _nt); -void update_matrix_to_gpu(NrnThread* _nt); void update_net_receive_buffer(NrnThread* _nt); + +// Called by NModl void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml); void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb); + void update_weights_from_gpu(NrnThread* threads, int nthreads); void init_gpu(); diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp index 646fbf5a0..2a001b85a 100644 --- a/coreneuron/io/lfp.cpp +++ b/coreneuron/io/lfp.cpp @@ -7,9 +7,6 @@ namespace coreneuron { -// extern variables require acc declare -#pragma acc declare create(pi) - namespace lfputils { double line_source_lfp_factor(const Point3D& e_pos, diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py index 35158908c..67cd93ebb 100644 --- a/coreneuron/kinderiv.py +++ b/coreneuron/kinderiv.py @@ -59,20 +59,26 @@ def write_out_kinderiv(fout): fout.write("\n/* declarations */\n") fout.write("\nnamespace coreneuron {\n") + if deriv or kin or euler: + fout.write('nrn_pragma_omp(declare target)\n') + for item in deriv: - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int _newton_%s%s(_threadargsproto_);\n' % (item[0], item[1])) for item in kin: - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int %s%s(void*, double*, _threadargsproto_);\n' % (item[0], item[1])) for item in euler: - fout.write('#pragma acc routine seq\n') + fout.write('nrn_pragma_acc(routine seq)\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) + if deriv or kin or euler: + fout.write('nrn_pragma_omp(end declare target)\n') + fout.write("\n/* callback indices */\n") derivoffset = 1 kinoffset = 1 diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp index ee62f660d..42c65cb18 100644 --- a/coreneuron/mechanism/capac.cpp +++ b/coreneuron/mechanism/capac.cpp @@ -12,25 +12,9 @@ #include "coreneuron/coreneuron.hpp" #include "coreneuron/permute/data_layout.hpp" -// clang-format off - -#if defined(_OPENACC) -#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ - _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#else -#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("") -#endif - -// clang-format on - +#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \ + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) #define _STRIDE _cntml_padded + _iml namespace coreneuron { @@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { (void) _cntml_padded; /* unused when layout=1*/ double* _vec_d = _nt->_actual_d; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif { /*if (use_cachevec) {*/ int* ni = ml->nodeindices; vdata = ml->data; - _PRAGMA_FOR_JACOB_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], + ni [0:_cntml_actual], + _vec_d [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (_iml = 0; _iml < _cntml_actual; _iml++) { _vec_d[ni[_iml]] += cfac * cm; } @@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) { /* no need to distinguish secondorder */ int* ni = ml->nodeindices; double* _vec_rhs = _nt->_actual_rhs; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif vdata = ml->data; - _PRAGMA_FOR_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm], + ni [0:_cntml_actual], + _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; _iml++) { i_cap = cfac * cm * _vec_rhs[ni[_iml]]; } diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 76adc9045..8b58e858d 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -19,26 +19,6 @@ #define _STRIDE _cntml_padded + _iml -// clang-format off - -#if defined(_OPENACC) -#define _PRAGMA_FOR_INIT_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu)") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu) async(stream_id)") -#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ \ - _Pragma( \ - "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)") -#else -#define _PRAGMA_FOR_INIT_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_CUR_ACC_LOOP_ _Pragma("") -#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("") -#endif - -// clang-format on - namespace coreneuron { // for each ion it refers to internal concentration, external concentration, and charge, @@ -197,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) { } } +nrn_pragma_omp(declare target) void nrn_wrote_conc(int type, double* p1, int p2, @@ -213,6 +194,7 @@ void nrn_wrote_conc(int type, pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius); } } +nrn_pragma_omp(end declare target) static double efun(double x) { if (fabs(x) < 1e-4) { @@ -277,14 +259,18 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { double* pd; Datum* ppd; (void) nt; /* unused */ -#if defined(_OPENACC) - int stream_id = nt->stream_id; -#endif /*printf("ion_cur %s\n", memb_func[type].sym->name);*/ int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - _PRAGMA_FOR_CUR_ACC_LOOP_ + // clang-format off + nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], + nrn_ion_global_map[0:nrn_ion_global_map_size] + [0:ion_global_map_member_size]) + if (nt->compute_gpu) + async(nt->stream_id)) + // clang-format on + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { dcurdv = 0.; cur = 0.; @@ -312,7 +298,18 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) { int _cntml_padded = ml->_nodecount_padded; pd = ml->data; ppd = ml->pdata; - _PRAGMA_FOR_INIT_ACC_LOOP_ + // There was no async(...) clause in the initial OpenACC implementation, so + // no `nowait` clause has been added to the OpenMP implementation. TODO: + // verify if this can be made asynchronous or if there is a strong reason it + // needs to be like this. + // clang-format off + nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], + ppd[0:1], + nrn_ion_global_map[0:nrn_ion_global_map_size] + [0:ion_global_map_member_size]) + if (nt->compute_gpu)) + // clang-format on + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { if (iontype & 04) { conci = conci0; @@ -332,9 +329,6 @@ void second_order_cur(NrnThread* _nt, int secondorder) { int _cntml_padded; double* pd; (void) _nt; /* unused */ -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif double* _vec_rhs = _nt->_actual_rhs; if (secondorder == 2) { @@ -345,7 +339,11 @@ void second_order_cur(NrnThread* _nt, int secondorder) { int* ni = ml->nodeindices; _cntml_padded = ml->_nodecount_padded; pd = ml->data; - _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ + nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5], + ni [0:_cntml_actual], + _vec_rhs [0:_nt->end]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int _iml = 0; _iml < _cntml_actual; ++_iml) { cur += dcurdv * (_vec_rhs[ni[_iml]]); } diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp index e3b08207e..de8970560 100644 --- a/coreneuron/mechanism/mech/dimplic.cpp +++ b/coreneuron/mechanism/mech/dimplic.cpp @@ -24,6 +24,7 @@ #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp" #include "_kinderiv.h" namespace coreneuron { +nrn_pragma_omp(declare target) int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) { difun(fun); return 0; @@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) { switch (fun) { _NRN_KINETIC_CASES } return 0; } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp index e4dee09ac..4c572dd18 100644 --- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp +++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp @@ -11,6 +11,7 @@ #include "coreneuron/sim/multicore.hpp" #include "coreneuron/mechanism/mechanism.hpp" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { @@ -35,15 +36,17 @@ using DIFUN = int; using NEWTFUN = int; using SPFUN = int; using EULFUN = int; -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern int nrn_derivimplicit_steer(int, _threadargsproto_); #define difun(arg) nrn_derivimplicit_steer(arg, _threadargs_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int nrn_newton_steer(int, _threadargsproto_); #define newtfun(arg) nrn_newton_steer(arg, _threadargs_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int nrn_euler_steer(int, _threadargsproto_); #define eulerfun(arg) nrn_euler_steer(arg, _threadargs_); +nrn_pragma_omp(end declare target) struct Elm { unsigned row; /* Row location */ @@ -89,15 +92,19 @@ struct SparseObj { /* all the state information */ int do_flag; }; -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) extern double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml); +nrn_pragma_omp(end declare target) extern void* nrn_cons_sparseobj(SPFUN, int, Memb_list*, _threadargsproto_); extern void _nrn_destroy_sparseobj_thread(SparseObj* so); -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) extern int nrn_kinetic_steer(int, SparseObj*, double*, _threadargsproto_); +nrn_pragma_omp(end declare target) #define spfun(arg1, arg2, arg3) nrn_kinetic_steer(arg1, arg2, arg3, _threadargs_); // derived from nrn/src/scopmath/euler.c @@ -116,14 +123,15 @@ static inline int euler_thread(int neqn, int* var, int* der, DIFUN fun, _threada return 0; } -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern int derivimplicit_thread(int, int*, int*, DIFUN, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int sparse_thread(SparseObj*, int, int*, int*, double*, double, SPFUN, int, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) int _ss_sparse_thread(SparseObj*, int n, int* s, @@ -134,10 +142,11 @@ int _ss_sparse_thread(SparseObj*, int linflag, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern double _modl_get_dt_thread(NrnThread*); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern void _modl_set_dt_thread(double, NrnThread*); +nrn_pragma_omp(end declare target) void nrn_sparseobj_copyto_device(SparseObj* so); void nrn_sparseobj_delete_from_device(SparseObj* so); diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp index 3e7046e4e..65d7b29ce 100644 --- a/coreneuron/mechanism/mechanism.hpp +++ b/coreneuron/mechanism/mechanism.hpp @@ -88,7 +88,7 @@ struct NetSendBuffer_t: MemoryManaged { } void grow() { -#if defined(_OPENACC) +#ifdef CORENEURON_ENABLE_GPU int cannot_reallocate_on_device = 0; assert(cannot_reallocate_on_device); #else diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp index 7598edf50..ba7bf9281 100644 --- a/coreneuron/mechanism/membfunc.hpp +++ b/coreneuron/mechanism/membfunc.hpp @@ -11,6 +11,7 @@ #include #include "coreneuron/mechanism/mechanism.hpp" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { using Pfrpdat = Datum* (*) (void); @@ -109,12 +110,14 @@ extern void hoc_register_watch_check(nrn_watch_check_t, int); extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int); extern void nrn_writes_conc(int, int); -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern void nrn_wrote_conc(int, double*, int, int, double**, double, int); -#pragma acc routine seq +nrn_pragma_acc(routine seq) double nrn_nernst(double ci, double co, double z, double celsius); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern double nrn_ghk(double v, double ci, double co, double z); +nrn_pragma_omp(end declare target) extern void hoc_register_prop_size(int, int, int); extern void hoc_register_dparam_semantics(int type, int, const char* name); @@ -175,8 +178,10 @@ extern void artcell_net_move(void**, Point_process*, double); extern void nrn2ncs_outputevent(int netcon_output_index, double firetime); extern bool nrn_use_localgid_; extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double); -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) extern int at_time(NrnThread*, double); +nrn_pragma_omp(end declare target) // _OPENACC and/or NET_RECEIVE_BUFFERING extern void net_sem_from_gpu(int, int, int, int, int, double, double); diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp index 3acdff1ea..433140b82 100644 --- a/coreneuron/mechanism/register_mech.cpp +++ b/coreneuron/mechanism/register_mech.cpp @@ -19,11 +19,9 @@ namespace coreneuron { int secondorder = 0; +nrn_pragma_omp(declare target) double t, dt, celsius, pi; -// declare copyin required for correct initialization -#pragma acc declare copyin(secondorder) -#pragma acc declare copyin(celsius) -#pragma acc declare copyin(pi) +nrn_pragma_omp(end declare target) int rev_dt; using Pfrv = void (*)(); diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp index 6ed52dc34..31c18807e 100644 --- a/coreneuron/network/cvodestb.cpp +++ b/coreneuron/network/cvodestb.cpp @@ -55,17 +55,15 @@ void init_net_events() { net_cvode_instance->init_events(); } -#if defined(_OPENACC) +#ifdef CORENEURON_ENABLE_GPU /* weight vectors could be updated (from INITIAL block of NET_RECEIVE, update those on GPU's */ for (int ith = 0; ith < nrn_nthread; ++ith) { NrnThread* nt = nrn_threads + ith; double* weights = nt->weights; int n_weight = nt->n_weight; - if (n_weight) { - // clang-format off - - #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu) - // clang-format on + if (n_weight && nt->compute_gpu) { + nrn_pragma_acc(update device(weights [0:n_weight])) + nrn_pragma_omp(target update to(weights [0:n_weight])) } } #endif @@ -88,6 +86,7 @@ void fixed_play_continuous(NrnThread* nt) { // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc" // for the ISPC backend. If changes are required, make sure to change ISPC as well. +nrn_pragma_omp(declare target) int at_time(NrnThread* nt, double te) { double x = te - 1e-11; if (x <= nt->_t && x > (nt->_t - nt->_dt)) { @@ -95,5 +94,6 @@ int at_time(NrnThread* nt, double te) { } return 0; } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index 899bc1e14..4fb1d165f 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -26,9 +26,6 @@ #include "coreneuron/coreneuron.hpp" #include "coreneuron/utils/nrnoc_aux.hpp" -#ifdef _OPENACC -#include -#endif namespace coreneuron { #define PP2NT(pp) (nrn_threads + (pp)->_tid) #define PP2t(pp) (PP2NT(pp)->_t) @@ -531,28 +528,13 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method PreSynHelper* presyns_helper = nt->presyns_helper; double* actual_v = nt->_actual_v; -#if defined(_OPENACC) - int stream_id = nt->stream_id; -#endif - if (nt->ncell == 0) return; - //_net_send_buffer_cnt is no longer used in openacc kernel, remove this? - //#ifdef _OPENACC - // if(nt->compute_gpu) - // acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int)); - //#endif - - // on GPU... - // clang-format off - - #pragma acc parallel loop present( \ - nt[0:1], presyns_helper[0:nt->n_presyn], \ - presyns[0:nt->n_presyn], actual_v[0:nt->end]) \ - copy(net_send_buf_count) if (nt->compute_gpu) \ - async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present( + nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) + copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu)) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; @@ -563,7 +545,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method int* flag = &(psh->flag_); if (pscheck(v, threshold, flag)) { -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nt->_net_send_buffer_cnt = net_send_buf_count; if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) { nt->_net_send_buffer_size *= 2; @@ -572,31 +554,23 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method } #endif - // clang-format off - - #pragma acc atomic capture - // clang-format on + nrn_pragma_acc(atomic capture) + nrn_pragma_omp(atomic capture) idx = net_send_buf_count++; nt->_net_send_buffer[idx] = i; } } - - // clang-format off - - #pragma acc wait(stream_id) - // clang-format on + nrn_pragma_acc(wait(nt->stream_id)) nt->_net_send_buffer_cnt = net_send_buf_count; - if (nt->_net_send_buffer_cnt) { -#ifdef _OPENACC + if (nt->compute_gpu && nt->_net_send_buffer_cnt) { +#ifdef CORENEURON_ENABLE_GPU int* nsbuffer = nt->_net_send_buffer; #endif - // clang-format off - - #pragma acc update host(nsbuffer[0:nt->_net_send_buffer_cnt]) if (nt->compute_gpu) async(stream_id) - #pragma acc wait(stream_id) - // clang-format on + nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id)) + nrn_pragma_acc(wait(nt->stream_id)) + nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])) } // on CPU... diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp index e74d866ce..4c517e999 100644 --- a/coreneuron/network/partrans.cpp +++ b/coreneuron/network/partrans.cpp @@ -41,40 +41,38 @@ void nrnmpi_v_transfer() { // gather the source values. can be done in parallel for (int tid = 0; tid < nrn_nthread; ++tid) { auto& ttd = transfer_thread_data_[tid]; - auto& nt = nrn_threads[tid]; + auto* nt = &nrn_threads[tid]; int n = int(ttd.outsrc_indices.size()); if (n == 0) { continue; } - double* src_data = nt._data; + double* src_data = nt->_data; int* src_indices = ttd.src_indices.data(); // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf double* src_gather = ttd.src_gather.data(); size_t n_src_gather = ttd.src_gather.size(); - // clang-format off - #pragma acc parallel loop present( \ - src_indices[0:n_src_gather], src_data[0:nt._ndata], \ - src_gather[0 : n_src_gather]) /*copyout(src_gather[0:n_src_gather])*/ \ - if (nt.compute_gpu) async(nt.stream_id) + nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather], + src_data [0:nt->_ndata], + src_gather [0:n_src_gather]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = 0; i < n_src_gather; ++i) { src_gather[i] = src_data[src_indices[i]]; } - // do not know why the copyout above did not work - // and the following update is needed - #pragma acc update host(src_gather[0 : n_src_gather]) \ - if (nrn_threads[0].compute_gpu) \ - async(nt.stream_id) - // clang-format on + nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)) } // copy gathered source values to outsrc_buf_ + bool compute_gpu = false; for (int tid = 0; tid < nrn_nthread; ++tid) { - // clang-format off - - #pragma acc wait(nrn_threads[tid].stream_id) - // clang-format on + if (nrn_threads[tid].compute_gpu) { + compute_gpu = true; + nrn_pragma_acc(wait(nrn_threads[tid].stream_id)) + } TransferThreadData& ttd = transfer_thread_data_[tid]; size_t n_outsrc_indices = ttd.outsrc_indices.size(); int* outsrc_indices = ttd.outsrc_indices.data(); @@ -102,12 +100,8 @@ void nrnmpi_v_transfer() { } // insrc_buf_ will get copied to targets via nrnthread_v_transfer - // clang-format off - - #pragma acc update device( \ - insrc_buf_[0:n_insrc_buf]) \ - if (nrn_threads[0].compute_gpu) - // clang-format on + nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu)) + nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu)) } void nrnthread_v_transfer(NrnThread* _nt) { @@ -119,33 +113,33 @@ void nrnthread_v_transfer(NrnThread* _nt) { int* insrc_indices = ttd.insrc_indices.data(); double* tar_data = _nt->_data; // last element in the displacement vector gives total length +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; int ndata = _nt->_ndata; +#endif - // clang-format off - - #pragma acc parallel loop present( \ - insrc_indices[0:ntar], \ - tar_data[0:ndata], \ - insrc_buf_[0:n_insrc_buf]) \ - if (_nt->compute_gpu) \ - async(_nt->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar], + tar_data [0:ndata], + insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu)) for (size_t i = 0; i < ntar; ++i) { tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]]; } } +/// TODO: Corresponding exit data cluase for OpenACC/OpenMP is missing and hence +/// GPU buffers are not freed. void nrn_partrans::gap_update_indices() { // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu. if (insrcdspl_) { int n_insrc_buf = insrcdspl_[nrnmpi_numprocs]; + nrn_pragma_acc(enter data create(insrc_buf_ [0:n_insrc_buf]) if (corenrn_param.gpu)) + // clang-format off + nrn_pragma_omp(target enter data map(alloc: insrc_buf_[0:n_insrc_buf]) + if(corenrn_param.gpu)) // clang-format off - - #pragma acc enter data create( \ - insrc_buf_[0:n_insrc_buf]) \ - if (nrn_threads[0].compute_gpu) - // clang-format on } for (int tid = 0; tid < nrn_nthread; ++tid) { TransferThreadData& ttd = transfer_thread_data_[tid]; @@ -154,21 +148,25 @@ void nrn_partrans::gap_update_indices() { size_t n_src_gather = ttd.src_gather.size(); NrnThread* nt = nrn_threads + tid; if (n_src_indices) { + int* src_indices = ttd.src_indices.data(); + double* src_gather = ttd.src_gather.data(); + nrn_pragma_acc(enter data copyin(src_indices[0:n_src_indices]) if(nt->compute_gpu)) + nrn_pragma_acc(enter data create(src_gather[0:n_src_gather]) if(nt->compute_gpu)) // clang-format off - - int *src_indices = ttd.src_indices.data(); - double *src_gather = ttd.src_gather.data(); - #pragma acc enter data copyin(src_indices[0 : n_src_indices]) if (nt->compute_gpu) - #pragma acc enter data create(src_gather[0 : n_src_gather]) if (nt->compute_gpu) + nrn_pragma_omp(target enter data map(to: src_indices [0:n_src_indices]) + map(alloc: src_gather[0:n_src_gather]) + if(nt->compute_gpu)) // clang-format on } if (ttd.insrc_indices.size()) { - // clang-format off - - int *insrc_indices = ttd.insrc_indices.data(); + int* insrc_indices = ttd.insrc_indices.data(); size_t n_insrc_indices = ttd.insrc_indices.size(); - #pragma acc enter data copyin(insrc_indices[0 : n_insrc_indices]) if (nt->compute_gpu) + nrn_pragma_acc( + enter data copyin(insrc_indices [0:n_insrc_indices]) if (nt->compute_gpu)) + // clang-format off + nrn_pragma_omp(target enter data map(to: insrc_indices[0:n_insrc_indices]) + if(nt->compute_gpu)) // clang-format on } } diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h index 2c7fb8bb9..225d6d2ad 100644 --- a/coreneuron/nrnconf.h +++ b/coreneuron/nrnconf.h @@ -9,6 +9,8 @@ #ifndef _H_NRNCONF_ #define _H_NRNCONF_ +#include "coreneuron/utils/offload.hpp" + #include #include #include @@ -32,14 +34,16 @@ using Symbol = char; #define VECTORIZE 1 // extern variables require acc declare +nrn_pragma_omp(declare target) extern double celsius; -#pragma acc declare create(celsius) +nrn_pragma_acc(declare create(celsius)) extern double pi; -#pragma acc declare create(pi) +nrn_pragma_acc(declare create(pi)) extern int secondorder; -#pragma acc declare create(secondorder) +nrn_pragma_acc(declare create(secondorder)) +nrn_pragma_omp(end declare target) extern double t, dt; extern int rev_dt; diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 2b6167f57..6b4014a64 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -6,8 +6,6 @@ # ============================================================================= */ -#include - #include "coreneuron/nrnconf.h" #include "coreneuron/sim/multicore.hpp" #include "coreneuron/utils/nrn_assert.h" @@ -15,6 +13,7 @@ #include "coreneuron/network/tnode.hpp" #include "coreneuron/utils/lpt.hpp" #include "coreneuron/utils/memory.h" +#include "coreneuron/utils/offload.hpp" #include "coreneuron/apps/corenrn_parameters.hpp" #include "coreneuron/permute/node_permute.h" // for print_quality @@ -22,6 +21,9 @@ #ifdef _OPENACC #include #endif + +#include + namespace coreneuron { int interleave_permute_type; InterleaveInfo* interleave_info; // nrn_nthread array @@ -444,7 +446,7 @@ static void triang_interleaved(NrnThread* nt, if (istride < icellsize) { // only first icellsize strides matter // what is the index int ip = GPU_PARENT(i); -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nrn_assert(ip >= 0); // if (ip < 0) return; #endif double p = GPU_A(i) / GPU_D(i); @@ -466,7 +468,7 @@ static void bksub_interleaved(NrnThread* nt, GPU_RHS(icell) /= GPU_D(icell); // the root for (int istride = 0; istride < icellsize; ++istride) { int ip = GPU_PARENT(i); -#ifndef _OPENACC +#ifndef CORENEURON_ENABLE_GPU nrn_assert(ip >= 0); #endif GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip); @@ -480,7 +482,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid int icycle = ncycle - 1; int istride = stride[icycle]; int i = lastnode - istride + icore; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU int ii = i; #endif @@ -488,10 +490,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid bool has_subtrees_to_compute = true; // clang-format off - - #pragma acc loop seq + nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU // serial test, gpu does this in parallel for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; @@ -500,12 +501,14 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid // what is the index int ip = GPU_PARENT(i); double p = GPU_A(i) / GPU_D(i); - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) GPU_D(ip) -= p * GPU_B(i); - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) GPU_RHS(ip) -= p * GPU_RHS(i); } -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU } #endif // if finished with all tree depths then ready to break @@ -517,7 +520,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid --icycle; istride = stride[icycle]; i -= istride; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU ii -= istride; #endif } @@ -532,25 +535,22 @@ static void bksub_interleaved2(NrnThread* nt, int ncycle, int* stride, int firstnode) { -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU for (int i = root; i < lastroot; i += 1) { #else - // clang-format off - - #pragma acc loop seq - // clang-format on + nrn_pragma_acc(loop seq) for (int i = root; i < lastroot; i += warpsize) { #endif GPU_RHS(i) /= GPU_D(i); // the root } int i = firstnode + icore; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU int ii = i; #endif for (int icycle = 0; icycle < ncycle; ++icycle) { int istride = stride[icycle]; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU // serial test, gpu does this in parallel for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; @@ -561,7 +561,7 @@ static void bksub_interleaved2(NrnThread* nt, GPU_RHS(i) /= GPU_D(i); } i += istride; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU } ii += istride; #endif @@ -596,21 +596,18 @@ void solve_interleaved2(int ith) { int* strides = ii.stride; // sum ncycles of these (bad since ncompart/warpsize) int* rootbegin = ii.firstnode; // nwarp+1 of these int* nodebegin = ii.lastnode; // nwarp+1 of these -#ifdef _OPENACC +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) int nstride = stridedispl[nwarp]; - int stream_id = nt->stream_id; -#endif - -#ifdef _OPENACC - // clang-format off - - #pragma acc parallel loop gang vector vector_length(warpsize) \ - present(nt[0:1], strides[0:nstride], \ - ncycles[0:nwarp], stridedispl[0:nwarp+1], \ - rootbegin[0:nwarp+1], nodebegin[0:nwarp+1]) \ - if (nt->compute_gpu) async(stream_id) -// clang-format on #endif + nrn_pragma_acc(parallel loop gang vector vector_length( + warpsize) present(nt [0:1], + strides [0:nstride], + ncycles [0:nwarp], + stridedispl [0:nwarp + 1], + rootbegin [0:nwarp + 1], + nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icore = 0; icore < ncore; ++icore) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask @@ -620,18 +617,16 @@ void solve_interleaved2(int ith) { int lastroot = rootbegin[iwarp + 1]; int firstnode = nodebegin[iwarp]; int lastnode = nodebegin[iwarp + 1]; -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU if (ic == 0) { // serial test mode. triang and bksub do all cores in warp #endif triang_interleaved2(nt, ic, ncycle, stride, lastnode); bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); -#if !defined(_OPENACC) +#ifndef CORENEURON_ENABLE_GPU } // serial test mode #endif } -#ifdef _OPENACC -#pragma acc wait(nt->stream_id) -#endif + nrn_pragma_acc(wait(nt->stream_id)) #ifdef _OPENACC } #endif @@ -656,28 +651,23 @@ void solve_interleaved1(int ith) { int* firstnode = ii.firstnode; int* lastnode = ii.lastnode; int* cellsize = ii.cellsize; -#if _OPENACC - int stream_id = nt->stream_id; -#endif -#ifdef _OPENACC - // clang-format off - - #pragma acc parallel loop present( \ - nt[0:1], stride[0:nstride], \ - firstnode[0:ncell], lastnode[0:ncell], \ - cellsize[0:ncell]) if (nt->compute_gpu) \ - async(stream_id) -// clang-format on -#endif + // OL211123: can we preserve the error checking behaviour of OpenACC's + // present clause with OpenMP? It is a bug if these data are not present, + // so diagnostics are helpful... + nrn_pragma_acc(parallel loop present(nt [0:1], + stride [0:nstride], + firstnode [0:ncell], + lastnode [0:ncell], + cellsize [0:ncell]) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int icell = 0; icell < ncell; ++icell) { int icellsize = cellsize[icell]; triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode); bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode); } -#ifdef _OPENACC -#pragma acc wait(stream_id) -#endif + nrn_pragma_acc(wait(nt->stream_id)) } void solve_interleaved(int ith) { diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu index 82198410f..0c1b5af2e 100644 --- a/coreneuron/permute/cellorder.cu +++ b/coreneuron/permute/cellorder.cu @@ -72,25 +72,37 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int int* rootbegin = ii->firstnode; // nwarp+1 of these int* nodebegin = ii->lastnode; // nwarp+1 of these - int iwarp = icore / warpsize; // figure out the >> value - int ic = icore & (warpsize - 1); // figure out the & mask - int ncycle = ncycles[iwarp]; - int* stride = strides + stridedispl[iwarp]; - int root = rootbegin[iwarp]; - int lastroot = rootbegin[iwarp + 1]; - int firstnode = nodebegin[iwarp]; - int lastnode = nodebegin[iwarp + 1]; - - triang_interleaved2_device(nt, ic, ncycle, stride, lastnode); - bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); + while (icore < ncore) { + int iwarp = icore / warpsize; // figure out the >> value + int ic = icore & (warpsize - 1); // figure out the & mask + int ncycle = ncycles[iwarp]; + int* stride = strides + stridedispl[iwarp]; + int root = rootbegin[iwarp]; + int lastroot = rootbegin[iwarp + 1]; + int firstnode = nodebegin[iwarp]; + int lastnode = nodebegin[iwarp + 1]; + + triang_interleaved2_device(nt, ic, ncycle, stride, lastnode); + bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); + + icore += blockDim.x * gridDim.x; + } } void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) { auto cuda_stream = static_cast(stream); + /// the selection of these parameters has been done after running the channel-benchmark for + /// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells. + /// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the + /// channel-benchmark circuits mentioned above we figured out that the best performance was + /// achieved with this configuration int threadsPerBlock = warpsize; - // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel? - int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock; + /// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the + /// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number + const auto maxBlocksPerGrid = 65535; + int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock; + int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid; solve_interleaved2_kernel<<>>(nt, info, ncore); diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp index 8f4ac14cf..ab6fc4bfb 100644 --- a/coreneuron/sim/fadvance_core.cpp +++ b/coreneuron/sim/fadvance_core.cpp @@ -78,10 +78,11 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */ } else { nt->cj = 1.0 / dt; } + nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj) + async(nt->stream_id) if (nt->compute_gpu)) // clang-format off - - #pragma acc update device(nt->_t, nt->_dt, nt->cj) \ - async(nt->stream_id) if(nt->compute_gpu) + nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj) + if(nt->compute_gpu)) // clang-format on } } @@ -201,35 +202,24 @@ void update(NrnThread* _nt) { double* vec_v = &(VEC_V(0)); double* vec_rhs = &(VEC_RHS(0)); int i2 = _nt->end; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif /* do not need to worry about linmod or extracellular*/ if (secondorder) { - // clang-format off - - #pragma acc parallel loop present( \ - vec_v[0:i2], vec_rhs[0:i2]) \ - if (_nt->compute_gpu) async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += 2. * vec_rhs[i]; } } else { - // clang-format off - - #pragma acc parallel loop present( \ - vec_v[0:i2], vec_rhs[0:i2]) \ - if (_nt->compute_gpu) async(stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < i2; ++i) { vec_v[i] += vec_rhs[i]; } } - // update_matrix_to_gpu(_nt); - if (_nt->tml) { assert(_nt->tml->index == CAP); nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index); @@ -304,10 +294,9 @@ void nrncore2nrn_send_values(NrnThread* nth) { // make sure we do not overflow the `varrays` buffers assert(vs < tr->bsize); - // clang-format off - - #pragma acc parallel loop present(tr[0:1]) if(nth->compute_gpu) async(nth->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu) + async(nth->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu)) for (int i = 0; i < tr->n_trajec; ++i) { tr->varrays[i][vs] = *tr->gather[i]; } @@ -326,12 +315,11 @@ void nrncore2nrn_send_values(NrnThread* nth) { // https://github.com/BlueBrain/CoreNeuron/issues/611 for (int i = 0; i < tr->n_trajec; ++i) { double* gather_i = tr->gather[i]; - // clang-format off - - #pragma acc update self(gather_i[0:1]) if(nth->compute_gpu) async(nth->stream_id) + nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu) + async(nth->stream_id)) + nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)) } - #pragma acc wait(nth->stream_id) - // clang-format on + nrn_pragma_acc(wait(nth->stream_id)) for (int i = 0; i < tr->n_trajec; ++i) { *(tr->scatter[i]) = *(tr->gather[i]); } @@ -351,15 +339,11 @@ static void* nrn_fixed_step_thread(NrnThread* nth) { nth->_t += .5 * nth->_dt; if (nth->ncell) { -#if defined(_OPENACC) - int stream_id = nth->stream_id; - /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */ - // clang-format off - - #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id) - #pragma acc wait(stream_id) -// clang-format on -#endif + /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can + launch kernel) */ + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); { @@ -393,12 +377,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) { if (nth->ncell) { /*@todo: do we need to update nth->_t on GPU */ - // clang-format off - - #pragma acc update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id) - #pragma acc wait(nth->stream_id) - // clang-format on - + nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)) + nrn_pragma_acc(wait(nth->stream_id)) + nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu)) fixed_play_continuous(nth); nonvint(nth); nrncore2nrn_send_values(nth); diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp index 8dfb0cd76..1218b7967 100644 --- a/coreneuron/sim/fast_imem.cpp +++ b/coreneuron/sim/fast_imem.cpp @@ -50,10 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) { double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d; double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(vec_rhs, \ - vec_area, \ - fast_imem_d, \ - fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id) + nrn_pragma_acc( + parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } @@ -68,8 +68,9 @@ void nrn_calc_fast_imem_init(NrnThread* nt) { double* vec_area = nt->_actual_area; double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) \ - async(nt->stream_id) + nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) + async(nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01; } diff --git a/coreneuron/sim/finitialize.cpp b/coreneuron/sim/finitialize.cpp index 1ae79a92f..d711ae247 100644 --- a/coreneuron/sim/finitialize.cpp +++ b/coreneuron/sim/finitialize.cpp @@ -53,12 +53,9 @@ void nrn_finitialize(int setv, double v) { if (setv) { for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) { double* vec_v = &(VEC_V(0)); - // clang-format off - - #pragma acc parallel loop present( \ - _nt[0:1], vec_v[0:_nt->end]) \ - if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc( + parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) for (int i = 0; i < _nt->end; ++i) { vec_v[i] = v; } diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp index b180ea107..72a5c017f 100644 --- a/coreneuron/sim/scopmath/crout_thread.cpp +++ b/coreneuron/sim/scopmath/crout_thread.cpp @@ -50,6 +50,7 @@ namespace coreneuron { #define ix(arg) ((arg) *_STRIDE) /* having a differnt permutation per instance may not be a good idea */ +nrn_pragma_omp(declare target) int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) { int save_i = 0; @@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n, } } } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/sim/scopmath/newton_struct.h b/coreneuron/sim/scopmath/newton_struct.h index 8cd52732c..d01bfb822 100644 --- a/coreneuron/sim/scopmath/newton_struct.h +++ b/coreneuron/sim/scopmath/newton_struct.h @@ -25,10 +25,11 @@ struct NewtonSpace { double* rowmax; }; -#pragma acc routine seq +nrn_pragma_omp(declare target) +nrn_pragma_acc(routine seq) extern int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern void nrn_scopmath_solve_thread(int n, double** a, double* value, @@ -37,7 +38,7 @@ extern void nrn_scopmath_solve_thread(int n, int* s, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int nrn_newton_thread(NewtonSpace* ns, int n, int* s, @@ -45,7 +46,7 @@ extern int nrn_newton_thread(NewtonSpace* ns, double* value, _threadargsproto_); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern void nrn_buildjacobian_thread(NewtonSpace* ns, int n, int* s, @@ -53,6 +54,7 @@ extern void nrn_buildjacobian_thread(NewtonSpace* ns, double* value, double** jacobian, _threadargsproto_); +nrn_pragma_omp(end declare target) extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance); extern void nrn_destroy_newtonspace(NewtonSpace* ns); diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp index 6c0f303ce..dc08ca04b 100644 --- a/coreneuron/sim/scopmath/newton_thread.cpp +++ b/coreneuron/sim/scopmath/newton_thread.cpp @@ -59,6 +59,7 @@ namespace coreneuron { #define ix(arg) ((arg) *_STRIDE) #define s_(arg) _p[s[arg] * _STRIDE] +nrn_pragma_omp(declare target) int nrn_newton_thread(NewtonSpace* ns, int n, int* s, @@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns, return (error); } +nrn_pragma_omp(end declare target) /*------------------------------------------------------------*/ /* */ diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp index d936e269a..71643430a 100644 --- a/coreneuron/sim/scopmath/sparse_thread.cpp +++ b/coreneuron/sim/scopmath/sparse_thread.cpp @@ -105,7 +105,7 @@ static void check_assert(SparseObj* so); static void re_link(SparseObj* so, unsigned i); static SparseObj* create_sparseobj(); -#if defined(_OPENACC) +#ifdef CORENEURON_ENABLE_GPU #undef emalloc #undef ecalloc #define emalloc(arg) malloc(arg) diff --git a/coreneuron/sim/scopmath/ssimplic_thread.cpp b/coreneuron/sim/scopmath/ssimplic_thread.cpp index fe11411d0..511e45d2b 100644 --- a/coreneuron/sim/scopmath/ssimplic_thread.cpp +++ b/coreneuron/sim/scopmath/ssimplic_thread.cpp @@ -9,12 +9,15 @@ #include "coreneuron/mechanism/mech/cfile/scoplib.h" #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp" #include "coreneuron/sim/scopmath/errcodes.h" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { #define s_(arg) _p[s[arg] * _STRIDE] -#pragma acc routine seq +nrn_pragma_acc(routine seq) +nrn_pragma_omp(declare target) static int check_state(int, int*, _threadargsproto_); +nrn_pragma_omp(end declare target) int _ss_sparse_thread(SparseObj* v, int n, diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp index a24c8360f..60ba2b660 100644 --- a/coreneuron/sim/solve_core.cpp +++ b/coreneuron/sim/solve_core.cpp @@ -24,7 +24,9 @@ void nrn_solve_minimal(NrnThread* _nt) { } } -/** TODO loops are executed seq in OpenACC just for debugging, remove it! */ +/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and + * --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented. + */ /* triangularization of the matrix equations */ static void triang(NrnThread* _nt) { @@ -37,17 +39,9 @@ static void triang(NrnThread* _nt) { double* vec_rhs = &(VEC_RHS(0)); int* parent_index = _nt->_v_parent_index; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_a[0:i3], vec_b[0:i3], vec_d[0:i3], \ - vec_rhs[0:i3], parent_index[0:i3]) \ - async(stream_id) if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc(parallel loop seq present( + vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i3 - 1; i >= i2; --i) { double p = vec_a[i] / vec_d[i]; vec_d[parent_index[i]] -= p * vec_b[i]; @@ -66,33 +60,22 @@ static void bksub(NrnThread* _nt) { double* vec_rhs = &(VEC_RHS(0)); int* parent_index = _nt->_v_parent_index; -#if defined(_OPENACC) - int stream_id = _nt->stream_id; -#endif - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_d[0:i2], vec_rhs[0:i2]) \ - async(stream_id) if (_nt->compute_gpu) - // clang-format on + nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i1; i < i2; ++i) { vec_rhs[i] /= vec_d[i]; } - /** @todo: just for benchmarking, otherwise produces wrong results */ - // clang-format off - - #pragma acc parallel loop seq present( \ - vec_b[0:i3], vec_d[0:i3], vec_rhs[0:i3], \ - parent_index[0:i3]) async(stream_id) \ - if (_nt->compute_gpu) + nrn_pragma_acc( + parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3]) + async(_nt->stream_id) if (_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]]; vec_rhs[i] /= vec_d[i]; } - #pragma acc wait(stream_id) - // clang-format on + if (_nt->compute_gpu) { + nrn_pragma_acc(wait(_nt->stream_id)) + } } } // namespace coreneuron diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index 943980bcd..208058fe1 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -32,12 +32,9 @@ static void nrn_rhs(NrnThread* _nt) { double* vec_v = &(VEC_V(0)); int* parent_index = _nt->_v_parent_index; - // clang-format off - - #pragma acc parallel loop present( \ - vec_rhs[0:i3], vec_d[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) - // clang-format on + nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; vec_d[i] = 0.; @@ -46,9 +43,10 @@ static void nrn_rhs(NrnThread* _nt) { if (_nt->nrn_fast_imem) { double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d; double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(fast_imem_d [i1:i3], \ - fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \ - async(_nt->stream_id) + nrn_pragma_acc( + parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; fast_imem_rhs[i] = 0.; @@ -76,7 +74,9 @@ static void nrn_rhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_rhs; -#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; } @@ -86,22 +86,24 @@ static void nrn_rhs(NrnThread* _nt) { The extracellular mechanism contribution is already done. rhs += ai_j*(vi_j - vi) */ - // clang-format off - - #pragma acc parallel loop present( \ - vec_rhs[0:i3], vec_d[0:i3], \ - vec_a[0:i3], vec_b[0:i3], \ - vec_v[0:i3], parent_index[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], + vec_d [0:i3], + vec_a [0:i3], + vec_b [0:i3], + vec_v [0:i3], + parent_index [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; /* our connection coefficients are negative so */ - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_rhs[i] -= vec_b[i] * dv; - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_rhs[parent_index[i]] += vec_a[i] * dv; } - // clang-format on } /* calculate left hand side of @@ -150,34 +152,32 @@ static void nrn_lhs(NrnThread* _nt) { so here we transform so it only has membrane current contribution */ double* p = _nt->nrn_fast_imem->nrn_sav_d; -#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } } /* now add the axial currents */ - // clang-format off - - #pragma acc parallel loop present( \ - vec_d[0:i3], vec_a[0:i3], \ - vec_b[0:i3], parent_index[0:i3]) \ - if (_nt->compute_gpu) async(_nt->stream_id) + nrn_pragma_acc(parallel loop present( + vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) + async(_nt->stream_id)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_d[i] -= vec_b[i]; - #pragma acc atomic update + nrn_pragma_acc(atomic update) + nrn_pragma_omp(atomic update) vec_d[parent_index[i]] -= vec_a[i]; } - // clang-format on } /* for the fixed step method */ void* setup_tree_matrix_minimal(NrnThread* _nt) { nrn_rhs(_nt); nrn_lhs(_nt); - // update_matrix_from_gpu(_nt); - return nullptr; } } // namespace coreneuron diff --git a/coreneuron/utils/ivocvect.cpp b/coreneuron/utils/ivocvect.cpp index 1315d409f..b51a96ab8 100644 --- a/coreneuron/utils/ivocvect.cpp +++ b/coreneuron/utils/ivocvect.cpp @@ -7,6 +7,7 @@ */ #include "coreneuron/utils/ivocvect.hpp" +#include "coreneuron/utils/offload.hpp" namespace coreneuron { IvocVect* vector_new(int n) { @@ -26,12 +27,12 @@ void* vector_new1(int n) { return (void*) (new IvocVect(n)); } -#pragma acc routine seq +nrn_pragma_acc(routine seq) int vector_capacity(void* v) { return ((IvocVect*) v)->size(); } -#pragma acc routine seq +nrn_pragma_acc(routine seq) double* vector_vec(void* v) { return ((IvocVect*) v)->data(); } diff --git a/coreneuron/utils/ivocvect.hpp b/coreneuron/utils/ivocvect.hpp index af4286e09..80440c74d 100644 --- a/coreneuron/utils/ivocvect.hpp +++ b/coreneuron/utils/ivocvect.hpp @@ -9,6 +9,8 @@ #ifndef ivoc_vector_h #define ivoc_vector_h +#include "coreneuron/utils/offload.hpp" + #include #include @@ -52,17 +54,17 @@ class fixed_vector { return data_[i]; } -#pragma acc routine seq + nrn_pragma_acc(routine seq) const T* data(void) const { return data_; } -#pragma acc routine seq + nrn_pragma_acc(routine seq) T* data(void) { return data_; } -#pragma acc routine seq + nrn_pragma_acc(routine seq) size_t size() const { return n_; } @@ -76,9 +78,9 @@ extern double* vector_vec(IvocVect* v); // retro-compatibility API extern void* vector_new1(int n); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern int vector_capacity(void* v); -#pragma acc routine seq +nrn_pragma_acc(routine seq) extern double* vector_vec(void* v); } // namespace coreneuron diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h index 965c06e78..2f0e24458 100644 --- a/coreneuron/utils/memory.h +++ b/coreneuron/utils/memory.h @@ -115,8 +115,7 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) { } // namespace coreneuron /// for gpu builds with unified memory support -/// OL210812: why do we include __CUDACC__ here? -#if (defined(__CUDACC__) || defined(CORENEURON_UNIFIED_MEMORY)) +#ifdef CORENEURON_UNIFIED_MEMORY #include diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp new file mode 100644 index 000000000..078990107 --- /dev/null +++ b/coreneuron/utils/offload.hpp @@ -0,0 +1,96 @@ +/* +# ============================================================================= +# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL +# +# See top-level LICENSE file for details. +# ============================================================================= +*/ +#pragma once +#define nrn_pragma_stringify(x) #x +#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) +#define nrn_pragma_acc(x) +#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x)) +#include +#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) +#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x)) +#define nrn_pragma_omp(x) +#include +#else +#define nrn_pragma_acc(x) +#define nrn_pragma_omp(x) +#include +#endif + +#include + +namespace coreneuron { +template +T* cnrn_target_deviceptr(const T* h_ptr) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + return static_cast(acc_deviceptr(const_cast(h_ptr))); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + T const* d_ptr{}; + + nrn_pragma_omp(target data use_device_ptr(h_ptr)) + { d_ptr = h_ptr; } + + return const_cast(d_ptr); +#else + throw std::runtime_error( + "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + nrn_pragma_omp(target enter data map(to : h_ptr[:len])) + return cnrn_target_deviceptr(h_ptr); +#else + throw std::runtime_error( + "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + acc_delete(h_ptr, len * sizeof(T)); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + nrn_pragma_omp(target exit data map(delete : h_ptr[:len])) +#else + throw std::runtime_error( + "cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENACC) + acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ + defined(_OPENMP) + omp_target_memcpy(d_ptr, + const_cast(h_ptr), + len * sizeof(T), + 0, + 0, + omp_get_default_device(), + omp_get_initial_device()); +#else + throw std::runtime_error( + "cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +} // namespace coreneuron diff --git a/coreneuron/utils/profile/profiler_interface.h b/coreneuron/utils/profile/profiler_interface.h index f6a24eb2e..2c68a0ae1 100644 --- a/coreneuron/utils/profile/profiler_interface.h +++ b/coreneuron/utils/profile/profiler_interface.h @@ -15,7 +15,7 @@ #include #endif -#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC)) +#ifdef CORENEURON_CUDA_PROFILING #include #endif @@ -163,7 +163,7 @@ struct Caliper { #endif -#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC)) +#ifdef CORENEURON_CUDA_PROFILING struct CudaProfiling { inline static void phase_begin(const char* name){}; @@ -270,7 +270,7 @@ using InstrumentorImpl = detail::Instrumentor< #if defined CORENEURON_CALIPER detail::Caliper, #endif -#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC)) +#ifdef CORENEURON_CUDA_PROFILING detail::CudaProfiling, #endif #if defined(CRAYPAT) diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu index b13dad7eb..8a02c4e26 100644 --- a/coreneuron/utils/randoms/nrnran123.cu +++ b/coreneuron/utils/randoms/nrnran123.cu @@ -15,6 +15,11 @@ #include #include +#ifdef CORENEURON_USE_BOOST_POOL +#include +#include +#endif + // In a GPU build this file will be compiled by NVCC as CUDA code // In a CPU build this file will be compiled by a C++ compiler as C++ code #ifdef __CUDACC__ @@ -24,6 +29,48 @@ #endif namespace { +#ifdef CORENEURON_USE_BOOST_POOL +/** Tag type for use with boost::fast_pool_allocator that forwards to + * coreneuron::[de]allocate_unified(). Using a Random123-specific type here + * makes sure that allocations do not come from the same global pool as other + * usage of boost pools for objects with sizeof == sizeof(nrnran123_State). + * + * The messy m_block_sizes map is just because `deallocate_unified` uses sized + * deallocations, but the Boost pool allocators don't. Because this is hidden + * behind the pool mechanism, these methods are not called very often and the + * overhead is minimal. + */ +struct random123_allocate_unified { + using size_type = std::size_t; + using difference_type = std::size_t; + static char* malloc(const size_type bytes) { + std::lock_guard const lock{m_mutex}; + static_cast(lock); + auto* buffer = coreneuron::allocate_unified(bytes); + m_block_sizes[buffer] = bytes; + return reinterpret_cast(buffer); + } + static void free(char* const block) { + std::lock_guard const lock{m_mutex}; + static_cast(lock); + auto const iter = m_block_sizes.find(block); + assert(iter != m_block_sizes.end()); + auto const size = iter->second; + m_block_sizes.erase(iter); + return coreneuron::deallocate_unified(block, size); + } + static std::mutex m_mutex; + static std::unordered_map m_block_sizes; +}; + +std::mutex random123_allocate_unified::m_mutex{}; +std::unordered_map random123_allocate_unified::m_block_sizes{}; + +using random123_allocator = + boost::fast_pool_allocator; +#else +using random123_allocator = coreneuron::unified_allocator; +#endif /* Global data structure per process. Using a unique_ptr here causes [minor] * problems because its destructor can be called very late during application * shutdown. If the destructor calls cudaFree and the CUDA runtime has already @@ -212,9 +259,7 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1, #endif nrnran123_State* s{nullptr}; if (use_unified_memory) { - s = coreneuron::allocate_unique( - coreneuron::unified_allocator{}) - .release(); + s = coreneuron::allocate_unique(random123_allocator{}).release(); } else { s = new nrnran123_State{}; } @@ -244,9 +289,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) { --g_instance_count; } if (use_unified_memory) { - std::unique_ptr>> - _{s}; + std::unique_ptr> _{s}; } else { delete s; } diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h index ab432f89c..c97592161 100644 --- a/coreneuron/utils/randoms/nrnran123.h +++ b/coreneuron/utils/randoms/nrnran123.h @@ -37,6 +37,8 @@ of the full distribution available from #define R123_USE_GNU_UINT128 1 #endif +#include "coreneuron/utils/offload.hpp" + #include #include @@ -46,17 +48,12 @@ of the full distribution available from #define CORENRN_HOST_DEVICE #endif -// Is there actually any harm leaving the pragma in when DISABLE_OPENACC is true? -#if defined(_OPENACC) && !defined(DISABLE_OPENACC) -#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE _Pragma("acc routine seq") -#else -#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE -#endif +#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE nrn_pragma_acc(routine seq) // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU // support at all. In these two cases, request that the random123 state is // allocated using new/delete instead of CUDA unified memory. -#if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC) +#if defined(CORENEURON_ENABLE_GPU) && !defined(DISABLE_OPENACC) #define CORENRN_RAN123_USE_UNIFIED_MEMORY true #else #define CORENRN_RAN123_USE_UNIFIED_MEMORY false @@ -100,6 +97,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY); /* minimal data stream */ +nrn_pragma_omp(declare target) CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which); CORENRN_HOST_DEVICE_ACC void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2); CORENRN_HOST_DEVICE_ACC void nrnran123_getids3(nrnran123_State*, @@ -128,6 +126,7 @@ CORENRN_HOST_DEVICE_ACC nrnran123_array4x32 nrnran123_iran(uint32_t seq, uint32_t id1, uint32_t id2); CORENRN_HOST_DEVICE_ACC double nrnran123_uint2dbl(uint32_t); +nrn_pragma_omp(end declare target) } // namespace coreneuron #endif diff --git a/coreneuron/utils/vrecord.cpp b/coreneuron/utils/vrecord.cpp index 8af2b028e..a972e754a 100644 --- a/coreneuron/utils/vrecord.cpp +++ b/coreneuron/utils/vrecord.cpp @@ -78,7 +78,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) { last_index_ = ubound_index_; // clang-format off - #pragma acc update device(last_index_) if (nt->compute_gpu) + nrn_pragma_acc(update device(last_index_) if (nt->compute_gpu)) + nrn_pragma_omp(target update to(last_index_) if (nt->compute_gpu)) // clang-format on if (discon_indices_) { if (discon_index_ < discon_indices_->size()) { @@ -96,7 +97,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) { } // clang-format off - #pragma acc update device(ubound_index_) if (nt->compute_gpu) + nrn_pragma_acc(update device(ubound_index_) if (nt->compute_gpu)) + nrn_pragma_omp(target update to(ubound_index_) if (nt->compute_gpu)) // clang-format on continuous(tt); } @@ -105,7 +107,8 @@ void VecPlayContinuous::continuous(double tt) { NrnThread* nt = nrn_threads + ith_; // clang-format off - #pragma acc kernels present(this) if(nt->compute_gpu) + nrn_pragma_acc(kernels present(this) if(nt->compute_gpu)) + nrn_pragma_omp(target if(nt->compute_gpu)) { *pd_ = interpolate(tt); } diff --git a/external/nmodl b/external/nmodl index 85dec3618..46f8baf2b 160000 --- a/external/nmodl +++ b/external/nmodl @@ -1 +1 @@ -Subproject commit 85dec36180cc8d012db3392c06c065d39de79960 +Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt index 3e2ac8e80..ec795f178 100644 --- a/tests/unit/lfp/CMakeLists.txt +++ b/tests/unit/lfp/CMakeLists.txt @@ -22,3 +22,4 @@ set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF) target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS}) add_dependencies(lfp_test_bin nrniv-core) add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $) +set_tests_properties(lfp_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)