diff --git a/.clang-format.changes b/.clang-format.changes
index 01b58702d..4c2b11b59 100644
--- a/.clang-format.changes
+++ b/.clang-format.changes
@@ -1,2 +1,3 @@
-SortIncludes: false
 IndentCaseLabels: true
+SortIncludes: false
+StatementMacros: [nrn_pragma_acc, nrn_pragma_omp]
diff --git a/.cmake-format.changes.yaml b/.cmake-format.changes.yaml
index 19ea9c084..2f20247f7 100644
--- a/.cmake-format.changes.yaml
+++ b/.cmake-format.changes.yaml
@@ -1,9 +1,4 @@
 additional_commands:
-  cuda_add_library:
-    pargs: '*'
-    flags: ["STATIC", "SHARED", "MODULE", "EXCLUDE_FROM_ALL"]
-    kwargs:
-      OPTIONS: '*'
   cpp_cc_build_time_copy:
     flags: ['NO_TARGET']
     kwargs:
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5e3967f7d..8b434bf81 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -45,7 +45,7 @@ spack_setup:
   variables:
     SPACK_PACKAGE: neuron
     SPACK_PACKAGE_REF: ''
-    SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit
+    SPACK_PACKAGE_SPEC: +coreneuron+debug+tests~legacy-unit model_tests=channel-benchmark,olfactory
 .gpu_node:
   variables:
     bb5_constraint: volta
@@ -93,7 +93,19 @@ build:coreneuron+nmodl:gpu:
     SPACK_PACKAGE: coreneuron
     # +report pulls in a lot of dependencies and the tests fail.
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +nmodl+gpu+tests~legacy-unit~report build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +nmodl+openmp+gpu+tests~legacy-unit~report~sympy build_type=RelWithDebInfo
+  extends:
+    - .spack_build
+    - .spack_nvhpc
+  needs: ["build:nmodl:gpu"]
+
+build:coreneuron+nmodl~openmp:gpu:
+  variables:
+    SPACK_PACKAGE: coreneuron
+    # +report pulls in a lot of dependencies and the tests fail.
+    # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
+    # Sympy + OpenMP target offload does not currently work with NVHPC
+    SPACK_PACKAGE_SPEC: +nmodl~openmp+gpu+tests~legacy-unit~report+sympy build_type=RelWithDebInfo
   extends:
     - .spack_build
     - .spack_nvhpc
@@ -104,7 +116,7 @@ build:coreneuron:gpu:
     SPACK_PACKAGE: coreneuron
     # +report pulls in a lot of dependencies and the tests fail.
     # See https://github.com/BlueBrain/CoreNeuron/issues/518 re: build_type
-    SPACK_PACKAGE_SPEC: +gpu+tests~legacy-unit~report build_type=RelWithDebInfo
+    SPACK_PACKAGE_SPEC: +gpu+openmp+tests~legacy-unit~report build_type=RelWithDebInfo
   extends:
     - .spack_build
     - .spack_nvhpc
@@ -121,6 +133,10 @@ test:coreneuron+nmodl:gpu:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron+nmodl:gpu"]
 
+test:coreneuron+nmodl~openmp:gpu:
+  extends: [.ctest, .gpu_node]
+  needs: ["build:coreneuron+nmodl~openmp:gpu"]
+
 test:coreneuron:gpu:
   extends: [.ctest, .gpu_node]
   needs: ["build:coreneuron:gpu"]
@@ -153,6 +169,18 @@ build:neuron+nmodl:gpu:
     - !reference [.spack_build, before_script]
   needs: ["build:coreneuron+nmodl:gpu"]
 
+build:neuron+nmodl~openmp:gpu:
+  stage: build_neuron
+  extends:
+    - .spack_build
+    - .spack_neuron
+    - .spack_nvhpc
+  before_script:
+    # Build py-cython and py-numpy with GCC instead of NVHPC.
+    - SPACK_PACKAGE_DEPENDENCIES="${SPACK_PACKAGE_DEPENDENCIES}^py-cython%gcc^py-numpy%gcc"
+    - !reference [.spack_build, before_script]
+  needs: ["build:coreneuron+nmodl~openmp:gpu"]
+
 build:neuron:gpu:
   stage: build_neuron
   extends:
@@ -180,6 +208,11 @@ test:neuron+nmodl:gpu:
   extends: [.ctest, .gpu_node]
   needs: ["build:neuron+nmodl:gpu"]
 
+test:neuron+nmodl~openmp:gpu:
+  stage: test_neuron
+  extends: [.ctest, .gpu_node]
+  needs: ["build:neuron+nmodl~openmp:gpu"]
+
 test:neuron:gpu:
   stage: test_neuron
   extends: [.ctest, .gpu_node]
diff --git a/CMake/OpenAccHelper.cmake b/CMake/OpenAccHelper.cmake
index 7767a3672..5838742f8 100644
--- a/CMake/OpenAccHelper.cmake
+++ b/CMake/OpenAccHelper.cmake
@@ -10,6 +10,9 @@
 if(CORENRN_ENABLE_GPU)
   # Enable cudaProfiler{Start,Stop}() behind the Instrumentor::phase... APIs
   add_compile_definitions(CORENEURON_CUDA_PROFILING CORENEURON_ENABLE_GPU)
+  # Plain C++ code in CoreNEURON may need to use CUDA runtime APIs for, for example, starting and
+  # stopping profiling. This makes sure those headers can be found.
+  include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
   # cuda unified memory support
   if(CORENRN_ENABLE_CUDA_UNIFIED_MEMORY)
     add_compile_definitions(CORENEURON_UNIFIED_MEMORY)
@@ -47,25 +50,32 @@ if(CORENRN_ENABLE_GPU)
     endif()
     set(CORENRN_CUDA_VERSION_SHORT "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}")
   endif()
-  # -acc enables OpenACC support, -cuda links CUDA libraries and (very importantly!) seems to be
-  # required to make the NVHPC compiler do the device code linking. Otherwise the explicit CUDA
-  # device code (.cu files in libcoreneuron) has to be linked in a separate, earlier, step, which
-  # apparently causes problems with interoperability with OpenACC. Passing -cuda to nvc++ when
-  # compiling (as opposed to linking) seems to enable CUDA C++ support, which has other consequences
-  # due to e.g. __CUDACC__ being defined. See https://github.com/BlueBrain/CoreNeuron/issues/607 for
-  # more information about this. -gpu=cudaX.Y ensures that OpenACC code is compiled with the same
-  # CUDA version as is used for the explicit CUDA code.
-  set(NVHPC_ACC_COMP_FLAGS "-acc -gpu=cuda${CORENRN_CUDA_VERSION_SHORT}")
-  set(NVHPC_ACC_LINK_FLAGS "-acc -cuda")
+  # -cuda links CUDA libraries and also seems to be important to make the NVHPC do the device code
+  # linking. Without this, we had problems with linking between the explicit CUDA (.cu) device code
+  # and offloaded OpenACC/OpenMP code. Using -cuda when compiling seems to improve error messages in
+  # some cases, and to be recommended by NVIDIA. We pass -gpu=cudaX.Y to ensure that OpenACC/OpenMP
+  # code is compiled with the same CUDA version as the explicit CUDA code.
+  set(NVHPC_ACC_COMP_FLAGS "-cuda -gpu=cuda${CORENRN_CUDA_VERSION_SHORT},lineinfo")
   # Make sure that OpenACC code is generated for the same compute capabilities as the explicit CUDA
   # code. Otherwise there may be confusing linker errors. We cannot rely on nvcc and nvc++ using the
   # same default compute capabilities as each other, particularly on GPU-less build machines.
   foreach(compute_capability ${CMAKE_CUDA_ARCHITECTURES})
     string(APPEND NVHPC_ACC_COMP_FLAGS ",cc${compute_capability}")
   endforeach()
+  if(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenMP")
+    # Enable OpenMP target offload to GPU and if both OpenACC and OpenMP directives are available
+    # for a region then prefer OpenMP.
+    add_compile_definitions(CORENEURON_PREFER_OPENMP_OFFLOAD)
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -mp=gpu")
+  elseif(CORENRN_ACCELERATOR_OFFLOAD STREQUAL "OpenACC")
+    # Only enable OpenACC offload for GPU
+    string(APPEND NVHPC_ACC_COMP_FLAGS " -acc")
+  else()
+    message(FATAL_ERROR "${CORENRN_ACCELERATOR_OFFLOAD} not supported with NVHPC compilers")
+  endif()
   # avoid PGI adding standard compliant "-A" flags
   set(CMAKE_CXX14_STANDARD_COMPILE_OPTION --c++14)
-  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_LINK_FLAGS}")
+  string(APPEND CMAKE_EXE_LINKER_FLAGS " ${NVHPC_ACC_COMP_FLAGS}")
   # Use `-Mautoinline` option to compile .cpp files generated from .mod files only. This is
   # especially needed when we compile with -O0 or -O1 optimisation level where we get link errors.
   # Use of `-Mautoinline` ensure that the necessary functions like `net_receive_kernel` are inlined
@@ -81,7 +91,7 @@ if(CORENRN_ENABLE_GPU)
     GLOBAL
     PROPERTY
       CORENEURON_LIB_LINK_FLAGS
-      "${NVHPC_ACC_COMP_FLAGS} ${NVHPC_ACC_LINK_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
+      "${NVHPC_ACC_COMP_FLAGS} -rdynamic -lrt -Wl,--whole-archive -L${CMAKE_HOST_SYSTEM_PROCESSOR} -lcorenrnmech -L${CMAKE_INSTALL_PREFIX}/lib -lcoreneuron -Wl,--no-whole-archive"
   )
 else()
   set_property(GLOBAL PROPERTY CORENEURON_LIB_LINK_FLAGS
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1b3edf3a5..d3e1950d0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,6 +85,7 @@ add_subdirectory(${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11)
 # Build options
 # =============================================================================
 option(CORENRN_ENABLE_OPENMP "Build the CORE NEURON with OpenMP implementation" ON)
+option(CORENRN_ENABLE_OPENMP_OFFLOAD "Prefer OpenMP target offload to OpenACC" ON)
 option(CORENRN_ENABLE_TIMEOUT "Enable nrn_timeout implementation" ON)
 option(CORENRN_ENABLE_REPORTING "Enable use of ReportingLib for soma reports" OFF)
 option(CORENRN_ENABLE_MPI "Enable MPI-based execution" ON)
@@ -117,6 +118,7 @@ else()
   set(CORENRN_HAVE_NVHPC_COMPILER OFF)
 endif()
 
+set(CORENRN_ACCELERATOR_OFFLOAD "Disabled")
 if(CORENRN_ENABLE_GPU)
   # Older CMake versions than 3.15 have not been tested for GPU/CUDA/OpenACC support after
   # https://github.com/BlueBrain/CoreNeuron/pull/609.
@@ -135,7 +137,7 @@ if(CORENRN_ENABLE_GPU)
 
   # Set some sensible default CUDA architectures.
   if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 60 70 80)
+    set(CMAKE_CUDA_ARCHITECTURES 70 80)
     message(STATUS "Setting default CUDA architectures to ${CMAKE_CUDA_ARCHITECTURES}")
   endif()
 
@@ -185,6 +187,18 @@ if(CORENRN_ENABLE_GPU)
   set(CMAKE_CUDA_FLAGS
       "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr -Xcudafe --diag_suppress=3057,--diag_suppress=3085"
   )
+
+  if(CORENRN_ENABLE_NMODL)
+    # NMODL supports both OpenACC and OpenMP target offload
+    if(CORENRN_ENABLE_OPENMP AND CORENRN_ENABLE_OPENMP_OFFLOAD)
+      set(CORENRN_ACCELERATOR_OFFLOAD "OpenMP")
+    else()
+      set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
+    endif()
+  else()
+    # MOD2C only supports OpenACC offload
+    set(CORENRN_ACCELERATOR_OFFLOAD "OpenACC")
+  endif()
 endif()
 
 # =============================================================================
@@ -526,6 +540,7 @@ message(STATUS "MOD2CPP PATH        | ${CORENRN_MOD2CPP_BINARY}")
 message(STATUS "GPU Support         | ${CORENRN_ENABLE_GPU}")
 if(CORENRN_ENABLE_GPU)
   message(STATUS "  CUDA              | ${CUDAToolkit_LIBRARY_DIR}")
+  message(STATUS "  Offload           | ${CORENRN_ACCELERATOR_OFFLOAD}")
   message(STATUS "  Unified Memory    | ${CORENRN_ENABLE_CUDA_UNIFIED_MEMORY}")
 endif()
 message(STATUS "Auto Timeout        | ${CORENRN_ENABLE_TIMEOUT}")
diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 6fd5c98a8..d370df1df 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -284,6 +284,16 @@ target_include_directories(coreneuron SYSTEM
 target_include_directories(coreneuron SYSTEM
                            PRIVATE ${CORENEURON_PROJECT_SOURCE_DIR}/external/CLI11/include)
 
+if(CORENRN_ENABLE_GPU)
+  # nrnran123.cpp possibly-temporarily uses Boost.Pool in GPU builds if it's available.
+  find_package(Boost QUIET)
+  if(Boost_FOUND)
+    message(STATUS "Boost found, enabling use of memory pools for Random123...")
+    target_include_directories(coreneuron SYSTEM PRIVATE ${Boost_INCLUDE_DIRS})
+    target_compile_definitions(coreneuron PRIVATE CORENEURON_USE_BOOST_POOL)
+  endif()
+endif()
+
 set_target_properties(
   coreneuron scopmath
   PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib
diff --git a/coreneuron/apps/corenrn_parameters.cpp b/coreneuron/apps/corenrn_parameters.cpp
index c0aa02ab0..4403f44db 100644
--- a/coreneuron/apps/corenrn_parameters.cpp
+++ b/coreneuron/apps/corenrn_parameters.cpp
@@ -47,7 +47,12 @@ corenrn_parameters::corenrn_parameters() {
                  "Print number of instances of each mechanism and detailed memory stats.");
 
     auto sub_gpu = app.add_option_group("GPU", "Commands relative to GPU.");
-    sub_gpu->add_option("-W, --nwarp", this->nwarp, "Number of warps to balance.", true)
+    sub_gpu
+        ->add_option("-W, --nwarp",
+                     this->nwarp,
+                     "Number of warps to execute in parallel the Hines solver. Each warp solves a "
+                     "group of cells. (Only used with cell permute 2)",
+                     true)
         ->check(CLI::Range(0, 1'000'000));
     sub_gpu
         ->add_option("-R, --cell-permute",
diff --git a/coreneuron/apps/corenrn_parameters.hpp b/coreneuron/apps/corenrn_parameters.hpp
index ea7ef8aba..e22cf348d 100644
--- a/coreneuron/apps/corenrn_parameters.hpp
+++ b/coreneuron/apps/corenrn_parameters.hpp
@@ -46,8 +46,8 @@ struct corenrn_parameters {
     unsigned ms_subint = 2;                /// Number of multisend interval. 1 or 2
     unsigned spkcompress = 0;              /// Spike Compression
     unsigned cell_interleave_permute = 0;  /// Cell interleaving permutation
-    unsigned nwarp = 0;     /// Number of warps to balance for cell_interleave_permute == 2
-    unsigned num_gpus = 0;  /// Number of gpus to use per node
+    unsigned nwarp = 65536;  /// Number of warps to balance for cell_interleave_permute == 2
+    unsigned num_gpus = 0;   /// Number of gpus to use per node
     unsigned report_buff_size = report_buff_size_default;  /// Size in MB of the report buffer.
     int seed = -1;  /// Initialization seed for random number generator (int)
 
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 0fdaa509b..5bfda9421 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -193,10 +193,11 @@ void nrn_init_and_load_data(int argc,
     // precedence is: set by user, globals.dat, 34.0
     celsius = corenrn_param.celsius;
 
-#if _OPENACC
+#if CORENEURON_ENABLE_GPU
     if (!corenrn_param.gpu && corenrn_param.cell_interleave_permute == 2) {
         fprintf(stderr,
-                "compiled with _OPENACC does not allow the combination of --cell-permute=2 and "
+                "compiled with CORENEURON_ENABLE_GPU does not allow the combination of "
+                "--cell-permute=2 and "
                 "missing --gpu\n");
         exit(1);
     }
@@ -497,7 +498,7 @@ extern "C" void mk_mech_init(int argc, char** argv) {
     }
 #endif
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (corenrn_param.gpu) {
         init_gpu();
     }
@@ -558,10 +559,8 @@ extern "C" int run_solve_core(int argc, char** argv) {
 #endif
     bool compute_gpu = corenrn_param.gpu;
 
-    // clang-format off
-
-    #pragma acc update device(celsius, secondorder, pi) if (compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
+    nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
     {
         double v = corenrn_param.voltage;
         double dt = corenrn_param.dt;
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index ac98f5420..d5e723527 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -24,13 +24,14 @@
 #include "coreneuron/mpi/nrnmpidec.h"
 #include "coreneuron/utils/utils.hpp"
 
-#ifdef _OPENACC
-#include <openacc.h>
-#endif
-
 #ifdef CRAYPAT
 #include <pat_api.h>
 #endif
+
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+#include <cuda_runtime_api.h>
+#endif
+
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
 void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
@@ -40,9 +41,227 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
+int cnrn_target_get_num_devices() {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    // choose nvidia GPU by default
+    acc_device_t device_type = acc_device_nvidia;
+    // check how many gpu devices available per node
+    return acc_get_num_devices(device_type);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    return omp_get_num_devices();
+#else
+    throw std::runtime_error(
+        "cnrn_target_get_num_devices() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+void cnrn_target_set_default_device(int device_num) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    acc_set_device_num(device_num, acc_device_nvidia);
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    omp_set_default_device(device_num);
+    // It seems that with NVHPC 21.9 then only setting the default OpenMP device
+    // is not enough: there were errors on some nodes when not-the-0th GPU was
+    // used. These seemed to be related to the NMODL instance structs, which are
+    // allocated using cudaMallocManaged.
+    auto const cuda_code = cudaSetDevice(device_num);
+    assert(cuda_code == cudaSuccess);
+#else
+    throw std::runtime_error(
+        "cnrn_target_set_default_device() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+static Memb_list* copy_ml_to_device(const Memb_list* ml, int type) {
+    // As we never run code for artificial cell inside GPU we don't copy it.
+    int is_art = corenrn.get_is_artificial()[type];
+    if (is_art) {
+        return nullptr;
+    }
+
+    auto d_ml = cnrn_target_copyin(ml);
+
+    int n = ml->nodecount;
+    int szp = corenrn.get_prop_param_size()[type];
+    int szdp = corenrn.get_prop_dparam_size()[type];
+
+    double* dptr = cnrn_target_deviceptr(ml->data);
+    cnrn_target_memcpy_to_device(&(d_ml->data), &(dptr));
+
+
+    int* d_nodeindices = cnrn_target_copyin(ml->nodeindices, n);
+    cnrn_target_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices);
+
+    if (szdp) {
+        int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+        int* d_pdata = cnrn_target_copyin(ml->pdata, pcnt);
+        cnrn_target_memcpy_to_device(&(d_ml->pdata), &d_pdata);
+    }
+
+    int ts = corenrn.get_memb_funcs()[type].thread_size_;
+    if (ts) {
+        ThreadDatum* td = cnrn_target_copyin(ml->_thread, ts);
+        cnrn_target_memcpy_to_device(&(d_ml->_thread), &td);
+    }
+
+    // net_receive buffer associated with mechanism
+    NetReceiveBuffer_t* nrb = ml->_net_receive_buffer;
+
+    // if net receive buffer exist for mechanism
+    if (nrb) {
+        NetReceiveBuffer_t* d_nrb = cnrn_target_copyin(nrb);
+        cnrn_target_memcpy_to_device(&(d_ml->_net_receive_buffer), &d_nrb);
+
+        int* d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
+
+        int* d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
+
+        double* d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
+
+        double* d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
+
+        int* d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
+
+        int* d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
+    }
+
+    /* copy NetSendBuffer_t on to GPU */
+    NetSendBuffer_t* nsb = ml->_net_send_buffer;
+
+    if (nsb) {
+        NetSendBuffer_t* d_nsb;
+        int* d_iptr;
+        double* d_dptr;
+
+        d_nsb = cnrn_target_copyin(nsb);
+        cnrn_target_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb);
+
+        d_iptr = cnrn_target_copyin(nsb->_sendtype, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr);
+
+        d_iptr = cnrn_target_copyin(nsb->_vdata_index, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr);
+
+        d_iptr = cnrn_target_copyin(nsb->_pnt_index, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr);
+
+        d_iptr = cnrn_target_copyin(nsb->_weight_index, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr);
+
+        d_dptr = cnrn_target_copyin(nsb->_nsb_t, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr);
+
+        d_dptr = cnrn_target_copyin(nsb->_nsb_flag, nsb->_size);
+        cnrn_target_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr);
+    }
+
+    return d_ml;
+}
+
+static void update_ml_on_host(const Memb_list* ml, int type) {
+    int is_art = corenrn.get_is_artificial()[type];
+    if (is_art) {
+        // Artificial mechanisms such as PatternStim and IntervalFire
+        // are not copied onto the GPU. They should not, therefore, be
+        // updated from the GPU.
+        return;
+    }
+
+    int n = ml->nodecount;
+    int szp = corenrn.get_prop_param_size()[type];
+    int szdp = corenrn.get_prop_dparam_size()[type];
+
+    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
+
+    nrn_pragma_acc(update self(ml->data[:pcnt], ml->nodeindices[:n]))
+    nrn_pragma_omp(target update from(ml->data[:pcnt], ml->nodeindices[:n]))
+
+    int dpcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+    nrn_pragma_acc(update self(ml->pdata[:dpcnt]) if (szdp))
+    nrn_pragma_omp(target update from(ml->pdata[:dpcnt]) if (szdp))
+
+    auto nrb = ml->_net_receive_buffer;
+
+    // clang-format off
+    nrn_pragma_acc(update self(nrb->_cnt,
+                               nrb->_size,
+                               nrb->_pnt_offset,
+                               nrb->_displ_cnt,
+                               nrb->_pnt_index[:nrb->_size],
+                               nrb->_weight_index[:nrb->_size],
+                               nrb->_displ[:nrb->_size + 1],
+                               nrb->_nrb_index[:nrb->_size])
+                          if (nrb != nullptr))
+    nrn_pragma_omp(target update from(nrb->_cnt,
+                                      nrb->_size,
+                                      nrb->_pnt_offset,
+                                      nrb->_displ_cnt,
+                                      nrb->_pnt_index[:nrb->_size],
+                                      nrb->_weight_index[:nrb->_size],
+                                      nrb->_displ[:nrb->_size + 1],
+                                      nrb->_nrb_index[:nrb->_size])
+                                 if (nrb != nullptr))
+    // clang-format on
+}
+
+static void delete_ml_from_device(Memb_list* ml, int type) {
+    int is_art = corenrn.get_is_artificial()[type];
+    if (is_art) {
+        return;
+    }
+    // Cleanup the net send buffer if it exists
+    {
+        NetSendBuffer_t* nsb{ml->_net_send_buffer};
+        if (nsb) {
+            cnrn_target_delete(nsb->_nsb_flag, nsb->_size);
+            cnrn_target_delete(nsb->_nsb_t, nsb->_size);
+            cnrn_target_delete(nsb->_weight_index, nsb->_size);
+            cnrn_target_delete(nsb->_pnt_index, nsb->_size);
+            cnrn_target_delete(nsb->_vdata_index, nsb->_size);
+            cnrn_target_delete(nsb->_sendtype, nsb->_size);
+            cnrn_target_delete(nsb);
+        }
+    }
+    // Cleanup the net receive buffer if it exists.
+    {
+        NetReceiveBuffer_t* nrb{ml->_net_receive_buffer};
+        if (nrb) {
+            cnrn_target_delete(nrb->_nrb_index, nrb->_size);
+            cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+            cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+            cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+            cnrn_target_delete(nrb->_weight_index, nrb->_size);
+            cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+            cnrn_target_delete(nrb);
+        }
+    }
+    int n = ml->nodecount;
+    int szdp = corenrn.get_prop_dparam_size()[type];
+    int ts = corenrn.get_memb_funcs()[type].thread_size_;
+    if (ts) {
+        cnrn_target_delete(ml->_thread, ts);
+    }
+    if (szdp) {
+        int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
+        cnrn_target_delete(ml->pdata, pcnt);
+    }
+    cnrn_target_delete(ml->nodeindices, n);
+    cnrn_target_delete(ml);
+}
+
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // initialize NrnThreads for gpu execution
     // empty thread or only artificial cells should be on cpu
     for (int i = 0; i < nthreads; i++) {
@@ -58,13 +277,13 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         NrnThread* nt = threads + i;  // NrnThread on host
 
         if (nt->n_presyn) {
-            PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
+            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
             // note: we are using unified memory for NrnThread. Once VecPlay is copied to gpu,
             // we dont want to update nt->vecplay because it will also set gpu pointer of vecplay
             // inside nt on cpu (due to unified memory).
@@ -82,7 +301,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
      * find
      * corresponding NrnThread using Point_process in NET_RECEIVE block
      */
-    NrnThread* d_threads = (NrnThread*) acc_copyin(threads, sizeof(NrnThread) * nthreads);
+    NrnThread* d_threads = cnrn_target_copyin(threads, nthreads);
 
     if (interleave_info == nullptr) {
         printf("\n Warning: No permutation data? Required for linear algebra!");
@@ -101,7 +320,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* -- copy _data to device -- */
 
         /*copy all double data for thread */
-        d__data = (double*) acc_copyin(nt->_data, nt->_ndata * sizeof(double));
+        d__data = cnrn_target_copyin(nt->_data, nt->_ndata);
+
 
         /* Here is the example of using OpenACC data enter/exit
          * Remember that we are not allowed to use nt->_data but we have to use:
@@ -111,7 +331,7 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
          */
 
         /*update d_nt._data to point to device copy */
-        acc_memcpy_to_device(&(d_nt->_data), &d__data, sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_data), &d__data);
 
         /* -- setup rhs, d, a, b, v, node_aread to point to device copy -- */
         double* dptr;
@@ -120,36 +340,34 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         int ne = nrn_soa_padded_size(nt->end, 0);
 
         dptr = d__data + 0 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_rhs), &(dptr));
 
         dptr = d__data + 1 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_d), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_d), &(dptr));
 
         dptr = d__data + 2 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_a), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_a), &(dptr));
 
         dptr = d__data + 3 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_b), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_b), &(dptr));
 
         dptr = d__data + 4 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_v), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_v), &(dptr));
 
         dptr = d__data + 5 * ne;
-        acc_memcpy_to_device(&(d_nt->_actual_area), &(dptr), sizeof(double*));
+        cnrn_target_memcpy_to_device(&(d_nt->_actual_area), &(dptr));
 
         if (nt->_actual_diam) {
             dptr = d__data + 6 * ne;
-            acc_memcpy_to_device(&(d_nt->_actual_diam), &(dptr), sizeof(double*));
+            cnrn_target_memcpy_to_device(&(d_nt->_actual_diam), &(dptr));
         }
 
-        int* d_v_parent_index = (int*) acc_copyin(nt->_v_parent_index, nt->end * sizeof(int));
-        acc_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index), sizeof(int*));
+        int* d_v_parent_index = cnrn_target_copyin(nt->_v_parent_index, nt->end);
+        cnrn_target_memcpy_to_device(&(d_nt->_v_parent_index), &(d_v_parent_index));
 
         /* nt._ml_list is used in NET_RECEIVE block and should have valid membrane list id*/
-        Memb_list** d_ml_list = (Memb_list**) acc_copyin(nt->_ml_list,
-                                                         corenrn.get_memb_funcs().size() *
-                                                             sizeof(Memb_list*));
-        acc_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list), sizeof(Memb_list**));
+        Memb_list** d_ml_list = cnrn_target_copyin(nt->_ml_list, corenrn.get_memb_funcs().size());
+        cnrn_target_memcpy_to_device(&(d_nt->_ml_list), &(d_ml_list));
 
         /* -- copy NrnThreadMembList list ml to device -- */
 
@@ -160,119 +378,25 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         for (auto tml = nt->tml; tml; tml = tml->next) {
             /*copy tml to device*/
             /*QUESTIONS: does tml will point to nullptr as in host ? : I assume so!*/
-            auto d_tml = (NrnThreadMembList*) acc_copyin(tml, sizeof(NrnThreadMembList));
+            auto d_tml = cnrn_target_copyin(tml);
 
             /*first tml is pointed by nt */
             if (first_tml) {
-                acc_memcpy_to_device(&(d_nt->tml), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_target_memcpy_to_device(&(d_nt->tml), &d_tml);
                 first_tml = false;
             } else {
                 /*rest of tml forms linked list */
-                acc_memcpy_to_device(&(d_last_tml->next), &d_tml, sizeof(NrnThreadMembList*));
+                cnrn_target_memcpy_to_device(&(d_last_tml->next), &d_tml);
             }
 
             // book keeping for linked-list
             d_last_tml = d_tml;
 
             /* now for every tml, there is a ml. copy that and setup pointer */
-            auto d_ml = (Memb_list*) acc_copyin(tml->ml, sizeof(Memb_list));
-            acc_memcpy_to_device(&(d_tml->ml), &d_ml, sizeof(Memb_list*));
-
+            Memb_list* d_ml = copy_ml_to_device(tml->ml, tml->index);
+            cnrn_target_memcpy_to_device(&(d_tml->ml), &d_ml);
             /* setup nt._ml_list */
-            acc_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml, sizeof(Memb_list*));
-
-            int type = tml->index;
-            int n = tml->ml->nodecount;
-            int szp = corenrn.get_prop_param_size()[type];
-            int szdp = corenrn.get_prop_dparam_size()[type];
-            int is_art = corenrn.get_is_artificial()[type];
-
-            // get device pointer for corresponding mechanism data
-            dptr = (double*) acc_deviceptr(tml->ml->data);
-            acc_memcpy_to_device(&(d_ml->data), &(dptr), sizeof(double*));
-
-
-            if (!is_art) {
-                int* d_nodeindices = (int*) acc_copyin(tml->ml->nodeindices, sizeof(int) * n);
-                acc_memcpy_to_device(&(d_ml->nodeindices), &d_nodeindices, sizeof(int*));
-            }
-
-            if (szdp) {
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                int* d_pdata = (int*) acc_copyin(tml->ml->pdata, sizeof(int) * pcnt);
-                acc_memcpy_to_device(&(d_ml->pdata), &d_pdata, sizeof(int*));
-            }
-
-            int ts = corenrn.get_memb_funcs()[type].thread_size_;
-            if (ts) {
-                ThreadDatum* td = (ThreadDatum*) acc_copyin(tml->ml->_thread,
-                                                            ts * sizeof(ThreadDatum));
-                acc_memcpy_to_device(&(d_ml->_thread), &td, sizeof(ThreadDatum*));
-            }
-
-            NetReceiveBuffer_t *nrb, *d_nrb;
-            int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index;
-            double *d_nrb_t, *d_nrb_flag;
-
-            // net_receive buffer associated with mechanism
-            nrb = tml->ml->_net_receive_buffer;
-
-            // if net receive buffer exist for mechanism
-            if (nrb) {
-                d_nrb = (NetReceiveBuffer_t*) acc_copyin(nrb, sizeof(NetReceiveBuffer_t));
-                acc_memcpy_to_device(&(d_ml->_net_receive_buffer),
-                                     &d_nrb,
-                                     sizeof(NetReceiveBuffer_t*));
-
-                d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
-
-                d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
-
-                d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
-
-                d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
-
-                d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
-
-                d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
-            }
-
-            /* copy NetSendBuffer_t on to GPU */
-            NetSendBuffer_t* nsb;
-            nsb = tml->ml->_net_send_buffer;
-
-            if (nsb) {
-                NetSendBuffer_t* d_nsb;
-                int* d_iptr;
-                double* d_dptr;
-
-                d_nsb = (NetSendBuffer_t*) acc_copyin(nsb, sizeof(NetSendBuffer_t));
-                acc_memcpy_to_device(&(d_ml->_net_send_buffer), &d_nsb, sizeof(NetSendBuffer_t*));
-
-                d_iptr = (int*) acc_copyin(nsb->_sendtype, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_sendtype), &d_iptr, sizeof(int*));
-
-                d_iptr = (int*) acc_copyin(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_vdata_index), &d_iptr, sizeof(int*));
-
-                d_iptr = (int*) acc_copyin(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_pnt_index), &d_iptr, sizeof(int*));
-
-                d_iptr = (int*) acc_copyin(nsb->_weight_index, sizeof(int) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_weight_index), &d_iptr, sizeof(int*));
-
-                d_dptr = (double*) acc_copyin(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_nsb_t), &d_dptr, sizeof(double*));
-
-                d_dptr = (double*) acc_copyin(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                acc_memcpy_to_device(&(d_nsb->_nsb_flag), &d_dptr, sizeof(double*));
-            }
+            cnrn_target_memcpy_to_device(&(d_ml_list[tml->index]), &d_ml);
         }
 
         if (nt->shadow_rhs_cnt) {
@@ -281,50 +405,46 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
 
             /* copy shadow_rhs to device and fix-up the pointer */
-            d_shadow_ptr = (double*) acc_copyin(nt->_shadow_rhs, pcnt * sizeof(double));
-            acc_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_rhs, pcnt);
+            cnrn_target_memcpy_to_device(&(d_nt->_shadow_rhs), &d_shadow_ptr);
 
             /* copy shadow_d to device and fix-up the pointer */
-            d_shadow_ptr = (double*) acc_copyin(nt->_shadow_d, pcnt * sizeof(double));
-            acc_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr, sizeof(double*));
+            d_shadow_ptr = cnrn_target_copyin(nt->_shadow_d, pcnt);
+            cnrn_target_memcpy_to_device(&(d_nt->_shadow_d), &d_shadow_ptr);
         }
 
         /* Fast membrane current calculation struct */
         if (nt->nrn_fast_imem) {
-            auto* d_fast_imem = reinterpret_cast<NrnFastImem*>(
-                acc_copyin(nt->nrn_fast_imem, sizeof(NrnFastImem)));
-            acc_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem, sizeof(NrnFastImem*));
+            NrnFastImem* d_fast_imem = cnrn_target_copyin(nt->nrn_fast_imem);
+            cnrn_target_memcpy_to_device(&(d_nt->nrn_fast_imem), &d_fast_imem);
             {
-                auto* d_ptr = reinterpret_cast<double*>(
-                    acc_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double)));
-                acc_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr, sizeof(double*));
+                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
+                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_rhs), &d_ptr);
             }
             {
-                auto* d_ptr = reinterpret_cast<double*>(
-                    acc_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double)));
-                acc_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr, sizeof(double*));
+                double* d_ptr = cnrn_target_copyin(nt->nrn_fast_imem->nrn_sav_d, nt->end);
+                cnrn_target_memcpy_to_device(&(d_fast_imem->nrn_sav_d), &d_ptr);
             }
         }
 
         if (nt->n_pntproc) {
             /* copy Point_processes array and fix the pointer to execute net_receive blocks on GPU
              */
-            Point_process* pntptr =
-                (Point_process*) acc_copyin(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
-            acc_memcpy_to_device(&(d_nt->pntprocs), &pntptr, sizeof(Point_process*));
+            Point_process* pntptr = cnrn_target_copyin(nt->pntprocs, nt->n_pntproc);
+            cnrn_target_memcpy_to_device(&(d_nt->pntprocs), &pntptr);
         }
 
         if (nt->n_weight) {
             /* copy weight vector used in NET_RECEIVE which is pointed by netcon.weight */
-            double* d_weights = (double*) acc_copyin(nt->weights, sizeof(double) * nt->n_weight);
-            acc_memcpy_to_device(&(d_nt->weights), &d_weights, sizeof(double*));
+            double* d_weights = cnrn_target_copyin(nt->weights, nt->n_weight);
+            cnrn_target_memcpy_to_device(&(d_nt->weights), &d_weights);
         }
 
         if (nt->_nvdata) {
             /* copy vdata which is setup in bbcore_read. This contains cuda allocated
              * nrnran123_State * */
-            void** d_vdata = (void**) acc_copyin(nt->_vdata, sizeof(void*) * nt->_nvdata);
-            acc_memcpy_to_device(&(d_nt->_vdata), &d_vdata, sizeof(void**));
+            void** d_vdata = cnrn_target_copyin(nt->_vdata, nt->_nvdata);
+            cnrn_target_memcpy_to_device(&(d_nt->_vdata), &d_vdata);
         }
 
         if (nt->n_presyn) {
@@ -333,25 +453,24 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
              * while updating PreSyn objects which has virtual base class. May be this is issue due
              * to
              * VTable and alignment */
-            PreSynHelper* d_presyns_helper =
-                (PreSynHelper*) acc_copyin(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-            acc_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper, sizeof(PreSynHelper*));
-            PreSyn* d_presyns = (PreSyn*) acc_copyin(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            acc_memcpy_to_device(&(d_nt->presyns), &d_presyns, sizeof(PreSyn*));
+            PreSynHelper* d_presyns_helper = cnrn_target_copyin(nt->presyns_helper, nt->n_presyn);
+            cnrn_target_memcpy_to_device(&(d_nt->presyns_helper), &d_presyns_helper);
+            PreSyn* d_presyns = cnrn_target_copyin(nt->presyns, nt->n_presyn);
+            cnrn_target_memcpy_to_device(&(d_nt->presyns), &d_presyns);
         }
 
         if (nt->_net_send_buffer_size) {
             /* copy send_receive buffer */
-            int* d_net_send_buffer = (int*) acc_copyin(nt->_net_send_buffer,
-                                                       sizeof(int) * nt->_net_send_buffer_size);
-            acc_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer, sizeof(int*));
+            int* d_net_send_buffer = cnrn_target_copyin(nt->_net_send_buffer,
+                                                        nt->_net_send_buffer_size);
+            cnrn_target_memcpy_to_device(&(d_nt->_net_send_buffer), &d_net_send_buffer);
         }
 
         if (nt->n_vecplay) {
             /* copy VecPlayContinuous instances */
             /** just empty containers */
-            void** d_vecplay = (void**) acc_copyin(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
-            acc_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay, sizeof(void**));
+            void** d_vecplay = cnrn_target_copyin(nt->_vecplay, nt->n_vecplay);
+            cnrn_target_memcpy_to_device(&(d_nt->_vecplay), &d_vecplay);
 
             nrn_VecPlay_copyto_device(nt, d_vecplay);
         }
@@ -360,41 +479,41 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (interleave_permute_type == 1) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo));
                 int* d_ptr = nullptr;
+                InterleaveInfo* d_info = cnrn_target_copyin(info);
 
-                d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * (info->nstride + 1));
-                acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stride, info->nstride + 1);
+                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * nt->ncell);
-                acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->firstnode, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * nt->ncell);
-                acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->lastnode, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * nt->ncell);
-                acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->cellsize, nt->ncell);
+                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
 
             } else if (interleave_permute_type == 2) {
                 /* todo: not necessary to setup pointers, just copy it */
                 InterleaveInfo* info = interleave_info + i;
-                InterleaveInfo* d_info = (InterleaveInfo*) acc_copyin(info, sizeof(InterleaveInfo));
+                InterleaveInfo* d_info = cnrn_target_copyin(info);
                 int* d_ptr = nullptr;
 
-                d_ptr = (int*) acc_copyin(info->stride, sizeof(int) * info->nstride);
-                acc_memcpy_to_device(&(d_info->stride), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stride, info->nstride);
+                cnrn_target_memcpy_to_device(&(d_info->stride), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                acc_memcpy_to_device(&(d_info->firstnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->firstnode, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->firstnode), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                acc_memcpy_to_device(&(d_info->lastnode), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->lastnode, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->lastnode), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                acc_memcpy_to_device(&(d_info->stridedispl), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->stridedispl, info->nwarp + 1);
+                cnrn_target_memcpy_to_device(&(d_info->stridedispl), &d_ptr);
 
-                d_ptr = (int*) acc_copyin(info->cellsize, sizeof(int) * info->nwarp);
-                acc_memcpy_to_device(&(d_info->cellsize), &d_ptr, sizeof(int*));
+                d_ptr = cnrn_target_copyin(info->cellsize, info->nwarp);
+                cnrn_target_memcpy_to_device(&(d_info->cellsize), &d_ptr);
             } else {
                 printf("\n ERROR: only --cell_permute = [12] implemented");
                 abort();
@@ -408,38 +527,30 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 // Create a device-side copy of the `trajec_requests` struct and
                 // make sure the device-side NrnThread object knows about it.
-                auto* d_trajec_requests = reinterpret_cast<TrajectoryRequests*>(
-                    acc_copyin(tr, sizeof(TrajectoryRequests)));
-                acc_memcpy_to_device(&(d_nt->trajec_requests),
-                                     &d_trajec_requests,
-                                     sizeof(TrajectoryRequests*));
+                TrajectoryRequests* d_trajec_requests = cnrn_target_copyin(tr);
+                cnrn_target_memcpy_to_device(&(d_nt->trajec_requests), &d_trajec_requests);
                 // Initialise the double** gather member of the struct.
-                auto* d_tr_gather = reinterpret_cast<double**>(
-                    acc_copyin(tr->gather, sizeof(double*) * tr->n_trajec));
-                acc_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather, sizeof(double**));
+                double** d_tr_gather = cnrn_target_copyin(tr->gather, tr->n_trajec);
+                cnrn_target_memcpy_to_device(&(d_trajec_requests->gather), &d_tr_gather);
                 // Initialise the double** varrays member of the struct if it's
                 // set.
                 double** d_tr_varrays{nullptr};
                 if (tr->varrays) {
-                    d_tr_varrays = reinterpret_cast<double**>(
-                        acc_copyin(tr->varrays, sizeof(double*) * tr->n_trajec));
-                    acc_memcpy_to_device(&(d_trajec_requests->varrays),
-                                         &d_tr_varrays,
-                                         sizeof(double**));
+                    d_tr_varrays = cnrn_target_copyin(tr->varrays, tr->n_trajec);
+                    cnrn_target_memcpy_to_device(&(d_trajec_requests->varrays), &d_tr_varrays);
                 }
                 for (int i = 0; i < tr->n_trajec; ++i) {
                     if (tr->varrays) {
                         // tr->varrays[i] is a buffer of tr->bsize doubles on the host,
                         // make a device-side copy of it and store a pointer to it in
                         // the device-side version of tr->varrays.
-                        auto* d_buf_traj_i = reinterpret_cast<double*>(
-                            acc_copyin(tr->varrays[i], tr->bsize * sizeof(double)));
-                        acc_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i, sizeof(double*));
+                        double* d_buf_traj_i = cnrn_target_copyin(tr->varrays[i], tr->bsize);
+                        cnrn_target_memcpy_to_device(&(d_tr_varrays[i]), &d_buf_traj_i);
                     }
                     // tr->gather[i] is a double* referring to (host) data in the
                     // (host) _data block
-                    auto* d_gather_i = acc_deviceptr(tr->gather[i]);
-                    acc_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i, sizeof(double*));
+                    auto* d_gather_i = cnrn_target_deviceptr(tr->gather[i]);
+                    cnrn_target_memcpy_to_device(&(d_tr_gather[i]), &d_gather_i);
                 }
                 // TODO: other `double** scatter` and `void** vpr` members of
                 // the TrajectoryRequests struct are not copied to the device.
@@ -457,14 +568,14 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 }
 
 void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
-#ifdef _OPENACC
-    IvocVect* d_iv = (IvocVect*) acc_copyin((void*) &from, sizeof(IvocVect));
-    acc_memcpy_to_device(&to, &d_iv, sizeof(IvocVect*));
+#ifdef CORENEURON_ENABLE_GPU
+    /// by default `to` is desitionation pointer on a device
+    IvocVect* d_iv = &to;
 
     size_t n = from.size();
     if (n) {
-        double* d_data = (double*) acc_copyin((void*) from.data(), sizeof(double) * n);
-        acc_memcpy_to_device(&(d_iv->data_), &d_data, sizeof(double*));
+        double* d_data = cnrn_target_copyin(from.data(), n);
+        cnrn_target_memcpy_to_device(&(d_iv->data_), &d_data);
     }
 #else
     (void) from;
@@ -473,12 +584,12 @@ void copy_ivoc_vect_to_device(const IvocVect& from, IvocVect& to) {
 }
 
 void delete_ivoc_vect_from_device(IvocVect& vec) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     auto const n = vec.size();
     if (n) {
-        acc_delete(vec.data(), sizeof(double) * n);
+        cnrn_target_delete(vec.data(), n);
     }
-    acc_delete(&vec, sizeof(IvocVect));
+    cnrn_target_delete(&vec);
 #else
     (void) vec;
 #endif
@@ -490,15 +601,15 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
         return;
     }
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (nt->compute_gpu) {
         // free existing vectors in buffers on gpu
-        acc_delete(nrb->_pnt_index, nrb->_size * sizeof(int));
-        acc_delete(nrb->_weight_index, nrb->_size * sizeof(int));
-        acc_delete(nrb->_nrb_t, nrb->_size * sizeof(double));
-        acc_delete(nrb->_nrb_flag, nrb->_size * sizeof(double));
-        acc_delete(nrb->_displ, (nrb->_size + 1) * sizeof(int));
-        acc_delete(nrb->_nrb_index, nrb->_size * sizeof(int));
+        cnrn_target_delete(nrb->_pnt_index, nrb->_size);
+        cnrn_target_delete(nrb->_weight_index, nrb->_size);
+        cnrn_target_delete(nrb->_nrb_t, nrb->_size);
+        cnrn_target_delete(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_delete(nrb->_displ, nrb->_size + 1);
+        cnrn_target_delete(nrb->_nrb_index, nrb->_size);
     }
 #endif
 
@@ -511,34 +622,35 @@ void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml) {
     nrb->_displ = (int*) erealloc(nrb->_displ, (nrb->_size + 1) * sizeof(int));
     nrb->_nrb_index = (int*) erealloc(nrb->_nrb_index, nrb->_size * sizeof(int));
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (nt->compute_gpu) {
         int *d_weight_index, *d_pnt_index, *d_displ, *d_nrb_index;
         double *d_nrb_t, *d_nrb_flag;
 
         // update device copy
-        acc_update_device(nrb, sizeof(NetReceiveBuffer_t));
+        nrn_pragma_acc(update device(nrb));
+        nrn_pragma_omp(target update to(nrb));
 
-        NetReceiveBuffer_t* d_nrb = (NetReceiveBuffer_t*) acc_deviceptr(nrb);
+        NetReceiveBuffer_t* d_nrb = cnrn_target_deviceptr(nrb);
 
         // recopy the vectors in the buffer
-        d_pnt_index = (int*) acc_copyin(nrb->_pnt_index, sizeof(int) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index, sizeof(int*));
+        d_pnt_index = cnrn_target_copyin(nrb->_pnt_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_pnt_index), &d_pnt_index);
 
-        d_weight_index = (int*) acc_copyin(nrb->_weight_index, sizeof(int) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index, sizeof(int*));
+        d_weight_index = cnrn_target_copyin(nrb->_weight_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_weight_index), &d_weight_index);
 
-        d_nrb_t = (double*) acc_copyin(nrb->_nrb_t, sizeof(double) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t, sizeof(double*));
+        d_nrb_t = cnrn_target_copyin(nrb->_nrb_t, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_t), &d_nrb_t);
 
-        d_nrb_flag = (double*) acc_copyin(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag, sizeof(double*));
+        d_nrb_flag = cnrn_target_copyin(nrb->_nrb_flag, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_flag), &d_nrb_flag);
 
-        d_displ = (int*) acc_copyin(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-        acc_memcpy_to_device(&(d_nrb->_displ), &d_displ, sizeof(int*));
+        d_displ = cnrn_target_copyin(nrb->_displ, nrb->_size + 1);
+        cnrn_target_memcpy_to_device(&(d_nrb->_displ), &d_displ);
 
-        d_nrb_index = (int*) acc_copyin(nrb->_nrb_index, sizeof(int) * nrb->_size);
-        acc_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index, sizeof(int*));
+        d_nrb_index = cnrn_target_copyin(nrb->_nrb_index, nrb->_size);
+        cnrn_target_memcpy_to_device(&(d_nrb->_nrb_index), &d_nrb_index);
     }
 #endif
 }
@@ -597,6 +709,10 @@ static void net_receive_buffer_order(NetReceiveBuffer_t* nrb) {
 void update_net_receive_buffer(NrnThread* nt) {
     Instrumentor::phase p_update_net_receive_buffer("update-net-receive-buf");
     for (auto tml = nt->tml; tml; tml = tml->next) {
+        int is_art = corenrn.get_is_artificial()[tml->index];
+        if (is_art) {
+            continue;
+        }
         // net_receive buffer to copy
         NetReceiveBuffer_t* nrb = tml->ml->_net_receive_buffer;
 
@@ -605,29 +721,39 @@ void update_net_receive_buffer(NrnThread* nt) {
             // instance order to avoid race. setup _displ and _nrb_index
             net_receive_buffer_order(nrb);
 
-#ifdef _OPENACC
             if (nt->compute_gpu) {
                 Instrumentor::phase p_net_receive_buffer_order("net-receive-buf-cpu2gpu");
                 // note that dont update nrb otherwise we lose pointers
 
+                // clang-format off
+
                 /* update scalar elements */
-                acc_update_device(&nrb->_cnt, sizeof(int));
-                acc_update_device(&nrb->_displ_cnt, sizeof(int));
-
-                acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_t, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_nrb_flag, sizeof(double) * nrb->_cnt);
-                acc_update_device(nrb->_displ, sizeof(int) * (nrb->_displ_cnt + 1));
-                acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_cnt);
+                nrn_pragma_acc(update device(nrb->_cnt,
+                                             nrb->_displ_cnt,
+                                             nrb->_pnt_index[:nrb->_cnt],
+                                             nrb->_weight_index[:nrb->_cnt],
+                                             nrb->_nrb_t[:nrb->_cnt],
+                                             nrb->_nrb_flag[:nrb->_cnt],
+                                             nrb->_displ[:nrb->_displ_cnt + 1],
+                                             nrb->_nrb_index[:nrb->_cnt])
+                                             async(nt->stream_id))
+                nrn_pragma_omp(target update to(nrb->_cnt,
+                                                nrb->_displ_cnt,
+                                                nrb->_pnt_index[:nrb->_cnt],
+                                                nrb->_weight_index[:nrb->_cnt],
+                                                nrb->_nrb_t[:nrb->_cnt],
+                                                nrb->_nrb_flag[:nrb->_cnt],
+                                                nrb->_displ[:nrb->_displ_cnt + 1],
+                                                nrb->_nrb_index[:nrb->_cnt]))
+                // clang-format on
             }
-#endif
         }
     }
+    nrn_pragma_acc(wait(nt->stream_id))
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     if (!nt->compute_gpu)
         return;
 
@@ -641,13 +767,23 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
 
     if (nsb->_cnt) {
         Instrumentor::phase p_net_receive_buffer_order("net-send-buf-gpu2cpu");
-        acc_update_self(nsb->_sendtype, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_vdata_index, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_pnt_index, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_weight_index, sizeof(int) * nsb->_cnt);
-        acc_update_self(nsb->_nsb_t, sizeof(double) * nsb->_cnt);
-        acc_update_self(nsb->_nsb_flag, sizeof(double) * nsb->_cnt);
     }
+    // clang-format off
+    nrn_pragma_acc(update self(nsb->_sendtype[:nsb->_cnt],
+                               nsb->_vdata_index[:nsb->_cnt],
+                               nsb->_pnt_index[:nsb->_cnt],
+                               nsb->_weight_index[:nsb->_cnt],
+                               nsb->_nsb_t[:nsb->_cnt],
+                               nsb->_nsb_flag[:nsb->_cnt])
+                          if (nsb->_cnt))
+    nrn_pragma_omp(target update from(nsb->_sendtype[:nsb->_cnt],
+                                      nsb->_vdata_index[:nsb->_cnt],
+                                      nsb->_pnt_index[:nsb->_cnt],
+                                      nsb->_weight_index[:nsb->_cnt],
+                                      nsb->_nsb_t[:nsb->_cnt],
+                                      nsb->_nsb_flag[:nsb->_cnt])
+                                 if (nsb->_cnt))
+    // clang-format on
 #else
     (void) nt;
     (void) nsb;
@@ -655,7 +791,7 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
 }
 
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
 
     for (int i = 0; i < nthreads; i++) {
         NrnThread* nt = threads + i;
@@ -665,197 +801,63 @@ void update_nrnthreads_on_host(NrnThread* threads, int nthreads) {
 
             int ne = nrn_soa_padded_size(nt->end, 0);
 
-            acc_update_self(nt->_actual_rhs, ne * sizeof(double));
-            acc_update_self(nt->_actual_d, ne * sizeof(double));
-            acc_update_self(nt->_actual_a, ne * sizeof(double));
-            acc_update_self(nt->_actual_b, ne * sizeof(double));
-            acc_update_self(nt->_actual_v, ne * sizeof(double));
-            acc_update_self(nt->_actual_area, ne * sizeof(double));
-            if (nt->_actual_diam) {
-                acc_update_self(nt->_actual_diam, ne * sizeof(double));
-            }
-
-            /* @todo: nt._ml_list[tml->index] = tml->ml; */
-
-            /* -- copy NrnThreadMembList list ml to host -- */
-            for (auto tml = nt->tml; tml; tml = tml->next) {
-                Memb_list* ml = tml->ml;
-
-                acc_update_self(&tml->index, sizeof(int));
-                acc_update_self(&ml->nodecount, sizeof(int));
-
-                int type = tml->index;
-                int n = ml->nodecount;
-                int szp = corenrn.get_prop_param_size()[type];
-                int szdp = corenrn.get_prop_dparam_size()[type];
-                int is_art = corenrn.get_is_artificial()[type];
-
-                // Artificial mechanisms such as PatternStim and IntervalFire
-                // are not copied onto the GPU. They should not, therefore, be
-                // updated from the GPU.
-                if (is_art) {
-                    continue;
-                }
-
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
-
-                acc_update_self(ml->data, pcnt * sizeof(double));
-                acc_update_self(ml->nodeindices, n * sizeof(int));
-
-                if (szdp) {
-                    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                    acc_update_self(ml->pdata, pcnt * sizeof(int));
-                }
-
-                auto nrb = tml->ml->_net_receive_buffer;
-
-                if (nrb) {
-                    acc_update_self(&nrb->_cnt, sizeof(int));
-                    acc_update_self(&nrb->_size, sizeof(int));
-                    acc_update_self(&nrb->_pnt_offset, sizeof(int));
-                    acc_update_self(&nrb->_displ_cnt, sizeof(int));
-
-                    acc_update_self(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    acc_update_self(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    acc_update_self(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    acc_update_self(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                }
-            }
-
-            if (nt->shadow_rhs_cnt) {
-                int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-                /* copy shadow_rhs to host */
-                acc_update_self(nt->_shadow_rhs, pcnt * sizeof(double));
-                /* copy shadow_d to host */
-                acc_update_self(nt->_shadow_d, pcnt * sizeof(double));
-            }
-
-            if (nt->nrn_fast_imem) {
-                acc_update_self(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-                acc_update_self(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            }
-
-            if (nt->n_pntproc) {
-                acc_update_self(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
-            }
-
-            if (nt->n_weight) {
-                acc_update_self(nt->weights, sizeof(double) * nt->n_weight);
-            }
-
-            if (nt->n_presyn) {
-                acc_update_self(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-                acc_update_self(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            }
-
-            {
-                TrajectoryRequests* tr = nt->trajec_requests;
-                if (tr && tr->varrays) {
-                    // The full buffers have `bsize` entries, but only `vsize`
-                    // of them are valid.
-                    for (int i = 0; i < tr->n_trajec; ++i) {
-                        acc_update_self(tr->varrays[i], tr->vsize * sizeof(double));
-                    }
-                }
-            }
-
-            /* dont update vdata, its pointer array
-               if(nt->_nvdata) {
-               acc_update_self(nt->_vdata, sizeof(double)*nt->_nvdata);
-               }
-             */
-        }
-    }
-#else
-    (void) threads;
-    (void) nthreads;
-#endif
-}
-
-void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
-
-    for (int i = 0; i < nthreads; i++) {
-        NrnThread* nt = threads + i;
-
-        if (nt->compute_gpu && (nt->end > 0)) {
-            /* -- copy data to device -- */
-
-            int ne = nrn_soa_padded_size(nt->end, 0);
+            // clang-format off
+            nrn_pragma_acc(update self(nt->_actual_rhs[:ne],
+                                       nt->_actual_d[:ne],
+                                       nt->_actual_a[:ne],
+                                       nt->_actual_b[:ne],
+                                       nt->_actual_v[:ne],
+                                       nt->_actual_area[:ne]))
+            nrn_pragma_omp(target update from(nt->_actual_rhs[:ne],
+                                              nt->_actual_d[:ne],
+                                              nt->_actual_a[:ne],
+                                              nt->_actual_b[:ne],
+                                              nt->_actual_v[:ne],
+                                              nt->_actual_area[:ne]))
+            // clang-format on
 
-            acc_update_device(nt->_actual_rhs, ne * sizeof(double));
-            acc_update_device(nt->_actual_d, ne * sizeof(double));
-            acc_update_device(nt->_actual_a, ne * sizeof(double));
-            acc_update_device(nt->_actual_b, ne * sizeof(double));
-            acc_update_device(nt->_actual_v, ne * sizeof(double));
-            acc_update_device(nt->_actual_area, ne * sizeof(double));
-            if (nt->_actual_diam) {
-                acc_update_device(nt->_actual_diam, ne * sizeof(double));
-            }
+            nrn_pragma_acc(update self(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
+            nrn_pragma_omp(
+                target update from(nt->_actual_diam[:ne]) if (nt->_actual_diam != nullptr))
 
             /* @todo: nt._ml_list[tml->index] = tml->ml; */
 
             /* -- copy NrnThreadMembList list ml to host -- */
             for (auto tml = nt->tml; tml; tml = tml->next) {
-                Memb_list* ml = tml->ml;
-                int type = tml->index;
-                int n = ml->nodecount;
-                int szp = corenrn.get_prop_param_size()[type];
-                int szdp = corenrn.get_prop_dparam_size()[type];
-
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szp;
-
-                acc_update_device(ml->data, pcnt * sizeof(double));
-
-                if (!corenrn.get_is_artificial()[type]) {
-                    acc_update_device(ml->nodeindices, n * sizeof(int));
-                }
-
-                if (szdp) {
-                    int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                    acc_update_device(ml->pdata, pcnt * sizeof(int));
-                }
-
-                auto nrb = tml->ml->_net_receive_buffer;
-
-                if (nrb) {
-                    acc_update_device(&nrb->_cnt, sizeof(int));
-                    acc_update_device(&nrb->_size, sizeof(int));
-                    acc_update_device(&nrb->_pnt_offset, sizeof(int));
-                    acc_update_device(&nrb->_displ_cnt, sizeof(int));
-
-                    acc_update_device(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    acc_update_device(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    acc_update_device(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    acc_update_device(nrb->_nrb_index, sizeof(int) * nrb->_size);
+                if (!corenrn.get_is_artificial()[tml->index]) {
+                    nrn_pragma_acc(update self(tml->index, tml->ml->nodecount))
+                    nrn_pragma_omp(target update from(tml->index, tml->ml->nodecount))
                 }
+                update_ml_on_host(tml->ml, tml->index);
             }
 
-            if (nt->shadow_rhs_cnt) {
-                int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-                /* copy shadow_rhs to host */
-                acc_update_device(nt->_shadow_rhs, pcnt * sizeof(double));
-                /* copy shadow_d to host */
-                acc_update_device(nt->_shadow_d, pcnt * sizeof(double));
-            }
+            int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
+            /* copy shadow_rhs to host */
+            /* copy shadow_d to host */
+            nrn_pragma_acc(
+                update self(nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
+            nrn_pragma_omp(target update from(
+                nt->_shadow_rhs[:pcnt], nt->_shadow_d[:pcnt]) if (nt->shadow_rhs_cnt))
 
-            if (nt->nrn_fast_imem) {
-                acc_update_device(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-                acc_update_device(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            }
+            // clang-format off
+            nrn_pragma_acc(update self(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
+                                       nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                                  if (nt->nrn_fast_imem != nullptr))
+            nrn_pragma_omp(target update from(nt->nrn_fast_imem->nrn_sav_rhs[:nt->end],
+                                              nt->nrn_fast_imem->nrn_sav_d[:nt->end])
+                                         if (nt->nrn_fast_imem != nullptr))
+            // clang-format on
 
-            if (nt->n_pntproc) {
-                acc_update_device(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
-            }
+            nrn_pragma_acc(update self(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
+            nrn_pragma_omp(target update from(nt->pntprocs[:nt->n_pntproc]) if (nt->n_pntproc))
 
-            if (nt->n_weight) {
-                acc_update_device(nt->weights, sizeof(double) * nt->n_weight);
-            }
+            nrn_pragma_acc(update self(nt->weights[:nt->n_weight]) if (nt->n_weight))
+            nrn_pragma_omp(target update from(nt->weights[:nt->n_weight]) if (nt->n_weight))
 
-            if (nt->n_presyn) {
-                acc_update_device(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
-                acc_update_device(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            }
+            nrn_pragma_acc(update self(
+                nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
+            nrn_pragma_omp(target update from(
+                nt->presyns_helper[:nt->n_presyn], nt->presyns[:nt->n_presyn]) if (nt->n_presyn))
 
             {
                 TrajectoryRequests* tr = nt->trajec_requests;
@@ -863,15 +865,15 @@ void update_nrnthreads_on_device(NrnThread* threads, int nthreads) {
                     // The full buffers have `bsize` entries, but only `vsize`
                     // of them are valid.
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        acc_update_device(tr->varrays[i], tr->vsize * sizeof(double));
+                        nrn_pragma_acc(update self(tr->varrays[i][:tr->vsize]))
+                        nrn_pragma_omp(target update from(tr->varrays[i][:tr->vsize]))
                     }
                 }
             }
 
-            /* don't and don't update vdata, its pointer array
-               if(nt->_nvdata) {
-               acc_update_device(nt->_vdata, sizeof(double)*nt->_nvdata);
-               }
+            /* dont update vdata, its pointer array
+               nrn_pragma_acc(update self(nt->_vdata[:nt->_nvdata) if nt->_nvdata)
+               nrn_pragma_omp(target update from(nt->_vdata[:nt->_nvdata) if (nt->_nvdata))
              */
         }
     }
@@ -894,85 +896,30 @@ void update_weights_from_gpu(NrnThread* threads, int nthreads) {
         size_t n_weight = nt->n_weight;
         if (nt->compute_gpu && n_weight > 0) {
             double* weights = nt->weights;
-            // clang-format off
-
-            #pragma acc update host(weights [0:n_weight])
-            // clang-format on
+            nrn_pragma_acc(update host(weights [0:n_weight]))
+            nrn_pragma_omp(target update from(weights [0:n_weight]))
         }
     }
 }
 
-void update_matrix_from_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* openacc routine doesn't allow asyn, use pragma */
-        // acc_update_self(_nt->_actual_rhs, 2*_nt->end*sizeof(double));
-
-        /* RHS and D are contigious, copy them in one go!
-         * NOTE: in pragma you have to give actual pointer like below and not nt->rhs...
-         */
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update host(rhs[0 : 2 * ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
-void update_matrix_to_gpu(NrnThread* _nt) {
-#ifdef _OPENACC
-    if (_nt->compute_gpu && (_nt->end > 0)) {
-        /* before copying, make sure all computations in the stream are completed */
-
-        // clang-format off
-
-        #pragma acc wait(_nt->stream_id)
-
-        /* while discussion with Michael we found that RHS is also needed on
-         * gpu because nrn_cap_jacob uses rhs which is being updated on GPU
-         */
-        double* v = _nt->_actual_v;
-        double* rhs = _nt->_actual_rhs;
-        int ne = nrn_soa_padded_size(_nt->end, 0);
-
-        #pragma acc update device(v[0 : ne]) async(_nt->stream_id)
-        #pragma acc update device(rhs[0 : ne]) async(_nt->stream_id)
-        #pragma acc wait(_nt->stream_id)
-        // clang-format on
-    }
-#else
-    (void) _nt;
-#endif
-}
-
 /** Cleanup device memory that is being tracked by the OpenACC runtime.
  *
- *  This function painstakingly calls `acc_delete` in reverse order on all
- *  pointers that were passed to `acc_copyin` in `setup_nrnthreads_on_device`.
+ *  This function painstakingly calls `cnrn_target_delete` in reverse order on all
+ *  pointers that were passed to `cnrn_target_copyin` in `setup_nrnthreads_on_device`.
  *  This cleanup ensures that if the GPU is initialised multiple times from the
  *  same process then the OpenACC runtime will not be polluted with old
  *  pointers, which can cause errors. In particular if we do:
  *  @code
  *    {
  *      // ... some_ptr is dynamically allocated ...
- *      acc_copyin(some_ptr, some_size);
+ *      cnrn_target_copyin(some_ptr, some_size);
  *      // ... do some work ...
- *      // acc_delete(some_ptr);
+ *      // cnrn_target_delete(some_ptr);
  *      free(some_ptr);
  *    }
  *    {
  *      // ... same_ptr_again is dynamically allocated at the same address ...
- *      acc_copyin(same_ptr_again, some_other_size); // ERROR
+ *      cnrn_target_copyin(same_ptr_again, some_other_size); // ERROR
  *    }
  *  @endcode
  *  the application will/may abort with an error such as:
@@ -981,7 +928,7 @@ void update_matrix_to_gpu(NrnThread* _nt) {
  *  the same process.
  */
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     for (int i = 0; i < nthreads; i++) {
         NrnThread* nt = threads + i;
         {
@@ -989,132 +936,91 @@ void delete_nrnthreads_on_device(NrnThread* threads, int nthreads) {
             if (tr) {
                 if (tr->varrays) {
                     for (int i = 0; i < tr->n_trajec; ++i) {
-                        acc_delete(tr->varrays[i], tr->bsize * sizeof(double));
+                        cnrn_target_delete(tr->varrays[i], tr->bsize);
                     }
-                    acc_delete(tr->varrays, sizeof(double*) * tr->n_trajec);
+                    cnrn_target_delete(tr->varrays, tr->n_trajec);
                 }
-                acc_delete(tr->gather, sizeof(double*) * tr->n_trajec);
-                acc_delete(tr, sizeof(TrajectoryRequests));
+                cnrn_target_delete(tr->gather, tr->n_trajec);
+                cnrn_target_delete(tr);
             }
         }
         if (nt->_permute) {
             if (interleave_permute_type == 1) {
                 InterleaveInfo* info = interleave_info + i;
-                acc_delete(info->cellsize, sizeof(int) * nt->ncell);
-                acc_delete(info->lastnode, sizeof(int) * nt->ncell);
-                acc_delete(info->firstnode, sizeof(int) * nt->ncell);
-                acc_delete(info->stride, sizeof(int) * (info->nstride + 1));
-                acc_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, nt->ncell);
+                cnrn_target_delete(info->lastnode, nt->ncell);
+                cnrn_target_delete(info->firstnode, nt->ncell);
+                cnrn_target_delete(info->stride, info->nstride + 1);
+                cnrn_target_delete(info);
             } else if (interleave_permute_type == 2) {
                 InterleaveInfo* info = interleave_info + i;
-                acc_delete(info->cellsize, sizeof(int) * info->nwarp);
-                acc_delete(info->stridedispl, sizeof(int) * (info->nwarp + 1));
-                acc_delete(info->lastnode, sizeof(int) * (info->nwarp + 1));
-                acc_delete(info->firstnode, sizeof(int) * (info->nwarp + 1));
-                acc_delete(info->stride, sizeof(int) * info->nstride);
-                acc_delete(info, sizeof(InterleaveInfo));
+                cnrn_target_delete(info->cellsize, info->nwarp);
+                cnrn_target_delete(info->stridedispl, info->nwarp + 1);
+                cnrn_target_delete(info->lastnode, info->nwarp + 1);
+                cnrn_target_delete(info->firstnode, info->nwarp + 1);
+                cnrn_target_delete(info->stride, info->nstride);
+                cnrn_target_delete(info);
             }
         }
 
         if (nt->n_vecplay) {
             nrn_VecPlay_delete_from_device(nt);
-            acc_delete(nt->_vecplay, sizeof(void*) * nt->n_vecplay);
+            cnrn_target_delete(nt->_vecplay, nt->n_vecplay);
         }
 
         // Cleanup send_receive buffer.
         if (nt->_net_send_buffer_size) {
-            acc_delete(nt->_net_send_buffer, sizeof(int) * nt->_net_send_buffer_size);
+            cnrn_target_delete(nt->_net_send_buffer, nt->_net_send_buffer_size);
         }
 
         if (nt->n_presyn) {
-            acc_delete(nt->presyns, sizeof(PreSyn) * nt->n_presyn);
-            acc_delete(nt->presyns_helper, sizeof(PreSynHelper) * nt->n_presyn);
+            cnrn_target_delete(nt->presyns, nt->n_presyn);
+            cnrn_target_delete(nt->presyns_helper, nt->n_presyn);
         }
 
         // Cleanup data that's setup in bbcore_read.
         if (nt->_nvdata) {
-            acc_delete(nt->_vdata, sizeof(void*) * nt->_nvdata);
+            cnrn_target_delete(nt->_vdata, nt->_nvdata);
         }
 
         // Cleanup weight vector used in NET_RECEIVE
         if (nt->n_weight) {
-            acc_delete(nt->weights, sizeof(double) * nt->n_weight);
+            cnrn_target_delete(nt->weights, nt->n_weight);
         }
 
         // Cleanup point processes
         if (nt->n_pntproc) {
-            acc_delete(nt->pntprocs, nt->n_pntproc * sizeof(Point_process));
+            cnrn_target_delete(nt->pntprocs, nt->n_pntproc);
         }
 
         if (nt->nrn_fast_imem) {
-            acc_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end * sizeof(double));
-            acc_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end * sizeof(double));
-            acc_delete(nt->nrn_fast_imem, sizeof(NrnFastImem));
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_d, nt->end);
+            cnrn_target_delete(nt->nrn_fast_imem->nrn_sav_rhs, nt->end);
+            cnrn_target_delete(nt->nrn_fast_imem);
         }
 
         if (nt->shadow_rhs_cnt) {
             int pcnt = nrn_soa_padded_size(nt->shadow_rhs_cnt, 0);
-            acc_delete(nt->_shadow_d, pcnt * sizeof(double));
-            acc_delete(nt->_shadow_rhs, pcnt * sizeof(double));
+            cnrn_target_delete(nt->_shadow_d, pcnt);
+            cnrn_target_delete(nt->_shadow_rhs, pcnt);
         }
 
         for (auto tml = nt->tml; tml; tml = tml->next) {
-            // Cleanup the net send buffer if it exists
-            {
-                NetSendBuffer_t* nsb{tml->ml->_net_send_buffer};
-                if (nsb) {
-                    acc_delete(nsb->_nsb_flag, sizeof(double) * nsb->_size);
-                    acc_delete(nsb->_nsb_t, sizeof(double) * nsb->_size);
-                    acc_delete(nsb->_weight_index, sizeof(int) * nsb->_size);
-                    acc_delete(nsb->_pnt_index, sizeof(int) * nsb->_size);
-                    acc_delete(nsb->_vdata_index, sizeof(int) * nsb->_size);
-                    acc_delete(nsb->_sendtype, sizeof(int) * nsb->_size);
-                    acc_delete(nsb, sizeof(NetSendBuffer_t));
-                }
-            }
-            // Cleanup the net receive buffer if it exists.
-            {
-                NetReceiveBuffer_t* nrb{tml->ml->_net_receive_buffer};
-                if (nrb) {
-                    acc_delete(nrb->_nrb_index, sizeof(int) * nrb->_size);
-                    acc_delete(nrb->_displ, sizeof(int) * (nrb->_size + 1));
-                    acc_delete(nrb->_nrb_flag, sizeof(double) * nrb->_size);
-                    acc_delete(nrb->_nrb_t, sizeof(double) * nrb->_size);
-                    acc_delete(nrb->_weight_index, sizeof(int) * nrb->_size);
-                    acc_delete(nrb->_pnt_index, sizeof(int) * nrb->_size);
-                    acc_delete(nrb, sizeof(NetReceiveBuffer_t));
-                }
-            }
-            int type = tml->index;
-            int n = tml->ml->nodecount;
-            int szdp = corenrn.get_prop_dparam_size()[type];
-            int is_art = corenrn.get_is_artificial()[type];
-            int ts = corenrn.get_memb_funcs()[type].thread_size_;
-            if (ts) {
-                acc_delete(tml->ml->_thread, ts * sizeof(ThreadDatum));
-            }
-            if (szdp) {
-                int pcnt = nrn_soa_padded_size(n, SOA_LAYOUT) * szdp;
-                acc_delete(tml->ml->pdata, sizeof(int) * pcnt);
-            }
-            if (!is_art) {
-                acc_delete(tml->ml->nodeindices, sizeof(int) * n);
-            }
-            acc_delete(tml->ml, sizeof(Memb_list));
-            acc_delete(tml, sizeof(NrnThreadMembList));
+            delete_ml_from_device(tml->ml, tml->index);
+            cnrn_target_delete(tml);
         }
-        acc_delete(nt->_ml_list, corenrn.get_memb_funcs().size() * sizeof(Memb_list*));
-        acc_delete(nt->_v_parent_index, nt->end * sizeof(int));
-        acc_delete(nt->_data, nt->_ndata * sizeof(double));
+        cnrn_target_delete(nt->_ml_list, corenrn.get_memb_funcs().size());
+        cnrn_target_delete(nt->_v_parent_index, nt->end);
+        cnrn_target_delete(nt->_data, nt->_ndata);
     }
-    acc_delete(threads, sizeof(NrnThread) * nthreads);
+    cnrn_target_delete(threads, nthreads);
     nrn_ion_global_map_delete_from_device();
 #endif
 }
 
 
 void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1123,59 +1029,59 @@ void nrn_newtonspace_copyto_device(NewtonSpace* ns) {
 
     int n = ns->n * ns->n_instance;
     // actually, the values of double do not matter, only the  pointers.
-    NewtonSpace* d_ns = (NewtonSpace*) acc_copyin(ns, sizeof(NewtonSpace));
+    NewtonSpace* d_ns = cnrn_target_copyin(ns);
 
     double* pd;
 
-    pd = (double*) acc_copyin(ns->delta_x, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->delta_x), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->delta_x, n);
+    cnrn_target_memcpy_to_device(&(d_ns->delta_x), &pd);
 
-    pd = (double*) acc_copyin(ns->high_value, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->high_value), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->high_value, n);
+    cnrn_target_memcpy_to_device(&(d_ns->high_value), &pd);
 
-    pd = (double*) acc_copyin(ns->low_value, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->low_value), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->low_value, n);
+    cnrn_target_memcpy_to_device(&(d_ns->low_value), &pd);
 
-    pd = (double*) acc_copyin(ns->rowmax, n * sizeof(double));
-    acc_memcpy_to_device(&(d_ns->rowmax), &pd, sizeof(double*));
+    pd = cnrn_target_copyin(ns->rowmax, n);
+    cnrn_target_memcpy_to_device(&(d_ns->rowmax), &pd);
 
-    auto pint = (int*) acc_copyin(ns->perm, n * sizeof(int));
-    acc_memcpy_to_device(&(d_ns->perm), &pint, sizeof(int*));
+    auto pint = cnrn_target_copyin(ns->perm, n);
+    cnrn_target_memcpy_to_device(&(d_ns->perm), &pint);
 
-    auto ppd = (double**) acc_copyin(ns->jacobian, ns->n * sizeof(double*));
-    acc_memcpy_to_device(&(d_ns->jacobian), &ppd, sizeof(double**));
+    auto ppd = cnrn_target_copyin(ns->jacobian, ns->n);
+    cnrn_target_memcpy_to_device(&(d_ns->jacobian), &ppd);
 
     // the actual jacobian doubles were allocated as a single array
-    double* d_jacdat = (double*) acc_copyin(ns->jacobian[0], ns->n * n * sizeof(double));
+    double* d_jacdat = cnrn_target_copyin(ns->jacobian[0], ns->n * n);
 
     for (int i = 0; i < ns->n; ++i) {
         pd = d_jacdat + i * n;
-        acc_memcpy_to_device(&(ppd[i]), &pd, sizeof(double*));
+        cnrn_target_memcpy_to_device(&(ppd[i]), &pd);
     }
 #endif
 }
 
 void nrn_newtonspace_delete_from_device(NewtonSpace* ns) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
         return;
     }
     int n = ns->n * ns->n_instance;
-    acc_delete(ns->jacobian[0], ns->n * n * sizeof(double));
-    acc_delete(ns->jacobian, ns->n * sizeof(double*));
-    acc_delete(ns->perm, n * sizeof(int));
-    acc_delete(ns->rowmax, n * sizeof(double));
-    acc_delete(ns->low_value, n * sizeof(double));
-    acc_delete(ns->high_value, n * sizeof(double));
-    acc_delete(ns->delta_x, n * sizeof(double));
-    acc_delete(ns, sizeof(NewtonSpace));
+    cnrn_target_delete(ns->jacobian[0], ns->n * n);
+    cnrn_target_delete(ns->jacobian, ns->n);
+    cnrn_target_delete(ns->perm, n);
+    cnrn_target_delete(ns->rowmax, n);
+    cnrn_target_delete(ns->low_value, n);
+    cnrn_target_delete(ns->high_value, n);
+    cnrn_target_delete(ns->delta_x, n);
+    cnrn_target_delete(ns);
 #endif
 }
 
 void nrn_sparseobj_copyto_device(SparseObj* so) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1183,82 +1089,82 @@ void nrn_sparseobj_copyto_device(SparseObj* so) {
     }
 
     unsigned n1 = so->neqn + 1;
-    SparseObj* d_so = (SparseObj*) acc_copyin(so, sizeof(SparseObj));
+    SparseObj* d_so = cnrn_target_copyin(so);
     // only pointer fields in SparseObj that need setting up are
     //   rowst, diag, rhs, ngetcall, coef_list
     // only pointer fields in Elm that need setting up are
     //   r_down, c_right, value
     // do not care about the Elm* ptr value, just the space.
 
-    Elm** d_rowst = (Elm**) acc_copyin(so->rowst, n1 * sizeof(Elm*));
-    acc_memcpy_to_device(&(d_so->rowst), &d_rowst, sizeof(Elm**));
+    Elm** d_rowst = cnrn_target_copyin(so->rowst, n1);
+    cnrn_target_memcpy_to_device(&(d_so->rowst), &d_rowst);
 
-    Elm** d_diag = (Elm**) acc_copyin(so->diag, n1 * sizeof(Elm*));
-    acc_memcpy_to_device(&(d_so->diag), &d_diag, sizeof(Elm**));
+    Elm** d_diag = cnrn_target_copyin(so->diag, n1);
+    cnrn_target_memcpy_to_device(&(d_so->diag), &d_diag);
 
-    auto pu = (unsigned*) acc_copyin(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    acc_memcpy_to_device(&(d_so->ngetcall), &pu, sizeof(Elm**));
+    unsigned* pu = cnrn_target_copyin(so->ngetcall, so->_cntml_padded);
+    cnrn_target_memcpy_to_device(&(d_so->ngetcall), &pu);
 
-    auto pd = (double*) acc_copyin(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    acc_memcpy_to_device(&(d_so->rhs), &pd, sizeof(double*));
+    double* pd = cnrn_target_copyin(so->rhs, n1 * so->_cntml_padded);
+    cnrn_target_memcpy_to_device(&(d_so->rhs), &pd);
 
-    auto d_coef_list = (double**) acc_copyin(so->coef_list, so->coef_list_size * sizeof(double*));
-    acc_memcpy_to_device(&(d_so->coef_list), &d_coef_list, sizeof(double**));
+    double** d_coef_list = cnrn_target_copyin(so->coef_list, so->coef_list_size);
+    cnrn_target_memcpy_to_device(&(d_so->coef_list), &d_coef_list);
 
     // Fill in relevant Elm pointer values
 
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            Elm* pelm = (Elm*) acc_copyin(elm, sizeof(Elm));
+            Elm* pelm = cnrn_target_copyin(elm);
 
             if (elm == so->rowst[irow]) {
-                acc_memcpy_to_device(&(d_rowst[irow]), &pelm, sizeof(Elm*));
+                cnrn_target_memcpy_to_device(&(d_rowst[irow]), &pelm);
             } else {
-                Elm* d_e = (Elm*) acc_deviceptr(elm->c_left);
-                acc_memcpy_to_device(&(pelm->c_left), &d_e, sizeof(Elm*));
+                Elm* d_e = cnrn_target_deviceptr(elm->c_left);
+                cnrn_target_memcpy_to_device(&(pelm->c_left), &d_e);
             }
 
             if (elm->col == elm->row) {
-                acc_memcpy_to_device(&(d_diag[irow]), &pelm, sizeof(Elm*));
+                cnrn_target_memcpy_to_device(&(d_diag[irow]), &pelm);
             }
 
             if (irow > 1) {
                 if (elm->r_up) {
-                    Elm* d_e = (Elm*) acc_deviceptr(elm->r_up);
-                    acc_memcpy_to_device(&(pelm->r_up), &d_e, sizeof(Elm*));
+                    Elm* d_e = cnrn_target_deviceptr(elm->r_up);
+                    cnrn_target_memcpy_to_device(&(pelm->r_up), &d_e);
                 }
             }
 
-            pd = (double*) acc_copyin(elm->value, so->_cntml_padded * sizeof(double));
-            acc_memcpy_to_device(&(pelm->value), &pd, sizeof(double*));
+            pd = cnrn_target_copyin(elm->value, so->_cntml_padded);
+            cnrn_target_memcpy_to_device(&(pelm->value), &pd);
         }
     }
 
     // visit all the Elm again and fill in pelm->r_down and pelm->c_left
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            auto pelm = (Elm*) acc_deviceptr(elm);
+            auto pelm = cnrn_target_deviceptr(elm);
             if (elm->r_down) {
-                auto d_e = (Elm*) acc_deviceptr(elm->r_down);
-                acc_memcpy_to_device(&(pelm->r_down), &d_e, sizeof(Elm*));
+                auto d_e = cnrn_target_deviceptr(elm->r_down);
+                cnrn_target_memcpy_to_device(&(pelm->r_down), &d_e);
             }
             if (elm->c_right) {
-                auto d_e = (Elm*) acc_deviceptr(elm->c_right);
-                acc_memcpy_to_device(&(pelm->c_right), &d_e, sizeof(Elm*));
+                auto d_e = cnrn_target_deviceptr(elm->c_right);
+                cnrn_target_memcpy_to_device(&(pelm->c_right), &d_e);
             }
         }
     }
 
     // Fill in the d_so->coef_list
     for (unsigned i = 0; i < so->coef_list_size; ++i) {
-        pd = (double*) acc_deviceptr(so->coef_list[i]);
-        acc_memcpy_to_device(&(d_coef_list[i]), &pd, sizeof(double*));
+        pd = cnrn_target_deviceptr(so->coef_list[i]);
+        cnrn_target_memcpy_to_device(&(d_coef_list[i]), &pd);
     }
 #endif
 }
 
 void nrn_sparseobj_delete_from_device(SparseObj* so) {
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
     // FIXME this check needs to be tweaked if we ever want to run with a mix
     //       of CPU and GPU threads.
     if (nrn_threads[0].compute_gpu == 0) {
@@ -1267,31 +1173,29 @@ void nrn_sparseobj_delete_from_device(SparseObj* so) {
     unsigned n1 = so->neqn + 1;
     for (unsigned irow = 1; irow < n1; ++irow) {
         for (Elm* elm = so->rowst[irow]; elm; elm = elm->c_right) {
-            acc_delete(elm->value, so->_cntml_padded * sizeof(double));
-            acc_delete(elm, sizeof(Elm));
+            cnrn_target_delete(elm->value, so->_cntml_padded);
+            cnrn_target_delete(elm);
         }
     }
-    acc_delete(so->coef_list, so->coef_list_size * sizeof(double*));
-    acc_delete(so->rhs, n1 * so->_cntml_padded * sizeof(double));
-    acc_delete(so->ngetcall, so->_cntml_padded * sizeof(unsigned));
-    acc_delete(so->diag, n1 * sizeof(Elm*));
-    acc_delete(so->rowst, n1 * sizeof(Elm*));
-    acc_delete(so, sizeof(SparseObj));
+    cnrn_target_delete(so->coef_list, so->coef_list_size);
+    cnrn_target_delete(so->rhs, n1 * so->_cntml_padded);
+    cnrn_target_delete(so->ngetcall, so->_cntml_padded);
+    cnrn_target_delete(so->diag, n1);
+    cnrn_target_delete(so->rowst, n1);
+    cnrn_target_delete(so);
 #endif
 }
 
-#ifdef _OPENACC
+#ifdef CORENEURON_ENABLE_GPU
 
 void nrn_ion_global_map_copyto_device() {
     if (nrn_ion_global_map_size) {
-        double** d_data = (double**) acc_copyin(nrn_ion_global_map,
-                                                sizeof(double*) * nrn_ion_global_map_size);
+        double** d_data = cnrn_target_copyin(nrn_ion_global_map, nrn_ion_global_map_size);
         for (int j = 0; j < nrn_ion_global_map_size; j++) {
             if (nrn_ion_global_map[j]) {
-                double* d_mechmap = (double*) acc_copyin(nrn_ion_global_map[j],
-                                                         ion_global_map_member_size *
-                                                             sizeof(double));
-                acc_memcpy_to_device(&(d_data[j]), &d_mechmap, sizeof(double*));
+                double* d_mechmap = cnrn_target_copyin(nrn_ion_global_map[j],
+                                                       ion_global_map_member_size);
+                cnrn_target_memcpy_to_device(&(d_data[j]), &d_mechmap);
             }
         }
     }
@@ -1300,20 +1204,17 @@ void nrn_ion_global_map_copyto_device() {
 void nrn_ion_global_map_delete_from_device() {
     for (int j = 0; j < nrn_ion_global_map_size; j++) {
         if (nrn_ion_global_map[j]) {
-            acc_delete(nrn_ion_global_map[j], ion_global_map_member_size * sizeof(double));
+            cnrn_target_delete(nrn_ion_global_map[j], ion_global_map_member_size);
         }
     }
     if (nrn_ion_global_map_size) {
-        acc_delete(nrn_ion_global_map, sizeof(double*) * nrn_ion_global_map_size);
+        cnrn_target_delete(nrn_ion_global_map, nrn_ion_global_map_size);
     }
 }
 
 void init_gpu() {
-    // choose nvidia GPU by default
-    acc_device_t device_type = acc_device_nvidia;
-
     // check how many gpu devices available per node
-    int num_devices_per_node = acc_get_num_devices(device_type);
+    int num_devices_per_node = cnrn_target_get_num_devices();
 
     // if no gpu found, can't run on GPU
     if (num_devices_per_node == 0) {
@@ -1341,10 +1242,9 @@ void init_gpu() {
     }
 #endif
 
-    int device_num = local_rank % num_devices_per_node;
-    acc_set_device_num(device_num, device_type);
+    cnrn_target_set_default_device(local_rank % num_devices_per_node);
 
-    if (nrnmpi_myid == 0) {
+    if (nrnmpi_myid == 0 && !corenrn_param.is_quiet()) {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }
@@ -1355,42 +1255,44 @@ void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
         VecPlayContinuous* vecplay_instance = (VecPlayContinuous*) nt->_vecplay[i];
 
         /** just VecPlayContinuous object */
-        void* d_p = (void*) acc_copyin(vecplay_instance, sizeof(VecPlayContinuous));
-        acc_memcpy_to_device(&(d_vecplay[i]), &d_p, sizeof(void*));
-
-        VecPlayContinuous* d_vecplay_instance = (VecPlayContinuous*) d_p;
+        VecPlayContinuous* d_vecplay_instance = cnrn_target_copyin(vecplay_instance);
+        cnrn_target_memcpy_to_device((VecPlayContinuous**) (&(d_vecplay[i])), &d_vecplay_instance);
 
         /** copy y_, t_ and discon_indices_ */
         copy_ivoc_vect_to_device(vecplay_instance->y_, d_vecplay_instance->y_);
         copy_ivoc_vect_to_device(vecplay_instance->t_, d_vecplay_instance->t_);
+        // OL211213: beware, the test suite does not currently include anything
+        // with a non-null discon_indices_.
         if (vecplay_instance->discon_indices_) {
+            IvocVect* d_discon_indices = cnrn_target_copyin(vecplay_instance->discon_indices_);
+            cnrn_target_memcpy_to_device(&(d_vecplay_instance->discon_indices_), &d_discon_indices);
             copy_ivoc_vect_to_device(*(vecplay_instance->discon_indices_),
                                      *(d_vecplay_instance->discon_indices_));
         }
 
         /** copy PlayRecordEvent : todo: verify this */
-        PlayRecordEvent* d_e_ = (PlayRecordEvent*) acc_copyin(vecplay_instance->e_,
-                                                              sizeof(PlayRecordEvent));
-        acc_memcpy_to_device(&(d_e_->plr_), &d_vecplay_instance, sizeof(VecPlayContinuous*));
-        acc_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_, sizeof(PlayRecordEvent*));
+        PlayRecordEvent* d_e_ = cnrn_target_copyin(vecplay_instance->e_);
+
+        cnrn_target_memcpy_to_device(&(d_e_->plr_), (PlayRecord**) (&d_vecplay_instance));
+        cnrn_target_memcpy_to_device(&(d_vecplay_instance->e_), &d_e_);
 
         /** copy pd_ : note that it's pointer inside ml->data and hence data itself is
          * already on GPU */
-        double* d_pd_ = (double*) acc_deviceptr(vecplay_instance->pd_);
-        acc_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_, sizeof(double*));
+        double* d_pd_ = cnrn_target_deviceptr(vecplay_instance->pd_);
+        cnrn_target_memcpy_to_device(&(d_vecplay_instance->pd_), &d_pd_);
     }
 }
 
 void nrn_VecPlay_delete_from_device(NrnThread* nt) {
     for (int i = 0; i < nt->n_vecplay; i++) {
         auto* vecplay_instance = reinterpret_cast<VecPlayContinuous*>(nt->_vecplay[i]);
-        acc_delete(vecplay_instance->e_, sizeof(PlayRecordEvent));
+        cnrn_target_delete(vecplay_instance->e_);
         if (vecplay_instance->discon_indices_) {
             delete_ivoc_vect_from_device(*(vecplay_instance->discon_indices_));
         }
         delete_ivoc_vect_from_device(vecplay_instance->t_);
         delete_ivoc_vect_from_device(vecplay_instance->y_);
-        acc_delete(vecplay_instance, sizeof(VecPlayContinuous));
+        cnrn_target_delete(vecplay_instance);
     }
 }
 
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
index 67e6a058c..72d222cdd 100644
--- a/coreneuron/gpu/nrn_acc_manager.hpp
+++ b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -9,25 +9,19 @@
 #ifndef _nrn_device_manager_
 #define _nrn_device_manager_
 
-#if defined(_OPENACC)
-#include <openacc.h>
-#endif
-
 #include "coreneuron/sim/multicore.hpp"
 
 namespace coreneuron {
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
-void update_nrnthreads_on_device(NrnThread* threads, int nthreads);
-void modify_data_on_device(NrnThread* threads, int nthreads);
-void dump_nt_to_file(char* filename, NrnThread* threads, int nthreads);
 
-void update_matrix_from_gpu(NrnThread* _nt);
-void update_matrix_to_gpu(NrnThread* _nt);
 void update_net_receive_buffer(NrnThread* _nt);
+
+// Called by NModl
 void realloc_net_receive_buffer(NrnThread* nt, Memb_list* ml);
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
+
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
 
diff --git a/coreneuron/io/lfp.cpp b/coreneuron/io/lfp.cpp
index 646fbf5a0..2a001b85a 100644
--- a/coreneuron/io/lfp.cpp
+++ b/coreneuron/io/lfp.cpp
@@ -7,9 +7,6 @@
 
 
 namespace coreneuron {
-// extern variables require acc declare
-#pragma acc declare create(pi)
-
 namespace lfputils {
 
 double line_source_lfp_factor(const Point3D& e_pos,
diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py
index 35158908c..67cd93ebb 100644
--- a/coreneuron/kinderiv.py
+++ b/coreneuron/kinderiv.py
@@ -59,20 +59,26 @@ def write_out_kinderiv(fout):
     fout.write("\n/* declarations */\n")
     fout.write("\nnamespace coreneuron {\n")
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(declare target)\n')
+
     for item in deriv:
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int _newton_%s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
     for item in kin:
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int %s%s(void*, double*, _threadargsproto_);\n' % (item[0], item[1]))
 
     for item in euler:
-        fout.write('#pragma acc routine seq\n')
+        fout.write('nrn_pragma_acc(routine seq)\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(end declare target)\n')
+
     fout.write("\n/* callback indices */\n")
     derivoffset = 1
     kinoffset = 1
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index ee62f660d..42c65cb18 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -12,25 +12,9 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/permute/data_layout.hpp"
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma("acc parallel loop present(vdata[0:_cntml_padded*nparm]) if(_nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ \
-    _Pragma(                        \
-        "acc parallel loop present(vdata[0:_cntml_padded*nparm], ni[0:_cntml_actual], _vec_d[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   _Pragma("")
-#define _PRAGMA_FOR_JACOB_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
+#define _PRAGMA_FOR_INIT_ACC_LOOP_                                                               \
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm]) if (_nt->compute_gpu)) \
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
 #define _STRIDE _cntml_padded + _iml
 
 namespace coreneuron {
@@ -78,15 +62,16 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     (void) _cntml_padded; /* unused when layout=1*/
 
     double* _vec_d = _nt->_actual_d;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     { /*if (use_cachevec) {*/
         int* ni = ml->nodeindices;
 
         vdata = ml->data;
-        _PRAGMA_FOR_JACOB_ACC_LOOP_
+        nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                             ni [0:_cntml_actual],
+                                             _vec_d [0:_nt->end]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (_iml = 0; _iml < _cntml_actual; _iml++) {
             _vec_d[ni[_iml]] += cfac * cm;
         }
@@ -126,12 +111,13 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     /* no need to distinguish secondorder */
     int* ni = ml->nodeindices;
     double* _vec_rhs = _nt->_actual_rhs;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     vdata = ml->data;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
+                                         ni [0:_cntml_actual],
+                                         _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; _iml++) {
         i_cap = cfac * cm * _vec_rhs[ni[_iml]];
     }
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 76adc9045..8b58e858d 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -19,26 +19,6 @@
 
 #define _STRIDE _cntml_padded + _iml
 
-// clang-format off
-
-#if defined(_OPENACC)
-#define _PRAGMA_FOR_INIT_ACC_LOOP_  \
-    _Pragma(                        \
-        "acc parallel loop present(pd[0:_cntml_padded*5], ppd[0:1], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu)")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_   \
-    _Pragma(                        \
-        "acc parallel loop present(pd[0:_cntml_padded*5], nrn_ion_global_map[0:nrn_ion_global_map_size][0:ion_global_map_member_size]) if(nt->compute_gpu) async(stream_id)")
-#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_     \
-    _Pragma(                                    \
-        "acc parallel loop present(pd[0:_cntml_padded*5], ni[0:_cntml_actual], _vec_rhs[0:_nt->end]) if(_nt->compute_gpu) async(stream_id)")
-#else
-#define _PRAGMA_FOR_INIT_ACC_LOOP_          _Pragma("")
-#define _PRAGMA_FOR_CUR_ACC_LOOP_           _Pragma("")
-#define _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_ _Pragma("")
-#endif
-
-// clang-format on
-
 namespace coreneuron {
 
 // for each ion it refers to internal concentration, external concentration, and charge,
@@ -197,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
     }
 }
 
+nrn_pragma_omp(declare target)
 void nrn_wrote_conc(int type,
                     double* p1,
                     int p2,
@@ -213,6 +194,7 @@ void nrn_wrote_conc(int type,
         pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
     }
 }
+nrn_pragma_omp(end declare target)
 
 static double efun(double x) {
     if (fabs(x) < 1e-4) {
@@ -277,14 +259,18 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
     double* pd;
     Datum* ppd;
     (void) nt; /* unused */
-#if defined(_OPENACC)
-    int stream_id = nt->stream_id;
-#endif
     /*printf("ion_cur %s\n", memb_func[type].sym->name);*/
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    _PRAGMA_FOR_CUR_ACC_LOOP_
+    // clang-format off
+    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
+                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
+                                                           [0:ion_global_map_member_size])
+                                 if (nt->compute_gpu)
+                                 async(nt->stream_id))
+    // clang-format on
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         dcurdv = 0.;
         cur = 0.;
@@ -312,7 +298,18 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    _PRAGMA_FOR_INIT_ACC_LOOP_
+    // There was no async(...) clause in the initial OpenACC implementation, so
+    // no `nowait` clause has been added to the OpenMP implementation. TODO:
+    // verify if this can be made asynchronous or if there is a strong reason it
+    // needs to be like this.
+    // clang-format off
+    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
+                                         ppd[0:1],
+                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
+                                                           [0:ion_global_map_member_size])
+                                 if (nt->compute_gpu))
+    // clang-format on
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         if (iontype & 04) {
             conci = conci0;
@@ -332,9 +329,6 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
     int _cntml_padded;
     double* pd;
     (void) _nt; /* unused */
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
     double* _vec_rhs = _nt->_actual_rhs;
 
     if (secondorder == 2) {
@@ -345,7 +339,11 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
                 int* ni = ml->nodeindices;
                 _cntml_padded = ml->_nodecount_padded;
                 pd = ml->data;
-                _PRAGMA_FOR_SEC_ORDER_CUR_ACC_LOOP_
+                nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
+                                                     ni [0:_cntml_actual],
+                                                     _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
+                                   async(_nt->stream_id))
+                nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
                 for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
                     cur += dcurdv * (_vec_rhs[ni[_iml]]);
                 }
diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp
index e3b08207e..de8970560 100644
--- a/coreneuron/mechanism/mech/dimplic.cpp
+++ b/coreneuron/mechanism/mech/dimplic.cpp
@@ -24,6 +24,7 @@
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "_kinderiv.h"
 namespace coreneuron {
+nrn_pragma_omp(declare target)
 int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
     difun(fun);
     return 0;
@@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
     switch (fun) { _NRN_KINETIC_CASES }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/mech/mod2c_core_thread.hpp b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
index e4dee09ac..4c572dd18 100644
--- a/coreneuron/mechanism/mech/mod2c_core_thread.hpp
+++ b/coreneuron/mechanism/mech/mod2c_core_thread.hpp
@@ -11,6 +11,7 @@
 
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/mechanism/mechanism.hpp"
+#include "coreneuron/utils/offload.hpp"
 
 namespace coreneuron {
 
@@ -35,15 +36,17 @@ using DIFUN = int;
 using NEWTFUN = int;
 using SPFUN = int;
 using EULFUN = int;
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern int nrn_derivimplicit_steer(int, _threadargsproto_);
 #define difun(arg) nrn_derivimplicit_steer(arg, _threadargs_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int nrn_newton_steer(int, _threadargsproto_);
 #define newtfun(arg) nrn_newton_steer(arg, _threadargs_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int nrn_euler_steer(int, _threadargsproto_);
 #define eulerfun(arg) nrn_euler_steer(arg, _threadargs_);
+nrn_pragma_omp(end declare target)
 
 struct Elm {
     unsigned row;        /* Row location */
@@ -89,15 +92,19 @@ struct SparseObj {          /* all the state information */
     int do_flag;
 };
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 extern double* _nrn_thread_getelm(SparseObj* so, int row, int col, int _iml);
+nrn_pragma_omp(end declare target)
 
 extern void* nrn_cons_sparseobj(SPFUN, int, Memb_list*, _threadargsproto_);
 
 extern void _nrn_destroy_sparseobj_thread(SparseObj* so);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 extern int nrn_kinetic_steer(int, SparseObj*, double*, _threadargsproto_);
+nrn_pragma_omp(end declare target)
 #define spfun(arg1, arg2, arg3) nrn_kinetic_steer(arg1, arg2, arg3, _threadargs_);
 
 // derived from nrn/src/scopmath/euler.c
@@ -116,14 +123,15 @@ static inline int euler_thread(int neqn, int* var, int* der, DIFUN fun, _threada
     return 0;
 }
 
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern int derivimplicit_thread(int, int*, int*, DIFUN, _threadargsproto_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int _ss_derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int
 sparse_thread(SparseObj*, int, int*, int*, double*, double, SPFUN, int, _threadargsproto_);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 int _ss_sparse_thread(SparseObj*,
                       int n,
                       int* s,
@@ -134,10 +142,11 @@ int _ss_sparse_thread(SparseObj*,
                       int linflag,
                       _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern double _modl_get_dt_thread(NrnThread*);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern void _modl_set_dt_thread(double, NrnThread*);
+nrn_pragma_omp(end declare target)
 
 void nrn_sparseobj_copyto_device(SparseObj* so);
 void nrn_sparseobj_delete_from_device(SparseObj* so);
diff --git a/coreneuron/mechanism/mechanism.hpp b/coreneuron/mechanism/mechanism.hpp
index 3e7046e4e..65d7b29ce 100644
--- a/coreneuron/mechanism/mechanism.hpp
+++ b/coreneuron/mechanism/mechanism.hpp
@@ -88,7 +88,7 @@ struct NetSendBuffer_t: MemoryManaged {
     }
 
     void grow() {
-#if defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_GPU
         int cannot_reallocate_on_device = 0;
         assert(cannot_reallocate_on_device);
 #else
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 7598edf50..ba7bf9281 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include "coreneuron/mechanism/mechanism.hpp"
+#include "coreneuron/utils/offload.hpp"
 namespace coreneuron {
 
 using Pfrpdat = Datum* (*) (void);
@@ -109,12 +110,14 @@ extern void hoc_register_watch_check(nrn_watch_check_t, int);
 
 extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 double nrn_nernst(double ci, double co, double z, double celsius);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern double nrn_ghk(double v, double ci, double co, double z);
+nrn_pragma_omp(end declare target)
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 
@@ -175,8 +178,10 @@ extern void artcell_net_move(void**, Point_process*, double);
 extern void nrn2ncs_outputevent(int netcon_output_index, double firetime);
 extern bool nrn_use_localgid_;
 extern void net_sem_from_gpu(int sendtype, int i_vdata, int, int ith, int ipnt, double, double);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 extern int at_time(NrnThread*, double);
+nrn_pragma_omp(end declare target)
 
 // _OPENACC and/or NET_RECEIVE_BUFFERING
 extern void net_sem_from_gpu(int, int, int, int, int, double, double);
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
index 3acdff1ea..433140b82 100644
--- a/coreneuron/mechanism/register_mech.cpp
+++ b/coreneuron/mechanism/register_mech.cpp
@@ -19,11 +19,9 @@
 
 namespace coreneuron {
 int secondorder = 0;
+nrn_pragma_omp(declare target)
 double t, dt, celsius, pi;
-// declare copyin required for correct initialization
-#pragma acc declare copyin(secondorder)
-#pragma acc declare copyin(celsius)
-#pragma acc declare copyin(pi)
+nrn_pragma_omp(end declare target)
 int rev_dt;
 
 using Pfrv = void (*)();
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
index 6ed52dc34..31c18807e 100644
--- a/coreneuron/network/cvodestb.cpp
+++ b/coreneuron/network/cvodestb.cpp
@@ -55,17 +55,15 @@ void init_net_events() {
         net_cvode_instance->init_events();
     }
 
-#if defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_GPU
     /* weight vectors could be updated (from INITIAL block of NET_RECEIVE, update those on GPU's */
     for (int ith = 0; ith < nrn_nthread; ++ith) {
         NrnThread* nt = nrn_threads + ith;
         double* weights = nt->weights;
         int n_weight = nt->n_weight;
-        if (n_weight) {
-            // clang-format off
-
-            #pragma acc update device(weights[0 : n_weight]) if (nt->compute_gpu)
-            // clang-format on
+        if (n_weight && nt->compute_gpu) {
+            nrn_pragma_acc(update device(weights [0:n_weight]))
+            nrn_pragma_omp(target update to(weights [0:n_weight]))
         }
     }
 #endif
@@ -88,6 +86,7 @@ void fixed_play_continuous(NrnThread* nt) {
 
 // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
 // for the ISPC backend. If changes are required, make sure to change ISPC as well.
+nrn_pragma_omp(declare target)
 int at_time(NrnThread* nt, double te) {
     double x = te - 1e-11;
     if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
@@ -95,5 +94,6 @@ int at_time(NrnThread* nt, double te) {
     }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index 899bc1e14..4fb1d165f 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -26,9 +26,6 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
 
-#ifdef _OPENACC
-#include <openacc.h>
-#endif
 namespace coreneuron {
 #define PP2NT(pp) (nrn_threads + (pp)->_tid)
 #define PP2t(pp)  (PP2NT(pp)->_t)
@@ -531,28 +528,13 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
     PreSynHelper* presyns_helper = nt->presyns_helper;
     double* actual_v = nt->_actual_v;
 
-#if defined(_OPENACC)
-    int stream_id = nt->stream_id;
-#endif
-
     if (nt->ncell == 0)
         return;
 
-        //_net_send_buffer_cnt is no longer used in openacc kernel, remove this?
-        //#ifdef _OPENACC
-        //    if(nt->compute_gpu)
-        //        acc_update_device(&(nt->_net_send_buffer_cnt), sizeof(int));
-        //#endif
-
-        // on GPU...
-        // clang-format off
-
-    #pragma acc parallel loop present(                  \
-        nt[0:1], presyns_helper[0:nt->n_presyn],        \
-        presyns[0:nt->n_presyn], actual_v[0:nt->end])   \
-        copy(net_send_buf_count) if (nt->compute_gpu)   \
-        async(stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(
+        nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
+                       copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
@@ -563,7 +545,7 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
         int* flag = &(psh->flag_);
 
         if (pscheck(v, threshold, flag)) {
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
             nt->_net_send_buffer_cnt = net_send_buf_count;
             if (nt->_net_send_buffer_cnt >= nt->_net_send_buffer_size) {
                 nt->_net_send_buffer_size *= 2;
@@ -572,31 +554,23 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
             }
 #endif
 
-            // clang-format off
-
-            #pragma acc atomic capture
-            // clang-format on
+            nrn_pragma_acc(atomic capture)
+            nrn_pragma_omp(atomic capture)
             idx = net_send_buf_count++;
 
             nt->_net_send_buffer[idx] = i;
         }
     }
-
-    // clang-format off
-
-    #pragma acc wait(stream_id)
-    // clang-format on
+    nrn_pragma_acc(wait(nt->stream_id))
     nt->_net_send_buffer_cnt = net_send_buf_count;
 
-    if (nt->_net_send_buffer_cnt) {
-#ifdef _OPENACC
+    if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
+#ifdef CORENEURON_ENABLE_GPU
         int* nsbuffer = nt->_net_send_buffer;
 #endif
-        // clang-format off
-
-        #pragma acc update host(nsbuffer[0:nt->_net_send_buffer_cnt]) if (nt->compute_gpu) async(stream_id)
-        #pragma acc wait(stream_id)
-        // clang-format on
+        nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
+        nrn_pragma_acc(wait(nt->stream_id))
+        nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
     }
 
     // on CPU...
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index e74d866ce..4c517e999 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -41,40 +41,38 @@ void nrnmpi_v_transfer() {
     // gather the source values. can be done in parallel
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         auto& ttd = transfer_thread_data_[tid];
-        auto& nt = nrn_threads[tid];
+        auto* nt = &nrn_threads[tid];
         int n = int(ttd.outsrc_indices.size());
         if (n == 0) {
             continue;
         }
-        double* src_data = nt._data;
+        double* src_data = nt->_data;
         int* src_indices = ttd.src_indices.data();
 
         // gather sources on gpu and copy to cpu, cpu scatters to outsrc_buf
         double* src_gather = ttd.src_gather.data();
         size_t n_src_gather = ttd.src_gather.size();
-        // clang-format off
 
-        #pragma acc parallel loop present(                                          \
-            src_indices[0:n_src_gather], src_data[0:nt._ndata],                     \
-            src_gather[0 : n_src_gather]) /*copyout(src_gather[0:n_src_gather])*/   \
-            if (nt.compute_gpu) async(nt.stream_id)
+        nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
+                                             src_data [0:nt->_ndata],
+                                             src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           async(nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int i = 0; i < n_src_gather; ++i) {
             src_gather[i] = src_data[src_indices[i]];
         }
-        // do not know why the copyout above did not work
-        // and the following update is needed
-        #pragma acc update host(src_gather[0 : n_src_gather])   \
-            if (nrn_threads[0].compute_gpu)                     \
-            async(nt.stream_id)
-        // clang-format on
+        nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           async(nt->stream_id))
+        nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
     }
 
     // copy gathered source values to outsrc_buf_
+    bool compute_gpu = false;
     for (int tid = 0; tid < nrn_nthread; ++tid) {
-        // clang-format off
-
-        #pragma acc wait(nrn_threads[tid].stream_id)
-        // clang-format on
+        if (nrn_threads[tid].compute_gpu) {
+            compute_gpu = true;
+            nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
+        }
         TransferThreadData& ttd = transfer_thread_data_[tid];
         size_t n_outsrc_indices = ttd.outsrc_indices.size();
         int* outsrc_indices = ttd.outsrc_indices.data();
@@ -102,12 +100,8 @@ void nrnmpi_v_transfer() {
     }
 
     // insrc_buf_ will get copied to targets via nrnthread_v_transfer
-    // clang-format off
-
-    #pragma acc update device(          \
-        insrc_buf_[0:n_insrc_buf])      \
-        if (nrn_threads[0].compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(update device(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
+    nrn_pragma_omp(target update to(insrc_buf_ [0:n_insrc_buf]) if (compute_gpu))
 }
 
 void nrnthread_v_transfer(NrnThread* _nt) {
@@ -119,33 +113,33 @@ void nrnthread_v_transfer(NrnThread* _nt) {
     int* insrc_indices = ttd.insrc_indices.data();
     double* tar_data = _nt->_data;
     // last element in the displacement vector gives total length
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
     int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
     int ndata = _nt->_ndata;
+#endif
 
-    // clang-format off
-
-    #pragma acc parallel loop present(  \
-        insrc_indices[0:ntar],          \
-        tar_data[0:ndata],              \
-        insrc_buf_[0:n_insrc_buf])      \
-    if (_nt->compute_gpu)               \
-        async(_nt->stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
+                                         tar_data [0:ndata],
+                                         insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
     for (size_t i = 0; i < ntar; ++i) {
         tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
     }
 }
 
+/// TODO: Corresponding exit data cluase for OpenACC/OpenMP is missing and hence
+///       GPU buffers are not freed.
 void nrn_partrans::gap_update_indices() {
     // Ensure index vectors, src_gather, and insrc_buf_ are on the gpu.
     if (insrcdspl_) {
         int n_insrc_buf = insrcdspl_[nrnmpi_numprocs];
+        nrn_pragma_acc(enter data create(insrc_buf_ [0:n_insrc_buf]) if (corenrn_param.gpu))
+        // clang-format off
+        nrn_pragma_omp(target enter data map(alloc: insrc_buf_[0:n_insrc_buf])
+                                         if(corenrn_param.gpu))
         // clang-format off
-
-        #pragma acc enter data create(      \
-            insrc_buf_[0:n_insrc_buf])      \
-            if (nrn_threads[0].compute_gpu)
-        // clang-format on
     }
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         TransferThreadData& ttd = transfer_thread_data_[tid];
@@ -154,21 +148,25 @@ void nrn_partrans::gap_update_indices() {
         size_t n_src_gather = ttd.src_gather.size();
         NrnThread* nt = nrn_threads + tid;
         if (n_src_indices) {
+            int* src_indices = ttd.src_indices.data();
+            double* src_gather = ttd.src_gather.data();
+            nrn_pragma_acc(enter data copyin(src_indices[0:n_src_indices]) if(nt->compute_gpu))
+            nrn_pragma_acc(enter data create(src_gather[0:n_src_gather]) if(nt->compute_gpu))
             // clang-format off
-
-            int *src_indices = ttd.src_indices.data();
-            double *src_gather = ttd.src_gather.data();
-            #pragma acc enter data copyin(src_indices[0 : n_src_indices]) if (nt->compute_gpu)
-            #pragma acc enter data create(src_gather[0 : n_src_gather]) if (nt->compute_gpu)
+            nrn_pragma_omp(target enter data map(to: src_indices [0:n_src_indices])
+                                             map(alloc: src_gather[0:n_src_gather])
+                                             if(nt->compute_gpu))
             // clang-format on
         }
 
         if (ttd.insrc_indices.size()) {
-            // clang-format off
-
-            int *insrc_indices = ttd.insrc_indices.data();
+            int* insrc_indices = ttd.insrc_indices.data();
             size_t n_insrc_indices = ttd.insrc_indices.size();
-            #pragma acc enter data copyin(insrc_indices[0 : n_insrc_indices]) if (nt->compute_gpu)
+            nrn_pragma_acc(
+                enter data copyin(insrc_indices [0:n_insrc_indices]) if (nt->compute_gpu))
+            // clang-format off
+            nrn_pragma_omp(target enter data map(to: insrc_indices[0:n_insrc_indices])
+                                             if(nt->compute_gpu))
             // clang-format on
         }
     }
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index 2c7fb8bb9..225d6d2ad 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -9,6 +9,8 @@
 #ifndef _H_NRNCONF_
 #define _H_NRNCONF_
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <cstdio>
 #include <cmath>
 #include <cassert>
@@ -32,14 +34,16 @@ using Symbol = char;
 #define VECTORIZE   1
 
 // extern variables require acc declare
+nrn_pragma_omp(declare target)
 extern double celsius;
-#pragma acc declare create(celsius)
+nrn_pragma_acc(declare create(celsius))
 
 extern double pi;
-#pragma acc declare create(pi)
+nrn_pragma_acc(declare create(pi))
 
 extern int secondorder;
-#pragma acc declare create(secondorder)
+nrn_pragma_acc(declare create(secondorder))
+nrn_pragma_omp(end declare target)
 
 extern double t, dt;
 extern int rev_dt;
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 2b6167f57..6b4014a64 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -6,8 +6,6 @@
 # =============================================================================
 */
 
-#include <set>
-
 #include "coreneuron/nrnconf.h"
 #include "coreneuron/sim/multicore.hpp"
 #include "coreneuron/utils/nrn_assert.h"
@@ -15,6 +13,7 @@
 #include "coreneuron/network/tnode.hpp"
 #include "coreneuron/utils/lpt.hpp"
 #include "coreneuron/utils/memory.h"
+#include "coreneuron/utils/offload.hpp"
 #include "coreneuron/apps/corenrn_parameters.hpp"
 
 #include "coreneuron/permute/node_permute.h"  // for print_quality
@@ -22,6 +21,9 @@
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
+
+#include <set>
+
 namespace coreneuron {
 int interleave_permute_type;
 InterleaveInfo* interleave_info;  // nrn_nthread array
@@ -444,7 +446,7 @@ static void triang_interleaved(NrnThread* nt,
         if (istride < icellsize) {  // only first icellsize strides matter
             // what is the index
             int ip = GPU_PARENT(i);
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
             nrn_assert(ip >= 0);  // if (ip < 0) return;
 #endif
             double p = GPU_A(i) / GPU_D(i);
@@ -466,7 +468,7 @@ static void bksub_interleaved(NrnThread* nt,
     GPU_RHS(icell) /= GPU_D(icell);  // the root
     for (int istride = 0; istride < icellsize; ++istride) {
         int ip = GPU_PARENT(i);
-#ifndef _OPENACC
+#ifndef CORENEURON_ENABLE_GPU
         nrn_assert(ip >= 0);
 #endif
         GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
@@ -480,7 +482,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     int icycle = ncycle - 1;
     int istride = stride[icycle];
     int i = lastnode - istride + icore;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
 #endif
 
@@ -488,10 +490,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     bool has_subtrees_to_compute = true;
 
     // clang-format off
-
-    #pragma acc loop seq
+    nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
@@ -500,12 +501,14 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 // what is the index
                 int ip = GPU_PARENT(i);
                 double p = GPU_A(i) / GPU_D(i);
-                #pragma acc atomic update
+                nrn_pragma_acc(atomic update)
+                nrn_pragma_omp(atomic update)
                 GPU_D(ip) -= p * GPU_B(i);
-                #pragma acc atomic update
+                nrn_pragma_acc(atomic update)
+                nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         }
 #endif
         // if finished with all tree depths then ready to break
@@ -517,7 +520,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
         --icycle;
         istride = stride[icycle];
         i -= istride;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         ii -= istride;
 #endif
     }
@@ -532,25 +535,22 @@ static void bksub_interleaved2(NrnThread* nt,
                                int ncycle,
                                int* stride,
                                int firstnode) {
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
     for (int i = root; i < lastroot; i += 1) {
 #else
-    // clang-format off
-
-    #pragma acc loop seq
-    // clang-format on
+    nrn_pragma_acc(loop seq)
     for (int i = root; i < lastroot; i += warpsize) {
 #endif
         GPU_RHS(i) /= GPU_D(i);  // the root
     }
 
     int i = firstnode + icore;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
 #endif
     for (int icycle = 0; icycle < ncycle; ++icycle) {
         int istride = stride[icycle];
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
@@ -561,7 +561,7 @@ static void bksub_interleaved2(NrnThread* nt,
                 GPU_RHS(i) /= GPU_D(i);
             }
             i += istride;
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
         }
         ii += istride;
 #endif
@@ -596,21 +596,18 @@ void solve_interleaved2(int ith) {
         int* strides = ii.stride;           // sum ncycles of these (bad since ncompart/warpsize)
         int* rootbegin = ii.firstnode;      // nwarp+1 of these
         int* nodebegin = ii.lastnode;       // nwarp+1 of these
-#ifdef _OPENACC
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
         int nstride = stridedispl[nwarp];
-        int stream_id = nt->stream_id;
-#endif
-
-#ifdef _OPENACC
-        // clang-format off
-        
-        #pragma acc parallel loop gang vector vector_length(warpsize) \
-            present(nt[0:1], strides[0:nstride],                      \
-            ncycles[0:nwarp], stridedispl[0:nwarp+1],                 \
-            rootbegin[0:nwarp+1], nodebegin[0:nwarp+1])               \
-            if (nt->compute_gpu) async(stream_id)
-// clang-format on
 #endif
+        nrn_pragma_acc(parallel loop gang vector vector_length(
+            warpsize) present(nt [0:1],
+                              strides [0:nstride],
+                              ncycles [0:nwarp],
+                              stridedispl [0:nwarp + 1],
+                              rootbegin [0:nwarp + 1],
+                              nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int icore = 0; icore < ncore; ++icore) {
             int iwarp = icore / warpsize;     // figure out the >> value
             int ic = icore & (warpsize - 1);  // figure out the & mask
@@ -620,18 +617,16 @@ void solve_interleaved2(int ith) {
             int lastroot = rootbegin[iwarp + 1];
             int firstnode = nodebegin[iwarp];
             int lastnode = nodebegin[iwarp + 1];
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
             if (ic == 0) {  // serial test mode. triang and bksub do all cores in warp
 #endif
                 triang_interleaved2(nt, ic, ncycle, stride, lastnode);
                 bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
-#if !defined(_OPENACC)
+#ifndef CORENEURON_ENABLE_GPU
             }  // serial test mode
 #endif
         }
-#ifdef _OPENACC
-#pragma acc wait(nt->stream_id)
-#endif
+        nrn_pragma_acc(wait(nt->stream_id))
 #ifdef _OPENACC
     }
 #endif
@@ -656,28 +651,23 @@ void solve_interleaved1(int ith) {
     int* firstnode = ii.firstnode;
     int* lastnode = ii.lastnode;
     int* cellsize = ii.cellsize;
-#if _OPENACC
-    int stream_id = nt->stream_id;
-#endif
 
-#ifdef _OPENACC
-    // clang-format off
-
-    #pragma acc parallel loop present(              \
-        nt[0:1], stride[0:nstride],                 \
-        firstnode[0:ncell], lastnode[0:ncell],      \
-        cellsize[0:ncell]) if (nt->compute_gpu)     \
-        async(stream_id)
-// clang-format on
-#endif
+    // OL211123: can we preserve the error checking behaviour of OpenACC's
+    // present clause with OpenMP? It is a bug if these data are not present,
+    // so diagnostics are helpful...
+    nrn_pragma_acc(parallel loop present(nt [0:1],
+                                         stride [0:nstride],
+                                         firstnode [0:ncell],
+                                         lastnode [0:ncell],
+                                         cellsize [0:ncell]) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int icell = 0; icell < ncell; ++icell) {
         int icellsize = cellsize[icell];
         triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
         bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
     }
-#ifdef _OPENACC
-#pragma acc wait(stream_id)
-#endif
+    nrn_pragma_acc(wait(nt->stream_id))
 }
 
 void solve_interleaved(int ith) {
diff --git a/coreneuron/permute/cellorder.cu b/coreneuron/permute/cellorder.cu
index 82198410f..0c1b5af2e 100644
--- a/coreneuron/permute/cellorder.cu
+++ b/coreneuron/permute/cellorder.cu
@@ -72,25 +72,37 @@ __global__ void solve_interleaved2_kernel(NrnThread* nt, InterleaveInfo* ii, int
     int* rootbegin = ii->firstnode;      // nwarp+1 of these
     int* nodebegin = ii->lastnode;       // nwarp+1 of these
 
-    int iwarp = icore / warpsize;     // figure out the >> value
-    int ic = icore & (warpsize - 1);  // figure out the & mask
-    int ncycle = ncycles[iwarp];
-    int* stride = strides + stridedispl[iwarp];
-    int root = rootbegin[iwarp];
-    int lastroot = rootbegin[iwarp + 1];
-    int firstnode = nodebegin[iwarp];
-    int lastnode = nodebegin[iwarp + 1];
-
-    triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
-    bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+    while (icore < ncore) {
+        int iwarp = icore / warpsize;     // figure out the >> value
+        int ic = icore & (warpsize - 1);  // figure out the & mask
+        int ncycle = ncycles[iwarp];
+        int* stride = strides + stridedispl[iwarp];
+        int root = rootbegin[iwarp];
+        int lastroot = rootbegin[iwarp + 1];
+        int firstnode = nodebegin[iwarp];
+        int lastnode = nodebegin[iwarp + 1];
+
+        triang_interleaved2_device(nt, ic, ncycle, stride, lastnode);
+        bksub_interleaved2_device(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
+
+        icore += blockDim.x * gridDim.x;
+    }
 }
 
 void solve_interleaved2_launcher(NrnThread* nt, InterleaveInfo* info, int ncore, void* stream) {
     auto cuda_stream = static_cast<cudaStream_t>(stream);
 
+    /// the selection of these parameters has been done after running the channel-benchmark for
+    /// typical production runs, i.e. 1 MPI task with 1440 cells & 6 MPI tasks with 8800 cells.
+    /// In the OpenACC/OpenMP implementations threadsPerBlock is set to 32. From profiling the
+    /// channel-benchmark circuits mentioned above we figured out that the best performance was
+    /// achieved with this configuration
     int threadsPerBlock = warpsize;
-    // TODO: Should blocksPerGrid be a fixed number and have a while block inside the kernel?
-    int blocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
+    /// Max number of blocksPerGrid for NVIDIA GPUs is 65535, so we need to make sure that the
+    /// blocksPerGrid we launch the CUDA kernel with doesn't exceed this number
+    const auto maxBlocksPerGrid = 65535;
+    int provisionalBlocksPerGrid = (ncore + threadsPerBlock - 1) / threadsPerBlock;
+    int blocksPerGrid = provisionalBlocksPerGrid <= maxBlocksPerGrid ? provisionalBlocksPerGrid : maxBlocksPerGrid;
 
     solve_interleaved2_kernel<<<blocksPerGrid, threadsPerBlock, 0, cuda_stream>>>(nt, info, ncore);
 
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
index 8f4ac14cf..ab6fc4bfb 100644
--- a/coreneuron/sim/fadvance_core.cpp
+++ b/coreneuron/sim/fadvance_core.cpp
@@ -78,10 +78,11 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
             } else {
                 nt->cj = 1.0 / dt;
             }
+            nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
+                               async(nt->stream_id) if (nt->compute_gpu))
             // clang-format off
-
-            #pragma acc update device(nt->_t, nt->_dt, nt->cj) \
-                    async(nt->stream_id) if(nt->compute_gpu)
+            nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
+                                         if(nt->compute_gpu))
             // clang-format on
         }
     }
@@ -201,35 +202,24 @@ void update(NrnThread* _nt) {
     double* vec_v = &(VEC_V(0));
     double* vec_rhs = &(VEC_RHS(0));
     int i2 = _nt->end;
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
 
     /* do not need to worry about linmod or extracellular*/
     if (secondorder) {
-        // clang-format off
-
-        #pragma acc parallel loop present(          \
-            vec_v[0:i2], vec_rhs[0:i2])             \
-            if (_nt->compute_gpu) async(stream_id)
-        // clang-format on
+        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += 2. * vec_rhs[i];
         }
     } else {
-        // clang-format off
-
-        #pragma acc parallel loop present(              \
-                vec_v[0:i2], vec_rhs[0:i2])             \
-                if (_nt->compute_gpu) async(stream_id)
-        // clang-format on
+        nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += vec_rhs[i];
         }
     }
 
-    // update_matrix_to_gpu(_nt);
-
     if (_nt->tml) {
         assert(_nt->tml->index == CAP);
         nrn_cur_capacitance(_nt, _nt->tml->ml, _nt->tml->index);
@@ -304,10 +294,9 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             // make sure we do not overflow the `varrays` buffers
             assert(vs < tr->bsize);
 
-            // clang-format off
-
-            #pragma acc parallel loop present(tr[0:1]) if(nth->compute_gpu) async(nth->stream_id)
-            // clang-format on
+            nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
+                               async(nth->stream_id))
+            nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
             for (int i = 0; i < tr->n_trajec; ++i) {
                 tr->varrays[i][vs] = *tr->gather[i];
             }
@@ -326,12 +315,11 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             // https://github.com/BlueBrain/CoreNeuron/issues/611
             for (int i = 0; i < tr->n_trajec; ++i) {
                 double* gather_i = tr->gather[i];
-                // clang-format off
-
-                #pragma acc update self(gather_i[0:1]) if(nth->compute_gpu) async(nth->stream_id)
+                nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
+                                   async(nth->stream_id))
+                nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
             }
-            #pragma acc wait(nth->stream_id)
-            // clang-format on
+            nrn_pragma_acc(wait(nth->stream_id))
             for (int i = 0; i < tr->n_trajec; ++i) {
                 *(tr->scatter[i]) = *(tr->gather[i]);
             }
@@ -351,15 +339,11 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
     nth->_t += .5 * nth->_dt;
 
     if (nth->ncell) {
-#if defined(_OPENACC)
-        int stream_id = nth->stream_id;
-        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can launch kernel) */
-        // clang-format off
-
-        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(stream_id)
-        #pragma acc wait(stream_id)
-// clang-format on
-#endif
+        /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
+        launch kernel) */
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
+        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
 
         {
@@ -393,12 +377,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {
 
     if (nth->ncell) {
         /*@todo: do we need to update nth->_t on GPU */
-        // clang-format off
-
-        #pragma acc update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id)
-        #pragma acc wait(nth->stream_id)
-        // clang-format on
-
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
+        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
         nonvint(nth);
         nrncore2nrn_send_values(nth);
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index 8dfb0cd76..1218b7967 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -50,10 +50,10 @@ void nrn_calc_fast_imem(NrnThread* nt) {
 
     double* fast_imem_d = nt->nrn_fast_imem->nrn_sav_d;
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(vec_rhs,     \
-                                  vec_area,    \
-                                  fast_imem_d, \
-                                  fast_imem_rhs) if (nt->compute_gpu) async(nt->stream_id)
+    nrn_pragma_acc(
+        parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
+            async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
@@ -68,8 +68,9 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {
     double* vec_area = nt->_actual_area;
 
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu) \
-    async(nt->stream_id)
+    nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
+                       async(nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
diff --git a/coreneuron/sim/finitialize.cpp b/coreneuron/sim/finitialize.cpp
index 1ae79a92f..d711ae247 100644
--- a/coreneuron/sim/finitialize.cpp
+++ b/coreneuron/sim/finitialize.cpp
@@ -53,12 +53,9 @@ void nrn_finitialize(int setv, double v) {
     if (setv) {
         for (auto _nt = nrn_threads; _nt < nrn_threads + nrn_nthread; ++_nt) {
             double* vec_v = &(VEC_V(0));
-            // clang-format off
-
-            #pragma acc parallel loop present(      \
-                _nt[0:1], vec_v[0:_nt->end])        \
-                if (_nt->compute_gpu)
-            // clang-format on
+            nrn_pragma_acc(
+                parallel loop present(_nt [0:1], vec_v [0:_nt->end]) if (_nt->compute_gpu))
+            nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
             for (int i = 0; i < _nt->end; ++i) {
                 vec_v[i] = v;
             }
diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp
index b180ea107..72a5c017f 100644
--- a/coreneuron/sim/scopmath/crout_thread.cpp
+++ b/coreneuron/sim/scopmath/crout_thread.cpp
@@ -50,6 +50,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 
 /* having a differnt permutation per instance may not be a good idea */
+nrn_pragma_omp(declare target)
 int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
     int save_i = 0;
 
@@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n,
         }
     }
 }
+nrn_pragma_omp(end declare target)
 }  // namespace coreneuron
diff --git a/coreneuron/sim/scopmath/newton_struct.h b/coreneuron/sim/scopmath/newton_struct.h
index 8cd52732c..d01bfb822 100644
--- a/coreneuron/sim/scopmath/newton_struct.h
+++ b/coreneuron/sim/scopmath/newton_struct.h
@@ -25,10 +25,11 @@ struct NewtonSpace {
     double* rowmax;
 };
 
-#pragma acc routine seq
+nrn_pragma_omp(declare target)
+nrn_pragma_acc(routine seq)
 extern int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern void nrn_scopmath_solve_thread(int n,
                                       double** a,
                                       double* value,
@@ -37,7 +38,7 @@ extern void nrn_scopmath_solve_thread(int n,
                                       int* s,
                                       _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int nrn_newton_thread(NewtonSpace* ns,
                              int n,
                              int* s,
@@ -45,7 +46,7 @@ extern int nrn_newton_thread(NewtonSpace* ns,
                              double* value,
                              _threadargsproto_);
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern void nrn_buildjacobian_thread(NewtonSpace* ns,
                                      int n,
                                      int* s,
@@ -53,6 +54,7 @@ extern void nrn_buildjacobian_thread(NewtonSpace* ns,
                                      double* value,
                                      double** jacobian,
                                      _threadargsproto_);
+nrn_pragma_omp(end declare target)
 
 extern NewtonSpace* nrn_cons_newtonspace(int n, int n_instance);
 extern void nrn_destroy_newtonspace(NewtonSpace* ns);
diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp
index 6c0f303ce..dc08ca04b 100644
--- a/coreneuron/sim/scopmath/newton_thread.cpp
+++ b/coreneuron/sim/scopmath/newton_thread.cpp
@@ -59,6 +59,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 #define s_(arg) _p[s[arg] * _STRIDE]
 
+nrn_pragma_omp(declare target)
 int nrn_newton_thread(NewtonSpace* ns,
                       int n,
                       int* s,
@@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns,
 
     return (error);
 }
+nrn_pragma_omp(end declare target)
 
 /*------------------------------------------------------------*/
 /*                                                            */
diff --git a/coreneuron/sim/scopmath/sparse_thread.cpp b/coreneuron/sim/scopmath/sparse_thread.cpp
index d936e269a..71643430a 100644
--- a/coreneuron/sim/scopmath/sparse_thread.cpp
+++ b/coreneuron/sim/scopmath/sparse_thread.cpp
@@ -105,7 +105,7 @@ static void check_assert(SparseObj* so);
 static void re_link(SparseObj* so, unsigned i);
 static SparseObj* create_sparseobj();
 
-#if defined(_OPENACC)
+#ifdef CORENEURON_ENABLE_GPU
 #undef emalloc
 #undef ecalloc
 #define emalloc(arg)        malloc(arg)
diff --git a/coreneuron/sim/scopmath/ssimplic_thread.cpp b/coreneuron/sim/scopmath/ssimplic_thread.cpp
index fe11411d0..511e45d2b 100644
--- a/coreneuron/sim/scopmath/ssimplic_thread.cpp
+++ b/coreneuron/sim/scopmath/ssimplic_thread.cpp
@@ -9,12 +9,15 @@
 #include "coreneuron/mechanism/mech/cfile/scoplib.h"
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "coreneuron/sim/scopmath/errcodes.h"
+#include "coreneuron/utils/offload.hpp"
 
 namespace coreneuron {
 #define s_(arg) _p[s[arg] * _STRIDE]
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
+nrn_pragma_omp(declare target)
 static int check_state(int, int*, _threadargsproto_);
+nrn_pragma_omp(end declare target)
 
 int _ss_sparse_thread(SparseObj* v,
                       int n,
diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp
index a24c8360f..60ba2b660 100644
--- a/coreneuron/sim/solve_core.cpp
+++ b/coreneuron/sim/solve_core.cpp
@@ -24,7 +24,9 @@ void nrn_solve_minimal(NrnThread* _nt) {
     }
 }
 
-/** TODO loops are executed seq in OpenACC just for debugging, remove it! */
+/** @todo OpenACC GPU offload is sequential/slow. Because --cell-permute=0 and
+ *  --gpu is forbidden anyway, no OpenMP target offload equivalent is implemented.
+ */
 
 /* triangularization of the matrix equations */
 static void triang(NrnThread* _nt) {
@@ -37,17 +39,9 @@ static void triang(NrnThread* _nt) {
     double* vec_rhs = &(VEC_RHS(0));
     int* parent_index = _nt->_v_parent_index;
 
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(      \
-        vec_a[0:i3], vec_b[0:i3], vec_d[0:i3],  \
-        vec_rhs[0:i3], parent_index[0:i3])      \
-        async(stream_id) if (_nt->compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(parallel loop seq present(
+        vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
+                       async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i3 - 1; i >= i2; --i) {
         double p = vec_a[i] / vec_d[i];
         vec_d[parent_index[i]] -= p * vec_b[i];
@@ -66,33 +60,22 @@ static void bksub(NrnThread* _nt) {
     double* vec_rhs = &(VEC_RHS(0));
     int* parent_index = _nt->_v_parent_index;
 
-#if defined(_OPENACC)
-    int stream_id = _nt->stream_id;
-#endif
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(      \
-        vec_d[0:i2], vec_rhs[0:i2])             \
-        async(stream_id) if (_nt->compute_gpu)
-    // clang-format on
+    nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2])
+                       async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i1; i < i2; ++i) {
         vec_rhs[i] /= vec_d[i];
     }
 
-    /** @todo: just for benchmarking, otherwise produces wrong results */
-    // clang-format off
-
-    #pragma acc parallel loop seq present(          \
-        vec_b[0:i3], vec_d[0:i3], vec_rhs[0:i3],    \
-        parent_index[0:i3]) async(stream_id)        \
-        if (_nt->compute_gpu)
+    nrn_pragma_acc(
+        parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
+            async(_nt->stream_id) if (_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
         vec_rhs[i] /= vec_d[i];
     }
 
-    #pragma acc wait(stream_id)
-    // clang-format on
+    if (_nt->compute_gpu) {
+        nrn_pragma_acc(wait(_nt->stream_id))
+    }
 }
 }  // namespace coreneuron
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
index 943980bcd..208058fe1 100644
--- a/coreneuron/sim/treeset_core.cpp
+++ b/coreneuron/sim/treeset_core.cpp
@@ -32,12 +32,9 @@ static void nrn_rhs(NrnThread* _nt) {
     double* vec_v = &(VEC_V(0));
     int* parent_index = _nt->_v_parent_index;
 
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_rhs[0:i3], vec_d[0:i3])             \
-        if (_nt->compute_gpu) async(_nt->stream_id)
-    // clang-format on
+    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -46,9 +43,10 @@ static void nrn_rhs(NrnThread* _nt) {
     if (_nt->nrn_fast_imem) {
         double* fast_imem_d = _nt->nrn_fast_imem->nrn_sav_d;
         double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(fast_imem_d [i1:i3],                         \
-                                  fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) \
-    async(_nt->stream_id)
+        nrn_pragma_acc(
+            parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
+                async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -76,7 +74,9 @@ static void nrn_rhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
-#pragma acc parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)
+        nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
+                           async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -86,22 +86,24 @@ static void nrn_rhs(NrnThread* _nt) {
     The extracellular mechanism contribution is already done.
             rhs += ai_j*(vi_j - vi)
     */
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_rhs[0:i3], vec_d[0:i3],             \
-        vec_a[0:i3], vec_b[0:i3],               \
-        vec_v[0:i3], parent_index[0:i3])        \
-        if (_nt->compute_gpu) async(_nt->stream_id)
+    nrn_pragma_acc(parallel loop present(vec_rhs [0:i3],
+                                         vec_d [0:i3],
+                                         vec_a [0:i3],
+                                         vec_b [0:i3],
+                                         vec_v [0:i3],
+                                         parent_index [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_rhs[i] -= vec_b[i] * dv;
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_rhs[parent_index[i]] += vec_a[i] * dv;
     }
-    // clang-format on
 }
 
 /* calculate left hand side of
@@ -150,34 +152,32 @@ static void nrn_lhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
-#pragma acc parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)
+        nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
     }
 
     /* now add the axial currents */
-    // clang-format off
-
-    #pragma acc parallel loop present(          \
-        vec_d[0:i3], vec_a[0:i3],               \
-        vec_b[0:i3], parent_index[0:i3])        \
-        if (_nt->compute_gpu) async(_nt->stream_id)
+    nrn_pragma_acc(parallel loop present(
+        vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
+                       async(_nt->stream_id))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_d[i] -= vec_b[i];
-        #pragma acc atomic update
+        nrn_pragma_acc(atomic update)
+        nrn_pragma_omp(atomic update)
         vec_d[parent_index[i]] -= vec_a[i];
     }
-    // clang-format on
 }
 
 /* for the fixed step method */
 void* setup_tree_matrix_minimal(NrnThread* _nt) {
     nrn_rhs(_nt);
     nrn_lhs(_nt);
-    // update_matrix_from_gpu(_nt);
-
     return nullptr;
 }
 }  // namespace coreneuron
diff --git a/coreneuron/utils/ivocvect.cpp b/coreneuron/utils/ivocvect.cpp
index 1315d409f..b51a96ab8 100644
--- a/coreneuron/utils/ivocvect.cpp
+++ b/coreneuron/utils/ivocvect.cpp
@@ -7,6 +7,7 @@
 */
 
 #include "coreneuron/utils/ivocvect.hpp"
+#include "coreneuron/utils/offload.hpp"
 
 namespace coreneuron {
 IvocVect* vector_new(int n) {
@@ -26,12 +27,12 @@ void* vector_new1(int n) {
     return (void*) (new IvocVect(n));
 }
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 int vector_capacity(void* v) {
     return ((IvocVect*) v)->size();
 }
 
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 double* vector_vec(void* v) {
     return ((IvocVect*) v)->data();
 }
diff --git a/coreneuron/utils/ivocvect.hpp b/coreneuron/utils/ivocvect.hpp
index af4286e09..80440c74d 100644
--- a/coreneuron/utils/ivocvect.hpp
+++ b/coreneuron/utils/ivocvect.hpp
@@ -9,6 +9,8 @@
 #ifndef ivoc_vector_h
 #define ivoc_vector_h
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <cstdio>
 #include <utility>
 
@@ -52,17 +54,17 @@ class fixed_vector {
         return data_[i];
     }
 
-#pragma acc routine seq
+    nrn_pragma_acc(routine seq)
     const T* data(void) const {
         return data_;
     }
 
-#pragma acc routine seq
+    nrn_pragma_acc(routine seq)
     T* data(void) {
         return data_;
     }
 
-#pragma acc routine seq
+    nrn_pragma_acc(routine seq)
     size_t size() const {
         return n_;
     }
@@ -76,9 +78,9 @@ extern double* vector_vec(IvocVect* v);
 
 // retro-compatibility API
 extern void* vector_new1(int n);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern int vector_capacity(void* v);
-#pragma acc routine seq
+nrn_pragma_acc(routine seq)
 extern double* vector_vec(void* v);
 
 }  // namespace coreneuron
diff --git a/coreneuron/utils/memory.h b/coreneuron/utils/memory.h
index 965c06e78..2f0e24458 100644
--- a/coreneuron/utils/memory.h
+++ b/coreneuron/utils/memory.h
@@ -115,8 +115,7 @@ auto allocate_unique(const Alloc& alloc, Args&&... args) {
 }  // namespace coreneuron
 
 /// for gpu builds with unified memory support
-/// OL210812: why do we include __CUDACC__ here?
-#if (defined(__CUDACC__) || defined(CORENEURON_UNIFIED_MEMORY))
+#ifdef CORENEURON_UNIFIED_MEMORY
 
 #include <cuda_runtime_api.h>
 
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
new file mode 100644
index 000000000..078990107
--- /dev/null
+++ b/coreneuron/utils/offload.hpp
@@ -0,0 +1,96 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================
+*/
+#pragma once
+#define nrn_pragma_stringify(x) #x
+#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+#define nrn_pragma_acc(x)
+#define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
+#include <omp.h>
+#elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+#define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
+#define nrn_pragma_omp(x)
+#include <openacc.h>
+#else
+#define nrn_pragma_acc(x)
+#define nrn_pragma_omp(x)
+#include <stdexcept>
+#endif
+
+#include <cstddef>
+
+namespace coreneuron {
+template <typename T>
+T* cnrn_target_deviceptr(const T* h_ptr) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    T const* d_ptr{};
+
+    nrn_pragma_omp(target data use_device_ptr(h_ptr))
+    { d_ptr = h_ptr; }
+
+    return const_cast<T*>(d_ptr);
+#else
+    throw std::runtime_error(
+        "cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    nrn_pragma_omp(target enter data map(to : h_ptr[:len]))
+    return cnrn_target_deviceptr(h_ptr);
+#else
+    throw std::runtime_error(
+        "cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    acc_delete(h_ptr, len * sizeof(T));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    nrn_pragma_omp(target exit data map(delete : h_ptr[:len]))
+#else
+    throw std::runtime_error(
+        "cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENACC)
+    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
+    defined(_OPENMP)
+    omp_target_memcpy(d_ptr,
+                      const_cast<T*>(h_ptr),
+                      len * sizeof(T),
+                      0,
+                      0,
+                      omp_get_default_device(),
+                      omp_get_initial_device());
+#else
+    throw std::runtime_error(
+        "cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+}  // namespace coreneuron
diff --git a/coreneuron/utils/profile/profiler_interface.h b/coreneuron/utils/profile/profiler_interface.h
index f6a24eb2e..2c68a0ae1 100644
--- a/coreneuron/utils/profile/profiler_interface.h
+++ b/coreneuron/utils/profile/profiler_interface.h
@@ -15,7 +15,7 @@
 #include <caliper/cali.h>
 #endif
 
-#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC))
+#ifdef CORENEURON_CUDA_PROFILING
 #include <cuda_profiler_api.h>
 #endif
 
@@ -163,7 +163,7 @@ struct Caliper {
 
 #endif
 
-#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC))
+#ifdef CORENEURON_CUDA_PROFILING
 
 struct CudaProfiling {
     inline static void phase_begin(const char* name){};
@@ -270,7 +270,7 @@ using InstrumentorImpl = detail::Instrumentor<
 #if defined CORENEURON_CALIPER
     detail::Caliper,
 #endif
-#if defined(CORENEURON_CUDA_PROFILING) && (defined(__CUDACC__) || defined(_OPENACC))
+#ifdef CORENEURON_CUDA_PROFILING
     detail::CudaProfiling,
 #endif
 #if defined(CRAYPAT)
diff --git a/coreneuron/utils/randoms/nrnran123.cu b/coreneuron/utils/randoms/nrnran123.cu
index b13dad7eb..8a02c4e26 100644
--- a/coreneuron/utils/randoms/nrnran123.cu
+++ b/coreneuron/utils/randoms/nrnran123.cu
@@ -15,6 +15,11 @@
 #include <memory>
 #include <mutex>
 
+#ifdef CORENEURON_USE_BOOST_POOL
+#include <boost/pool/pool_alloc.hpp>
+#include <unordered_map>
+#endif
+
 // In a GPU build this file will be compiled by NVCC as CUDA code
 // In a CPU build this file will be compiled by a C++ compiler as C++ code
 #ifdef __CUDACC__
@@ -24,6 +29,48 @@
 #endif
 
 namespace {
+#ifdef CORENEURON_USE_BOOST_POOL
+/** Tag type for use with boost::fast_pool_allocator that forwards to
+ *  coreneuron::[de]allocate_unified(). Using a Random123-specific type here
+ *  makes sure that allocations do not come from the same global pool as other
+ *  usage of boost pools for objects with sizeof == sizeof(nrnran123_State).
+ *
+ *  The messy m_block_sizes map is just because `deallocate_unified` uses sized
+ *  deallocations, but the Boost pool allocators don't. Because this is hidden
+ *  behind the pool mechanism, these methods are not called very often and the
+ *  overhead is minimal.
+ */
+struct random123_allocate_unified {
+    using size_type = std::size_t;
+    using difference_type = std::size_t;
+    static char* malloc(const size_type bytes) {
+        std::lock_guard<std::mutex> const lock{m_mutex};
+        static_cast<void>(lock);
+        auto* buffer = coreneuron::allocate_unified(bytes);
+        m_block_sizes[buffer] = bytes;
+        return reinterpret_cast<char*>(buffer);
+    }
+    static void free(char* const block) {
+        std::lock_guard<std::mutex> const lock{m_mutex};
+        static_cast<void>(lock);
+        auto const iter = m_block_sizes.find(block);
+        assert(iter != m_block_sizes.end());
+        auto const size = iter->second;
+        m_block_sizes.erase(iter);
+        return coreneuron::deallocate_unified(block, size);
+    }
+    static std::mutex m_mutex;
+    static std::unordered_map<void*, std::size_t> m_block_sizes;
+};
+
+std::mutex random123_allocate_unified::m_mutex{};
+std::unordered_map<void*, std::size_t> random123_allocate_unified::m_block_sizes{};
+
+using random123_allocator =
+    boost::fast_pool_allocator<coreneuron::nrnran123_State, random123_allocate_unified>;
+#else
+using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_State>;
+#endif
 /* Global data structure per process. Using a unique_ptr here causes [minor]
  * problems because its destructor can be called very late during application
  * shutdown. If the destructor calls cudaFree and the CUDA runtime has already
@@ -212,9 +259,7 @@ nrnran123_State* nrnran123_newstream3(uint32_t id1,
 #endif
     nrnran123_State* s{nullptr};
     if (use_unified_memory) {
-        s = coreneuron::allocate_unique<nrnran123_State>(
-                coreneuron::unified_allocator<nrnran123_State>{})
-                .release();
+        s = coreneuron::allocate_unique<nrnran123_State>(random123_allocator{}).release();
     } else {
         s = new nrnran123_State{};
     }
@@ -244,9 +289,7 @@ void nrnran123_deletestream(nrnran123_State* s, bool use_unified_memory) {
         --g_instance_count;
     }
     if (use_unified_memory) {
-        std::unique_ptr<nrnran123_State,
-                        coreneuron::alloc_deleter<coreneuron::unified_allocator<nrnran123_State>>>
-            _{s};
+        std::unique_ptr<nrnran123_State, coreneuron::alloc_deleter<random123_allocator>> _{s};
     } else {
         delete s;
     }
diff --git a/coreneuron/utils/randoms/nrnran123.h b/coreneuron/utils/randoms/nrnran123.h
index ab432f89c..c97592161 100644
--- a/coreneuron/utils/randoms/nrnran123.h
+++ b/coreneuron/utils/randoms/nrnran123.h
@@ -37,6 +37,8 @@ of the full distribution available from
 #define R123_USE_GNU_UINT128            1
 #endif
 
+#include "coreneuron/utils/offload.hpp"
+
 #include <Random123/philox.h>
 #include <inttypes.h>
 
@@ -46,17 +48,12 @@ of the full distribution available from
 #define CORENRN_HOST_DEVICE
 #endif
 
-// Is there actually any harm leaving the pragma in when DISABLE_OPENACC is true?
-#if defined(_OPENACC) && !defined(DISABLE_OPENACC)
-#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE _Pragma("acc routine seq")
-#else
-#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE
-#endif
+#define CORENRN_HOST_DEVICE_ACC CORENRN_HOST_DEVICE nrn_pragma_acc(routine seq)
 
 // Some files are compiled with DISABLE_OPENACC, and some builds have no GPU
 // support at all. In these two cases, request that the random123 state is
 // allocated using new/delete instead of CUDA unified memory.
-#if (defined(__CUDACC__) || defined(_OPENACC)) && !defined(DISABLE_OPENACC)
+#if defined(CORENEURON_ENABLE_GPU) && !defined(DISABLE_OPENACC)
 #define CORENRN_RAN123_USE_UNIFIED_MEMORY true
 #else
 #define CORENRN_RAN123_USE_UNIFIED_MEMORY false
@@ -100,6 +97,7 @@ void nrnran123_deletestream(nrnran123_State* s,
                             bool use_unified_memory = CORENRN_RAN123_USE_UNIFIED_MEMORY);
 
 /* minimal data stream */
+nrn_pragma_omp(declare target)
 CORENRN_HOST_DEVICE_ACC void nrnran123_getseq(nrnran123_State*, uint32_t* seq, char* which);
 CORENRN_HOST_DEVICE_ACC void nrnran123_getids(nrnran123_State*, uint32_t* id1, uint32_t* id2);
 CORENRN_HOST_DEVICE_ACC void nrnran123_getids3(nrnran123_State*,
@@ -128,6 +126,7 @@ CORENRN_HOST_DEVICE_ACC nrnran123_array4x32 nrnran123_iran(uint32_t seq,
                                                            uint32_t id1,
                                                            uint32_t id2);
 CORENRN_HOST_DEVICE_ACC double nrnran123_uint2dbl(uint32_t);
+nrn_pragma_omp(end declare target)
 }  // namespace coreneuron
 
 #endif
diff --git a/coreneuron/utils/vrecord.cpp b/coreneuron/utils/vrecord.cpp
index 8af2b028e..a972e754a 100644
--- a/coreneuron/utils/vrecord.cpp
+++ b/coreneuron/utils/vrecord.cpp
@@ -78,7 +78,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) {
     last_index_ = ubound_index_;
     // clang-format off
 
-    #pragma acc update device(last_index_) if (nt->compute_gpu)
+    nrn_pragma_acc(update device(last_index_) if (nt->compute_gpu))
+    nrn_pragma_omp(target update to(last_index_) if (nt->compute_gpu))
     // clang-format on
     if (discon_indices_) {
         if (discon_index_ < discon_indices_->size()) {
@@ -96,7 +97,8 @@ void VecPlayContinuous::deliver(double tt, NetCvode* ns) {
     }
     // clang-format off
 
-    #pragma acc update device(ubound_index_) if (nt->compute_gpu)
+    nrn_pragma_acc(update device(ubound_index_) if (nt->compute_gpu))
+    nrn_pragma_omp(target update to(ubound_index_) if (nt->compute_gpu))
     // clang-format on
     continuous(tt);
 }
@@ -105,7 +107,8 @@ void VecPlayContinuous::continuous(double tt) {
     NrnThread* nt = nrn_threads + ith_;
     // clang-format off
 
-    #pragma acc kernels present(this) if(nt->compute_gpu)
+    nrn_pragma_acc(kernels present(this) if(nt->compute_gpu))
+    nrn_pragma_omp(target if(nt->compute_gpu))
     {
         *pd_ = interpolate(tt);
     }
diff --git a/external/nmodl b/external/nmodl
index 85dec3618..46f8baf2b 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 85dec36180cc8d012db3392c06c065d39de79960
+Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee
diff --git a/tests/unit/lfp/CMakeLists.txt b/tests/unit/lfp/CMakeLists.txt
index 3e2ac8e80..ec795f178 100644
--- a/tests/unit/lfp/CMakeLists.txt
+++ b/tests/unit/lfp/CMakeLists.txt
@@ -22,3 +22,4 @@ set_target_properties(lfp_test_bin PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
 target_compile_options(lfp_test_bin PRIVATE ${CORENEURON_BOOST_UNIT_TEST_COMPILE_FLAGS})
 add_dependencies(lfp_test_bin nrniv-core)
 add_test(NAME lfp_test COMMAND ${TEST_EXEC_PREFIX} $<TARGET_FILE:lfp_test_bin>)
+set_tests_properties(lfp_test PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1)