to ensure the device-side values of these parameters are updated.
The full diff of manual changes to the generate C++ file that was tested locally is shown below. This includes three different sets of changes:
--- hhwatch-original.cpp 2021-06-01 09:31:52.879278000 +0200
+++ hhwatch-patched.cpp 2021-06-01 09:27:37.270874000 +0200
@@ -230,9 +230,10 @@
static inline void net_send_buffering(NetSendBuffer_t* nsb, int type, int vdata_index, int weight_index, int point_index, double t, double flag) {
- int i = nsb->_cnt;
- nsb->_cnt++;
- if(nsb->_cnt >= nsb->_size) {
+ int i = 0;
+#pragma acc atomic capture
+ i = nsb->_cnt++;
+ if(i >= nsb->_size) {
printf("Error : netsend buffer size (%d) exceeded\n", nsb->_cnt);
coreneuron_abort();
}
@@ -291,49 +292,47 @@
if (v > -55.0) {
if ((inst->watch1[4*pnodecount+id]&1) == 0) {
net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 2.0);
- inst->watch1[4*pnodecount+id] = 3;
- }
- else {
- inst->watch1[4*pnodecount+id] = 2;
}
+ inst->watch1[4*pnodecount+id] = 3;
+ } else {
+ inst->watch1[4*pnodecount+id] = 2;
}
}
if (inst->watch2[5*pnodecount+id]&2) {
if (v > 10.0) {
if ((inst->watch2[5*pnodecount+id]&1) == 0) {
net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 3.0);
- inst->watch2[5*pnodecount+id] = 3;
- }
- else {
- inst->watch2[5*pnodecount+id] = 2;
}
+ inst->watch2[5*pnodecount+id] = 3;
+ } else {
+ inst->watch2[5*pnodecount+id] = 2;
}
}
if (inst->watch3[6*pnodecount+id]&2) {
if (v < -70.0) {
if ((inst->watch3[6*pnodecount+id]&1) == 0) {
net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 4.0);
- inst->watch3[6*pnodecount+id] = 3;
- }
- else {
- inst->watch3[6*pnodecount+id] = 2;
}
+ inst->watch3[6*pnodecount+id] = 3;
+ } else {
+ inst->watch3[6*pnodecount+id] = 2;
}
}
if (inst->watch4[7*pnodecount+id]&2) {
if (v > -55.0) {
if ((inst->watch4[7*pnodecount+id]&1) == 0) {
net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 2.0);
- inst->watch4[7*pnodecount+id] = 3;
- }
- else {
- inst->watch4[7*pnodecount+id] = 2;
}
+ inst->watch4[7*pnodecount+id] = 3;
+ } else {
+ inst->watch4[7*pnodecount+id] = 2;
}
}
}
-
NetSendBuffer_t* nsb = ml->_net_send_buffer;
+ #pragma acc wait(nt->stream_id)
+ #pragma acc update self(nsb->_cnt) if(nt->compute_gpu)
+ update_net_send_buffer_on_host(nt, nsb);
for (int i=0; i < nsb->_cnt; i++) {
int type = nsb->_sendtype[i];
int tid = nt->id;
@@ -345,6 +344,7 @@
net_sem_from_gpu(type, vdata_index, weight_index, tid, point_index, t, flag);
}
nsb->_cnt = 0;
+ #pragma acc update device(nsb->_cnt) if (nt->compute_gpu)
}
}
@@ -425,8 +425,12 @@
net_receive_kernel_hhwatch(t, point_process, inst, nt, ml, weight_index, flag);
}
}
-
+ #pragma acc wait(nt->stream_id)
+ nrb->_displ_cnt = 0;
+ nrb->_cnt = 0;
NetSendBuffer_t* nsb = ml->_net_send_buffer;
+ #pragma acc update self(nsb->_cnt) if(nt->compute_gpu)
+ update_net_send_buffer_on_host(nt, nsb);
for (int i=0; i < nsb->_cnt; i++) {
int type = nsb->_sendtype[i];
int tid = nt->id;
@@ -438,15 +442,14 @@
net_sem_from_gpu(type, vdata_index, weight_index, tid, point_index, t, flag);
}
nsb->_cnt = 0;
-
- nrb->_displ_cnt = 0;
- nrb->_cnt = 0;
+ #pragma acc update device(nsb->_cnt) if (nt->compute_gpu)
}
}
/** initialize channel */
void nrn_init_hhwatch(NrnThread* nt, Memb_list* ml, int type) {
+ #pragma acc update device (hhwatch_global) if(nt->compute_gpu)
#pragma acc data present(nt, ml, hhwatch_global) if(nt->compute_gpu)
{
int nodecount = ml->nodecount;
@@ -470,10 +473,26 @@
double v = voltage[node_id];
inst->g[id] = hhwatch_global.gpas;
inst->e[id] = hhwatch_global.erev;
- net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], -1, inst->point_process[1*pnodecount+id], nt->_t+0.0, 1.0);
+ net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 1.0);
}
}
}
+ #pragma acc wait(nt->stream_id)
+ NetSendBuffer_t* nsb = ml->_net_send_buffer;
+ #pragma acc update self(nsb->_cnt) if(nt->compute_gpu)
+ update_net_send_buffer_on_host(nt, nsb);
+ for (int i=0; i < nsb->_cnt; i++) {
+ int type = nsb->_sendtype[i];
+ int tid = nt->id;
+ double t = nsb->_nsb_t[i];
+ double flag = nsb->_nsb_flag[i];
+ int vdata_index = nsb->_vdata_index[i];
+ int weight_index = nsb->_weight_index[i];
+ int point_index = nsb->_pnt_index[i];
+ net_sem_from_gpu(type, vdata_index, weight_index, tid, point_index, t, flag);
+ }
+ nsb->_cnt = 0;
+ #pragma acc update device(nsb->_cnt) if (nt->compute_gpu)
}
NMODL generates OpenACC code from the hhwatch.mod mechanism that gives incorrect results. This leads to test failures when the NEURON test suite is run using CoreNEURON and NMODL, i.e.
cmake .. -DNRN_ENABLE_TESTS=ON -DNRN_ENABLE_CORENEURON=ON -DCORENRN_ENABLE_GPU=ON -DCORENRN_ENABLE_NMODL=ON cmake --build . --parallel ctest -j 8The
testcorenrn_watch::compare_resultstest fails because the output spikes from the CoreNEURON+GPU tests do not match the NEURON and CoreNEURON+CPU runs, and the reference file. In the currentmasterthis is affected by #675, but if that is worked around there is a second issue: these tests configure non-default values for some global parameters, and these values are not updated on the GPU. The OpenACC code generated by NMODL should include:/** initialize channel */ void nrn_init_hhwatch(NrnThread* nt, Memb_list* ml, int type) { + #pragma acc update device (hhwatch_global) if(nt->compute_gpu) #pragma acc data present(nt, ml, hhwatch_global) if(nt->compute_gpu) {to ensure the device-side values of these parameters are updated.
The full diff of manual changes to the generate C++ file that was tested locally is shown below. This includes three different sets of changes:
if ( ... ) { ... } else { ... }structure of thenrn_watch_check_...method to match MOD2C's generate code, see NMODL generates different WATCH statement logic to MOD2C #679)#pragma acc update device (hhwatch_global) if(nt->compute_gpu)) noted aboveThe full diff: