NMODL does not update global variables on GPU

NMODL generates OpenACC code from the [hhwatch.mod](https://github.com/neuronsimulator/testcorenrn/blob/7b781b3a7ac1e0b785f897e4db5c3bf84c5127d2/mod/hhwatch.mod) mechanism that gives incorrect results. This leads to test failures when the NEURON test suite is run using CoreNEURON and NMODL, i.e.
```sh
cmake ..  -DNRN_ENABLE_TESTS=ON -DNRN_ENABLE_CORENEURON=ON -DCORENRN_ENABLE_GPU=ON -DCORENRN_ENABLE_NMODL=ON
cmake --build . --parallel
ctest -j 8
```
The `testcorenrn_watch::compare_results` test fails because the output spikes from the CoreNEURON+GPU tests do not match the NEURON and CoreNEURON+CPU runs, and the reference file. In the current `master` this is affected by https://github.com/BlueBrain/nmodl/issues/675, but if that is worked around there is a second issue: these tests [configure non-default values for some global parameters](https://github.com/neuronsimulator/testcorenrn/blob/7b781b3a7ac1e0b785f897e4db5c3bf84c5127d2/testwatch.hoc#L17-L19), and these values are not updated on the GPU. The OpenACC code generated by NMODL should include:
```diff
     /** initialize channel */
     void nrn_init_hhwatch(NrnThread* nt, Memb_list* ml, int type) {
+        #pragma acc update device (hhwatch_global) if(nt->compute_gpu)
         #pragma acc data present(nt, ml, hhwatch_global) if(nt->compute_gpu)
         {
```
to ensure the device-side values of these parameters are updated.

The full diff of manual changes to the generate C++ file that was tested locally is shown below. This includes three different sets of changes:
- [ ] Workarounds for https://github.com/BlueBrain/nmodl/issues/675
- [x] Changes to the `if ( ... ) { ... } else { ... }` structure of the `nrn_watch_check_...` method to match MOD2C's generate code, see https://github.com/BlueBrain/nmodl/issues/679)
- [x] The global variable fix (`#pragma acc update device (hhwatch_global) if(nt->compute_gpu)`) noted above
- [ ] MOD2C's behaviour is adopted for https://github.com/BlueBrain/nmodl/issues/680
- [ ] Also bring changes from PR BlueBrain/mod2c#63 mentioned in 679
-
The **full** diff:
```diff
--- hhwatch-original.cpp 2021-06-01 09:31:52.879278000 +0200
+++ hhwatch-patched.cpp  2021-06-01 09:27:37.270874000 +0200
@@ -230,9 +230,10 @@


     static inline void net_send_buffering(NetSendBuffer_t* nsb, int type, int vdata_index, int weight_index, int point_index, double t, double flag)  {
-        int i = nsb->_cnt;
-        nsb->_cnt++;
-        if(nsb->_cnt >= nsb->_size) {
+        int i = 0;
+#pragma acc atomic capture
+        i = nsb->_cnt++;
+        if(i >= nsb->_size) {
             printf("Error : netsend buffer size (%d) exceeded\n", nsb->_cnt);
             coreneuron_abort();
         }
@@ -291,49 +292,47 @@
                     if (v >  -55.0) {
                         if ((inst->watch1[4*pnodecount+id]&1) == 0) {
                             net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 2.0);
-                            inst->watch1[4*pnodecount+id] = 3;
-                        }
-                        else {
-                            inst->watch1[4*pnodecount+id] = 2;
                         }
+                        inst->watch1[4*pnodecount+id] = 3;
+                    } else {
+                        inst->watch1[4*pnodecount+id] = 2;
                     }
                 }
                 if (inst->watch2[5*pnodecount+id]&2) {
                     if (v > 10.0) {
                         if ((inst->watch2[5*pnodecount+id]&1) == 0) {
                             net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 3.0);
-                            inst->watch2[5*pnodecount+id] = 3;
-                        }
-                        else {
-                            inst->watch2[5*pnodecount+id] = 2;
                         }
+                        inst->watch2[5*pnodecount+id] = 3;
+                    } else {
+                        inst->watch2[5*pnodecount+id] = 2;
                     }
                 }
                 if (inst->watch3[6*pnodecount+id]&2) {
                     if (v <  -70.0) {
                         if ((inst->watch3[6*pnodecount+id]&1) == 0) {
                             net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 4.0);
-                            inst->watch3[6*pnodecount+id] = 3;
-                        }
-                        else {
-                            inst->watch3[6*pnodecount+id] = 2;
                         }
+                        inst->watch3[6*pnodecount+id] = 3;
+                    } else {
+                        inst->watch3[6*pnodecount+id] = 2;
                     }
                 }
                 if (inst->watch4[7*pnodecount+id]&2) {
                     if (v >  -55.0) {
                         if ((inst->watch4[7*pnodecount+id]&1) == 0) {
                             net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 2.0);
-                            inst->watch4[7*pnodecount+id] = 3;
-                        }
-                        else {
-                            inst->watch4[7*pnodecount+id] = 2;
                         }
+                        inst->watch4[7*pnodecount+id] = 3;
+                    } else {
+                        inst->watch4[7*pnodecount+id] = 2;
                     }
                 }
             }
-
             NetSendBuffer_t* nsb = ml->_net_send_buffer;
+            #pragma acc wait(nt->stream_id)
+            #pragma acc update self(nsb->_cnt) if(nt->compute_gpu)
+            update_net_send_buffer_on_host(nt, nsb);
             for (int i=0; i < nsb->_cnt; i++) {
                 int type = nsb->_sendtype[i];
                 int tid = nt->id;
@@ -345,6 +344,7 @@
                 net_sem_from_gpu(type, vdata_index, weight_index, tid, point_index, t, flag);
             }
             nsb->_cnt = 0;
+            #pragma acc update device(nsb->_cnt) if (nt->compute_gpu)
         }
     }

@@ -425,8 +425,12 @@
                     net_receive_kernel_hhwatch(t, point_process, inst, nt, ml, weight_index, flag);
                 }
             }
-
+            #pragma acc wait(nt->stream_id)
+            nrb->_displ_cnt = 0;
+            nrb->_cnt = 0;
             NetSendBuffer_t* nsb = ml->_net_send_buffer;
+            #pragma acc update self(nsb->_cnt) if(nt->compute_gpu)
+            update_net_send_buffer_on_host(nt, nsb);
             for (int i=0; i < nsb->_cnt; i++) {
                 int type = nsb->_sendtype[i];
                 int tid = nt->id;
@@ -438,15 +442,14 @@
                 net_sem_from_gpu(type, vdata_index, weight_index, tid, point_index, t, flag);
             }
             nsb->_cnt = 0;
-
-            nrb->_displ_cnt = 0;
-            nrb->_cnt = 0;
+            #pragma acc update device(nsb->_cnt) if (nt->compute_gpu)
         }
     }


     /** initialize channel */
     void nrn_init_hhwatch(NrnThread* nt, Memb_list* ml, int type) {
+        #pragma acc update device (hhwatch_global) if(nt->compute_gpu)
         #pragma acc data present(nt, ml, hhwatch_global) if(nt->compute_gpu)
         {
             int nodecount = ml->nodecount;
@@ -470,10 +473,26 @@
                     double v = voltage[node_id];
                     inst->g[id] = hhwatch_global.gpas;
                     inst->e[id] = hhwatch_global.erev;
-                    net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], -1, inst->point_process[1*pnodecount+id], nt->_t+0.0, 1.0);
+                    net_send_buffering(ml->_net_send_buffer, 0, inst->tqitem[2*pnodecount+id], 0, inst->point_process[1*pnodecount+id], nt->_t+0.0, 1.0);
                 }
             }
         }
+        #pragma acc wait(nt->stream_id)
+        NetSendBuffer_t* nsb = ml->_net_send_buffer;
+        #pragma acc update self(nsb->_cnt) if(nt->compute_gpu)
+        update_net_send_buffer_on_host(nt, nsb);
+        for (int i=0; i < nsb->_cnt; i++) {
+            int type = nsb->_sendtype[i];
+            int tid = nt->id;
+            double t = nsb->_nsb_t[i];
+            double flag = nsb->_nsb_flag[i];
+            int vdata_index = nsb->_vdata_index[i];
+            int weight_index = nsb->_weight_index[i];
+            int point_index = nsb->_pnt_index[i];
+            net_sem_from_gpu(type, vdata_index, weight_index, tid, point_index, t, flag);
+        }
+        nsb->_cnt = 0;
+        #pragma acc update device(nsb->_cnt) if (nt->compute_gpu)
     }
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

NMODL does not update global variables on GPU #678

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

NMODL does not update global variables on GPU #678

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions