do jj = ntsj, ntej, 1
do ji = ntsi, ntei, 1
zstep = rsfact / REAL(kiter(ji,jj), kind=wp) / 2.
do jt = 1, kiter(ji,jj), 1
do idx = loop_start, loop_stop, 1
zakz(idx) = 0.e0
enddo
do idx_1 = loop_start_1, loop_stop_1, 1
ztrb(idx_1) = tr(ji,jj,idx_1 + (LBOUND(tr, dim=3) - LBOUND(ztrb, dim=1)),jp_tra,kbb)
enddo
do jn = 1, 2, 1
ztraz_km1 = (ztrb(1) - ztrb(2)) * tmask(ji,jj,2)
do jk = 2, jpkm1, 1
...
enddo
do jk = 1, jpkm1, 1
...
enddo
enddo
do jk = 1, jpkm1, 1
...
enddo
enddo
enddo
And as in other parts of NEMO v5 psyclone only parallelises the innermost loop reporting write-on-write accessess to the temporary arrays as a reason to not parallelise outer loops. In this cases we typically apply tmp-rank-increase-and-reorder or array privatisation but this case:
The
trc_sink2_musroutine from NEMO main has become very slow on GPUs.It looks something like this:
And as in other parts of NEMO v5 psyclone only parallelises the innermost loop reporting write-on-write accessess to the temporary arrays as a reason to not parallelise outer loops. In this cases we typically apply tmp-rank-increase-and-reorder or array privatisation but this case:
do jt = 1, kiter(ji,jj), 1loop)