From c5fb952ba0a5eacf7033560b91a03f7a82965325 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Wed, 18 May 2022 22:27:39 -0700 Subject: [PATCH 01/16] Use the right LHS type in unary reductions --- src/cunumeric/unary/unary_red.cc | 32 +---- src/cunumeric/unary/unary_red.cu | 100 ++-------------- src/cunumeric/unary/unary_red_omp.cc | 68 +---------- src/cunumeric/unary/unary_red_template.inl | 44 +------ src/cunumeric/unary/unary_red_util.h | 132 ++++++++++++++------- 5 files changed, 104 insertions(+), 272 deletions(-) diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc index aef90964aa..115587b289 100644 --- a/src/cunumeric/unary/unary_red.cc +++ b/src/cunumeric/unary/unary_red.cc @@ -26,23 +26,10 @@ template struct UnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - lhs.reduce(point, rhs[point]); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, + AccessorRO rhs, const Rect& rect, const Pitches& pitches, int collapsed_dim, @@ -50,7 +37,7 @@ struct UnaryRedImplBody { { for (size_t idx = 0; idx < volume; ++idx) { auto point = pitches.unflatten(idx, rect.lo); - OP::template fold(lhs[point], rhs[point]); + lhs.reduce(point, OP::convert(point, collapsed_dim, rhs[point])); } } }; @@ -74,19 +61,6 @@ struct ArgRedImplBody { lhs.reduce(point, LHS(point[collapsed_dim], rhs[point])); } } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - OP::template fold(lhs[point], LHS(point[collapsed_dim], rhs[point])); - } - } }; /*static*/ void UnaryRedTask::cpu_variant(TaskContext& context) diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu index 86003c00e5..1cc0d46536 100644 --- a/src/cunumeric/unary/unary_red.cu +++ b/src/cunumeric/unary/unary_red.cu @@ -38,7 +38,7 @@ static constexpr coord_t WARP_SIZE = 32; // have enough elements to be assigned to the threads, we also parallelize on // the collapsing domain. One exceptional case to this strategy is where the collapsing // dimension is the innermost one, in which case we prefer that dimension to the others -// in order to enjoy wrap coalescing. The maximum degree of such parallelism woudl be 32, +// in order to enjoy wrap coalescing. The maximum degree of such parallelism would be 32, // which is the size of a wrap. template struct ThreadBlock { @@ -203,9 +203,8 @@ std::ostream& operator<<(std::ostream& os, const ThreadBlocks& blocks) return os; } -template -static __device__ __forceinline__ Point local_reduce(CTOR ctor, - LHS& result, +template +static __device__ __forceinline__ Point local_reduce(LHS& result, AccessorRO in, LHS identity, const ThreadBlocks& blocks, @@ -218,7 +217,7 @@ static __device__ __forceinline__ Point local_reduce(CTOR ctor, if (!domain.contains(point)) return point; while (point[collapsed_dim] <= domain.hi[collapsed_dim]) { - LHS value = ctor(point, in[point], collapsed_dim); + LHS value = OP::convert(point, collapsed_dim, in[point]); REDOP::template fold(result, value); blocks.next_point(point); } @@ -274,22 +273,7 @@ static __device__ __forceinline__ Point local_reduce(CTOR ctor, return point; } -template -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) - reduce_with_rw_acc(AccessorRW out, - AccessorRO in, - LHS identity, - ThreadBlocks blocks, - Rect domain, - int32_t collapsed_dim) -{ - auto result = identity; - auto point = local_reduce( - CTOR{}, result, in, identity, blocks, domain, collapsed_dim); - if (result != identity) REDOP::template fold(out[point], result); -} - -template +template static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduce_with_rd_acc(AccessorRD out, AccessorRO in, @@ -299,64 +283,17 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) int32_t collapsed_dim) { auto result = identity; - auto point = local_reduce( - CTOR{}, result, in, identity, blocks, domain, collapsed_dim); + auto point = + local_reduce(result, in, identity, blocks, domain, collapsed_dim); if (result != identity) out.reduce(point, result); } template struct UnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - using CTOR = ValueConstructor; - - void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - auto Kernel = reduce_with_rd_acc; - auto stream = get_cached_stream(); - - ThreadBlocks blocks; - blocks.initialize(rect, collapsed_dim); - - blocks.compute_maximum_concurrency(reinterpret_cast(Kernel)); - Kernel<<>>( - lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim); - CHECK_CUDA_STREAM(stream); - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - auto Kernel = reduce_with_rw_acc; - auto stream = get_cached_stream(); - - ThreadBlocks blocks; - blocks.initialize(rect, collapsed_dim); - - blocks.compute_maximum_concurrency(reinterpret_cast(Kernel)); - Kernel<<>>( - lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim); - CHECK_CUDA_STREAM(stream); - } -}; - -template -struct ArgRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; using RHS = legate_type_of; - using LHS = Argval; - using CTOR = ArgvalConstructor; + using LHS = typename OP::VAL; void operator()(AccessorRD lhs, AccessorRO rhs, @@ -365,26 +302,7 @@ struct ArgRedImplBody { int collapsed_dim, size_t volume) const { - auto Kernel = reduce_with_rd_acc; - auto stream = get_cached_stream(); - - ThreadBlocks blocks; - blocks.initialize(rect, collapsed_dim); - - blocks.compute_maximum_concurrency(reinterpret_cast(Kernel)); - Kernel<<>>( - lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim); - CHECK_CUDA_STREAM(stream); - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - auto Kernel = reduce_with_rw_acc; + auto Kernel = reduce_with_rd_acc; auto stream = get_cached_stream(); ThreadBlocks blocks; diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc index 1cdb40b44c..6696c0f7b0 100644 --- a/src/cunumeric/unary/unary_red_omp.cc +++ b/src/cunumeric/unary/unary_red_omp.cc @@ -79,10 +79,10 @@ template struct UnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD lhs, - AccessorRO rhs, + AccessorRO rhs, const Rect& rect, const Pitches& pitches, int collapsed_dim, @@ -95,69 +95,7 @@ struct UnaryRedImplBody { for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { auto point = splitter.combine(o_idx, i_idx, rect.lo); - lhs.reduce(point, rhs[point]); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - Splitter splitter; - auto split = splitter.split(rect, collapsed_dim); - -#pragma omp parallel for schedule(static) - for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) - for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { - auto point = splitter.combine(o_idx, i_idx, rect.lo); - OP::template fold(lhs[point], rhs[point]); - } - } -}; - -template -struct ArgRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - using ARGVAL = Argval; - - void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - Splitter splitter; - auto split = splitter.split(rect, collapsed_dim); - -#pragma omp parallel for schedule(static) - for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) - for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { - auto point = splitter.combine(o_idx, i_idx, rect.lo); - lhs.reduce(point, ARGVAL(point[collapsed_dim], rhs[point])); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - Splitter splitter; - auto split = splitter.split(rect, collapsed_dim); - -#pragma omp parallel for schedule(static) - for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) - for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { - auto point = splitter.combine(o_idx, i_idx, rect.lo); - OP::template fold(lhs[point], ARGVAL(point[collapsed_dim], rhs[point])); + lhs.reduce(point, OP::convert(point, collapsed_dim, rhs[point])); } } }; diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl index 309dec413d..1818860e65 100644 --- a/src/cunumeric/unary/unary_red_template.inl +++ b/src/cunumeric/unary/unary_red_template.inl @@ -37,7 +37,7 @@ struct UnaryRedImpl { void operator()(UnaryRedArgs& args) const { using OP = UnaryRedOp; - using VAL = legate_type_of; + using RHS = legate_type_of; Pitches pitches; auto rect = args.rhs.shape(); @@ -45,7 +45,7 @@ struct UnaryRedImpl { if (volume == 0) return; - auto rhs = args.rhs.read_accessor(rect); + auto rhs = args.rhs.read_accessor(rect); auto lhs = args.lhs.reduce_accessor(rect); UnaryRedImplBody()( @@ -61,52 +61,14 @@ struct UnaryRedImpl { } }; -template -struct ArgRedImpl { - template 1) && UnaryRedOp::valid>* = nullptr> - void operator()(UnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using VAL = legate_type_of; - using ARGVAL = Argval; - - Pitches pitches; - auto rect = args.rhs.shape(); - auto volume = pitches.flatten(rect); - - if (volume == 0) return; - - auto rhs = args.rhs.read_accessor(rect); - - auto lhs = args.lhs.reduce_accessor(rect); - ArgRedImplBody()(lhs, rhs, rect, pitches, args.collapsed_dim, volume); - } - - template ::valid>* = nullptr> - void operator()(UnaryRedArgs& args) const - { - assert(false); - } -}; - template struct UnaryRedDispatch { - template ::value>* = nullptr> + template void operator()(UnaryRedArgs& args) const { auto dim = std::max(1, args.rhs.dim()); return double_dispatch(dim, args.rhs.code(), UnaryRedImpl{}, args); } - template ::value>* = nullptr> - void operator()(UnaryRedArgs& args) const - { - auto dim = std::max(1, args.rhs.dim()); - return double_dispatch(dim, args.rhs.code(), ArgRedImpl{}, args); - } }; template diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index 233190731a..c72817d918 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -72,26 +72,6 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&.. return f.template operator()(std::forward(args)...); } -template -struct ValueConstructor { - __CUDA_HD__ inline constexpr T operator()(const Legion::Point&, - const T& value, - int32_t) const - { - return value; - } -}; - -template -struct ArgvalConstructor { - __CUDA_HD__ inline constexpr Argval operator()(const Legion::Point& point, - const T& value, - int32_t collapsed_dim) const - { - return Argval(point[collapsed_dim], value); - } -}; - template struct UnaryRedOp { static constexpr bool valid = false; @@ -101,13 +81,20 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = bool; using OP = Legion::ProdReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs != RHS(0); } }; @@ -120,13 +107,20 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = bool; using OP = Legion::SumReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) { - OP::template fold(rhs1, rhs2); + return rhs != RHS(0); } }; @@ -134,13 +128,20 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::MaxReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) { - OP::template fold(rhs1, rhs2); + return rhs; } }; @@ -153,13 +154,20 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::MinReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) { - OP::template fold(rhs1, rhs2); + return rhs; } }; @@ -172,13 +180,20 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::ProdReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs; } }; @@ -191,13 +206,20 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::SumReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) { - OP::template fold(rhs1, rhs2); + return rhs; } }; @@ -205,13 +227,22 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = Argval>; - using OP = ArgmaxReduction>; + using RHS = legate::legate_type_of; + using VAL = Argval; + using OP = ArgmaxReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + int32_t collapsed_dim, + const RHS& rhs) { - OP::template fold(rhs1, rhs2); + return VAL(point[collapsed_dim], rhs); } }; @@ -224,13 +255,22 @@ template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = Argval>; - using OP = ArgminReduction>; + using RHS = legate::legate_type_of; + using VAL = Argval; + using OP = ArgminReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + int32_t collapsed_dim, + const RHS& rhs) { - OP::template fold(rhs1, rhs2); + return VAL(point[collapsed_dim], rhs); } }; From 16eb415dfa10e60bcd38eb200993e01f19ce3767 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Wed, 18 May 2022 23:08:25 -0700 Subject: [PATCH 02/16] Fix scalar reductions with any and all --- src/cunumeric/unary/scalar_unary_red.cc | 74 +------------ src/cunumeric/unary/scalar_unary_red.cu | 104 ++---------------- src/cunumeric/unary/scalar_unary_red_omp.cc | 85 ++------------ .../unary/scalar_unary_red_template.inl | 66 +---------- src/cunumeric/unary/unary_red_util.h | 12 ++ 5 files changed, 37 insertions(+), 304 deletions(-) diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc index 2e01700e80..6abc2cb0ec 100644 --- a/src/cunumeric/unary/scalar_unary_red.cc +++ b/src/cunumeric/unary/scalar_unary_red.cc @@ -26,11 +26,11 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(OP func, AccessorRD out, - AccessorRO in, + AccessorRO in, const Rect& rect, const Pitches& pitches, bool dense) const @@ -39,82 +39,18 @@ struct ScalarUnaryRedImplBody { const size_t volume = rect.volume(); if (dense) { auto inptr = in.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) OP::template fold(result, inptr[idx]); + for (size_t idx = 0; idx < volume; ++idx) + OP::template fold(result, OP::convert(inptr[idx])); } else { for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - OP::template fold(result, in[p]); + OP::template fold(result, OP::convert(in[p])); } } out.reduce(0, result); } }; -namespace detail { - -template -void logical_operator(bool& result, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) -{ - const size_t volume = rect.volume(); - if (dense) { - auto inptr = in.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) { - bool tmp1 = detail::convert_to_bool(inptr[idx]); - OP::template fold(result, tmp1); - } - } else { - for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches.unflatten(idx, rect.lo); - bool tmp1 = detail::convert_to_bool(in[p]); - OP::template fold(result, tmp1); - } - } -} - -} // namespace detail - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - auto result = LG_OP::identity; - detail::logical_operator(result, in, rect, pitches, dense); - out.reduce(0, result); - } -}; - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - auto result = LG_OP::identity; - detail::logical_operator(result, in, rect, pitches, dense); - out.reduce(0, result); - } -}; - template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu index 796220ee5b..a62ba95c11 100644 --- a/src/cunumeric/unary/scalar_unary_red.cu +++ b/src/cunumeric/unary/scalar_unary_red.cu @@ -23,7 +23,8 @@ namespace cunumeric { using namespace Legion; -template static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduction_kernel(size_t volume, - Op op, + OP, + LG_OP, Output out, ReadAcc in, Pitches pitches, @@ -44,7 +46,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (offset < volume) { auto point = pitches.unflatten(offset, origin); - Op::template fold(value, in[point]); + LG_OP::template fold(value, OP::convert(in[point])); } } // Every thread in the thread block must participate in the exchange to get correct results @@ -93,11 +95,12 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; + using LHS = typename OP::VAL; void operator()(OP func, AccessorRD out, - AccessorRO in, + AccessorRO in, const Rect& rect, const Pitches& pitches, bool dense) const @@ -107,15 +110,15 @@ struct ScalarUnaryRedImplBody { const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; DeferredReduction result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(VAL); + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(LHS); if (blocks >= MAX_REDUCTION_CTAS) { const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; reduction_kernel<<>>( - volume, typename OP::OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity); + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity); } else reduction_kernel<<>>( - volume, typename OP::OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity); + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity); copy_kernel<<<1, 1, 0, stream>>>(result, out); CHECK_CUDA_STREAM(stream); @@ -156,91 +159,6 @@ struct ScalarUnaryRedImplBody -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) logical_kernel( - size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, VAL identity) -{ - auto value = identity; - for (size_t idx = 0; idx < iters; idx++) { - const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (offset < volume) { - auto point = pitches.unflatten(offset, origin); - Op::template fold(value, convert_to_bool(in[point])); - } - } - // Every thread in the thread block must participate in the exchange to get correct results - reduce_output(out, value); -} - -template -void logical_operator_gpu(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) -{ - auto stream = get_cached_stream(); - - const size_t volume = rect.volume(); - const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - DeferredReduction result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool); - - if (blocks >= MAX_REDUCTION_CTAS) { - const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; - logical_kernel<<>>( - volume, result, in, pitches, rect.lo, iters, LG_OP::identity); - } else - logical_kernel<<>>( - volume, result, in, pitches, rect.lo, 1, LG_OP::identity); - - copy_kernel<<<1, 1, 0, stream>>>(result, out); - CHECK_CUDA_STREAM(stream); -} - -} // namespace detail - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_gpu>(out, in, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_gpu>(out, in, rect, pitches, dense); - } -}; - template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc index 6d190cdf51..f575d3626c 100644 --- a/src/cunumeric/unary/scalar_unary_red_omp.cc +++ b/src/cunumeric/unary/scalar_unary_red_omp.cc @@ -29,11 +29,12 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; + using LHS = typename OP::VAL; void operator()(OP func, AccessorRD out, - AccessorRO in, + AccessorRO in, const Rect& rect, const Pitches& pitches, bool dense) const @@ -41,7 +42,7 @@ struct ScalarUnaryRedImplBody { auto result = LG_OP::identity; const size_t volume = rect.volume(); const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); + ThreadLocalStorage locals(max_threads); for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity; if (dense) { auto inptr = in.ptr(rect); @@ -49,7 +50,8 @@ struct ScalarUnaryRedImplBody { { const int tid = omp_get_thread_num(); #pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) OP::template fold(locals[tid], inptr[idx]); + for (size_t idx = 0; idx < volume; ++idx) + OP::template fold(locals[tid], OP::convert(inptr[idx])); } } else { #pragma omp parallel @@ -58,7 +60,7 @@ struct ScalarUnaryRedImplBody { #pragma omp for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - OP::template fold(locals[tid], in[p]); + OP::template fold(locals[tid], OP::convert(in[p])); } } } @@ -111,79 +113,6 @@ struct ScalarUnaryRedImplBody -void logical_operator_omp(AccessorRO in, - AccessorRD out, - const Rect& rect, - const Pitches& pitches, - bool dense) -{ - const size_t volume = rect.volume(); - const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); - for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity; - if (dense) { - auto inptr = in.ptr(rect); -#pragma omp parallel - { - const int tid = omp_get_thread_num(); -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) - OP::template fold(locals[tid], convert_to_bool(inptr[idx])); - } - } else { -#pragma omp parallel - { - const int tid = omp_get_thread_num(); -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches.unflatten(idx, rect.lo); - OP::template fold(locals[tid], convert_to_bool(in[p])); - } - } - } - - for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]); -} - -} // namespace detail - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_omp(in, out, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_omp(in, out, rect, pitches, dense); - } -}; - template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl index d11866cade..54baa5559f 100644 --- a/src/cunumeric/unary/scalar_unary_red_template.inl +++ b/src/cunumeric/unary/scalar_unary_red_template.inl @@ -34,7 +34,7 @@ struct ScalarUnaryRedImpl { { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; auto rect = args.in.shape(); @@ -44,7 +44,7 @@ struct ScalarUnaryRedImpl { if (0 == volume) return; auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); + auto in = args.in.read_accessor(rect); #ifndef LEGION_BOUNDS_CHECKS // Check to see if this is dense or not @@ -66,68 +66,6 @@ struct ScalarUnaryRedImpl { } }; -template -struct ScalarUnaryRedImpl { - template - void operator()(ScalarUnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - auto rect = args.in.shape(); - - Pitches pitches; - size_t volume = pitches.flatten(rect); - - if (0 == volume) return; - - auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - // Check to see if this is dense or not - bool dense = in.accessor.is_dense_row_major(rect); -#else - // No dense execution if we're doing bounds checks - bool dense = false; -#endif - - ScalarUnaryRedImplBody()(out, in, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImpl { - template - void operator()(ScalarUnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - auto rect = args.in.shape(); - - Pitches pitches; - size_t volume = pitches.flatten(rect); - - if (0 == volume) return; - - auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - // Check to see if this is dense or not - bool dense = in.accessor.is_dense_row_major(rect); -#else - // No dense execution if we're doing bounds checks - bool dense = false; -#endif - - ScalarUnaryRedImplBody()(out, in, rect, pitches, dense); - } -}; - template struct ScalarUnaryRedImpl { template diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index c72817d918..91eca97a7e 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -96,6 +96,8 @@ struct UnaryRedOp { { return rhs != RHS(0); } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); } }; template <> @@ -122,6 +124,8 @@ struct UnaryRedOp { { return rhs != RHS(0); } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); } }; template @@ -143,6 +147,8 @@ struct UnaryRedOp { { return rhs; } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template <> @@ -169,6 +175,8 @@ struct UnaryRedOp { { return rhs; } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template <> @@ -195,6 +203,8 @@ struct UnaryRedOp { { return rhs; } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template <> @@ -221,6 +231,8 @@ struct UnaryRedOp { { return rhs; } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template From 3d2c2d55e810c701caade23bce65c32542c1ba7f Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 00:28:09 -0700 Subject: [PATCH 03/16] Remove unnecessary template specializations --- src/cunumeric/unary/unary_red_util.h | 30 +++++----------------------- 1 file changed, 5 insertions(+), 25 deletions(-) diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index 91eca97a7e..48d4256e33 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -79,7 +79,7 @@ struct UnaryRedOp { template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = bool; @@ -100,14 +100,9 @@ struct UnaryRedOp { __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); } }; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; -}; - template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = bool; @@ -130,7 +125,7 @@ struct UnaryRedOp { template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = RHS; @@ -151,14 +146,9 @@ struct UnaryRedOp { __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; -}; - template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = RHS; @@ -179,14 +169,9 @@ struct UnaryRedOp { __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; -}; - template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = RHS; @@ -207,11 +192,6 @@ struct UnaryRedOp { __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; -}; - template struct UnaryRedOp { static constexpr bool valid = true; From e456cd5cf5563cbb2e956be9eb46e85ac1effdd2 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 00:35:50 -0700 Subject: [PATCH 04/16] Remove specializations for count_nonzero and use the template instead --- src/cunumeric/unary/scalar_unary_red.cc | 27 ----------- src/cunumeric/unary/scalar_unary_red.cu | 48 ------------------- src/cunumeric/unary/scalar_unary_red_omp.cc | 41 ---------------- .../unary/scalar_unary_red_template.inl | 39 +-------------- src/cunumeric/unary/unary_red_util.h | 25 ++++++++++ 5 files changed, 26 insertions(+), 154 deletions(-) diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc index 6abc2cb0ec..a084943fe3 100644 --- a/src/cunumeric/unary/scalar_unary_red.cc +++ b/src/cunumeric/unary/scalar_unary_red.cc @@ -87,33 +87,6 @@ struct ScalarUnaryRedImplBody -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - { - auto result = LG_OP::identity; - const size_t volume = rect.volume(); - if (dense) { - auto inptr = in.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) result += inptr[idx] != VAL(0); - } else { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - result += in[point] != VAL(0); - } - } - out.reduce(0, result); - } -}; - /*static*/ void ScalarUnaryRedTask::cpu_variant(TaskContext& context) { scalar_unary_red_template(context); diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu index a62ba95c11..e72b8d1b11 100644 --- a/src/cunumeric/unary/scalar_unary_red.cu +++ b/src/cunumeric/unary/scalar_unary_red.cu @@ -69,22 +69,6 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) con reduce_output(out, value); } -template -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) count_nonzero_kernel( - size_t volume, Output out, AccessorRO in, Pitches pitches, Point origin, size_t iters) -{ - uint64_t value = 0; - for (size_t idx = 0; idx < iters; idx++) { - const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (offset < volume) { - auto point = pitches.unflatten(offset, origin); - SumReduction::fold(value, in[point] != VAL(0)); - } - } - // Every thread in the thread block must participate in the exchange to get correct results - reduce_output(out, value); -} - template static __global__ void __launch_bounds__(1, 1) copy_kernel(Buffer result, RedAcc out) { @@ -159,38 +143,6 @@ struct ScalarUnaryRedImplBody -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - { - auto stream = get_cached_stream(); - - const size_t volume = rect.volume(); - const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - DeferredReduction> result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(uint64_t); - - if (blocks >= MAX_REDUCTION_CTAS) { - const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; - count_nonzero_kernel<<>>( - volume, result, in, pitches, rect.lo, iters); - } else - count_nonzero_kernel<<>>( - volume, result, in, pitches, rect.lo, 1); - - copy_kernel<<<1, 1, 0, stream>>>(result, out); - CHECK_CUDA_STREAM(stream); - } -}; - /*static*/ void ScalarUnaryRedTask::gpu_variant(TaskContext& context) { scalar_unary_red_template(context); diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc index f575d3626c..db50f15427 100644 --- a/src/cunumeric/unary/scalar_unary_red_omp.cc +++ b/src/cunumeric/unary/scalar_unary_red_omp.cc @@ -113,47 +113,6 @@ struct ScalarUnaryRedImplBody -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - { - auto result = LG_OP::identity; - const size_t volume = rect.volume(); - const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); - for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = 0; - if (dense) { - auto inptr = in.ptr(rect); -#pragma omp parallel - { - const int tid = omp_get_thread_num(); -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) locals[tid] += inptr[idx] != VAL(0); - } - } else { -#pragma omp parallel - { - const int tid = omp_get_thread_num(); -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - locals[tid] += in[point] != VAL(0); - } - } - } - - for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]); - } -}; - /*static*/ void ScalarUnaryRedTask::omp_variant(TaskContext& context) { scalar_unary_red_template(context); diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl index 54baa5559f..d750ebbb8b 100644 --- a/src/cunumeric/unary/scalar_unary_red_template.inl +++ b/src/cunumeric/unary/scalar_unary_red_template.inl @@ -98,38 +98,6 @@ struct ScalarUnaryRedImpl { } }; -template -struct ScalarUnaryRedImpl { - template - void operator()(ScalarUnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - auto rect = args.in.shape(); - - Pitches pitches; - size_t volume = pitches.flatten(rect); - - if (0 == volume) return; - - auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - // Check to see if this is dense or not - bool dense = in.accessor.is_dense_row_major(rect); -#else - // No dense execution if we're doing bounds checks - bool dense = false; -#endif - - ScalarUnaryRedImplBody()( - out, in, rect, pitches, dense); - } -}; - template struct ScalarUnaryRedDispatch { template ::value>* = nullptr> @@ -156,12 +124,7 @@ static void scalar_unary_red_template(TaskContext& context) ScalarUnaryRedArgs args{ context.reductions()[0], inputs[0], scalars[0].value(), std::move(extra_args)}; - if (args.op_code == UnaryRedCode::COUNT_NONZERO) { - auto dim = std::max(1, args.in.dim()); - double_dispatch( - dim, args.in.code(), ScalarUnaryRedImpl{}, args); - } else - op_dispatch(args.op_code, ScalarUnaryRedDispatch{}, args); + op_dispatch(args.op_code, ScalarUnaryRedDispatch{}, args); } } // namespace cunumeric diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index 48d4256e33..b385a5907c 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -58,6 +58,8 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&.. return f.template operator()(std::forward(args)...); case UnaryRedCode::CONTAINS: return f.template operator()(std::forward(args)...); + case UnaryRedCode::COUNT_NONZERO: + return f.template operator()(std::forward(args)...); case UnaryRedCode::MAX: return f.template operator()(std::forward(args)...); case UnaryRedCode::MIN: @@ -123,6 +125,29 @@ struct UnaryRedOp { __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); } }; +template +struct UnaryRedOp { + static constexpr bool valid = true; + + using RHS = legate::legate_type_of; + using VAL = uint64_t; + using OP = Legion::SumReduction; + + template + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return static_cast(rhs != RHS(0)); + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return static_cast(rhs != RHS(0)); } +}; + template struct UnaryRedOp { static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; From a147e1200d852b426055e0c904c366351a589744 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 10:30:52 -0700 Subject: [PATCH 05/16] More tests for unary reductions --- tests/integration/test_logical.py | 76 ++++++++++++------------------- tests/integration/test_nonzero.py | 11 ++--- 2 files changed, 34 insertions(+), 53 deletions(-) diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py index 1ef81d48c2..b84148e634 100644 --- a/tests/integration/test_logical.py +++ b/tests/integration/test_logical.py @@ -18,53 +18,35 @@ import cunumeric as num - -def test_any_basic(): - assert num.array_equal(num.any([-1, 4, 5]), np.any([-1, 4, 5])) - - x = [5, 10, 0, 100] - cx = num.array(x) - assert num.array_equal(num.any(cx), np.any(x)) - - y = [[0, 0], [0, 0]] - cy = num.array(y) - assert num.array_equal(num.any(cy), np.any(y)) - - -def test_any_axis(): - x = np.array([[True, True, False], [True, True, True]]) - cx = num.array(x) - - assert num.array_equal(num.any(cx), np.any(x)) - assert num.array_equal(num.any(cx, axis=0), np.any(x, axis=0)) - - -def test_all_basic(): - assert num.array_equal(num.all([-1, 4, 5]), np.all([-1, 4, 5])) - - x = [5, 10, 0, 100] - cx = num.array(x) - assert num.array_equal(num.all(cx), np.all(x)) - - y = [[0, 0], [0, 0]] - cy = num.array(y) - assert num.array_equal(num.all(cy), np.all(y)) - - -def test_all_axis(): - x = np.array([[True, True, False], [True, True, True]]) - cx = num.array(x) - - assert num.array_equal(num.all(cx), np.all(x)) - assert num.array_equal(num.all(cx, axis=0), np.all(x, axis=0)) - - -def test_nan(): - assert num.equal(num.all(num.nan), np.all(np.nan)) - assert num.equal(num.any(num.nan), np.any(np.nan)) - - assert num.array_equal(num.all(num.nan), np.all(np.nan)) - assert num.array_equal(num.any(num.nan), np.any(np.nan)) +INPUTS = ( + [-1, 4, 5], + [5, 10, 0, 100], + [[0, 0], [0, 0]], + [[True, True, False], [True, True, True]], + [[False, True, False]], + [[0.0, 1.0, 0.0]], + [[1, 0 + 1j, 1 + 1j]], + [[1, 0 + 1j, 0 + 0j]], + [np.nan], +) + + +@pytest.mark.parametrize("input", INPUTS) +def test_any_and_all(input): + in_np = np.array(input) + # cuNumeric doesn't support reductions for complex128 + if in_np.dtype.kind == "c": + in_np = in_np.astype("F") + in_num = num.array(in_np) + + for fn in ("any", "all"): + fn_np = getattr(np, fn) + fn_num = getattr(num, fn) + assert np.array_equal(fn_np(in_np), fn_num(in_num)) + for axis in range(in_num.ndim): + out_np = fn_np(in_np, axis=axis) + out_num = fn_num(in_num, axis=axis) + assert np.array_equal(out_np, out_num) @pytest.mark.skip diff --git a/tests/integration/test_nonzero.py b/tests/integration/test_nonzero.py index e09340c8be..27538ae003 100644 --- a/tests/integration/test_nonzero.py +++ b/tests/integration/test_nonzero.py @@ -98,12 +98,11 @@ def test_axis(): x_np, axis=(0, 1, 2) ) - # TODO: Put this back once we have per-axis count_nonzero - # for axis in range(3): - # assert_equal( - # num.count_nonzero(x, axis=axis), - # np.count_nonzero(x_np, axis=axis), - # ) + for axis in range(3): + assert_equal( + num.count_nonzero(x, axis=axis), + np.count_nonzero(x_np, axis=axis), + ) def test_deprecated_0d(): From 3f2769dd3e95aa4ad078d1a8c0db475038068f91 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 12:44:31 -0700 Subject: [PATCH 06/16] Remove unused ArgRedImplBody --- src/cunumeric/unary/unary_red.cc | 21 --------------------- src/cunumeric/unary/unary_red_template.inl | 3 --- 2 files changed, 24 deletions(-) diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc index 115587b289..8b9b93942c 100644 --- a/src/cunumeric/unary/unary_red.cc +++ b/src/cunumeric/unary/unary_red.cc @@ -42,27 +42,6 @@ struct UnaryRedImplBody { } }; -template -struct ArgRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using RHS = legate_type_of; - using LHS = Argval; - - void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - lhs.reduce(point, LHS(point[collapsed_dim], rhs[point])); - } - } -}; - /*static*/ void UnaryRedTask::cpu_variant(TaskContext& context) { unary_red_template(context); diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl index 1818860e65..7283949642 100644 --- a/src/cunumeric/unary/unary_red_template.inl +++ b/src/cunumeric/unary/unary_red_template.inl @@ -26,9 +26,6 @@ using namespace legate; template struct UnaryRedImplBody; -template -struct ArgRedImplBody; - template struct UnaryRedImpl { template Date: Thu, 19 May 2022 12:45:34 -0700 Subject: [PATCH 07/16] Clean up the valid fields that were missed in the previous pass --- src/cunumeric/unary/unary_red_util.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index b385a5907c..75a8f86da5 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -242,7 +242,7 @@ struct UnaryRedOp { template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = Argval; @@ -263,14 +263,9 @@ struct UnaryRedOp { } }; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; -}; - template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; using RHS = legate::legate_type_of; using VAL = Argval; @@ -291,9 +286,4 @@ struct UnaryRedOp { } }; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; -}; - } // namespace cunumeric From ee513263a97068398886b5af5b21553976fb0df3 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 13:22:34 -0700 Subject: [PATCH 08/16] Clean up spaghetti in unary reduction and correctly handle outputs of mismatching dtypes --- cunumeric/array.py | 186 +++++++++++++++----------------------------- cunumeric/module.py | 1 - 2 files changed, 62 insertions(+), 125 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 467c5d62e9..955739bc14 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -585,9 +585,8 @@ def __contains__(self, item): UnaryRedCode.CONTAINS, self, axis=None, - dtype=np.dtype(np.bool_), + dtype=bool, args=args, - check_types=False, ) def __copy__(self): @@ -1501,10 +1500,9 @@ def all( UnaryRedCode.ALL, self, axis=axis, - dst=out, + out=out, keepdims=keepdims, - dtype=np.dtype(np.bool_), - check_types=False, + dtype=bool, initial=initial, where=where, ) @@ -1537,10 +1535,9 @@ def any( UnaryRedCode.ANY, self, axis=axis, - dst=out, + out=out, keepdims=keepdims, - dtype=np.dtype(np.bool_), - check_types=False, + dtype=bool, initial=initial, where=where, ) @@ -1574,8 +1571,7 @@ def argmax(self, axis=None, out=None): self, axis=axis, dtype=np.dtype(np.int64), - dst=out, - check_types=False, + out=out, ) def argmin(self, axis=None, out=None): @@ -1607,8 +1603,7 @@ def argmin(self, axis=None, out=None): self, axis=axis, dtype=np.dtype(np.int64), - dst=out, - check_types=False, + out=out, ) def astype( @@ -2590,7 +2585,7 @@ def max( UnaryRedCode.MAX, self, axis=axis, - dst=out, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -2686,7 +2681,7 @@ def min( UnaryRedCode.MIN, self, axis=axis, - dst=out, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -2781,7 +2776,7 @@ def prod( UnaryRedCode.PROD, self_array, axis=axis, - dst=out, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -3054,7 +3049,7 @@ def sum( UnaryRedCode.SUM, self_array, axis=axis, - dst=out, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -3486,13 +3481,14 @@ def _perform_unary_reduction( src, axis=None, dtype=None, - dst=None, + out=None, keepdims=False, args=None, - check_types=True, initial=None, where=True, ): + where = convert_to_predicate_ndarray(where) + # TODO: Need to require initial to be given when the array is empty # or a where mask is given. if isinstance(where, ndarray): @@ -3516,121 +3512,63 @@ def _perform_unary_reduction( "(arg)max/min not supported for complex-type arrays" ) # Compute the output shape - if axis is not None: - to_reduce = set() - if type(axis) == int: - if axis < 0: - axis = len(src.shape) + axis - if axis < 0: - raise ValueError("Illegal 'axis' value") - elif axis >= src.ndim: - raise ValueError("Illegal 'axis' value") - to_reduce.add(axis) - axes = (axis,) - elif type(axis) == tuple: - for ax in axis: - if ax < 0: - ax = len(src.shape) + ax - if ax < 0: - raise ValueError("Illegal 'axis' value") - elif ax >= src.ndim: - raise ValueError("Illegal 'axis' value") - to_reduce.add(ax) - axes = axis - else: - raise TypeError( - "Illegal type passed for 'axis' argument " - + str(type(axis)) - ) - out_shape = () - for dim in range(len(src.shape)): - if dim in to_reduce: - if keepdims: - out_shape += (1,) - else: - out_shape += (src.shape[dim],) - else: - # Collapsing down to a single value in this case - out_shape = () - axes = None - # if src.size == 0: - # return nd - if dst is None: - if dtype is not None: - dst = ndarray( - shape=out_shape, - dtype=dtype, - inputs=(src, where), - ) - else: - dst = ndarray( - shape=out_shape, - dtype=src.dtype, - inputs=(src, where), - ) - else: - if dtype is not None and dtype != dst.dtype: - raise TypeError( - "Output array type does not match requested dtype" - ) - if dst.shape != out_shape: - raise TypeError( - "Output array shape " - + str(dst.shape) - + " does not match expected shape " - + str(out_shape) - ) - # Quick exit - if where is False: - return dst - if check_types and src.dtype != dst.dtype: - out_dtype = cls.find_common_type(src, dst) - if src.dtype != out_dtype: - temp = ndarray( - src.shape, - dtype=out_dtype, - inputs=(src, where), - ) - temp._thunk.convert(src._thunk) - src = temp - if dst.dtype != out_dtype: - temp = ndarray( - dst.shape, - dtype=out_dtype, - inputs=(src, where), - ) + axes = axis + if axes is None: + axes = tuple(range(src.ndim)) + elif not isinstance(axes, tuple): + axes = (axes,) - temp._thunk.unary_reduction( - op, - src._thunk, - cls._get_where_thunk(where, dst.shape), - axes, - keepdims, - args, - initial, - ) - dst._thunk.convert(temp._thunk) - else: - dst._thunk.unary_reduction( - op, - src._thunk, - cls._get_where_thunk(where, dst.shape), - axes, - keepdims, - args, - initial, - ) + if any(type(ax) != int for ax in axes): + raise TypeError( + "'axis' must be an integer or a tuple of integers, " + f"but got {axis}" + ) + + axes = tuple(ax + src.ndim if ax < 0 else ax for ax in axes) + + if any(ax < 0 for ax in axes): + raise ValueError(f"Invalid 'axis' value {axis}") + + out_shape = () + for axis in range(src.ndim): + if axis not in axes: + out_shape += (src.shape[axis],) + elif keepdims: + out_shape += (1,) + + # If no output dtype is given, the output has the same dtype as the + # input + if dtype is None: + dtype = src.dtype + + if out is None: + out = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where)) + elif out.shape != out_shape: + raise ValueError( + f"the output shape mismatch: expected {out_shape} but got " + f"{out.shape}" + ) + + if out.dtype == dtype: + result = out else: - dst._thunk.unary_reduction( + result = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where)) + + if where: + result._thunk.unary_reduction( op, src._thunk, - cls._get_where_thunk(where, dst.shape), + cls._get_where_thunk(where, result.shape), axes, keepdims, args, initial, ) - return dst + + if result is not out: + out._thunk.convert(result._thunk) + + return out @classmethod def _perform_binary_reduction( diff --git a/cunumeric/module.py b/cunumeric/module.py index 44f2037802..89adda9e00 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -4524,7 +4524,6 @@ def count_nonzero(a, axis=None): a, axis=axis, dtype=np.dtype(np.uint64), - check_types=False, ) From dafa481fa4be22e44ebc251628d23f25b5922c82 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 13:23:45 -0700 Subject: [PATCH 09/16] Add more tests for any and all --- tests/integration/test_logical.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py index b84148e634..4eda4388e6 100644 --- a/tests/integration/test_logical.py +++ b/tests/integration/test_logical.py @@ -17,6 +17,7 @@ import pytest import cunumeric as num +from legate.core import LEGATE_MAX_DIM INPUTS = ( [-1, 4, 5], @@ -49,6 +50,31 @@ def test_any_and_all(input): assert np.array_equal(out_np, out_num) +@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1)) +def test_nd_inputs(ndim): + shape = (3,) * ndim + in_np = np.random.random(shape) + in_num = num.array(in_np) + + for fn in ("any", "all"): + fn_np = getattr(np, fn) + fn_num = getattr(num, fn) + for axis in range(in_num.ndim): + out_np = fn_np(in_np, axis=axis) + out_num = fn_num(in_num, axis=axis) + assert np.array_equal(out_np, out_num) + + out_np = np.empty(out_np.shape, dtype="D") + out_num = num.empty(out_num.shape, dtype="D") + fn_np(in_np, axis=axis, out=out_np) + fn_num(in_num, axis=axis, out=out_num) + assert np.array_equal(out_np, out_num) + + out_np = fn_np(in_np[1:], axis=axis) + out_num = fn_num(in_num[1:], axis=axis) + assert np.array_equal(out_np, out_num) + + @pytest.mark.skip def test_where(): x = np.array([[True, True, False], [True, True, True]]) From 0983a85268d3ec45a7907e04bd342e4c87611909 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 15:02:57 -0700 Subject: [PATCH 10/16] Add some missing add_boilerplates calls --- cunumeric/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 955739bc14..20706502aa 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1542,6 +1542,7 @@ def any( where=where, ) + @add_boilerplate() def argmax(self, axis=None, out=None): """a.argmax(axis=None, out=None) @@ -1574,6 +1575,7 @@ def argmax(self, axis=None, out=None): out=out, ) + @add_boilerplate() def argmin(self, axis=None, out=None): """a.argmin(axis=None, out=None) @@ -3487,8 +3489,6 @@ def _perform_unary_reduction( initial=None, where=True, ): - where = convert_to_predicate_ndarray(where) - # TODO: Need to require initial to be given when the array is empty # or a where mask is given. if isinstance(where, ndarray): From 868b68b7adafeab1060ae5f3cffacb0b980207f2 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 15:28:28 -0700 Subject: [PATCH 11/16] Give more accurate names to some of the template parameters --- src/cunumeric/unary/scalar_unary_red.cc | 6 +++--- src/cunumeric/unary/scalar_unary_red.cu | 14 +++++++------- src/cunumeric/unary/scalar_unary_red_omp.cc | 6 +++--- src/cunumeric/unary/scalar_unary_red_template.inl | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc index a084943fe3..386a06eba1 100644 --- a/src/cunumeric/unary/scalar_unary_red.cc +++ b/src/cunumeric/unary/scalar_unary_red.cc @@ -55,17 +55,17 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD out, - AccessorRO in, + AccessorRO in, const Store& to_find_scalar, const Rect& rect, const Pitches& pitches, bool dense) const { auto result = LG_OP::identity; - const auto to_find = to_find_scalar.scalar(); + const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); if (dense) { auto inptr = in.ptr(rect); diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu index e72b8d1b11..5b3500162e 100644 --- a/src/cunumeric/unary/scalar_unary_red.cu +++ b/src/cunumeric/unary/scalar_unary_red.cu @@ -29,7 +29,7 @@ template + typename LHS> static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduction_kernel(size_t volume, OP, @@ -39,7 +39,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) Pitches pitches, Point origin, size_t iters, - VAL identity) + LHS identity) { auto value = identity; for (size_t idx = 0; idx < iters; idx++) { @@ -53,9 +53,9 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduce_output(out, value); } -template +template static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) contains_kernel( - size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, VAL to_find) + size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, RHS to_find) { bool value = false; for (size_t idx = 0; idx < iters; idx++) { @@ -113,10 +113,10 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD out, - AccessorRO in, + AccessorRO in, const Store& to_find_scalar, const Rect& rect, const Pitches& pitches, @@ -124,7 +124,7 @@ struct ScalarUnaryRedImplBody(); + const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; DeferredReduction> result; diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc index db50f15427..4621a2152d 100644 --- a/src/cunumeric/unary/scalar_unary_red_omp.cc +++ b/src/cunumeric/unary/scalar_unary_red_omp.cc @@ -73,17 +73,17 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD out, - AccessorRO in, + AccessorRO in, const Store& to_find_scalar, const Rect& rect, const Pitches& pitches, bool dense) const { auto result = LG_OP::identity; - const auto to_find = to_find_scalar.scalar(); + const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); const auto max_threads = omp_get_max_threads(); ThreadLocalStorage locals(max_threads); diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl index d750ebbb8b..a482854f2f 100644 --- a/src/cunumeric/unary/scalar_unary_red_template.inl +++ b/src/cunumeric/unary/scalar_unary_red_template.inl @@ -73,7 +73,7 @@ struct ScalarUnaryRedImpl { { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; auto rect = args.in.shape(); @@ -83,7 +83,7 @@ struct ScalarUnaryRedImpl { if (0 == volume) return; auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); + auto in = args.in.read_accessor(rect); #ifndef LEGION_BOUNDS_CHECKS // Check to see if this is dense or not From dbb22229a813684621c0d0228f9171120e576b0b Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 21:04:59 -0700 Subject: [PATCH 12/16] Handle 'dtype' correctly in sum and prod --- cunumeric/_ufunc/ufunc.py | 6 +--- cunumeric/array.py | 46 +++++++++++++++++------- cunumeric/module.py | 2 +- tests/integration/test_reduction_axis.py | 7 ++-- 4 files changed, 40 insertions(+), 21 deletions(-) diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py index 899f9728d9..64dec6b61b 100644 --- a/cunumeric/_ufunc/ufunc.py +++ b/cunumeric/_ufunc/ufunc.py @@ -663,10 +663,6 @@ def reduce( raise NotImplementedError( f"reduction for {self} is not yet implemented" ) - if out is not None: - raise NotImplementedError( - "reduction for {self} does not take an `out` argument" - ) if not isinstance(where, bool) or not where: raise NotImplementedError( "the 'where' keyword is not yet supported" @@ -682,7 +678,7 @@ def reduce( array, axis=axis, dtype=dtype, - # out=out, + out=out, keepdims=keepdims, initial=initial, where=where, diff --git a/cunumeric/array.py b/cunumeric/array.py index 20706502aa..393eb002ce 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -585,7 +585,7 @@ def __contains__(self, item): UnaryRedCode.CONTAINS, self, axis=None, - dtype=bool, + res_dtype=bool, args=args, ) @@ -1500,9 +1500,9 @@ def all( UnaryRedCode.ALL, self, axis=axis, + res_dtype=bool, out=out, keepdims=keepdims, - dtype=bool, initial=initial, where=where, ) @@ -1535,9 +1535,9 @@ def any( UnaryRedCode.ANY, self, axis=axis, + res_dtype=bool, out=out, keepdims=keepdims, - dtype=bool, initial=initial, where=where, ) @@ -1571,7 +1571,7 @@ def argmax(self, axis=None, out=None): UnaryRedCode.ARGMAX, self, axis=axis, - dtype=np.dtype(np.int64), + res_dtype=np.dtype(np.int64), out=out, ) @@ -1604,7 +1604,7 @@ def argmin(self, axis=None, out=None): UnaryRedCode.ARGMIN, self, axis=axis, - dtype=np.dtype(np.int64), + res_dtype=np.dtype(np.int64), out=out, ) @@ -2778,6 +2778,7 @@ def prod( UnaryRedCode.PROD, self_array, axis=axis, + dtype=dtype, out=out, keepdims=keepdims, initial=initial, @@ -3051,6 +3052,7 @@ def sum( UnaryRedCode.SUM, self_array, axis=axis, + dtype=dtype, out=out, keepdims=keepdims, initial=initial, @@ -3483,12 +3485,28 @@ def _perform_unary_reduction( src, axis=None, dtype=None, + res_dtype=None, out=None, keepdims=False, args=None, initial=None, where=True, ): + # When 'res_dtype' is not None, the input and output of the reduction + # have different types. Such reduction operators don't take a dtype of + # the accumulator + if res_dtype is not None: + assert dtype is None + dtype = src.dtype + else: + # If 'dtype' exists, that determines both the accumulation dtype + # and the output dtype + if dtype is not None: + res_dtype = dtype + else: + dtype = src.dtype + res_dtype = src.dtype + # TODO: Need to require initial to be given when the array is empty # or a where mask is given. if isinstance(where, ndarray): @@ -3536,23 +3554,25 @@ def _perform_unary_reduction( elif keepdims: out_shape += (1,) - # If no output dtype is given, the output has the same dtype as the - # input - if dtype is None: - dtype = src.dtype - if out is None: - out = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where)) + out = ndarray( + shape=out_shape, dtype=res_dtype, inputs=(src, where) + ) elif out.shape != out_shape: raise ValueError( f"the output shape mismatch: expected {out_shape} but got " f"{out.shape}" ) - if out.dtype == dtype: + if dtype != src.dtype: + src = src.astype(dtype) + + if out.dtype == res_dtype: result = out else: - result = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where)) + result = ndarray( + shape=out_shape, dtype=res_dtype, inputs=(src, where) + ) if where: result._thunk.unary_reduction( diff --git a/cunumeric/module.py b/cunumeric/module.py index 89adda9e00..3486b7d4d9 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -4522,8 +4522,8 @@ def count_nonzero(a, axis=None): return ndarray._perform_unary_reduction( UnaryRedCode.COUNT_NONZERO, a, + res_dtype=np.dtype(np.uint64), axis=axis, - dtype=np.dtype(np.uint64), ) diff --git a/tests/integration/test_reduction_axis.py b/tests/integration/test_reduction_axis.py index bfe67a41cc..7b6ff05553 100644 --- a/tests/integration/test_reduction_axis.py +++ b/tests/integration/test_reduction_axis.py @@ -21,8 +21,8 @@ import cunumeric as cn -def _sum(shape, axis, lib): - return lib.ones(shape).sum(axis=axis) +def _sum(shape, axis, lib, dtype=None): + return lib.ones(shape).sum(axis=axis, dtype=dtype) # Try various non-square shapes, to nudge the core towards trying many @@ -31,6 +31,9 @@ def _sum(shape, axis, lib): @pytest.mark.parametrize("shape", permutations((3, 4, 5)), ids=str) def test_3d(shape, axis): assert np.array_equal(_sum(shape, axis, np), _sum(shape, axis, cn)) + assert np.array_equal( + _sum(shape, axis, np, dtype="D"), _sum(shape, axis, cn, dtype="D") + ) if __name__ == "__main__": From 8341ded03dc7a42c7cd0b839ff74b83c8181db26 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 22:05:55 -0700 Subject: [PATCH 13/16] Fix the plumbing code on the Python side for argmin and argmax --- cunumeric/array.py | 30 ++++++++++++------------------ cunumeric/deferred.py | 30 +++++++++++++++--------------- cunumeric/module.py | 22 ++++++++++++---------- 3 files changed, 39 insertions(+), 43 deletions(-) diff --git a/cunumeric/array.py b/cunumeric/array.py index 393eb002ce..5dbd49ab40 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -1543,7 +1543,7 @@ def any( ) @add_boilerplate() - def argmax(self, axis=None, out=None): + def argmax(self, axis=None, out=None, keepdims=False): """a.argmax(axis=None, out=None) Return indices of the maximum values along the given axis. @@ -1559,24 +1559,21 @@ def argmax(self, axis=None, out=None): Multiple GPUs, Multiple CPUs """ - if self.size == 1: - return 0 - if axis is None: - axis = self.ndim - 1 - elif type(axis) != int: - raise TypeError("'axis' argument for argmax must be an 'int'") - elif axis < 0 or axis >= self.ndim: - raise TypeError("invalid 'axis' argument for argmax " + str(axis)) + if out is not None and out.dtype != np.int64: + raise ValueError("output array must have int64 dtype") + if axis is not None and not isinstance(axis, int): + raise ValueError("axis must be an integer") return self._perform_unary_reduction( UnaryRedCode.ARGMAX, self, axis=axis, res_dtype=np.dtype(np.int64), out=out, + keepdims=keepdims, ) @add_boilerplate() - def argmin(self, axis=None, out=None): + def argmin(self, axis=None, out=None, keepdims=False): """a.argmin(axis=None, out=None) Return indices of the minimum values along the given axis. @@ -1592,20 +1589,17 @@ def argmin(self, axis=None, out=None): Multiple GPUs, Multiple CPUs """ - if self.size == 1: - return 0 - if axis is None: - axis = self.ndim - 1 - elif type(axis) != int: - raise TypeError("'axis' argument for argmin must be an 'int'") - elif axis < 0 or axis >= self.ndim: - raise TypeError("invalid 'axis' argument for argmin " + str(axis)) + if out is not None and out.dtype != np.int64: + raise ValueError("output array must have int64 dtype") + if axis is not None and not isinstance(axis, int): + raise ValueError("axis must be an integer") return self._perform_unary_reduction( UnaryRedCode.ARGMIN, self, axis=axis, res_dtype=np.dtype(np.int64), out=out, + keepdims=keepdims, ) def astype( diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 5c2c4904a5..a59101745e 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1686,6 +1686,14 @@ def unary_reduction( argred = op in (UnaryRedCode.ARGMAX, UnaryRedCode.ARGMIN) + if argred: + argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype) + lhs_array = self.runtime.create_empty_thunk( + lhs_array.shape, + dtype=argred_dtype, + inputs=[self], + ) + # See if we are doing reduction to a point or another region if lhs_array.size == 1: assert axes is None or len(axes) == ( @@ -1711,14 +1719,6 @@ def unary_reduction( task.execute() else: - if argred: - argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype) - lhs_array = self.runtime.create_empty_thunk( - lhs_array.shape, - dtype=argred_dtype, - inputs=[self], - ) - # Before we perform region reduction, make sure to have the lhs # initialized. If an initial value is given, we use it, otherwise # we use the identity of the reduction operator @@ -1758,13 +1758,13 @@ def unary_reduction( task.execute() - if argred: - self.unary_op( - UnaryOpCode.GETARG, - lhs_array, - True, - [], - ) + if argred: + self.unary_op( + UnaryOpCode.GETARG, + lhs_array, + True, + [], + ) def isclose(self, rhs1, rhs2, rtol, atol, equal_nan): assert not equal_nan diff --git a/cunumeric/module.py b/cunumeric/module.py index 3486b7d4d9..710f01f68b 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -4411,7 +4411,7 @@ def partition(a, kth, axis=-1, kind="introselect", order=None): @add_boilerplate("a") -def argmax(a, axis=None, out=None): +def argmax(a, axis=None, out=None, *, keepdims=False): """ Returns the indices of the maximum values along an axis. @@ -4426,6 +4426,10 @@ def argmax(a, axis=None, out=None): out : ndarray, optional If provided, the result will be inserted into this array. It should be of the appropriate shape and dtype. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the array. Returns ------- @@ -4441,14 +4445,11 @@ def argmax(a, axis=None, out=None): -------- Multiple GPUs, Multiple CPUs """ - if out is not None: - if out.dtype != np.int64: - raise ValueError("output array must have int64 dtype") - return a.argmax(axis=axis, out=out) + return a.argmax(axis=axis, out=out, keepdims=keepdims) @add_boilerplate("a") -def argmin(a, axis=None, out=None): +def argmin(a, axis=None, out=None, *, keepdims=False): """ Returns the indices of the minimum values along an axis. @@ -4463,6 +4464,10 @@ def argmin(a, axis=None, out=None): out : ndarray, optional If provided, the result will be inserted into this array. It should be of the appropriate shape and dtype. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the array. Returns ------- @@ -4478,10 +4483,7 @@ def argmin(a, axis=None, out=None): -------- Multiple GPUs, Multiple CPUs """ - if out is not None: - if out is not None and out.dtype != np.int64: - raise ValueError("output array must have int64 dtype") - return a.argmin(axis=axis, out=out) + return a.argmin(axis=axis, out=out, keepdims=keepdims) # Counting From 4a658686dcd7cb9fff162f73997b8391d44083a0 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 19 May 2022 22:52:07 -0700 Subject: [PATCH 14/16] Implement scalar reduction for argmin and argmax --- cunumeric/array.py | 7 +- cunumeric/deferred.py | 2 + cunumeric/eager.py | 19 ++++-- cunumeric/thunk.py | 1 + src/cunumeric/arg.h | 4 +- src/cunumeric/arg.inl | 5 -- src/cunumeric/cuda_help.h | 29 +++++++++ src/cunumeric/unary/scalar_unary_red.cc | 32 ++++++++- src/cunumeric/unary/scalar_unary_red.cu | 65 ++++++++++++++++++- src/cunumeric/unary/scalar_unary_red.h | 14 +--- src/cunumeric/unary/scalar_unary_red_omp.cc | 46 ++++++++++++- .../unary/scalar_unary_red_template.inl | 14 ++-- src/cunumeric/unary/unary_red_util.h | 20 ++++++ .../{test_argmin.py => test_arg_reduce.py} | 36 +++++----- 14 files changed, 238 insertions(+), 56 deletions(-) rename tests/integration/{test_argmin.py => test_arg_reduce.py} (54%) diff --git a/cunumeric/array.py b/cunumeric/array.py index 5dbd49ab40..dcc85fd855 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -3542,9 +3542,9 @@ def _perform_unary_reduction( raise ValueError(f"Invalid 'axis' value {axis}") out_shape = () - for axis in range(src.ndim): - if axis not in axes: - out_shape += (src.shape[axis],) + for dim in range(src.ndim): + if dim not in axes: + out_shape += (src.shape[dim],) elif keepdims: out_shape += (1,) @@ -3573,6 +3573,7 @@ def _perform_unary_reduction( op, src._thunk, cls._get_where_thunk(where, result.shape), + axis, axes, keepdims, args, diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index a59101745e..da125c301e 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1675,6 +1675,7 @@ def unary_reduction( op, src, where, + orig_axis, axes, keepdims, args, @@ -1713,6 +1714,7 @@ def unary_reduction( task.add_reduction(lhs_array.base, _UNARY_RED_TO_REDUCTION_OPS[op]) task.add_input(rhs_array.base) task.add_scalar_arg(op, ty.int32) + task.add_scalar_arg(rhs_array.shape, (ty.int64,)) self.add_arguments(task, args) diff --git a/cunumeric/eager.py b/cunumeric/eager.py index f84310ce46..2f85c4592e 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -700,13 +700,16 @@ def unary_op(self, op, rhs, where, args, multiout=None): else: raise RuntimeError("unsupported unary op " + str(op)) - def unary_reduction(self, op, rhs, where, axes, keepdims, args, initial): + def unary_reduction( + self, op, rhs, where, orig_axis, axes, keepdims, args, initial + ): self.check_eager_args(rhs) if self.deferred is not None: self.deferred.unary_reduction( op, rhs, where, + orig_axis, axes, keepdims, args, @@ -722,22 +725,24 @@ def unary_reduction(self, op, rhs, where, axes, keepdims, args, initial): fn( rhs.array, out=self.array, - axis=axes, + axis=orig_axis, keepdims=keepdims, where=where if not isinstance(where, EagerArray) else where.array, ) elif op == UnaryRedCode.ARGMAX: - assert len(axes) == 1 - np.argmax(rhs.array, out=self.array, axis=axes[0]) + np.argmax( + rhs.array, + out=self.array, + axis=orig_axis, + ) elif op == UnaryRedCode.ARGMIN: - assert len(axes) == 1 - np.argmin(rhs.array, out=self.array, axis=axes[0]) + np.argmin(rhs.array, out=self.array, axis=orig_axis) elif op == UnaryRedCode.CONTAINS: self.array.fill(args[0] in rhs.array) elif op == UnaryRedCode.COUNT_NONZERO: - self.array[()] = np.count_nonzero(rhs.array, axis=axes) + self.array[()] = np.count_nonzero(rhs.array, axis=orig_axis) else: raise RuntimeError("unsupported unary reduction op " + str(op)) diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py index 153f133d92..a0d67d584b 100644 --- a/cunumeric/thunk.py +++ b/cunumeric/thunk.py @@ -193,6 +193,7 @@ def unary_reduction( redop, rhs, where, + orig_axis, axes, keepdims, args, diff --git a/src/cunumeric/arg.h b/src/cunumeric/arg.h index c31a839807..7705cdf8b9 100644 --- a/src/cunumeric/arg.h +++ b/src/cunumeric/arg.h @@ -25,8 +25,10 @@ namespace cunumeric { template class Argval { public: + // Calling this constructor manually is unsafe, as the members are left uninitialized. + // This constructor exists only to make nvcc happy when we use a shared memory of Argval. __CUDA_HD__ - Argval(); + Argval() {} __CUDA_HD__ Argval(T value); __CUDA_HD__ diff --git a/src/cunumeric/arg.inl b/src/cunumeric/arg.inl index fff5d1f93f..0f57b42621 100644 --- a/src/cunumeric/arg.inl +++ b/src/cunumeric/arg.inl @@ -16,11 +16,6 @@ namespace cunumeric { -template -__CUDA_HD__ Argval::Argval() : arg(LLONG_MAX), arg_value(0) -{ -} - template __CUDA_HD__ Argval::Argval(T v) : arg(LLONG_MAX), arg_value(v) { diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index 1c0efbb3d4..f0d410bf00 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -19,6 +19,7 @@ #include "legate.h" #include "core/cuda/cuda_help.h" #include "core/cuda/stream_pool.h" +#include "cunumeric/arg.h" #include #include #include @@ -237,6 +238,34 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction +__device__ __forceinline__ void reduce_output(Legion::DeferredReduction result, + Argval value) +{ + __shared__ Argval trampoline[THREADS_PER_BLOCK / 32]; + // Reduce across the warp + const int laneid = threadIdx.x & 0x1f; + const int warpid = threadIdx.x >> 5; + for (int i = 16; i >= 1; i /= 2) { + const Argval shuffle_value = shuffle(0xffffffff, value, i, 32); + REDUCTION::template fold(value, shuffle_value); + } + // Write warp values into shared memory + if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value; + __syncthreads(); + // Output reduction + if (threadIdx.x == 0) { + for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++) + REDUCTION::template fold(value, trampoline[i]); + result <<= value; + // Make sure the result is visible externally + __threadfence_system(); + } +} + template __device__ __forceinline__ void reduce_output(Legion::DeferredReduction result, T value) { diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc index 386a06eba1..9bbaeb6cc9 100644 --- a/src/cunumeric/unary/scalar_unary_red.cc +++ b/src/cunumeric/unary/scalar_unary_red.cc @@ -28,12 +28,15 @@ struct ScalarUnaryRedImplBody { using LG_OP = typename OP::OP; using RHS = legate_type_of; + template ::value>* = nullptr> void operator()(OP func, AccessorRD out, AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto result = LG_OP::identity; const size_t volume = rect.volume(); @@ -49,6 +52,33 @@ struct ScalarUnaryRedImplBody { } out.reduce(0, result); } + + template ::value>* = nullptr> + void operator()(OP func, + AccessorRD out, + AccessorRO in, + const Rect& rect, + const Pitches& pitches, + bool dense, + const Point& shape) const + { + auto result = LG_OP::identity; + const size_t volume = rect.volume(); + if (dense) { + auto inptr = in.ptr(rect); + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(result, OP::convert(p, shape, inptr[idx])); + } + } else { + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(result, OP::convert(p, shape, in[p])); + } + } + out.reduce(0, result); + } }; template diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu index 5b3500162e..6f2059847c 100644 --- a/src/cunumeric/unary/scalar_unary_red.cu +++ b/src/cunumeric/unary/scalar_unary_red.cu @@ -53,6 +53,37 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduce_output(out, value); } +template +static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + arg_reduction_kernel(size_t volume, + OP, + LG_OP, + Output out, + ReadAcc in, + Pitches pitches, + Point origin, + size_t iters, + LHS identity, + Point shape) +{ + auto value = identity; + for (size_t idx = 0; idx < iters; idx++) { + const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; + if (offset < volume) { + auto point = pitches.unflatten(offset, origin); + LG_OP::template fold(value, OP::convert(point, shape, in[point])); + } + } + // Every thread in the thread block must participate in the exchange to get correct results + reduce_output(out, value); +} + template static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) contains_kernel( size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, RHS to_find) @@ -82,12 +113,15 @@ struct ScalarUnaryRedImplBody { using RHS = legate_type_of; using LHS = typename OP::VAL; + template ::value>* = nullptr> void operator()(OP func, AccessorRD out, AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto stream = get_cached_stream(); @@ -107,6 +141,35 @@ struct ScalarUnaryRedImplBody { copy_kernel<<<1, 1, 0, stream>>>(result, out); CHECK_CUDA_STREAM(stream); } + + template ::value>* = nullptr> + void operator()(OP func, + AccessorRD out, + AccessorRO in, + const Rect& rect, + const Pitches& pitches, + bool dense, + const Point& shape) const + { + auto stream = get_cached_stream(); + + const size_t volume = rect.volume(); + const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + DeferredReduction result; + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(LHS); + + if (blocks >= MAX_REDUCTION_CTAS) { + const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; + arg_reduction_kernel<<>>( + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity, shape); + } else + arg_reduction_kernel<<>>( + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity, shape); + + copy_kernel<<<1, 1, 0, stream>>>(result, out); + CHECK_CUDA_STREAM(stream); + } }; template diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cunumeric/unary/scalar_unary_red.h index f8beab279d..e529abd7f4 100644 --- a/src/cunumeric/unary/scalar_unary_red.h +++ b/src/cunumeric/unary/scalar_unary_red.h @@ -25,6 +25,7 @@ struct ScalarUnaryRedArgs { const Array& out; const Array& in; UnaryRedCode op_code; + Legion::DomainPoint shape; std::vector args; }; @@ -43,17 +44,4 @@ class ScalarUnaryRedTask : public CuNumericTask { #endif }; -namespace detail { -template ::value>* = nullptr> -__CUDA_HD__ inline bool convert_to_bool(const _T& in) -{ - return bool(in); -} -template ::value>* = nullptr> -__CUDA_HD__ inline bool convert_to_bool(const _T& in) -{ - return bool(in.real()); -} -} // namespace detail - } // namespace cunumeric diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc index 4621a2152d..e276b4da5f 100644 --- a/src/cunumeric/unary/scalar_unary_red_omp.cc +++ b/src/cunumeric/unary/scalar_unary_red_omp.cc @@ -32,12 +32,15 @@ struct ScalarUnaryRedImplBody { using RHS = legate_type_of; using LHS = typename OP::VAL; + template ::value>* = nullptr> void operator()(OP func, AccessorRD out, AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto result = LG_OP::identity; const size_t volume = rect.volume(); @@ -67,6 +70,47 @@ struct ScalarUnaryRedImplBody { for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]); } + + template ::value>* = nullptr> + void operator()(OP func, + AccessorRD out, + AccessorRO in, + const Rect& rect, + const Pitches& pitches, + bool dense, + const Point& shape) const + { + auto result = LG_OP::identity; + const size_t volume = rect.volume(); + const auto max_threads = omp_get_max_threads(); + ThreadLocalStorage locals(max_threads); + for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity; + if (dense) { + auto inptr = in.ptr(rect); +#pragma omp parallel + { + const int tid = omp_get_thread_num(); +#pragma omp for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(locals[tid], OP::convert(p, shape, inptr[idx])); + } + } + } else { +#pragma omp parallel + { + const int tid = omp_get_thread_num(); +#pragma omp for schedule(static) + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(locals[tid], OP::convert(p, shape, in[p])); + } + } + } + + for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]); + } }; template diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl index a482854f2f..d95a1b6d50 100644 --- a/src/cunumeric/unary/scalar_unary_red_template.inl +++ b/src/cunumeric/unary/scalar_unary_red_template.inl @@ -54,7 +54,8 @@ struct ScalarUnaryRedImpl { bool dense = false; #endif - ScalarUnaryRedImplBody()(OP{}, out, in, rect, pitches, dense); + ScalarUnaryRedImplBody()( + OP{}, out, in, rect, pitches, dense, args.shape); } template { template struct ScalarUnaryRedDispatch { - template ::value>* = nullptr> + template void operator()(ScalarUnaryRedArgs& args) const { auto dim = std::max(1, args.in.dim()); double_dispatch(dim, args.in.code(), ScalarUnaryRedImpl{}, args); } - template ::value>* = nullptr> - void operator()(ScalarUnaryRedArgs& args) const - { - assert(false); - } }; template @@ -122,8 +118,10 @@ static void scalar_unary_red_template(TaskContext& context) std::vector extra_args; for (size_t idx = 1; idx < inputs.size(); ++idx) extra_args.push_back(std::move(inputs[idx])); + auto op_code = scalars[0].value(); + auto shape = scalars[1].value(); ScalarUnaryRedArgs args{ - context.reductions()[0], inputs[0], scalars[0].value(), std::move(extra_args)}; + context.reductions()[0], inputs[0], op_code, shape, std::move(extra_args)}; op_dispatch(args.op_code, ScalarUnaryRedDispatch{}, args); } diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index 75a8f86da5..316445a02b 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -261,6 +261,16 @@ struct UnaryRedOp { { return VAL(point[collapsed_dim], rhs); } + + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + const Legion::Point& shape, + const RHS& rhs) + { + int64_t idx = 0; + for (int32_t dim = 0; dim < DIM; ++dim) idx = idx * shape[dim] + point[dim]; + return VAL(idx, rhs); + } }; template @@ -284,6 +294,16 @@ struct UnaryRedOp { { return VAL(point[collapsed_dim], rhs); } + + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + const Legion::Point& shape, + const RHS& rhs) + { + int64_t idx = 0; + for (int32_t dim = 0; dim < DIM; ++dim) idx = idx * shape[dim] + point[dim]; + return VAL(idx, rhs); + } }; } // namespace cunumeric diff --git a/tests/integration/test_argmin.py b/tests/integration/test_arg_reduce.py similarity index 54% rename from tests/integration/test_argmin.py rename to tests/integration/test_arg_reduce.py index 7281a1096a..ab6f2f6daa 100644 --- a/tests/integration/test_argmin.py +++ b/tests/integration/test_arg_reduce.py @@ -17,22 +17,26 @@ import pytest import cunumeric as num - -anp = np.random.randn(4, 5) - - -def test_argmin(): - a = num.array(anp) - - assert np.array_equal(num.argmin(a, axis=0), np.argmin(anp, axis=0)) - assert np.array_equal(num.argmin(a, axis=1), np.argmin(anp, axis=1)) - - -def test_argmax(): - a = num.array(anp) - - assert np.array_equal(num.argmax(a, axis=0), np.argmax(anp, axis=0)) - assert np.array_equal(num.argmax(a, axis=1), np.argmax(anp, axis=1)) +from legate.core import LEGATE_MAX_DIM + + +@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1)) +def test_argmax_and_argmin(ndim): + shape = (5,) * ndim + + in_np = np.random.random(shape) + in_num = num.array(in_np) + + for fn in ("argmax", "argmin"): + fn_np = getattr(np, fn) + fn_num = getattr(num, fn) + assert np.array_equal(fn_np(in_np), fn_num(in_num)) + if in_num.ndim == 1: + continue + for axis in range(in_num.ndim): + out_np = fn_np(in_np, axis=axis) + out_num = fn_num(in_num, axis=axis) + assert np.array_equal(out_np, out_num) if __name__ == "__main__": From d5a945a8e9c8973c4a0a1e8b8670ebf202d3a1e5 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Fri, 20 May 2022 10:32:19 -0700 Subject: [PATCH 15/16] Minor fix for scalar reductions on scalar stores --- src/cunumeric/unary/scalar_unary_red_template.inl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl index d95a1b6d50..0d0644a47a 100644 --- a/src/cunumeric/unary/scalar_unary_red_template.inl +++ b/src/cunumeric/unary/scalar_unary_red_template.inl @@ -120,6 +120,11 @@ static void scalar_unary_red_template(TaskContext& context) auto op_code = scalars[0].value(); auto shape = scalars[1].value(); + // If the RHS was a scalar, use (1,) as the shape + if (shape.dim == 0) { + shape.dim = 1; + shape[0] = 1; + } ScalarUnaryRedArgs args{ context.reductions()[0], inputs[0], op_code, shape, std::move(extra_args)}; op_dispatch(args.op_code, ScalarUnaryRedDispatch{}, args); From c5d7f85afd505eeba09c2d65e1f7b05d16f104ef Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Fri, 20 May 2022 16:11:48 -0700 Subject: [PATCH 16/16] Make sure we use the dtype of out when it exists --- cunumeric/array.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cunumeric/array.py b/cunumeric/array.py index dcc85fd855..f9cdd13df3 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -3497,6 +3497,9 @@ def _perform_unary_reduction( # and the output dtype if dtype is not None: res_dtype = dtype + elif out is not None: + dtype = out.dtype + res_dtype = out.dtype else: dtype = src.dtype res_dtype = src.dtype