From c5fb952ba0a5eacf7033560b91a03f7a82965325 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 18 May 2022 22:27:39 -0700
Subject: [PATCH 01/16] Use the right LHS type in unary reductions

---
 src/cunumeric/unary/unary_red.cc           |  32 +----
 src/cunumeric/unary/unary_red.cu           | 100 ++--------------
 src/cunumeric/unary/unary_red_omp.cc       |  68 +----------
 src/cunumeric/unary/unary_red_template.inl |  44 +------
 src/cunumeric/unary/unary_red_util.h       | 132 ++++++++++++++-------
 5 files changed, 104 insertions(+), 272 deletions(-)
diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index aef90964aa..115587b289 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -26,23 +26,10 @@ template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
 
   void operator()(AccessorRD<LG_OP, true, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    for (size_t idx = 0; idx < volume; ++idx) {
-      auto point = pitches.unflatten(idx, rect.lo);
-      lhs.reduce(point, rhs[point]);
-    }
-  }
-
-  void operator()(AccessorRW<VAL, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
+                  AccessorRO<RHS, DIM> rhs,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   int collapsed_dim,
@@ -50,7 +37,7 @@ struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   {
     for (size_t idx = 0; idx < volume; ++idx) {
       auto point = pitches.unflatten(idx, rect.lo);
-      OP::template fold<true>(lhs[point], rhs[point]);
+      lhs.reduce(point, OP::convert(point, collapsed_dim, rhs[point]));
     }
   }
 };
@@ -74,19 +61,6 @@ struct ArgRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
       lhs.reduce(point, LHS(point[collapsed_dim], rhs[point]));
     }
   }
-
-  void operator()(AccessorRW<LHS, DIM> lhs,
-                  AccessorRO<RHS, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    for (size_t idx = 0; idx < volume; ++idx) {
-      auto point = pitches.unflatten(idx, rect.lo);
-      OP::template fold<true>(lhs[point], LHS(point[collapsed_dim], rhs[point]));
-    }
-  }
 };
 
 /*static*/ void UnaryRedTask::cpu_variant(TaskContext& context)
diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu
index 86003c00e5..1cc0d46536 100644
--- a/src/cunumeric/unary/unary_red.cu
+++ b/src/cunumeric/unary/unary_red.cu
@@ -38,7 +38,7 @@ static constexpr coord_t WARP_SIZE = 32;
 // have enough elements to be assigned to the threads, we also parallelize on
 // the collapsing domain. One exceptional case to this strategy is where the collapsing
 // dimension is the innermost one, in which case we prefer that dimension to the others
-// in order to enjoy wrap coalescing. The maximum degree of such parallelism woudl be 32,
+// in order to enjoy wrap coalescing. The maximum degree of such parallelism would be 32,
 // which is the size of a wrap.
 template <int32_t DIM>
 struct ThreadBlock {
@@ -203,9 +203,8 @@ std::ostream& operator<<(std::ostream& os, const ThreadBlocks<DIM>& blocks)
   return os;
 }
 
-template <typename REDOP, typename CTOR, typename LHS, typename RHS, int32_t DIM>
-static __device__ __forceinline__ Point<DIM> local_reduce(CTOR ctor,
-                                                          LHS& result,
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
+static __device__ __forceinline__ Point<DIM> local_reduce(LHS& result,
                                                           AccessorRO<RHS, DIM> in,
                                                           LHS identity,
                                                           const ThreadBlocks<DIM>& blocks,
@@ -218,7 +217,7 @@ static __device__ __forceinline__ Point<DIM> local_reduce(CTOR ctor,
   if (!domain.contains(point)) return point;
 
   while (point[collapsed_dim] <= domain.hi[collapsed_dim]) {
-    LHS value = ctor(point, in[point], collapsed_dim);
+    LHS value = OP::convert(point, collapsed_dim, in[point]);
     REDOP::template fold<true>(result, value);
     blocks.next_point(point);
   }
@@ -274,22 +273,7 @@ static __device__ __forceinline__ Point<DIM> local_reduce(CTOR ctor,
   return point;
 }
 
-template <typename REDOP, typename CTOR, typename LHS, typename RHS, int32_t DIM>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
-  reduce_with_rw_acc(AccessorRW<LHS, DIM> out,
-                     AccessorRO<RHS, DIM> in,
-                     LHS identity,
-                     ThreadBlocks<DIM> blocks,
-                     Rect<DIM> domain,
-                     int32_t collapsed_dim)
-{
-  auto result = identity;
-  auto point  = local_reduce<REDOP, CTOR, LHS, RHS, DIM>(
-    CTOR{}, result, in, identity, blocks, domain, collapsed_dim);
-  if (result != identity) REDOP::template fold<false>(out[point], result);
-}
-
-template <typename REDOP, typename CTOR, typename LHS, typename RHS, int32_t DIM>
+template <typename OP, typename REDOP, typename LHS, typename RHS, int32_t DIM>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduce_with_rd_acc(AccessorRD<REDOP, false, DIM> out,
                      AccessorRO<RHS, DIM> in,
@@ -299,64 +283,17 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
                      int32_t collapsed_dim)
 {
   auto result = identity;
-  auto point  = local_reduce<REDOP, CTOR, LHS, RHS, DIM>(
-    CTOR{}, result, in, identity, blocks, domain, collapsed_dim);
+  auto point =
+    local_reduce<OP, REDOP, LHS, RHS, DIM>(result, in, identity, blocks, domain, collapsed_dim);
   if (result != identity) out.reduce(point, result);
 }
 
 template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct UnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
-  using OP    = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-  using CTOR  = ValueConstructor<VAL, DIM>;
-
-  void operator()(AccessorRD<LG_OP, false, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    auto Kernel = reduce_with_rd_acc<LG_OP, CTOR, VAL, VAL, DIM>;
-    auto stream = get_cached_stream();
-
-    ThreadBlocks<DIM> blocks;
-    blocks.initialize(rect, collapsed_dim);
-
-    blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
-    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-      lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim);
-    CHECK_CUDA_STREAM(stream);
-  }
-
-  void operator()(AccessorRW<VAL, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    auto Kernel = reduce_with_rw_acc<LG_OP, CTOR, VAL, VAL, DIM>;
-    auto stream = get_cached_stream();
-
-    ThreadBlocks<DIM> blocks;
-    blocks.initialize(rect, collapsed_dim);
-
-    blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
-    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-      lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim);
-    CHECK_CUDA_STREAM(stream);
-  }
-};
-
-template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
-struct ArgRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
   using RHS   = legate_type_of<CODE>;
-  using LHS   = Argval<RHS>;
-  using CTOR  = ArgvalConstructor<RHS, DIM>;
+  using LHS   = typename OP::VAL;
 
   void operator()(AccessorRD<LG_OP, false, DIM> lhs,
                   AccessorRO<RHS, DIM> rhs,
@@ -365,26 +302,7 @@ struct ArgRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
                   int collapsed_dim,
                   size_t volume) const
   {
-    auto Kernel = reduce_with_rd_acc<LG_OP, CTOR, LHS, RHS, DIM>;
-    auto stream = get_cached_stream();
-
-    ThreadBlocks<DIM> blocks;
-    blocks.initialize(rect, collapsed_dim);
-
-    blocks.compute_maximum_concurrency(reinterpret_cast<const void*>(Kernel));
-    Kernel<<<blocks.num_blocks(), blocks.num_threads(), 0, stream>>>(
-      lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim);
-    CHECK_CUDA_STREAM(stream);
-  }
-
-  void operator()(AccessorRW<LHS, DIM> lhs,
-                  AccessorRO<RHS, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    auto Kernel = reduce_with_rw_acc<LG_OP, CTOR, LHS, RHS, DIM>;
+    auto Kernel = reduce_with_rd_acc<OP, LG_OP, LHS, RHS, DIM>;
     auto stream = get_cached_stream();
 
     ThreadBlocks<DIM> blocks;
diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc
index 1cdb40b44c..6696c0f7b0 100644
--- a/src/cunumeric/unary/unary_red_omp.cc
+++ b/src/cunumeric/unary/unary_red_omp.cc
@@ -79,10 +79,10 @@ template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
 
   void operator()(AccessorRD<LG_OP, true, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
+                  AccessorRO<RHS, DIM> rhs,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   int collapsed_dim,
@@ -95,69 +95,7 @@ struct UnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
     for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
       for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
         auto point = splitter.combine(o_idx, i_idx, rect.lo);
-        lhs.reduce(point, rhs[point]);
-      }
-  }
-
-  void operator()(AccessorRW<VAL, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    Splitter<DIM> splitter;
-    auto split = splitter.split(rect, collapsed_dim);
-
-#pragma omp parallel for schedule(static)
-    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
-      for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
-        auto point = splitter.combine(o_idx, i_idx, rect.lo);
-        OP::template fold<true>(lhs[point], rhs[point]);
-      }
-  }
-};
-
-template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
-struct ArgRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
-  using OP     = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP  = typename OP::OP;
-  using VAL    = legate_type_of<CODE>;
-  using ARGVAL = Argval<VAL>;
-
-  void operator()(AccessorRD<LG_OP, true, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    Splitter<DIM> splitter;
-    auto split = splitter.split(rect, collapsed_dim);
-
-#pragma omp parallel for schedule(static)
-    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
-      for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
-        auto point = splitter.combine(o_idx, i_idx, rect.lo);
-        lhs.reduce(point, ARGVAL(point[collapsed_dim], rhs[point]));
-      }
-  }
-
-  void operator()(AccessorRW<ARGVAL, DIM> lhs,
-                  AccessorRO<VAL, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    Splitter<DIM> splitter;
-    auto split = splitter.split(rect, collapsed_dim);
-
-#pragma omp parallel for schedule(static)
-    for (size_t o_idx = 0; o_idx < split.outer; ++o_idx)
-      for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) {
-        auto point = splitter.combine(o_idx, i_idx, rect.lo);
-        OP::template fold<true>(lhs[point], ARGVAL(point[collapsed_dim], rhs[point]));
+        lhs.reduce(point, OP::convert(point, collapsed_dim, rhs[point]));
       }
   }
 };
diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 309dec413d..1818860e65 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -37,7 +37,7 @@ struct UnaryRedImpl {
   void operator()(UnaryRedArgs& args) const
   {
     using OP  = UnaryRedOp<OP_CODE, CODE>;
-    using VAL = legate_type_of<CODE>;
+    using RHS = legate_type_of<CODE>;
 
     Pitches<DIM - 1> pitches;
     auto rect   = args.rhs.shape<DIM>();
@@ -45,7 +45,7 @@ struct UnaryRedImpl {
 
     if (volume == 0) return;
 
-    auto rhs = args.rhs.read_accessor<VAL, DIM>(rect);
+    auto rhs = args.rhs.read_accessor<RHS, DIM>(rect);
 
     auto lhs = args.lhs.reduce_accessor<typename OP::OP, KIND != VariantKind::GPU, DIM>(rect);
     UnaryRedImplBody<KIND, OP_CODE, CODE, DIM>()(
@@ -61,52 +61,14 @@ struct UnaryRedImpl {
   }
 };
 
-template <VariantKind KIND, UnaryRedCode OP_CODE>
-struct ArgRedImpl {
-  template <LegateTypeCode CODE,
-            int DIM,
-            std::enable_if_t<(DIM > 1) && UnaryRedOp<OP_CODE, CODE>::valid>* = nullptr>
-  void operator()(UnaryRedArgs& args) const
-  {
-    using OP     = UnaryRedOp<OP_CODE, CODE>;
-    using VAL    = legate_type_of<CODE>;
-    using ARGVAL = Argval<VAL>;
-
-    Pitches<DIM - 1> pitches;
-    auto rect   = args.rhs.shape<DIM>();
-    auto volume = pitches.flatten(rect);
-
-    if (volume == 0) return;
-
-    auto rhs = args.rhs.read_accessor<VAL, DIM>(rect);
-
-    auto lhs = args.lhs.reduce_accessor<typename OP::OP, KIND != VariantKind::GPU, DIM>(rect);
-    ArgRedImplBody<KIND, OP_CODE, CODE, DIM>()(lhs, rhs, rect, pitches, args.collapsed_dim, volume);
-  }
-
-  template <LegateTypeCode CODE,
-            int DIM,
-            std::enable_if_t<DIM <= 1 || !UnaryRedOp<OP_CODE, CODE>::valid>* = nullptr>
-  void operator()(UnaryRedArgs& args) const
-  {
-    assert(false);
-  }
-};
-
 template <VariantKind KIND>
 struct UnaryRedDispatch {
-  template <UnaryRedCode OP_CODE, std::enable_if_t<!is_arg_reduce<OP_CODE>::value>* = nullptr>
+  template <UnaryRedCode OP_CODE>
   void operator()(UnaryRedArgs& args) const
   {
     auto dim = std::max(1, args.rhs.dim());
     return double_dispatch(dim, args.rhs.code(), UnaryRedImpl<KIND, OP_CODE>{}, args);
   }
-  template <UnaryRedCode OP_CODE, std::enable_if_t<is_arg_reduce<OP_CODE>::value>* = nullptr>
-  void operator()(UnaryRedArgs& args) const
-  {
-    auto dim = std::max(1, args.rhs.dim());
-    return double_dispatch(dim, args.rhs.code(), ArgRedImpl<KIND, OP_CODE>{}, args);
-  }
 };
 
 template <VariantKind KIND>
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index 233190731a..c72817d918 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -72,26 +72,6 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&..
   return f.template operator()<UnaryRedCode::MAX>(std::forward<Fnargs>(args)...);
 }
 
-template <typename T, int32_t DIM>
-struct ValueConstructor {
-  __CUDA_HD__ inline constexpr T operator()(const Legion::Point<DIM>&,
-                                            const T& value,
-                                            int32_t) const
-  {
-    return value;
-  }
-};
-
-template <typename T, int32_t DIM>
-struct ArgvalConstructor {
-  __CUDA_HD__ inline constexpr Argval<T> operator()(const Legion::Point<DIM>& point,
-                                                    const T& value,
-                                                    int32_t collapsed_dim) const
-  {
-    return Argval<T>(point[collapsed_dim], value);
-  }
-};
-
 template <UnaryRedCode OP_CODE, legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp {
   static constexpr bool valid = false;
@@ -101,13 +81,20 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ALL, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = legate::legate_type_of<TYPE_CODE>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = bool;
   using OP  = Legion::ProdReduction<VAL>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
+  {
+    return rhs != RHS(0);
   }
 };
 
@@ -120,13 +107,20 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ANY, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = legate::legate_type_of<TYPE_CODE>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = bool;
   using OP  = Legion::SumReduction<VAL>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    return rhs != RHS(0);
   }
 };
 
@@ -134,13 +128,20 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = legate::legate_type_of<TYPE_CODE>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = RHS;
   using OP  = Legion::MaxReduction<VAL>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    return rhs;
   }
 };
 
@@ -153,13 +154,20 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MIN, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = legate::legate_type_of<TYPE_CODE>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = RHS;
   using OP  = Legion::MinReduction<VAL>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    return rhs;
   }
 };
 
@@ -172,13 +180,20 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::PROD, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = legate::legate_type_of<TYPE_CODE>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = RHS;
   using OP  = Legion::ProdReduction<VAL>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
+  {
+    return rhs;
   }
 };
 
@@ -191,13 +206,20 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::SUM, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = legate::legate_type_of<TYPE_CODE>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = RHS;
   using OP  = Legion::SumReduction<VAL>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    return rhs;
   }
 };
 
@@ -205,13 +227,22 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = Argval<legate::legate_type_of<TYPE_CODE>>;
-  using OP  = ArgmaxReduction<legate::legate_type_of<TYPE_CODE>>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = Argval<RHS>;
+  using OP  = ArgmaxReduction<RHS>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>& point,
+                                 int32_t collapsed_dim,
+                                 const RHS& rhs)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    return VAL(point[collapsed_dim], rhs);
   }
 };
 
@@ -224,13 +255,22 @@ template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMIN, TYPE_CODE> {
   static constexpr bool valid = true;
 
-  using VAL = Argval<legate::legate_type_of<TYPE_CODE>>;
-  using OP  = ArgminReduction<legate::legate_type_of<TYPE_CODE>>;
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = Argval<RHS>;
+  using OP  = ArgminReduction<RHS>;
 
   template <bool EXCLUSIVE>
-  __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2)
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>& point,
+                                 int32_t collapsed_dim,
+                                 const RHS& rhs)
   {
-    OP::template fold<EXCLUSIVE>(rhs1, rhs2);
+    return VAL(point[collapsed_dim], rhs);
   }
 };
 

From 16eb415dfa10e60bcd38eb200993e01f19ce3767 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Wed, 18 May 2022 23:08:25 -0700
Subject: [PATCH 02/16] Fix scalar reductions with any and all

---
 src/cunumeric/unary/scalar_unary_red.cc       |  74 +------------
 src/cunumeric/unary/scalar_unary_red.cu       | 104 ++----------------
 src/cunumeric/unary/scalar_unary_red_omp.cc   |  85 ++------------
 .../unary/scalar_unary_red_template.inl       |  66 +----------
 src/cunumeric/unary/unary_red_util.h          |  12 ++
 5 files changed, 37 insertions(+), 304 deletions(-)

diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc
index 2e01700e80..6abc2cb0ec 100644
--- a/src/cunumeric/unary/scalar_unary_red.cc
+++ b/src/cunumeric/unary/scalar_unary_red.cc
@@ -26,11 +26,11 @@ template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
 
   void operator()(OP func,
                   AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
+                  AccessorRO<RHS, DIM> in,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense) const
@@ -39,82 +39,18 @@ struct ScalarUnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
     const size_t volume = rect.volume();
     if (dense) {
       auto inptr = in.ptr(rect);
-      for (size_t idx = 0; idx < volume; ++idx) OP::template fold<true>(result, inptr[idx]);
+      for (size_t idx = 0; idx < volume; ++idx)
+        OP::template fold<true>(result, OP::convert(inptr[idx]));
     } else {
       for (size_t idx = 0; idx < volume; ++idx) {
         auto p = pitches.unflatten(idx, rect.lo);
-        OP::template fold<true>(result, in[p]);
+        OP::template fold<true>(result, OP::convert(in[p]));
       }
     }
     out.reduce(0, result);
   }
 };
 
-namespace detail {
-
-template <typename OP, typename VAL, int DIM>
-void logical_operator(bool& result,
-                      AccessorRO<VAL, DIM> in,
-                      const Rect<DIM>& rect,
-                      const Pitches<DIM - 1>& pitches,
-                      bool dense)
-{
-  const size_t volume = rect.volume();
-  if (dense) {
-    auto inptr = in.ptr(rect);
-    for (size_t idx = 0; idx < volume; ++idx) {
-      bool tmp1 = detail::convert_to_bool(inptr[idx]);
-      OP::template fold<true>(result, tmp1);
-    }
-  } else {
-    for (size_t idx = 0; idx < volume; ++idx) {
-      auto p    = pitches.unflatten(idx, rect.lo);
-      bool tmp1 = detail::convert_to_bool(in[p]);
-      OP::template fold<true>(result, tmp1);
-    }
-  }
-}
-
-}  // namespace detail
-
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::CPU, UnaryRedCode::ALL, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::PROD, LegateTypeCode::BOOL_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-
-  {
-    auto result = LG_OP::identity;
-    detail::logical_operator<OP>(result, in, rect, pitches, dense);
-    out.reduce(0, result);
-  }
-};
-
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::CPU, UnaryRedCode::ANY, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-
-  {
-    auto result = LG_OP::identity;
-    detail::logical_operator<OP>(result, in, rect, pitches, dense);
-    out.reduce(0, result);
-  }
-};
-
 template <LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::CPU, UnaryRedCode::CONTAINS, CODE, DIM> {
   using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu
index 796220ee5b..a62ba95c11 100644
--- a/src/cunumeric/unary/scalar_unary_red.cu
+++ b/src/cunumeric/unary/scalar_unary_red.cu
@@ -23,7 +23,8 @@ namespace cunumeric {
 
 using namespace Legion;
 
-template <typename Op,
+template <typename OP,
+          typename LG_OP,
           typename Output,
           typename ReadAcc,
           typename Pitches,
@@ -31,7 +32,8 @@ template <typename Op,
           typename VAL>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduction_kernel(size_t volume,
-                   Op op,
+                   OP,
+                   LG_OP,
                    Output out,
                    ReadAcc in,
                    Pitches pitches,
@@ -44,7 +46,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
     const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
     if (offset < volume) {
       auto point = pitches.unflatten(offset, origin);
-      Op::template fold<true>(value, in[point]);
+      LG_OP::template fold<true>(value, OP::convert(in[point]));
     }
   }
   // Every thread in the thread block must participate in the exchange to get correct results
@@ -93,11 +95,12 @@ template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
+  using LHS   = typename OP::VAL;
 
   void operator()(OP func,
                   AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
+                  AccessorRO<RHS, DIM> in,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense) const
@@ -107,15 +110,15 @@ struct ScalarUnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     DeferredReduction<typename OP::OP> result;
-    size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(VAL);
+    size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(LHS);
 
     if (blocks >= MAX_REDUCTION_CTAS) {
       const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
       reduction_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
-        volume, typename OP::OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity);
+        volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity);
     } else
       reduction_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
-        volume, typename OP::OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity);
+        volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity);
 
     copy_kernel<<<1, 1, 0, stream>>>(result, out);
     CHECK_CUDA_STREAM(stream);
@@ -156,91 +159,6 @@ struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::CONTAINS, CODE, DI
   }
 };
 
-namespace detail {
-
-template <typename Op,
-          typename Output,
-          typename ReadAcc,
-          typename Pitches,
-          typename Point,
-          typename VAL>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) logical_kernel(
-  size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, VAL identity)
-{
-  auto value = identity;
-  for (size_t idx = 0; idx < iters; idx++) {
-    const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (offset < volume) {
-      auto point = pitches.unflatten(offset, origin);
-      Op::template fold<true>(value, convert_to_bool(in[point]));
-    }
-  }
-  // Every thread in the thread block must participate in the exchange to get correct results
-  reduce_output(out, value);
-}
-
-template <typename OP, typename LG_OP, typename VAL, int DIM>
-void logical_operator_gpu(AccessorRD<LG_OP, true, 1> out,
-                          AccessorRO<VAL, DIM> in,
-                          const Rect<DIM>& rect,
-                          const Pitches<DIM - 1>& pitches,
-                          bool dense)
-{
-  auto stream = get_cached_stream();
-
-  const size_t volume = rect.volume();
-  const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-  DeferredReduction<OP> result;
-  size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool);
-
-  if (blocks >= MAX_REDUCTION_CTAS) {
-    const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
-    logical_kernel<OP><<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
-      volume, result, in, pitches, rect.lo, iters, LG_OP::identity);
-  } else
-    logical_kernel<OP><<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
-      volume, result, in, pitches, rect.lo, 1, LG_OP::identity);
-
-  copy_kernel<<<1, 1, 0, stream>>>(result, out);
-  CHECK_CUDA_STREAM(stream);
-}
-
-}  // namespace detail
-
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::ALL, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::PROD, LegateTypeCode::BOOL_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-
-  {
-    detail::logical_operator_gpu<ProdReduction<bool>>(out, in, rect, pitches, dense);
-  }
-};
-
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::ANY, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-
-  {
-    detail::logical_operator_gpu<SumReduction<bool>>(out, in, rect, pitches, dense);
-  }
-};
-
 template <LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::COUNT_NONZERO, CODE, DIM> {
   using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::UINT64_LT>;
diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc
index 6d190cdf51..f575d3626c 100644
--- a/src/cunumeric/unary/scalar_unary_red_omp.cc
+++ b/src/cunumeric/unary/scalar_unary_red_omp.cc
@@ -29,11 +29,12 @@ template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   using OP    = UnaryRedOp<OP_CODE, CODE>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
+  using LHS   = typename OP::VAL;
 
   void operator()(OP func,
                   AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
+                  AccessorRO<RHS, DIM> in,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense) const
@@ -41,7 +42,7 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
     auto result            = LG_OP::identity;
     const size_t volume    = rect.volume();
     const auto max_threads = omp_get_max_threads();
-    ThreadLocalStorage<VAL> locals(max_threads);
+    ThreadLocalStorage<LHS> locals(max_threads);
     for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity;
     if (dense) {
       auto inptr = in.ptr(rect);
@@ -49,7 +50,8 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
       {
         const int tid = omp_get_thread_num();
 #pragma omp for schedule(static)
-        for (size_t idx = 0; idx < volume; ++idx) OP::template fold<true>(locals[tid], inptr[idx]);
+        for (size_t idx = 0; idx < volume; ++idx)
+          OP::template fold<true>(locals[tid], OP::convert(inptr[idx]));
       }
     } else {
 #pragma omp parallel
@@ -58,7 +60,7 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
 #pragma omp for schedule(static)
         for (size_t idx = 0; idx < volume; ++idx) {
           auto p = pitches.unflatten(idx, rect.lo);
-          OP::template fold<true>(locals[tid], in[p]);
+          OP::template fold<true>(locals[tid], OP::convert(in[p]));
         }
       }
     }
@@ -111,79 +113,6 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::CONTAINS, CODE, DI
   }
 };
 
-namespace detail {
-
-template <typename OP, typename LG_OP, typename VAL, int DIM>
-void logical_operator_omp(AccessorRO<VAL, DIM> in,
-                          AccessorRD<LG_OP, true, 1> out,
-                          const Rect<DIM>& rect,
-                          const Pitches<DIM - 1>& pitches,
-                          bool dense)
-{
-  const size_t volume    = rect.volume();
-  const auto max_threads = omp_get_max_threads();
-  ThreadLocalStorage<bool> locals(max_threads);
-  for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity;
-  if (dense) {
-    auto inptr = in.ptr(rect);
-#pragma omp parallel
-    {
-      const int tid = omp_get_thread_num();
-#pragma omp for schedule(static)
-      for (size_t idx = 0; idx < volume; ++idx)
-        OP::template fold<true>(locals[tid], convert_to_bool(inptr[idx]));
-    }
-  } else {
-#pragma omp parallel
-    {
-      const int tid = omp_get_thread_num();
-#pragma omp for schedule(static)
-      for (size_t idx = 0; idx < volume; ++idx) {
-        auto p = pitches.unflatten(idx, rect.lo);
-        OP::template fold<true>(locals[tid], convert_to_bool(in[p]));
-      }
-    }
-  }
-
-  for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]);
-}
-
-}  // namespace detail
-
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::ALL, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::PROD, LegateTypeCode::BOOL_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-
-  {
-    detail::logical_operator_omp<OP, LG_OP>(in, out, rect, pitches, dense);
-  }
-};
-
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::ANY, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-
-  {
-    detail::logical_operator_omp<OP, LG_OP>(in, out, rect, pitches, dense);
-  }
-};
-
 template <LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::COUNT_NONZERO, CODE, DIM> {
   using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::UINT64_LT>;
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index d11866cade..54baa5559f 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -34,7 +34,7 @@ struct ScalarUnaryRedImpl {
   {
     using OP    = UnaryRedOp<OP_CODE, CODE>;
     using LG_OP = typename OP::OP;
-    using VAL   = legate_type_of<CODE>;
+    using RHS   = legate_type_of<CODE>;
 
     auto rect = args.in.shape<DIM>();
 
@@ -44,7 +44,7 @@ struct ScalarUnaryRedImpl {
     if (0 == volume) return;
 
     auto out = args.out.reduce_accessor<LG_OP, true, 1>();
-    auto in  = args.in.read_accessor<VAL, DIM>(rect);
+    auto in  = args.in.read_accessor<RHS, DIM>(rect);
 
 #ifndef LEGION_BOUNDS_CHECKS
     // Check to see if this is dense or not
@@ -66,68 +66,6 @@ struct ScalarUnaryRedImpl {
   }
 };
 
-template <VariantKind KIND>
-struct ScalarUnaryRedImpl<KIND, UnaryRedCode::ALL> {
-  template <LegateTypeCode CODE, int DIM>
-  void operator()(ScalarUnaryRedArgs& args) const
-  {
-    using OP    = UnaryRedOp<UnaryRedCode::PROD, LegateTypeCode::BOOL_LT>;
-    using LG_OP = typename OP::OP;
-    using VAL   = legate_type_of<CODE>;
-
-    auto rect = args.in.shape<DIM>();
-
-    Pitches<DIM - 1> pitches;
-    size_t volume = pitches.flatten(rect);
-
-    if (0 == volume) return;
-
-    auto out = args.out.reduce_accessor<LG_OP, true, 1>();
-    auto in  = args.in.read_accessor<VAL, DIM>(rect);
-
-#ifndef LEGION_BOUNDS_CHECKS
-    // Check to see if this is dense or not
-    bool dense = in.accessor.is_dense_row_major(rect);
-#else
-    // No dense execution if we're doing bounds checks
-    bool dense = false;
-#endif
-
-    ScalarUnaryRedImplBody<KIND, UnaryRedCode::ALL, CODE, DIM>()(out, in, rect, pitches, dense);
-  }
-};
-
-template <VariantKind KIND>
-struct ScalarUnaryRedImpl<KIND, UnaryRedCode::ANY> {
-  template <LegateTypeCode CODE, int DIM>
-  void operator()(ScalarUnaryRedArgs& args) const
-  {
-    using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
-    using LG_OP = typename OP::OP;
-    using VAL   = legate_type_of<CODE>;
-
-    auto rect = args.in.shape<DIM>();
-
-    Pitches<DIM - 1> pitches;
-    size_t volume = pitches.flatten(rect);
-
-    if (0 == volume) return;
-
-    auto out = args.out.reduce_accessor<LG_OP, true, 1>();
-    auto in  = args.in.read_accessor<VAL, DIM>(rect);
-
-#ifndef LEGION_BOUNDS_CHECKS
-    // Check to see if this is dense or not
-    bool dense = in.accessor.is_dense_row_major(rect);
-#else
-    // No dense execution if we're doing bounds checks
-    bool dense = false;
-#endif
-
-    ScalarUnaryRedImplBody<KIND, UnaryRedCode::ANY, CODE, DIM>()(out, in, rect, pitches, dense);
-  }
-};
-
 template <VariantKind KIND>
 struct ScalarUnaryRedImpl<KIND, UnaryRedCode::CONTAINS> {
   template <LegateTypeCode CODE, int DIM>
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index c72817d918..91eca97a7e 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -96,6 +96,8 @@ struct UnaryRedOp<UnaryRedCode::ALL, TYPE_CODE> {
   {
     return rhs != RHS(0);
   }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); }
 };
 
 template <>
@@ -122,6 +124,8 @@ struct UnaryRedOp<UnaryRedCode::ANY, TYPE_CODE> {
   {
     return rhs != RHS(0);
   }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); }
 };
 
 template <legate::LegateTypeCode TYPE_CODE>
@@ -143,6 +147,8 @@ struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
   {
     return rhs;
   }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
 template <>
@@ -169,6 +175,8 @@ struct UnaryRedOp<UnaryRedCode::MIN, TYPE_CODE> {
   {
     return rhs;
   }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
 template <>
@@ -195,6 +203,8 @@ struct UnaryRedOp<UnaryRedCode::PROD, TYPE_CODE> {
   {
     return rhs;
   }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
 template <>
@@ -221,6 +231,8 @@ struct UnaryRedOp<UnaryRedCode::SUM, TYPE_CODE> {
   {
     return rhs;
   }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
 template <legate::LegateTypeCode TYPE_CODE>

From 3d2c2d55e810c701caade23bce65c32542c1ba7f Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 00:28:09 -0700
Subject: [PATCH 03/16] Remove unnecessary template specializations

---
 src/cunumeric/unary/unary_red_util.h | 30 +++++-----------------------
 1 file changed, 5 insertions(+), 25 deletions(-)

diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index 91eca97a7e..48d4256e33 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -79,7 +79,7 @@ struct UnaryRedOp {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ALL, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = bool;
@@ -100,14 +100,9 @@ struct UnaryRedOp<UnaryRedCode::ALL, TYPE_CODE> {
   __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); }
 };
 
-template <>
-struct UnaryRedOp<UnaryRedCode::ALL, legate::LegateTypeCode::COMPLEX128_LT> {
-  static constexpr bool valid = false;
-};
-
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ANY, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = bool;
@@ -130,7 +125,7 @@ struct UnaryRedOp<UnaryRedCode::ANY, TYPE_CODE> {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = RHS;
@@ -151,14 +146,9 @@ struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
   __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
-template <>
-struct UnaryRedOp<UnaryRedCode::MAX, legate::LegateTypeCode::COMPLEX128_LT> {
-  static constexpr bool valid = false;
-};
-
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MIN, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = RHS;
@@ -179,14 +169,9 @@ struct UnaryRedOp<UnaryRedCode::MIN, TYPE_CODE> {
   __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
-template <>
-struct UnaryRedOp<UnaryRedCode::MIN, legate::LegateTypeCode::COMPLEX128_LT> {
-  static constexpr bool valid = false;
-};
-
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::PROD, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = RHS;
@@ -207,11 +192,6 @@ struct UnaryRedOp<UnaryRedCode::PROD, TYPE_CODE> {
   __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; }
 };
 
-template <>
-struct UnaryRedOp<UnaryRedCode::PROD, legate::LegateTypeCode::COMPLEX128_LT> {
-  static constexpr bool valid = false;
-};
-
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::SUM, TYPE_CODE> {
   static constexpr bool valid = true;

From e456cd5cf5563cbb2e956be9eb46e85ac1effdd2 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 00:35:50 -0700
Subject: [PATCH 04/16] Remove specializations for count_nonzero and use the
 template instead

---
 src/cunumeric/unary/scalar_unary_red.cc       | 27 -----------
 src/cunumeric/unary/scalar_unary_red.cu       | 48 -------------------
 src/cunumeric/unary/scalar_unary_red_omp.cc   | 41 ----------------
 .../unary/scalar_unary_red_template.inl       | 39 +--------------
 src/cunumeric/unary/unary_red_util.h          | 25 ++++++++++
 5 files changed, 26 insertions(+), 154 deletions(-)

diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc
index 6abc2cb0ec..a084943fe3 100644
--- a/src/cunumeric/unary/scalar_unary_red.cc
+++ b/src/cunumeric/unary/scalar_unary_red.cc
@@ -87,33 +87,6 @@ struct ScalarUnaryRedImplBody<VariantKind::CPU, UnaryRedCode::CONTAINS, CODE, DI
   }
 };
 
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::CPU, UnaryRedCode::COUNT_NONZERO, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::UINT64_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-  {
-    auto result         = LG_OP::identity;
-    const size_t volume = rect.volume();
-    if (dense) {
-      auto inptr = in.ptr(rect);
-      for (size_t idx = 0; idx < volume; ++idx) result += inptr[idx] != VAL(0);
-    } else {
-      for (size_t idx = 0; idx < volume; ++idx) {
-        auto point = pitches.unflatten(idx, rect.lo);
-        result += in[point] != VAL(0);
-      }
-    }
-    out.reduce(0, result);
-  }
-};
-
 /*static*/ void ScalarUnaryRedTask::cpu_variant(TaskContext& context)
 {
   scalar_unary_red_template<VariantKind::CPU>(context);
diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu
index a62ba95c11..e72b8d1b11 100644
--- a/src/cunumeric/unary/scalar_unary_red.cu
+++ b/src/cunumeric/unary/scalar_unary_red.cu
@@ -69,22 +69,6 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) con
   reduce_output(out, value);
 }
 
-template <typename Output, typename Pitches, typename Point, typename VAL, int32_t DIM>
-static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) count_nonzero_kernel(
-  size_t volume, Output out, AccessorRO<VAL, DIM> in, Pitches pitches, Point origin, size_t iters)
-{
-  uint64_t value = 0;
-  for (size_t idx = 0; idx < iters; idx++) {
-    const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
-    if (offset < volume) {
-      auto point = pitches.unflatten(offset, origin);
-      SumReduction<uint64_t>::fold<true>(value, in[point] != VAL(0));
-    }
-  }
-  // Every thread in the thread block must participate in the exchange to get correct results
-  reduce_output(out, value);
-}
-
 template <typename Buffer, typename RedAcc>
 static __global__ void __launch_bounds__(1, 1) copy_kernel(Buffer result, RedAcc out)
 {
@@ -159,38 +143,6 @@ struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::CONTAINS, CODE, DI
   }
 };
 
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::COUNT_NONZERO, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::UINT64_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-  {
-    auto stream = get_cached_stream();
-
-    const size_t volume = rect.volume();
-    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
-    DeferredReduction<SumReduction<uint64_t>> result;
-    size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(uint64_t);
-
-    if (blocks >= MAX_REDUCTION_CTAS) {
-      const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
-      count_nonzero_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
-        volume, result, in, pitches, rect.lo, iters);
-    } else
-      count_nonzero_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
-        volume, result, in, pitches, rect.lo, 1);
-
-    copy_kernel<<<1, 1, 0, stream>>>(result, out);
-    CHECK_CUDA_STREAM(stream);
-  }
-};
-
 /*static*/ void ScalarUnaryRedTask::gpu_variant(TaskContext& context)
 {
   scalar_unary_red_template<VariantKind::GPU>(context);
diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc
index f575d3626c..db50f15427 100644
--- a/src/cunumeric/unary/scalar_unary_red_omp.cc
+++ b/src/cunumeric/unary/scalar_unary_red_omp.cc
@@ -113,47 +113,6 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::CONTAINS, CODE, DI
   }
 };
 
-template <LegateTypeCode CODE, int DIM>
-struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::COUNT_NONZERO, CODE, DIM> {
-  using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::UINT64_LT>;
-  using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
-
-  void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  bool dense) const
-  {
-    auto result            = LG_OP::identity;
-    const size_t volume    = rect.volume();
-    const auto max_threads = omp_get_max_threads();
-    ThreadLocalStorage<uint64_t> locals(max_threads);
-    for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = 0;
-    if (dense) {
-      auto inptr = in.ptr(rect);
-#pragma omp parallel
-      {
-        const int tid = omp_get_thread_num();
-#pragma omp for schedule(static)
-        for (size_t idx = 0; idx < volume; ++idx) locals[tid] += inptr[idx] != VAL(0);
-      }
-    } else {
-#pragma omp parallel
-      {
-        const int tid = omp_get_thread_num();
-#pragma omp for schedule(static)
-        for (size_t idx = 0; idx < volume; ++idx) {
-          auto point = pitches.unflatten(idx, rect.lo);
-          locals[tid] += in[point] != VAL(0);
-        }
-      }
-    }
-
-    for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]);
-  }
-};
-
 /*static*/ void ScalarUnaryRedTask::omp_variant(TaskContext& context)
 {
   scalar_unary_red_template<VariantKind::OMP>(context);
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index 54baa5559f..d750ebbb8b 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -98,38 +98,6 @@ struct ScalarUnaryRedImpl<KIND, UnaryRedCode::CONTAINS> {
   }
 };
 
-template <VariantKind KIND>
-struct ScalarUnaryRedImpl<KIND, UnaryRedCode::COUNT_NONZERO> {
-  template <LegateTypeCode CODE, int DIM>
-  void operator()(ScalarUnaryRedArgs& args) const
-  {
-    using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::UINT64_LT>;
-    using LG_OP = typename OP::OP;
-    using VAL   = legate_type_of<CODE>;
-
-    auto rect = args.in.shape<DIM>();
-
-    Pitches<DIM - 1> pitches;
-    size_t volume = pitches.flatten(rect);
-
-    if (0 == volume) return;
-
-    auto out = args.out.reduce_accessor<LG_OP, true, 1>();
-    auto in  = args.in.read_accessor<VAL, DIM>(rect);
-
-#ifndef LEGION_BOUNDS_CHECKS
-    // Check to see if this is dense or not
-    bool dense = in.accessor.is_dense_row_major(rect);
-#else
-    // No dense execution if we're doing bounds checks
-    bool dense = false;
-#endif
-
-    ScalarUnaryRedImplBody<KIND, UnaryRedCode::COUNT_NONZERO, CODE, DIM>()(
-      out, in, rect, pitches, dense);
-  }
-};
-
 template <VariantKind KIND>
 struct ScalarUnaryRedDispatch {
   template <UnaryRedCode OP_CODE, std::enable_if_t<!is_arg_reduce<OP_CODE>::value>* = nullptr>
@@ -156,12 +124,7 @@ static void scalar_unary_red_template(TaskContext& context)
 
   ScalarUnaryRedArgs args{
     context.reductions()[0], inputs[0], scalars[0].value<UnaryRedCode>(), std::move(extra_args)};
-  if (args.op_code == UnaryRedCode::COUNT_NONZERO) {
-    auto dim = std::max(1, args.in.dim());
-    double_dispatch(
-      dim, args.in.code(), ScalarUnaryRedImpl<KIND, UnaryRedCode::COUNT_NONZERO>{}, args);
-  } else
-    op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
+  op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
 }
 
 }  // namespace cunumeric
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index 48d4256e33..b385a5907c 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -58,6 +58,8 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&..
       return f.template operator()<UnaryRedCode::ARGMIN>(std::forward<Fnargs>(args)...);
     case UnaryRedCode::CONTAINS:
       return f.template operator()<UnaryRedCode::CONTAINS>(std::forward<Fnargs>(args)...);
+    case UnaryRedCode::COUNT_NONZERO:
+      return f.template operator()<UnaryRedCode::COUNT_NONZERO>(std::forward<Fnargs>(args)...);
     case UnaryRedCode::MAX:
       return f.template operator()<UnaryRedCode::MAX>(std::forward<Fnargs>(args)...);
     case UnaryRedCode::MIN:
@@ -123,6 +125,29 @@ struct UnaryRedOp<UnaryRedCode::ANY, TYPE_CODE> {
   __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); }
 };
 
+template <legate::LegateTypeCode TYPE_CODE>
+struct UnaryRedOp<UnaryRedCode::COUNT_NONZERO, TYPE_CODE> {
+  static constexpr bool valid = true;
+
+  using RHS = legate::legate_type_of<TYPE_CODE>;
+  using VAL = uint64_t;
+  using OP  = Legion::SumReduction<VAL>;
+
+  template <bool EXCLUSIVE>
+  __CUDA_HD__ static void fold(VAL& a, VAL b)
+  {
+    OP::template fold<EXCLUSIVE>(a, b);
+  }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>&, int32_t, const RHS& rhs)
+  {
+    return static_cast<VAL>(rhs != RHS(0));
+  }
+
+  __CUDA_HD__ static VAL convert(const RHS& rhs) { return static_cast<VAL>(rhs != RHS(0)); }
+};
+
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::MAX, TYPE_CODE> {
   static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;

From a147e1200d852b426055e0c904c366351a589744 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 10:30:52 -0700
Subject: [PATCH 05/16] More tests for unary reductions

---
 tests/integration/test_logical.py | 76 ++++++++++++-------------------
 tests/integration/test_nonzero.py | 11 ++---
 2 files changed, 34 insertions(+), 53 deletions(-)

diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py
index 1ef81d48c2..b84148e634 100644
--- a/tests/integration/test_logical.py
+++ b/tests/integration/test_logical.py
@@ -18,53 +18,35 @@
 
 import cunumeric as num
 
-
-def test_any_basic():
-    assert num.array_equal(num.any([-1, 4, 5]), np.any([-1, 4, 5]))
-
-    x = [5, 10, 0, 100]
-    cx = num.array(x)
-    assert num.array_equal(num.any(cx), np.any(x))
-
-    y = [[0, 0], [0, 0]]
-    cy = num.array(y)
-    assert num.array_equal(num.any(cy), np.any(y))
-
-
-def test_any_axis():
-    x = np.array([[True, True, False], [True, True, True]])
-    cx = num.array(x)
-
-    assert num.array_equal(num.any(cx), np.any(x))
-    assert num.array_equal(num.any(cx, axis=0), np.any(x, axis=0))
-
-
-def test_all_basic():
-    assert num.array_equal(num.all([-1, 4, 5]), np.all([-1, 4, 5]))
-
-    x = [5, 10, 0, 100]
-    cx = num.array(x)
-    assert num.array_equal(num.all(cx), np.all(x))
-
-    y = [[0, 0], [0, 0]]
-    cy = num.array(y)
-    assert num.array_equal(num.all(cy), np.all(y))
-
-
-def test_all_axis():
-    x = np.array([[True, True, False], [True, True, True]])
-    cx = num.array(x)
-
-    assert num.array_equal(num.all(cx), np.all(x))
-    assert num.array_equal(num.all(cx, axis=0), np.all(x, axis=0))
-
-
-def test_nan():
-    assert num.equal(num.all(num.nan), np.all(np.nan))
-    assert num.equal(num.any(num.nan), np.any(np.nan))
-
-    assert num.array_equal(num.all(num.nan), np.all(np.nan))
-    assert num.array_equal(num.any(num.nan), np.any(np.nan))
+INPUTS = (
+    [-1, 4, 5],
+    [5, 10, 0, 100],
+    [[0, 0], [0, 0]],
+    [[True, True, False], [True, True, True]],
+    [[False, True, False]],
+    [[0.0, 1.0, 0.0]],
+    [[1, 0 + 1j, 1 + 1j]],
+    [[1, 0 + 1j, 0 + 0j]],
+    [np.nan],
+)
+
+
+@pytest.mark.parametrize("input", INPUTS)
+def test_any_and_all(input):
+    in_np = np.array(input)
+    # cuNumeric doesn't support reductions for complex128
+    if in_np.dtype.kind == "c":
+        in_np = in_np.astype("F")
+    in_num = num.array(in_np)
+
+    for fn in ("any", "all"):
+        fn_np = getattr(np, fn)
+        fn_num = getattr(num, fn)
+        assert np.array_equal(fn_np(in_np), fn_num(in_num))
+        for axis in range(in_num.ndim):
+            out_np = fn_np(in_np, axis=axis)
+            out_num = fn_num(in_num, axis=axis)
+            assert np.array_equal(out_np, out_num)
 
 
 @pytest.mark.skip
diff --git a/tests/integration/test_nonzero.py b/tests/integration/test_nonzero.py
index e09340c8be..27538ae003 100644
--- a/tests/integration/test_nonzero.py
+++ b/tests/integration/test_nonzero.py
@@ -98,12 +98,11 @@ def test_axis():
         x_np, axis=(0, 1, 2)
     )
 
-    # TODO: Put this back once we have per-axis count_nonzero
-    # for axis in range(3):
-    #    assert_equal(
-    #        num.count_nonzero(x, axis=axis),
-    #        np.count_nonzero(x_np, axis=axis),
-    #    )
+    for axis in range(3):
+        assert_equal(
+            num.count_nonzero(x, axis=axis),
+            np.count_nonzero(x_np, axis=axis),
+        )
 
 
 def test_deprecated_0d():

From 3f2769dd3e95aa4ad078d1a8c0db475038068f91 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 12:44:31 -0700
Subject: [PATCH 06/16] Remove unused ArgRedImplBody

---
 src/cunumeric/unary/unary_red.cc           | 21 ---------------------
 src/cunumeric/unary/unary_red_template.inl |  3 ---
 2 files changed, 24 deletions(-)

diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc
index 115587b289..8b9b93942c 100644
--- a/src/cunumeric/unary/unary_red.cc
+++ b/src/cunumeric/unary/unary_red.cc
@@ -42,27 +42,6 @@ struct UnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   }
 };
 
-template <UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
-struct ArgRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
-  using OP    = UnaryRedOp<OP_CODE, CODE>;
-  using LG_OP = typename OP::OP;
-  using RHS   = legate_type_of<CODE>;
-  using LHS   = Argval<RHS>;
-
-  void operator()(AccessorRD<LG_OP, true, DIM> lhs,
-                  AccessorRO<RHS, DIM> rhs,
-                  const Rect<DIM>& rect,
-                  const Pitches<DIM - 1>& pitches,
-                  int collapsed_dim,
-                  size_t volume) const
-  {
-    for (size_t idx = 0; idx < volume; ++idx) {
-      auto point = pitches.unflatten(idx, rect.lo);
-      lhs.reduce(point, LHS(point[collapsed_dim], rhs[point]));
-    }
-  }
-};
-
 /*static*/ void UnaryRedTask::cpu_variant(TaskContext& context)
 {
   unary_red_template<VariantKind::CPU>(context);
diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl
index 1818860e65..7283949642 100644
--- a/src/cunumeric/unary/unary_red_template.inl
+++ b/src/cunumeric/unary/unary_red_template.inl
@@ -26,9 +26,6 @@ using namespace legate;
 template <VariantKind KIND, UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
 struct UnaryRedImplBody;
 
-template <VariantKind KIND, UnaryRedCode OP_CODE, LegateTypeCode CODE, int DIM>
-struct ArgRedImplBody;
-
 template <VariantKind KIND, UnaryRedCode OP_CODE>
 struct UnaryRedImpl {
   template <LegateTypeCode CODE,

From 08fd23f7c909db55ed87d1d7a4977bfef2960dfc Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 12:45:34 -0700
Subject: [PATCH 07/16] Clean up the valid fields that were missed in the
 previous pass

---
 src/cunumeric/unary/unary_red_util.h | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index b385a5907c..75a8f86da5 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -242,7 +242,7 @@ struct UnaryRedOp<UnaryRedCode::SUM, TYPE_CODE> {
 
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = Argval<RHS>;
@@ -263,14 +263,9 @@ struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
   }
 };
 
-template <>
-struct UnaryRedOp<UnaryRedCode::ARGMAX, legate::LegateTypeCode::COMPLEX128_LT> {
-  static constexpr bool valid = false;
-};
-
 template <legate::LegateTypeCode TYPE_CODE>
 struct UnaryRedOp<UnaryRedCode::ARGMIN, TYPE_CODE> {
-  static constexpr bool valid = true;
+  static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT;
 
   using RHS = legate::legate_type_of<TYPE_CODE>;
   using VAL = Argval<RHS>;
@@ -291,9 +286,4 @@ struct UnaryRedOp<UnaryRedCode::ARGMIN, TYPE_CODE> {
   }
 };
 
-template <>
-struct UnaryRedOp<UnaryRedCode::ARGMIN, legate::LegateTypeCode::COMPLEX128_LT> {
-  static constexpr bool valid = false;
-};
-
 }  // namespace cunumeric

From ee513263a97068398886b5af5b21553976fb0df3 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 13:22:34 -0700
Subject: [PATCH 08/16] Clean up spaghetti in unary reduction and correctly
 handle outputs of mismatching dtypes

---
 cunumeric/array.py  | 186 +++++++++++++++-----------------------------
 cunumeric/module.py |   1 -
 2 files changed, 62 insertions(+), 125 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 467c5d62e9..955739bc14 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -585,9 +585,8 @@ def __contains__(self, item):
             UnaryRedCode.CONTAINS,
             self,
             axis=None,
-            dtype=np.dtype(np.bool_),
+            dtype=bool,
             args=args,
-            check_types=False,
         )
 
     def __copy__(self):
@@ -1501,10 +1500,9 @@ def all(
             UnaryRedCode.ALL,
             self,
             axis=axis,
-            dst=out,
+            out=out,
             keepdims=keepdims,
-            dtype=np.dtype(np.bool_),
-            check_types=False,
+            dtype=bool,
             initial=initial,
             where=where,
         )
@@ -1537,10 +1535,9 @@ def any(
             UnaryRedCode.ANY,
             self,
             axis=axis,
-            dst=out,
+            out=out,
             keepdims=keepdims,
-            dtype=np.dtype(np.bool_),
-            check_types=False,
+            dtype=bool,
             initial=initial,
             where=where,
         )
@@ -1574,8 +1571,7 @@ def argmax(self, axis=None, out=None):
             self,
             axis=axis,
             dtype=np.dtype(np.int64),
-            dst=out,
-            check_types=False,
+            out=out,
         )
 
     def argmin(self, axis=None, out=None):
@@ -1607,8 +1603,7 @@ def argmin(self, axis=None, out=None):
             self,
             axis=axis,
             dtype=np.dtype(np.int64),
-            dst=out,
-            check_types=False,
+            out=out,
         )
 
     def astype(
@@ -2590,7 +2585,7 @@ def max(
             UnaryRedCode.MAX,
             self,
             axis=axis,
-            dst=out,
+            out=out,
             keepdims=keepdims,
             initial=initial,
             where=where,
@@ -2686,7 +2681,7 @@ def min(
             UnaryRedCode.MIN,
             self,
             axis=axis,
-            dst=out,
+            out=out,
             keepdims=keepdims,
             initial=initial,
             where=where,
@@ -2781,7 +2776,7 @@ def prod(
             UnaryRedCode.PROD,
             self_array,
             axis=axis,
-            dst=out,
+            out=out,
             keepdims=keepdims,
             initial=initial,
             where=where,
@@ -3054,7 +3049,7 @@ def sum(
             UnaryRedCode.SUM,
             self_array,
             axis=axis,
-            dst=out,
+            out=out,
             keepdims=keepdims,
             initial=initial,
             where=where,
@@ -3486,13 +3481,14 @@ def _perform_unary_reduction(
         src,
         axis=None,
         dtype=None,
-        dst=None,
+        out=None,
         keepdims=False,
         args=None,
-        check_types=True,
         initial=None,
         where=True,
     ):
+        where = convert_to_predicate_ndarray(where)
+
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
         if isinstance(where, ndarray):
@@ -3516,121 +3512,63 @@ def _perform_unary_reduction(
                 "(arg)max/min not supported for complex-type arrays"
             )
         # Compute the output shape
-        if axis is not None:
-            to_reduce = set()
-            if type(axis) == int:
-                if axis < 0:
-                    axis = len(src.shape) + axis
-                    if axis < 0:
-                        raise ValueError("Illegal 'axis' value")
-                elif axis >= src.ndim:
-                    raise ValueError("Illegal 'axis' value")
-                to_reduce.add(axis)
-                axes = (axis,)
-            elif type(axis) == tuple:
-                for ax in axis:
-                    if ax < 0:
-                        ax = len(src.shape) + ax
-                        if ax < 0:
-                            raise ValueError("Illegal 'axis' value")
-                    elif ax >= src.ndim:
-                        raise ValueError("Illegal 'axis' value")
-                    to_reduce.add(ax)
-                axes = axis
-            else:
-                raise TypeError(
-                    "Illegal type passed for 'axis' argument "
-                    + str(type(axis))
-                )
-            out_shape = ()
-            for dim in range(len(src.shape)):
-                if dim in to_reduce:
-                    if keepdims:
-                        out_shape += (1,)
-                else:
-                    out_shape += (src.shape[dim],)
-        else:
-            # Collapsing down to a single value in this case
-            out_shape = ()
-            axes = None
-        # if src.size == 0:
-        # return nd
-        if dst is None:
-            if dtype is not None:
-                dst = ndarray(
-                    shape=out_shape,
-                    dtype=dtype,
-                    inputs=(src, where),
-                )
-            else:
-                dst = ndarray(
-                    shape=out_shape,
-                    dtype=src.dtype,
-                    inputs=(src, where),
-                )
-        else:
-            if dtype is not None and dtype != dst.dtype:
-                raise TypeError(
-                    "Output array type does not match requested dtype"
-                )
-            if dst.shape != out_shape:
-                raise TypeError(
-                    "Output array shape "
-                    + str(dst.shape)
-                    + " does not match expected shape "
-                    + str(out_shape)
-                )
-        # Quick exit
-        if where is False:
-            return dst
-        if check_types and src.dtype != dst.dtype:
-            out_dtype = cls.find_common_type(src, dst)
-            if src.dtype != out_dtype:
-                temp = ndarray(
-                    src.shape,
-                    dtype=out_dtype,
-                    inputs=(src, where),
-                )
-                temp._thunk.convert(src._thunk)
-                src = temp
-            if dst.dtype != out_dtype:
-                temp = ndarray(
-                    dst.shape,
-                    dtype=out_dtype,
-                    inputs=(src, where),
-                )
+        axes = axis
+        if axes is None:
+            axes = tuple(range(src.ndim))
+        elif not isinstance(axes, tuple):
+            axes = (axes,)
 
-                temp._thunk.unary_reduction(
-                    op,
-                    src._thunk,
-                    cls._get_where_thunk(where, dst.shape),
-                    axes,
-                    keepdims,
-                    args,
-                    initial,
-                )
-                dst._thunk.convert(temp._thunk)
-            else:
-                dst._thunk.unary_reduction(
-                    op,
-                    src._thunk,
-                    cls._get_where_thunk(where, dst.shape),
-                    axes,
-                    keepdims,
-                    args,
-                    initial,
-                )
+        if any(type(ax) != int for ax in axes):
+            raise TypeError(
+                "'axis' must be an integer or a tuple of integers, "
+                f"but got {axis}"
+            )
+
+        axes = tuple(ax + src.ndim if ax < 0 else ax for ax in axes)
+
+        if any(ax < 0 for ax in axes):
+            raise ValueError(f"Invalid 'axis' value {axis}")
+
+        out_shape = ()
+        for axis in range(src.ndim):
+            if axis not in axes:
+                out_shape += (src.shape[axis],)
+            elif keepdims:
+                out_shape += (1,)
+
+        # If no output dtype is given, the output has the same dtype as the
+        # input
+        if dtype is None:
+            dtype = src.dtype
+
+        if out is None:
+            out = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where))
+        elif out.shape != out_shape:
+            raise ValueError(
+                f"the output shape mismatch: expected {out_shape} but got "
+                f"{out.shape}"
+            )
+
+        if out.dtype == dtype:
+            result = out
         else:
-            dst._thunk.unary_reduction(
+            result = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where))
+
+        if where:
+            result._thunk.unary_reduction(
                 op,
                 src._thunk,
-                cls._get_where_thunk(where, dst.shape),
+                cls._get_where_thunk(where, result.shape),
                 axes,
                 keepdims,
                 args,
                 initial,
             )
-        return dst
+
+        if result is not out:
+            out._thunk.convert(result._thunk)
+
+        return out
 
     @classmethod
     def _perform_binary_reduction(
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 44f2037802..89adda9e00 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4524,7 +4524,6 @@ def count_nonzero(a, axis=None):
         a,
         axis=axis,
         dtype=np.dtype(np.uint64),
-        check_types=False,
     )
 
 

From dafa481fa4be22e44ebc251628d23f25b5922c82 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 13:23:45 -0700
Subject: [PATCH 09/16] Add more tests for any and all

---
 tests/integration/test_logical.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py
index b84148e634..4eda4388e6 100644
--- a/tests/integration/test_logical.py
+++ b/tests/integration/test_logical.py
@@ -17,6 +17,7 @@
 import pytest
 
 import cunumeric as num
+from legate.core import LEGATE_MAX_DIM
 
 INPUTS = (
     [-1, 4, 5],
@@ -49,6 +50,31 @@ def test_any_and_all(input):
             assert np.array_equal(out_np, out_num)
 
 
+@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+def test_nd_inputs(ndim):
+    shape = (3,) * ndim
+    in_np = np.random.random(shape)
+    in_num = num.array(in_np)
+
+    for fn in ("any", "all"):
+        fn_np = getattr(np, fn)
+        fn_num = getattr(num, fn)
+        for axis in range(in_num.ndim):
+            out_np = fn_np(in_np, axis=axis)
+            out_num = fn_num(in_num, axis=axis)
+            assert np.array_equal(out_np, out_num)
+
+            out_np = np.empty(out_np.shape, dtype="D")
+            out_num = num.empty(out_num.shape, dtype="D")
+            fn_np(in_np, axis=axis, out=out_np)
+            fn_num(in_num, axis=axis, out=out_num)
+            assert np.array_equal(out_np, out_num)
+
+            out_np = fn_np(in_np[1:], axis=axis)
+            out_num = fn_num(in_num[1:], axis=axis)
+            assert np.array_equal(out_np, out_num)
+
+
 @pytest.mark.skip
 def test_where():
     x = np.array([[True, True, False], [True, True, True]])

From 0983a85268d3ec45a7907e04bd342e4c87611909 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 15:02:57 -0700
Subject: [PATCH 10/16] Add some missing add_boilerplates calls

---
 cunumeric/array.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 955739bc14..20706502aa 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1542,6 +1542,7 @@ def any(
             where=where,
         )
 
+    @add_boilerplate()
     def argmax(self, axis=None, out=None):
         """a.argmax(axis=None, out=None)
 
@@ -1574,6 +1575,7 @@ def argmax(self, axis=None, out=None):
             out=out,
         )
 
+    @add_boilerplate()
     def argmin(self, axis=None, out=None):
         """a.argmin(axis=None, out=None)
 
@@ -3487,8 +3489,6 @@ def _perform_unary_reduction(
         initial=None,
         where=True,
     ):
-        where = convert_to_predicate_ndarray(where)
-
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
         if isinstance(where, ndarray):

From 868b68b7adafeab1060ae5f3cffacb0b980207f2 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 15:28:28 -0700
Subject: [PATCH 11/16] Give more accurate names to some of the template
 parameters

---
 src/cunumeric/unary/scalar_unary_red.cc           |  6 +++---
 src/cunumeric/unary/scalar_unary_red.cu           | 14 +++++++-------
 src/cunumeric/unary/scalar_unary_red_omp.cc       |  6 +++---
 src/cunumeric/unary/scalar_unary_red_template.inl |  4 ++--
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc
index a084943fe3..386a06eba1 100644
--- a/src/cunumeric/unary/scalar_unary_red.cc
+++ b/src/cunumeric/unary/scalar_unary_red.cc
@@ -55,17 +55,17 @@ template <LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::CPU, UnaryRedCode::CONTAINS, CODE, DIM> {
   using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
 
   void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
+                  AccessorRO<RHS, DIM> in,
                   const Store& to_find_scalar,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense) const
   {
     auto result         = LG_OP::identity;
-    const auto to_find  = to_find_scalar.scalar<VAL>();
+    const auto to_find  = to_find_scalar.scalar<RHS>();
     const size_t volume = rect.volume();
     if (dense) {
       auto inptr = in.ptr(rect);
diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu
index e72b8d1b11..5b3500162e 100644
--- a/src/cunumeric/unary/scalar_unary_red.cu
+++ b/src/cunumeric/unary/scalar_unary_red.cu
@@ -29,7 +29,7 @@ template <typename OP,
           typename ReadAcc,
           typename Pitches,
           typename Point,
-          typename VAL>
+          typename LHS>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduction_kernel(size_t volume,
                    OP,
@@ -39,7 +39,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
                    Pitches pitches,
                    Point origin,
                    size_t iters,
-                   VAL identity)
+                   LHS identity)
 {
   auto value = identity;
   for (size_t idx = 0; idx < iters; idx++) {
@@ -53,9 +53,9 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduce_output(out, value);
 }
 
-template <typename Output, typename ReadAcc, typename Pitches, typename Point, typename VAL>
+template <typename Output, typename ReadAcc, typename Pitches, typename Point, typename RHS>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) contains_kernel(
-  size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, VAL to_find)
+  size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, RHS to_find)
 {
   bool value = false;
   for (size_t idx = 0; idx < iters; idx++) {
@@ -113,10 +113,10 @@ template <LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::CONTAINS, CODE, DIM> {
   using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
 
   void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
+                  AccessorRO<RHS, DIM> in,
                   const Store& to_find_scalar,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
@@ -124,7 +124,7 @@ struct ScalarUnaryRedImplBody<VariantKind::GPU, UnaryRedCode::CONTAINS, CODE, DI
   {
     auto stream = get_cached_stream();
 
-    const auto to_find  = to_find_scalar.scalar<VAL>();
+    const auto to_find  = to_find_scalar.scalar<RHS>();
     const size_t volume = rect.volume();
     const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
     DeferredReduction<SumReduction<bool>> result;
diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc
index db50f15427..4621a2152d 100644
--- a/src/cunumeric/unary/scalar_unary_red_omp.cc
+++ b/src/cunumeric/unary/scalar_unary_red_omp.cc
@@ -73,17 +73,17 @@ template <LegateTypeCode CODE, int DIM>
 struct ScalarUnaryRedImplBody<VariantKind::OMP, UnaryRedCode::CONTAINS, CODE, DIM> {
   using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
   using LG_OP = typename OP::OP;
-  using VAL   = legate_type_of<CODE>;
+  using RHS   = legate_type_of<CODE>;
 
   void operator()(AccessorRD<LG_OP, true, 1> out,
-                  AccessorRO<VAL, DIM> in,
+                  AccessorRO<RHS, DIM> in,
                   const Store& to_find_scalar,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
                   bool dense) const
   {
     auto result            = LG_OP::identity;
-    const auto to_find     = to_find_scalar.scalar<VAL>();
+    const auto to_find     = to_find_scalar.scalar<RHS>();
     const size_t volume    = rect.volume();
     const auto max_threads = omp_get_max_threads();
     ThreadLocalStorage<bool> locals(max_threads);
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index d750ebbb8b..a482854f2f 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -73,7 +73,7 @@ struct ScalarUnaryRedImpl<KIND, UnaryRedCode::CONTAINS> {
   {
     using OP    = UnaryRedOp<UnaryRedCode::SUM, LegateTypeCode::BOOL_LT>;
     using LG_OP = typename OP::OP;
-    using VAL   = legate_type_of<CODE>;
+    using RHS   = legate_type_of<CODE>;
 
     auto rect = args.in.shape<DIM>();
 
@@ -83,7 +83,7 @@ struct ScalarUnaryRedImpl<KIND, UnaryRedCode::CONTAINS> {
     if (0 == volume) return;
 
     auto out = args.out.reduce_accessor<LG_OP, true, 1>();
-    auto in  = args.in.read_accessor<VAL, DIM>(rect);
+    auto in  = args.in.read_accessor<RHS, DIM>(rect);
 
 #ifndef LEGION_BOUNDS_CHECKS
     // Check to see if this is dense or not

From dbb22229a813684621c0d0228f9171120e576b0b Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 21:04:59 -0700
Subject: [PATCH 12/16] Handle 'dtype' correctly in sum and prod

---
 cunumeric/_ufunc/ufunc.py                |  6 +---
 cunumeric/array.py                       | 46 +++++++++++++++++-------
 cunumeric/module.py                      |  2 +-
 tests/integration/test_reduction_axis.py |  7 ++--
 4 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py
index 899f9728d9..64dec6b61b 100644
--- a/cunumeric/_ufunc/ufunc.py
+++ b/cunumeric/_ufunc/ufunc.py
@@ -663,10 +663,6 @@ def reduce(
             raise NotImplementedError(
                 f"reduction for {self} is not yet implemented"
             )
-        if out is not None:
-            raise NotImplementedError(
-                "reduction for {self} does not take an `out` argument"
-            )
         if not isinstance(where, bool) or not where:
             raise NotImplementedError(
                 "the 'where' keyword is not yet supported"
@@ -682,7 +678,7 @@ def reduce(
             array,
             axis=axis,
             dtype=dtype,
-            # out=out,
+            out=out,
             keepdims=keepdims,
             initial=initial,
             where=where,
diff --git a/cunumeric/array.py b/cunumeric/array.py
index 20706502aa..393eb002ce 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -585,7 +585,7 @@ def __contains__(self, item):
             UnaryRedCode.CONTAINS,
             self,
             axis=None,
-            dtype=bool,
+            res_dtype=bool,
             args=args,
         )
 
@@ -1500,9 +1500,9 @@ def all(
             UnaryRedCode.ALL,
             self,
             axis=axis,
+            res_dtype=bool,
             out=out,
             keepdims=keepdims,
-            dtype=bool,
             initial=initial,
             where=where,
         )
@@ -1535,9 +1535,9 @@ def any(
             UnaryRedCode.ANY,
             self,
             axis=axis,
+            res_dtype=bool,
             out=out,
             keepdims=keepdims,
-            dtype=bool,
             initial=initial,
             where=where,
         )
@@ -1571,7 +1571,7 @@ def argmax(self, axis=None, out=None):
             UnaryRedCode.ARGMAX,
             self,
             axis=axis,
-            dtype=np.dtype(np.int64),
+            res_dtype=np.dtype(np.int64),
             out=out,
         )
 
@@ -1604,7 +1604,7 @@ def argmin(self, axis=None, out=None):
             UnaryRedCode.ARGMIN,
             self,
             axis=axis,
-            dtype=np.dtype(np.int64),
+            res_dtype=np.dtype(np.int64),
             out=out,
         )
 
@@ -2778,6 +2778,7 @@ def prod(
             UnaryRedCode.PROD,
             self_array,
             axis=axis,
+            dtype=dtype,
             out=out,
             keepdims=keepdims,
             initial=initial,
@@ -3051,6 +3052,7 @@ def sum(
             UnaryRedCode.SUM,
             self_array,
             axis=axis,
+            dtype=dtype,
             out=out,
             keepdims=keepdims,
             initial=initial,
@@ -3483,12 +3485,28 @@ def _perform_unary_reduction(
         src,
         axis=None,
         dtype=None,
+        res_dtype=None,
         out=None,
         keepdims=False,
         args=None,
         initial=None,
         where=True,
     ):
+        # When 'res_dtype' is not None, the input and output of the reduction
+        # have different types. Such reduction operators don't take a dtype of
+        # the accumulator
+        if res_dtype is not None:
+            assert dtype is None
+            dtype = src.dtype
+        else:
+            # If 'dtype' exists, that determines both the accumulation dtype
+            # and the output dtype
+            if dtype is not None:
+                res_dtype = dtype
+            else:
+                dtype = src.dtype
+                res_dtype = src.dtype
+
         # TODO: Need to require initial to be given when the array is empty
         #       or a where mask is given.
         if isinstance(where, ndarray):
@@ -3536,23 +3554,25 @@ def _perform_unary_reduction(
             elif keepdims:
                 out_shape += (1,)
 
-        # If no output dtype is given, the output has the same dtype as the
-        # input
-        if dtype is None:
-            dtype = src.dtype
-
         if out is None:
-            out = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where))
+            out = ndarray(
+                shape=out_shape, dtype=res_dtype, inputs=(src, where)
+            )
         elif out.shape != out_shape:
             raise ValueError(
                 f"the output shape mismatch: expected {out_shape} but got "
                 f"{out.shape}"
             )
 
-        if out.dtype == dtype:
+        if dtype != src.dtype:
+            src = src.astype(dtype)
+
+        if out.dtype == res_dtype:
             result = out
         else:
-            result = ndarray(shape=out_shape, dtype=dtype, inputs=(src, where))
+            result = ndarray(
+                shape=out_shape, dtype=res_dtype, inputs=(src, where)
+            )
 
         if where:
             result._thunk.unary_reduction(
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 89adda9e00..3486b7d4d9 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4522,8 +4522,8 @@ def count_nonzero(a, axis=None):
     return ndarray._perform_unary_reduction(
         UnaryRedCode.COUNT_NONZERO,
         a,
+        res_dtype=np.dtype(np.uint64),
         axis=axis,
-        dtype=np.dtype(np.uint64),
     )
 
 
diff --git a/tests/integration/test_reduction_axis.py b/tests/integration/test_reduction_axis.py
index bfe67a41cc..7b6ff05553 100644
--- a/tests/integration/test_reduction_axis.py
+++ b/tests/integration/test_reduction_axis.py
@@ -21,8 +21,8 @@
 import cunumeric as cn
 
 
-def _sum(shape, axis, lib):
-    return lib.ones(shape).sum(axis=axis)
+def _sum(shape, axis, lib, dtype=None):
+    return lib.ones(shape).sum(axis=axis, dtype=dtype)
 
 
 # Try various non-square shapes, to nudge the core towards trying many
@@ -31,6 +31,9 @@ def _sum(shape, axis, lib):
 @pytest.mark.parametrize("shape", permutations((3, 4, 5)), ids=str)
 def test_3d(shape, axis):
     assert np.array_equal(_sum(shape, axis, np), _sum(shape, axis, cn))
+    assert np.array_equal(
+        _sum(shape, axis, np, dtype="D"), _sum(shape, axis, cn, dtype="D")
+    )
 
 
 if __name__ == "__main__":

From 8341ded03dc7a42c7cd0b839ff74b83c8181db26 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 22:05:55 -0700
Subject: [PATCH 13/16] Fix the plumbing code on the Python side for argmin and
 argmax

---
 cunumeric/array.py    | 30 ++++++++++++------------------
 cunumeric/deferred.py | 30 +++++++++++++++---------------
 cunumeric/module.py   | 22 ++++++++++++----------
 3 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 393eb002ce..5dbd49ab40 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -1543,7 +1543,7 @@ def any(
         )
 
     @add_boilerplate()
-    def argmax(self, axis=None, out=None):
+    def argmax(self, axis=None, out=None, keepdims=False):
         """a.argmax(axis=None, out=None)
 
         Return indices of the maximum values along the given axis.
@@ -1559,24 +1559,21 @@ def argmax(self, axis=None, out=None):
         Multiple GPUs, Multiple CPUs
 
         """
-        if self.size == 1:
-            return 0
-        if axis is None:
-            axis = self.ndim - 1
-        elif type(axis) != int:
-            raise TypeError("'axis' argument for argmax must be an 'int'")
-        elif axis < 0 or axis >= self.ndim:
-            raise TypeError("invalid 'axis' argument for argmax " + str(axis))
+        if out is not None and out.dtype != np.int64:
+            raise ValueError("output array must have int64 dtype")
+        if axis is not None and not isinstance(axis, int):
+            raise ValueError("axis must be an integer")
         return self._perform_unary_reduction(
             UnaryRedCode.ARGMAX,
             self,
             axis=axis,
             res_dtype=np.dtype(np.int64),
             out=out,
+            keepdims=keepdims,
         )
 
     @add_boilerplate()
-    def argmin(self, axis=None, out=None):
+    def argmin(self, axis=None, out=None, keepdims=False):
         """a.argmin(axis=None, out=None)
 
         Return indices of the minimum values along the given axis.
@@ -1592,20 +1589,17 @@ def argmin(self, axis=None, out=None):
         Multiple GPUs, Multiple CPUs
 
         """
-        if self.size == 1:
-            return 0
-        if axis is None:
-            axis = self.ndim - 1
-        elif type(axis) != int:
-            raise TypeError("'axis' argument for argmin must be an 'int'")
-        elif axis < 0 or axis >= self.ndim:
-            raise TypeError("invalid 'axis' argument for argmin " + str(axis))
+        if out is not None and out.dtype != np.int64:
+            raise ValueError("output array must have int64 dtype")
+        if axis is not None and not isinstance(axis, int):
+            raise ValueError("axis must be an integer")
         return self._perform_unary_reduction(
             UnaryRedCode.ARGMIN,
             self,
             axis=axis,
             res_dtype=np.dtype(np.int64),
             out=out,
+            keepdims=keepdims,
         )
 
     def astype(
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 5c2c4904a5..a59101745e 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1686,6 +1686,14 @@ def unary_reduction(
 
         argred = op in (UnaryRedCode.ARGMAX, UnaryRedCode.ARGMIN)
 
+        if argred:
+            argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype)
+            lhs_array = self.runtime.create_empty_thunk(
+                lhs_array.shape,
+                dtype=argred_dtype,
+                inputs=[self],
+            )
+
         # See if we are doing reduction to a point or another region
         if lhs_array.size == 1:
             assert axes is None or len(axes) == (
@@ -1711,14 +1719,6 @@ def unary_reduction(
             task.execute()
 
         else:
-            if argred:
-                argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype)
-                lhs_array = self.runtime.create_empty_thunk(
-                    lhs_array.shape,
-                    dtype=argred_dtype,
-                    inputs=[self],
-                )
-
             # Before we perform region reduction, make sure to have the lhs
             # initialized. If an initial value is given, we use it, otherwise
             # we use the identity of the reduction operator
@@ -1758,13 +1758,13 @@ def unary_reduction(
 
             task.execute()
 
-            if argred:
-                self.unary_op(
-                    UnaryOpCode.GETARG,
-                    lhs_array,
-                    True,
-                    [],
-                )
+        if argred:
+            self.unary_op(
+                UnaryOpCode.GETARG,
+                lhs_array,
+                True,
+                [],
+            )
 
     def isclose(self, rhs1, rhs2, rtol, atol, equal_nan):
         assert not equal_nan
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 3486b7d4d9..710f01f68b 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -4411,7 +4411,7 @@ def partition(a, kth, axis=-1, kind="introselect", order=None):
 
 
 @add_boilerplate("a")
-def argmax(a, axis=None, out=None):
+def argmax(a, axis=None, out=None, *, keepdims=False):
     """
 
     Returns the indices of the maximum values along an axis.
@@ -4426,6 +4426,10 @@ def argmax(a, axis=None, out=None):
     out : ndarray, optional
         If provided, the result will be inserted into this array. It should
         be of the appropriate shape and dtype.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the array.
 
     Returns
     -------
@@ -4441,14 +4445,11 @@ def argmax(a, axis=None, out=None):
     --------
     Multiple GPUs, Multiple CPUs
     """
-    if out is not None:
-        if out.dtype != np.int64:
-            raise ValueError("output array must have int64 dtype")
-    return a.argmax(axis=axis, out=out)
+    return a.argmax(axis=axis, out=out, keepdims=keepdims)
 
 
 @add_boilerplate("a")
-def argmin(a, axis=None, out=None):
+def argmin(a, axis=None, out=None, *, keepdims=False):
     """
 
     Returns the indices of the minimum values along an axis.
@@ -4463,6 +4464,10 @@ def argmin(a, axis=None, out=None):
     out : ndarray, optional
         If provided, the result will be inserted into this array. It should
         be of the appropriate shape and dtype.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the array.
 
     Returns
     -------
@@ -4478,10 +4483,7 @@ def argmin(a, axis=None, out=None):
     --------
     Multiple GPUs, Multiple CPUs
     """
-    if out is not None:
-        if out is not None and out.dtype != np.int64:
-            raise ValueError("output array must have int64 dtype")
-    return a.argmin(axis=axis, out=out)
+    return a.argmin(axis=axis, out=out, keepdims=keepdims)
 
 
 # Counting

From 4a658686dcd7cb9fff162f73997b8391d44083a0 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Thu, 19 May 2022 22:52:07 -0700
Subject: [PATCH 14/16] Implement scalar reduction for argmin and argmax

---
 cunumeric/array.py                            |  7 +-
 cunumeric/deferred.py                         |  2 +
 cunumeric/eager.py                            | 19 ++++--
 cunumeric/thunk.py                            |  1 +
 src/cunumeric/arg.h                           |  4 +-
 src/cunumeric/arg.inl                         |  5 --
 src/cunumeric/cuda_help.h                     | 29 +++++++++
 src/cunumeric/unary/scalar_unary_red.cc       | 32 ++++++++-
 src/cunumeric/unary/scalar_unary_red.cu       | 65 ++++++++++++++++++-
 src/cunumeric/unary/scalar_unary_red.h        | 14 +---
 src/cunumeric/unary/scalar_unary_red_omp.cc   | 46 ++++++++++++-
 .../unary/scalar_unary_red_template.inl       | 14 ++--
 src/cunumeric/unary/unary_red_util.h          | 20 ++++++
 .../{test_argmin.py => test_arg_reduce.py}    | 36 +++++-----
 14 files changed, 238 insertions(+), 56 deletions(-)
 rename tests/integration/{test_argmin.py => test_arg_reduce.py} (54%)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index 5dbd49ab40..dcc85fd855 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3542,9 +3542,9 @@ def _perform_unary_reduction(
             raise ValueError(f"Invalid 'axis' value {axis}")
 
         out_shape = ()
-        for axis in range(src.ndim):
-            if axis not in axes:
-                out_shape += (src.shape[axis],)
+        for dim in range(src.ndim):
+            if dim not in axes:
+                out_shape += (src.shape[dim],)
             elif keepdims:
                 out_shape += (1,)
 
@@ -3573,6 +3573,7 @@ def _perform_unary_reduction(
                 op,
                 src._thunk,
                 cls._get_where_thunk(where, result.shape),
+                axis,
                 axes,
                 keepdims,
                 args,
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index a59101745e..da125c301e 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -1675,6 +1675,7 @@ def unary_reduction(
         op,
         src,
         where,
+        orig_axis,
         axes,
         keepdims,
         args,
@@ -1713,6 +1714,7 @@ def unary_reduction(
             task.add_reduction(lhs_array.base, _UNARY_RED_TO_REDUCTION_OPS[op])
             task.add_input(rhs_array.base)
             task.add_scalar_arg(op, ty.int32)
+            task.add_scalar_arg(rhs_array.shape, (ty.int64,))
 
             self.add_arguments(task, args)
 
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index f84310ce46..2f85c4592e 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -700,13 +700,16 @@ def unary_op(self, op, rhs, where, args, multiout=None):
         else:
             raise RuntimeError("unsupported unary op " + str(op))
 
-    def unary_reduction(self, op, rhs, where, axes, keepdims, args, initial):
+    def unary_reduction(
+        self, op, rhs, where, orig_axis, axes, keepdims, args, initial
+    ):
         self.check_eager_args(rhs)
         if self.deferred is not None:
             self.deferred.unary_reduction(
                 op,
                 rhs,
                 where,
+                orig_axis,
                 axes,
                 keepdims,
                 args,
@@ -722,22 +725,24 @@ def unary_reduction(self, op, rhs, where, axes, keepdims, args, initial):
             fn(
                 rhs.array,
                 out=self.array,
-                axis=axes,
+                axis=orig_axis,
                 keepdims=keepdims,
                 where=where
                 if not isinstance(where, EagerArray)
                 else where.array,
             )
         elif op == UnaryRedCode.ARGMAX:
-            assert len(axes) == 1
-            np.argmax(rhs.array, out=self.array, axis=axes[0])
+            np.argmax(
+                rhs.array,
+                out=self.array,
+                axis=orig_axis,
+            )
         elif op == UnaryRedCode.ARGMIN:
-            assert len(axes) == 1
-            np.argmin(rhs.array, out=self.array, axis=axes[0])
+            np.argmin(rhs.array, out=self.array, axis=orig_axis)
         elif op == UnaryRedCode.CONTAINS:
             self.array.fill(args[0] in rhs.array)
         elif op == UnaryRedCode.COUNT_NONZERO:
-            self.array[()] = np.count_nonzero(rhs.array, axis=axes)
+            self.array[()] = np.count_nonzero(rhs.array, axis=orig_axis)
         else:
             raise RuntimeError("unsupported unary reduction op " + str(op))
 
diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py
index 153f133d92..a0d67d584b 100644
--- a/cunumeric/thunk.py
+++ b/cunumeric/thunk.py
@@ -193,6 +193,7 @@ def unary_reduction(
         redop,
         rhs,
         where,
+        orig_axis,
         axes,
         keepdims,
         args,
diff --git a/src/cunumeric/arg.h b/src/cunumeric/arg.h
index c31a839807..7705cdf8b9 100644
--- a/src/cunumeric/arg.h
+++ b/src/cunumeric/arg.h
@@ -25,8 +25,10 @@ namespace cunumeric {
 template <typename T>
 class Argval {
  public:
+  // Calling this constructor manually is unsafe, as the members are left uninitialized.
+  // This constructor exists only to make nvcc happy when we use a shared memory of Argval<T>.
   __CUDA_HD__
-  Argval();
+  Argval() {}
   __CUDA_HD__
   Argval(T value);
   __CUDA_HD__
diff --git a/src/cunumeric/arg.inl b/src/cunumeric/arg.inl
index fff5d1f93f..0f57b42621 100644
--- a/src/cunumeric/arg.inl
+++ b/src/cunumeric/arg.inl
@@ -16,11 +16,6 @@
 
 namespace cunumeric {
 
-template <typename T>
-__CUDA_HD__ Argval<T>::Argval() : arg(LLONG_MAX), arg_value(0)
-{
-}
-
 template <typename T>
 __CUDA_HD__ Argval<T>::Argval(T v) : arg(LLONG_MAX), arg_value(v)
 {
diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h
index 1c0efbb3d4..f0d410bf00 100644
--- a/src/cunumeric/cuda_help.h
+++ b/src/cunumeric/cuda_help.h
@@ -19,6 +19,7 @@
 #include "legate.h"
 #include "core/cuda/cuda_help.h"
 #include "core/cuda/stream_pool.h"
+#include "cunumeric/arg.h"
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 #include <cuda_runtime.h>
@@ -237,6 +238,34 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTIO
   }
 }
 
+// Overload for argval
+// TBD: if compiler optimizes out the shuffle function we defined, we could make it the default
+// version
+template <typename T, typename REDUCTION>
+__device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result,
+                                              Argval<T> value)
+{
+  __shared__ Argval<T> trampoline[THREADS_PER_BLOCK / 32];
+  // Reduce across the warp
+  const int laneid = threadIdx.x & 0x1f;
+  const int warpid = threadIdx.x >> 5;
+  for (int i = 16; i >= 1; i /= 2) {
+    const Argval<T> shuffle_value = shuffle(0xffffffff, value, i, 32);
+    REDUCTION::template fold<true /*exclusive*/>(value, shuffle_value);
+  }
+  // Write warp values into shared memory
+  if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value;
+  __syncthreads();
+  // Output reduction
+  if (threadIdx.x == 0) {
+    for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++)
+      REDUCTION::template fold<true /*exclusive*/>(value, trampoline[i]);
+    result <<= value;
+    // Make sure the result is visible externally
+    __threadfence_system();
+  }
+}
+
 template <typename T, typename REDUCTION>
 __device__ __forceinline__ void reduce_output(Legion::DeferredReduction<REDUCTION> result, T value)
 {
diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc
index 386a06eba1..9bbaeb6cc9 100644
--- a/src/cunumeric/unary/scalar_unary_red.cc
+++ b/src/cunumeric/unary/scalar_unary_red.cc
@@ -28,12 +28,15 @@ struct ScalarUnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
   using LG_OP = typename OP::OP;
   using RHS   = legate_type_of<CODE>;
 
+  template <UnaryRedCode _OP_CODE                              = OP_CODE,
+            std::enable_if_t<!is_arg_reduce<_OP_CODE>::value>* = nullptr>
   void operator()(OP func,
                   AccessorRD<LG_OP, true, 1> out,
                   AccessorRO<RHS, DIM> in,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
-                  bool dense) const
+                  bool dense,
+                  const Point<DIM>& shape) const
   {
     auto result         = LG_OP::identity;
     const size_t volume = rect.volume();
@@ -49,6 +52,33 @@ struct ScalarUnaryRedImplBody<VariantKind::CPU, OP_CODE, CODE, DIM> {
     }
     out.reduce(0, result);
   }
+
+  template <UnaryRedCode _OP_CODE                             = OP_CODE,
+            std::enable_if_t<is_arg_reduce<_OP_CODE>::value>* = nullptr>
+  void operator()(OP func,
+                  AccessorRD<LG_OP, true, 1> out,
+                  AccessorRO<RHS, DIM> in,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense,
+                  const Point<DIM>& shape) const
+  {
+    auto result         = LG_OP::identity;
+    const size_t volume = rect.volume();
+    if (dense) {
+      auto inptr = in.ptr(rect);
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto p = pitches.unflatten(idx, rect.lo);
+        OP::template fold<true>(result, OP::convert(p, shape, inptr[idx]));
+      }
+    } else {
+      for (size_t idx = 0; idx < volume; ++idx) {
+        auto p = pitches.unflatten(idx, rect.lo);
+        OP::template fold<true>(result, OP::convert(p, shape, in[p]));
+      }
+    }
+    out.reduce(0, result);
+  }
 };
 
 template <LegateTypeCode CODE, int DIM>
diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu
index 5b3500162e..6f2059847c 100644
--- a/src/cunumeric/unary/scalar_unary_red.cu
+++ b/src/cunumeric/unary/scalar_unary_red.cu
@@ -53,6 +53,37 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
   reduce_output(out, value);
 }
 
+template <typename OP,
+          typename LG_OP,
+          typename Output,
+          typename ReadAcc,
+          typename Pitches,
+          typename Point,
+          typename LHS>
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+  arg_reduction_kernel(size_t volume,
+                       OP,
+                       LG_OP,
+                       Output out,
+                       ReadAcc in,
+                       Pitches pitches,
+                       Point origin,
+                       size_t iters,
+                       LHS identity,
+                       Point shape)
+{
+  auto value = identity;
+  for (size_t idx = 0; idx < iters; idx++) {
+    const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+    if (offset < volume) {
+      auto point = pitches.unflatten(offset, origin);
+      LG_OP::template fold<true>(value, OP::convert(point, shape, in[point]));
+    }
+  }
+  // Every thread in the thread block must participate in the exchange to get correct results
+  reduce_output(out, value);
+}
+
 template <typename Output, typename ReadAcc, typename Pitches, typename Point, typename RHS>
 static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) contains_kernel(
   size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, RHS to_find)
@@ -82,12 +113,15 @@ struct ScalarUnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
   using RHS   = legate_type_of<CODE>;
   using LHS   = typename OP::VAL;
 
+  template <UnaryRedCode _OP_CODE                              = OP_CODE,
+            std::enable_if_t<!is_arg_reduce<_OP_CODE>::value>* = nullptr>
   void operator()(OP func,
                   AccessorRD<LG_OP, true, 1> out,
                   AccessorRO<RHS, DIM> in,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
-                  bool dense) const
+                  bool dense,
+                  const Point<DIM>& shape) const
   {
     auto stream = get_cached_stream();
 
@@ -107,6 +141,35 @@ struct ScalarUnaryRedImplBody<VariantKind::GPU, OP_CODE, CODE, DIM> {
     copy_kernel<<<1, 1, 0, stream>>>(result, out);
     CHECK_CUDA_STREAM(stream);
   }
+
+  template <UnaryRedCode _OP_CODE                             = OP_CODE,
+            std::enable_if_t<is_arg_reduce<_OP_CODE>::value>* = nullptr>
+  void operator()(OP func,
+                  AccessorRD<LG_OP, true, 1> out,
+                  AccessorRO<RHS, DIM> in,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense,
+                  const Point<DIM>& shape) const
+  {
+    auto stream = get_cached_stream();
+
+    const size_t volume = rect.volume();
+    const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    DeferredReduction<typename OP::OP> result;
+    size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(LHS);
+
+    if (blocks >= MAX_REDUCTION_CTAS) {
+      const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
+      arg_reduction_kernel<<<MAX_REDUCTION_CTAS, THREADS_PER_BLOCK, shmem_size, stream>>>(
+        volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity, shape);
+    } else
+      arg_reduction_kernel<<<blocks, THREADS_PER_BLOCK, shmem_size, stream>>>(
+        volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity, shape);
+
+    copy_kernel<<<1, 1, 0, stream>>>(result, out);
+    CHECK_CUDA_STREAM(stream);
+  }
 };
 
 template <LegateTypeCode CODE, int DIM>
diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cunumeric/unary/scalar_unary_red.h
index f8beab279d..e529abd7f4 100644
--- a/src/cunumeric/unary/scalar_unary_red.h
+++ b/src/cunumeric/unary/scalar_unary_red.h
@@ -25,6 +25,7 @@ struct ScalarUnaryRedArgs {
   const Array& out;
   const Array& in;
   UnaryRedCode op_code;
+  Legion::DomainPoint shape;
   std::vector<legate::Store> args;
 };
 
@@ -43,17 +44,4 @@ class ScalarUnaryRedTask : public CuNumericTask<ScalarUnaryRedTask> {
 #endif
 };
 
-namespace detail {
-template <typename _T, std::enable_if_t<!legate::is_complex<_T>::value>* = nullptr>
-__CUDA_HD__ inline bool convert_to_bool(const _T& in)
-{
-  return bool(in);
-}
-template <typename _T, std::enable_if_t<legate::is_complex<_T>::value>* = nullptr>
-__CUDA_HD__ inline bool convert_to_bool(const _T& in)
-{
-  return bool(in.real());
-}
-}  // namespace detail
-
 }  // namespace cunumeric
diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc
index 4621a2152d..e276b4da5f 100644
--- a/src/cunumeric/unary/scalar_unary_red_omp.cc
+++ b/src/cunumeric/unary/scalar_unary_red_omp.cc
@@ -32,12 +32,15 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
   using RHS   = legate_type_of<CODE>;
   using LHS   = typename OP::VAL;
 
+  template <UnaryRedCode _OP_CODE                              = OP_CODE,
+            std::enable_if_t<!is_arg_reduce<_OP_CODE>::value>* = nullptr>
   void operator()(OP func,
                   AccessorRD<LG_OP, true, 1> out,
                   AccessorRO<RHS, DIM> in,
                   const Rect<DIM>& rect,
                   const Pitches<DIM - 1>& pitches,
-                  bool dense) const
+                  bool dense,
+                  const Point<DIM>& shape) const
   {
     auto result            = LG_OP::identity;
     const size_t volume    = rect.volume();
@@ -67,6 +70,47 @@ struct ScalarUnaryRedImplBody<VariantKind::OMP, OP_CODE, CODE, DIM> {
 
     for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]);
   }
+
+  template <UnaryRedCode _OP_CODE                             = OP_CODE,
+            std::enable_if_t<is_arg_reduce<_OP_CODE>::value>* = nullptr>
+  void operator()(OP func,
+                  AccessorRD<LG_OP, true, 1> out,
+                  AccessorRO<RHS, DIM> in,
+                  const Rect<DIM>& rect,
+                  const Pitches<DIM - 1>& pitches,
+                  bool dense,
+                  const Point<DIM>& shape) const
+  {
+    auto result            = LG_OP::identity;
+    const size_t volume    = rect.volume();
+    const auto max_threads = omp_get_max_threads();
+    ThreadLocalStorage<LHS> locals(max_threads);
+    for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity;
+    if (dense) {
+      auto inptr = in.ptr(rect);
+#pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+#pragma omp for schedule(static)
+        for (size_t idx = 0; idx < volume; ++idx) {
+          auto p = pitches.unflatten(idx, rect.lo);
+          OP::template fold<true>(locals[tid], OP::convert(p, shape, inptr[idx]));
+        }
+      }
+    } else {
+#pragma omp parallel
+      {
+        const int tid = omp_get_thread_num();
+#pragma omp for schedule(static)
+        for (size_t idx = 0; idx < volume; ++idx) {
+          auto p = pitches.unflatten(idx, rect.lo);
+          OP::template fold<true>(locals[tid], OP::convert(p, shape, in[p]));
+        }
+      }
+    }
+
+    for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]);
+  }
 };
 
 template <LegateTypeCode CODE, int DIM>
diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index a482854f2f..d95a1b6d50 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -54,7 +54,8 @@ struct ScalarUnaryRedImpl {
     bool dense = false;
 #endif
 
-    ScalarUnaryRedImplBody<KIND, OP_CODE, CODE, DIM>()(OP{}, out, in, rect, pitches, dense);
+    ScalarUnaryRedImplBody<KIND, OP_CODE, CODE, DIM>()(
+      OP{}, out, in, rect, pitches, dense, args.shape);
   }
 
   template <LegateTypeCode CODE,
@@ -100,17 +101,12 @@ struct ScalarUnaryRedImpl<KIND, UnaryRedCode::CONTAINS> {
 
 template <VariantKind KIND>
 struct ScalarUnaryRedDispatch {
-  template <UnaryRedCode OP_CODE, std::enable_if_t<!is_arg_reduce<OP_CODE>::value>* = nullptr>
+  template <UnaryRedCode OP_CODE>
   void operator()(ScalarUnaryRedArgs& args) const
   {
     auto dim = std::max(1, args.in.dim());
     double_dispatch(dim, args.in.code(), ScalarUnaryRedImpl<KIND, OP_CODE>{}, args);
   }
-  template <UnaryRedCode OP_CODE, std::enable_if_t<is_arg_reduce<OP_CODE>::value>* = nullptr>
-  void operator()(ScalarUnaryRedArgs& args) const
-  {
-    assert(false);
-  }
 };
 
 template <VariantKind KIND>
@@ -122,8 +118,10 @@ static void scalar_unary_red_template(TaskContext& context)
   std::vector<Store> extra_args;
   for (size_t idx = 1; idx < inputs.size(); ++idx) extra_args.push_back(std::move(inputs[idx]));
 
+  auto op_code = scalars[0].value<UnaryRedCode>();
+  auto shape   = scalars[1].value<DomainPoint>();
   ScalarUnaryRedArgs args{
-    context.reductions()[0], inputs[0], scalars[0].value<UnaryRedCode>(), std::move(extra_args)};
+    context.reductions()[0], inputs[0], op_code, shape, std::move(extra_args)};
   op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);
 }
 
diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h
index 75a8f86da5..316445a02b 100644
--- a/src/cunumeric/unary/unary_red_util.h
+++ b/src/cunumeric/unary/unary_red_util.h
@@ -261,6 +261,16 @@ struct UnaryRedOp<UnaryRedCode::ARGMAX, TYPE_CODE> {
   {
     return VAL(point[collapsed_dim], rhs);
   }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>& point,
+                                 const Legion::Point<DIM>& shape,
+                                 const RHS& rhs)
+  {
+    int64_t idx = 0;
+    for (int32_t dim = 0; dim < DIM; ++dim) idx = idx * shape[dim] + point[dim];
+    return VAL(idx, rhs);
+  }
 };
 
 template <legate::LegateTypeCode TYPE_CODE>
@@ -284,6 +294,16 @@ struct UnaryRedOp<UnaryRedCode::ARGMIN, TYPE_CODE> {
   {
     return VAL(point[collapsed_dim], rhs);
   }
+
+  template <int32_t DIM>
+  __CUDA_HD__ static VAL convert(const Legion::Point<DIM>& point,
+                                 const Legion::Point<DIM>& shape,
+                                 const RHS& rhs)
+  {
+    int64_t idx = 0;
+    for (int32_t dim = 0; dim < DIM; ++dim) idx = idx * shape[dim] + point[dim];
+    return VAL(idx, rhs);
+  }
 };
 
 }  // namespace cunumeric
diff --git a/tests/integration/test_argmin.py b/tests/integration/test_arg_reduce.py
similarity index 54%
rename from tests/integration/test_argmin.py
rename to tests/integration/test_arg_reduce.py
index 7281a1096a..ab6f2f6daa 100644
--- a/tests/integration/test_argmin.py
+++ b/tests/integration/test_arg_reduce.py
@@ -17,22 +17,26 @@
 import pytest
 
 import cunumeric as num
-
-anp = np.random.randn(4, 5)
-
-
-def test_argmin():
-    a = num.array(anp)
-
-    assert np.array_equal(num.argmin(a, axis=0), np.argmin(anp, axis=0))
-    assert np.array_equal(num.argmin(a, axis=1), np.argmin(anp, axis=1))
-
-
-def test_argmax():
-    a = num.array(anp)
-
-    assert np.array_equal(num.argmax(a, axis=0), np.argmax(anp, axis=0))
-    assert np.array_equal(num.argmax(a, axis=1), np.argmax(anp, axis=1))
+from legate.core import LEGATE_MAX_DIM
+
+
+@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1))
+def test_argmax_and_argmin(ndim):
+    shape = (5,) * ndim
+
+    in_np = np.random.random(shape)
+    in_num = num.array(in_np)
+
+    for fn in ("argmax", "argmin"):
+        fn_np = getattr(np, fn)
+        fn_num = getattr(num, fn)
+        assert np.array_equal(fn_np(in_np), fn_num(in_num))
+        if in_num.ndim == 1:
+            continue
+        for axis in range(in_num.ndim):
+            out_np = fn_np(in_np, axis=axis)
+            out_num = fn_num(in_num, axis=axis)
+            assert np.array_equal(out_np, out_num)
 
 
 if __name__ == "__main__":

From d5a945a8e9c8973c4a0a1e8b8670ebf202d3a1e5 Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Fri, 20 May 2022 10:32:19 -0700
Subject: [PATCH 15/16] Minor fix for scalar reductions on scalar stores

---
 src/cunumeric/unary/scalar_unary_red_template.inl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/cunumeric/unary/scalar_unary_red_template.inl b/src/cunumeric/unary/scalar_unary_red_template.inl
index d95a1b6d50..0d0644a47a 100644
--- a/src/cunumeric/unary/scalar_unary_red_template.inl
+++ b/src/cunumeric/unary/scalar_unary_red_template.inl
@@ -120,6 +120,11 @@ static void scalar_unary_red_template(TaskContext& context)
 
   auto op_code = scalars[0].value<UnaryRedCode>();
   auto shape   = scalars[1].value<DomainPoint>();
+  // If the RHS was a scalar, use (1,) as the shape
+  if (shape.dim == 0) {
+    shape.dim = 1;
+    shape[0]  = 1;
+  }
   ScalarUnaryRedArgs args{
     context.reductions()[0], inputs[0], op_code, shape, std::move(extra_args)};
   op_dispatch(args.op_code, ScalarUnaryRedDispatch<KIND>{}, args);

From c5d7f85afd505eeba09c2d65e1f7b05d16f104ef Mon Sep 17 00:00:00 2001
From: Wonchan Lee <wonchanl@nvidia.com>
Date: Fri, 20 May 2022 16:11:48 -0700
Subject: [PATCH 16/16] Make sure we use the dtype of out when it exists

---
 cunumeric/array.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cunumeric/array.py b/cunumeric/array.py
index dcc85fd855..f9cdd13df3 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -3497,6 +3497,9 @@ def _perform_unary_reduction(
             # and the output dtype
             if dtype is not None:
                 res_dtype = dtype
+            elif out is not None:
+                dtype = out.dtype
+                res_dtype = out.dtype
             else:
                 dtype = src.dtype
                 res_dtype = src.dtype