diff --git a/cunumeric/_ufunc/ufunc.py b/cunumeric/_ufunc/ufunc.py index 899f9728d9..64dec6b61b 100644 --- a/cunumeric/_ufunc/ufunc.py +++ b/cunumeric/_ufunc/ufunc.py @@ -663,10 +663,6 @@ def reduce( raise NotImplementedError( f"reduction for {self} is not yet implemented" ) - if out is not None: - raise NotImplementedError( - "reduction for {self} does not take an `out` argument" - ) if not isinstance(where, bool) or not where: raise NotImplementedError( "the 'where' keyword is not yet supported" @@ -682,7 +678,7 @@ def reduce( array, axis=axis, dtype=dtype, - # out=out, + out=out, keepdims=keepdims, initial=initial, where=where, diff --git a/cunumeric/array.py b/cunumeric/array.py index 467c5d62e9..f9cdd13df3 100644 --- a/cunumeric/array.py +++ b/cunumeric/array.py @@ -585,9 +585,8 @@ def __contains__(self, item): UnaryRedCode.CONTAINS, self, axis=None, - dtype=np.dtype(np.bool_), + res_dtype=bool, args=args, - check_types=False, ) def __copy__(self): @@ -1501,10 +1500,9 @@ def all( UnaryRedCode.ALL, self, axis=axis, - dst=out, + res_dtype=bool, + out=out, keepdims=keepdims, - dtype=np.dtype(np.bool_), - check_types=False, initial=initial, where=where, ) @@ -1537,15 +1535,15 @@ def any( UnaryRedCode.ANY, self, axis=axis, - dst=out, + res_dtype=bool, + out=out, keepdims=keepdims, - dtype=np.dtype(np.bool_), - check_types=False, initial=initial, where=where, ) - def argmax(self, axis=None, out=None): + @add_boilerplate() + def argmax(self, axis=None, out=None, keepdims=False): """a.argmax(axis=None, out=None) Return indices of the maximum values along the given axis. @@ -1561,24 +1559,21 @@ def argmax(self, axis=None, out=None): Multiple GPUs, Multiple CPUs """ - if self.size == 1: - return 0 - if axis is None: - axis = self.ndim - 1 - elif type(axis) != int: - raise TypeError("'axis' argument for argmax must be an 'int'") - elif axis < 0 or axis >= self.ndim: - raise TypeError("invalid 'axis' argument for argmax " + str(axis)) + if out is not None and out.dtype != np.int64: + raise ValueError("output array must have int64 dtype") + if axis is not None and not isinstance(axis, int): + raise ValueError("axis must be an integer") return self._perform_unary_reduction( UnaryRedCode.ARGMAX, self, axis=axis, - dtype=np.dtype(np.int64), - dst=out, - check_types=False, + res_dtype=np.dtype(np.int64), + out=out, + keepdims=keepdims, ) - def argmin(self, axis=None, out=None): + @add_boilerplate() + def argmin(self, axis=None, out=None, keepdims=False): """a.argmin(axis=None, out=None) Return indices of the minimum values along the given axis. @@ -1594,21 +1589,17 @@ def argmin(self, axis=None, out=None): Multiple GPUs, Multiple CPUs """ - if self.size == 1: - return 0 - if axis is None: - axis = self.ndim - 1 - elif type(axis) != int: - raise TypeError("'axis' argument for argmin must be an 'int'") - elif axis < 0 or axis >= self.ndim: - raise TypeError("invalid 'axis' argument for argmin " + str(axis)) + if out is not None and out.dtype != np.int64: + raise ValueError("output array must have int64 dtype") + if axis is not None and not isinstance(axis, int): + raise ValueError("axis must be an integer") return self._perform_unary_reduction( UnaryRedCode.ARGMIN, self, axis=axis, - dtype=np.dtype(np.int64), - dst=out, - check_types=False, + res_dtype=np.dtype(np.int64), + out=out, + keepdims=keepdims, ) def astype( @@ -2590,7 +2581,7 @@ def max( UnaryRedCode.MAX, self, axis=axis, - dst=out, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -2686,7 +2677,7 @@ def min( UnaryRedCode.MIN, self, axis=axis, - dst=out, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -2781,7 +2772,8 @@ def prod( UnaryRedCode.PROD, self_array, axis=axis, - dst=out, + dtype=dtype, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -3054,7 +3046,8 @@ def sum( UnaryRedCode.SUM, self_array, axis=axis, - dst=out, + dtype=dtype, + out=out, keepdims=keepdims, initial=initial, where=where, @@ -3486,13 +3479,31 @@ def _perform_unary_reduction( src, axis=None, dtype=None, - dst=None, + res_dtype=None, + out=None, keepdims=False, args=None, - check_types=True, initial=None, where=True, ): + # When 'res_dtype' is not None, the input and output of the reduction + # have different types. Such reduction operators don't take a dtype of + # the accumulator + if res_dtype is not None: + assert dtype is None + dtype = src.dtype + else: + # If 'dtype' exists, that determines both the accumulation dtype + # and the output dtype + if dtype is not None: + res_dtype = dtype + elif out is not None: + dtype = out.dtype + res_dtype = out.dtype + else: + dtype = src.dtype + res_dtype = src.dtype + # TODO: Need to require initial to be given when the array is empty # or a where mask is given. if isinstance(where, ndarray): @@ -3516,121 +3527,66 @@ def _perform_unary_reduction( "(arg)max/min not supported for complex-type arrays" ) # Compute the output shape - if axis is not None: - to_reduce = set() - if type(axis) == int: - if axis < 0: - axis = len(src.shape) + axis - if axis < 0: - raise ValueError("Illegal 'axis' value") - elif axis >= src.ndim: - raise ValueError("Illegal 'axis' value") - to_reduce.add(axis) - axes = (axis,) - elif type(axis) == tuple: - for ax in axis: - if ax < 0: - ax = len(src.shape) + ax - if ax < 0: - raise ValueError("Illegal 'axis' value") - elif ax >= src.ndim: - raise ValueError("Illegal 'axis' value") - to_reduce.add(ax) - axes = axis - else: - raise TypeError( - "Illegal type passed for 'axis' argument " - + str(type(axis)) - ) - out_shape = () - for dim in range(len(src.shape)): - if dim in to_reduce: - if keepdims: - out_shape += (1,) - else: - out_shape += (src.shape[dim],) - else: - # Collapsing down to a single value in this case - out_shape = () - axes = None - # if src.size == 0: - # return nd - if dst is None: - if dtype is not None: - dst = ndarray( - shape=out_shape, - dtype=dtype, - inputs=(src, where), - ) - else: - dst = ndarray( - shape=out_shape, - dtype=src.dtype, - inputs=(src, where), - ) - else: - if dtype is not None and dtype != dst.dtype: - raise TypeError( - "Output array type does not match requested dtype" - ) - if dst.shape != out_shape: - raise TypeError( - "Output array shape " - + str(dst.shape) - + " does not match expected shape " - + str(out_shape) - ) - # Quick exit - if where is False: - return dst - if check_types and src.dtype != dst.dtype: - out_dtype = cls.find_common_type(src, dst) - if src.dtype != out_dtype: - temp = ndarray( - src.shape, - dtype=out_dtype, - inputs=(src, where), - ) - temp._thunk.convert(src._thunk) - src = temp - if dst.dtype != out_dtype: - temp = ndarray( - dst.shape, - dtype=out_dtype, - inputs=(src, where), - ) + axes = axis + if axes is None: + axes = tuple(range(src.ndim)) + elif not isinstance(axes, tuple): + axes = (axes,) - temp._thunk.unary_reduction( - op, - src._thunk, - cls._get_where_thunk(where, dst.shape), - axes, - keepdims, - args, - initial, - ) - dst._thunk.convert(temp._thunk) - else: - dst._thunk.unary_reduction( - op, - src._thunk, - cls._get_where_thunk(where, dst.shape), - axes, - keepdims, - args, - initial, - ) + if any(type(ax) != int for ax in axes): + raise TypeError( + "'axis' must be an integer or a tuple of integers, " + f"but got {axis}" + ) + + axes = tuple(ax + src.ndim if ax < 0 else ax for ax in axes) + + if any(ax < 0 for ax in axes): + raise ValueError(f"Invalid 'axis' value {axis}") + + out_shape = () + for dim in range(src.ndim): + if dim not in axes: + out_shape += (src.shape[dim],) + elif keepdims: + out_shape += (1,) + + if out is None: + out = ndarray( + shape=out_shape, dtype=res_dtype, inputs=(src, where) + ) + elif out.shape != out_shape: + raise ValueError( + f"the output shape mismatch: expected {out_shape} but got " + f"{out.shape}" + ) + + if dtype != src.dtype: + src = src.astype(dtype) + + if out.dtype == res_dtype: + result = out else: - dst._thunk.unary_reduction( + result = ndarray( + shape=out_shape, dtype=res_dtype, inputs=(src, where) + ) + + if where: + result._thunk.unary_reduction( op, src._thunk, - cls._get_where_thunk(where, dst.shape), + cls._get_where_thunk(where, result.shape), + axis, axes, keepdims, args, initial, ) - return dst + + if result is not out: + out._thunk.convert(result._thunk) + + return out @classmethod def _perform_binary_reduction( diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py index 5c2c4904a5..da125c301e 100644 --- a/cunumeric/deferred.py +++ b/cunumeric/deferred.py @@ -1675,6 +1675,7 @@ def unary_reduction( op, src, where, + orig_axis, axes, keepdims, args, @@ -1686,6 +1687,14 @@ def unary_reduction( argred = op in (UnaryRedCode.ARGMAX, UnaryRedCode.ARGMIN) + if argred: + argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype) + lhs_array = self.runtime.create_empty_thunk( + lhs_array.shape, + dtype=argred_dtype, + inputs=[self], + ) + # See if we are doing reduction to a point or another region if lhs_array.size == 1: assert axes is None or len(axes) == ( @@ -1705,20 +1714,13 @@ def unary_reduction( task.add_reduction(lhs_array.base, _UNARY_RED_TO_REDUCTION_OPS[op]) task.add_input(rhs_array.base) task.add_scalar_arg(op, ty.int32) + task.add_scalar_arg(rhs_array.shape, (ty.int64,)) self.add_arguments(task, args) task.execute() else: - if argred: - argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype) - lhs_array = self.runtime.create_empty_thunk( - lhs_array.shape, - dtype=argred_dtype, - inputs=[self], - ) - # Before we perform region reduction, make sure to have the lhs # initialized. If an initial value is given, we use it, otherwise # we use the identity of the reduction operator @@ -1758,13 +1760,13 @@ def unary_reduction( task.execute() - if argred: - self.unary_op( - UnaryOpCode.GETARG, - lhs_array, - True, - [], - ) + if argred: + self.unary_op( + UnaryOpCode.GETARG, + lhs_array, + True, + [], + ) def isclose(self, rhs1, rhs2, rtol, atol, equal_nan): assert not equal_nan diff --git a/cunumeric/eager.py b/cunumeric/eager.py index f84310ce46..2f85c4592e 100644 --- a/cunumeric/eager.py +++ b/cunumeric/eager.py @@ -700,13 +700,16 @@ def unary_op(self, op, rhs, where, args, multiout=None): else: raise RuntimeError("unsupported unary op " + str(op)) - def unary_reduction(self, op, rhs, where, axes, keepdims, args, initial): + def unary_reduction( + self, op, rhs, where, orig_axis, axes, keepdims, args, initial + ): self.check_eager_args(rhs) if self.deferred is not None: self.deferred.unary_reduction( op, rhs, where, + orig_axis, axes, keepdims, args, @@ -722,22 +725,24 @@ def unary_reduction(self, op, rhs, where, axes, keepdims, args, initial): fn( rhs.array, out=self.array, - axis=axes, + axis=orig_axis, keepdims=keepdims, where=where if not isinstance(where, EagerArray) else where.array, ) elif op == UnaryRedCode.ARGMAX: - assert len(axes) == 1 - np.argmax(rhs.array, out=self.array, axis=axes[0]) + np.argmax( + rhs.array, + out=self.array, + axis=orig_axis, + ) elif op == UnaryRedCode.ARGMIN: - assert len(axes) == 1 - np.argmin(rhs.array, out=self.array, axis=axes[0]) + np.argmin(rhs.array, out=self.array, axis=orig_axis) elif op == UnaryRedCode.CONTAINS: self.array.fill(args[0] in rhs.array) elif op == UnaryRedCode.COUNT_NONZERO: - self.array[()] = np.count_nonzero(rhs.array, axis=axes) + self.array[()] = np.count_nonzero(rhs.array, axis=orig_axis) else: raise RuntimeError("unsupported unary reduction op " + str(op)) diff --git a/cunumeric/module.py b/cunumeric/module.py index 44f2037802..710f01f68b 100644 --- a/cunumeric/module.py +++ b/cunumeric/module.py @@ -4411,7 +4411,7 @@ def partition(a, kth, axis=-1, kind="introselect", order=None): @add_boilerplate("a") -def argmax(a, axis=None, out=None): +def argmax(a, axis=None, out=None, *, keepdims=False): """ Returns the indices of the maximum values along an axis. @@ -4426,6 +4426,10 @@ def argmax(a, axis=None, out=None): out : ndarray, optional If provided, the result will be inserted into this array. It should be of the appropriate shape and dtype. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the array. Returns ------- @@ -4441,14 +4445,11 @@ def argmax(a, axis=None, out=None): -------- Multiple GPUs, Multiple CPUs """ - if out is not None: - if out.dtype != np.int64: - raise ValueError("output array must have int64 dtype") - return a.argmax(axis=axis, out=out) + return a.argmax(axis=axis, out=out, keepdims=keepdims) @add_boilerplate("a") -def argmin(a, axis=None, out=None): +def argmin(a, axis=None, out=None, *, keepdims=False): """ Returns the indices of the minimum values along an axis. @@ -4463,6 +4464,10 @@ def argmin(a, axis=None, out=None): out : ndarray, optional If provided, the result will be inserted into this array. It should be of the appropriate shape and dtype. + keepdims : bool, optional + If this is set to True, the axes which are reduced are left + in the result as dimensions with size one. With this option, + the result will broadcast correctly against the array. Returns ------- @@ -4478,10 +4483,7 @@ def argmin(a, axis=None, out=None): -------- Multiple GPUs, Multiple CPUs """ - if out is not None: - if out is not None and out.dtype != np.int64: - raise ValueError("output array must have int64 dtype") - return a.argmin(axis=axis, out=out) + return a.argmin(axis=axis, out=out, keepdims=keepdims) # Counting @@ -4522,9 +4524,8 @@ def count_nonzero(a, axis=None): return ndarray._perform_unary_reduction( UnaryRedCode.COUNT_NONZERO, a, + res_dtype=np.dtype(np.uint64), axis=axis, - dtype=np.dtype(np.uint64), - check_types=False, ) diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py index 153f133d92..a0d67d584b 100644 --- a/cunumeric/thunk.py +++ b/cunumeric/thunk.py @@ -193,6 +193,7 @@ def unary_reduction( redop, rhs, where, + orig_axis, axes, keepdims, args, diff --git a/src/cunumeric/arg.h b/src/cunumeric/arg.h index c31a839807..7705cdf8b9 100644 --- a/src/cunumeric/arg.h +++ b/src/cunumeric/arg.h @@ -25,8 +25,10 @@ namespace cunumeric { template class Argval { public: + // Calling this constructor manually is unsafe, as the members are left uninitialized. + // This constructor exists only to make nvcc happy when we use a shared memory of Argval. __CUDA_HD__ - Argval(); + Argval() {} __CUDA_HD__ Argval(T value); __CUDA_HD__ diff --git a/src/cunumeric/arg.inl b/src/cunumeric/arg.inl index fff5d1f93f..0f57b42621 100644 --- a/src/cunumeric/arg.inl +++ b/src/cunumeric/arg.inl @@ -16,11 +16,6 @@ namespace cunumeric { -template -__CUDA_HD__ Argval::Argval() : arg(LLONG_MAX), arg_value(0) -{ -} - template __CUDA_HD__ Argval::Argval(T v) : arg(LLONG_MAX), arg_value(v) { diff --git a/src/cunumeric/cuda_help.h b/src/cunumeric/cuda_help.h index 1c0efbb3d4..f0d410bf00 100644 --- a/src/cunumeric/cuda_help.h +++ b/src/cunumeric/cuda_help.h @@ -19,6 +19,7 @@ #include "legate.h" #include "core/cuda/cuda_help.h" #include "core/cuda/stream_pool.h" +#include "cunumeric/arg.h" #include #include #include @@ -237,6 +238,34 @@ __device__ __forceinline__ void reduce_output(Legion::DeferredReduction +__device__ __forceinline__ void reduce_output(Legion::DeferredReduction result, + Argval value) +{ + __shared__ Argval trampoline[THREADS_PER_BLOCK / 32]; + // Reduce across the warp + const int laneid = threadIdx.x & 0x1f; + const int warpid = threadIdx.x >> 5; + for (int i = 16; i >= 1; i /= 2) { + const Argval shuffle_value = shuffle(0xffffffff, value, i, 32); + REDUCTION::template fold(value, shuffle_value); + } + // Write warp values into shared memory + if ((laneid == 0) && (warpid > 0)) trampoline[warpid] = value; + __syncthreads(); + // Output reduction + if (threadIdx.x == 0) { + for (int i = 1; i < (THREADS_PER_BLOCK / 32); i++) + REDUCTION::template fold(value, trampoline[i]); + result <<= value; + // Make sure the result is visible externally + __threadfence_system(); + } +} + template __device__ __forceinline__ void reduce_output(Legion::DeferredReduction result, T value) { diff --git a/src/cunumeric/unary/scalar_unary_red.cc b/src/cunumeric/unary/scalar_unary_red.cc index 2e01700e80..9bbaeb6cc9 100644 --- a/src/cunumeric/unary/scalar_unary_red.cc +++ b/src/cunumeric/unary/scalar_unary_red.cc @@ -26,91 +26,57 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; + template ::value>* = nullptr> void operator()(OP func, AccessorRD out, - AccessorRO in, + AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto result = LG_OP::identity; const size_t volume = rect.volume(); if (dense) { auto inptr = in.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) OP::template fold(result, inptr[idx]); + for (size_t idx = 0; idx < volume; ++idx) + OP::template fold(result, OP::convert(inptr[idx])); } else { for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - OP::template fold(result, in[p]); + OP::template fold(result, OP::convert(in[p])); } } out.reduce(0, result); } -}; -namespace detail { - -template -void logical_operator(bool& result, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) -{ - const size_t volume = rect.volume(); - if (dense) { - auto inptr = in.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) { - bool tmp1 = detail::convert_to_bool(inptr[idx]); - OP::template fold(result, tmp1); - } - } else { - for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches.unflatten(idx, rect.lo); - bool tmp1 = detail::convert_to_bool(in[p]); - OP::template fold(result, tmp1); - } - } -} - -} // namespace detail - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - auto result = LG_OP::identity; - detail::logical_operator(result, in, rect, pitches, dense); - out.reduce(0, result); - } -}; - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, + template ::value>* = nullptr> + void operator()(OP func, + AccessorRD out, + AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const - + bool dense, + const Point& shape) const { - auto result = LG_OP::identity; - detail::logical_operator(result, in, rect, pitches, dense); + auto result = LG_OP::identity; + const size_t volume = rect.volume(); + if (dense) { + auto inptr = in.ptr(rect); + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(result, OP::convert(p, shape, inptr[idx])); + } + } else { + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(result, OP::convert(p, shape, in[p])); + } + } out.reduce(0, result); } }; @@ -119,17 +85,17 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD out, - AccessorRO in, + AccessorRO in, const Store& to_find_scalar, const Rect& rect, const Pitches& pitches, bool dense) const { auto result = LG_OP::identity; - const auto to_find = to_find_scalar.scalar(); + const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); if (dense) { auto inptr = in.ptr(rect); @@ -151,33 +117,6 @@ struct ScalarUnaryRedImplBody -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - { - auto result = LG_OP::identity; - const size_t volume = rect.volume(); - if (dense) { - auto inptr = in.ptr(rect); - for (size_t idx = 0; idx < volume; ++idx) result += inptr[idx] != VAL(0); - } else { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - result += in[point] != VAL(0); - } - } - out.reduce(0, result); - } -}; - /*static*/ void ScalarUnaryRedTask::cpu_variant(TaskContext& context) { scalar_unary_red_template(context); diff --git a/src/cunumeric/unary/scalar_unary_red.cu b/src/cunumeric/unary/scalar_unary_red.cu index 796220ee5b..6f2059847c 100644 --- a/src/cunumeric/unary/scalar_unary_red.cu +++ b/src/cunumeric/unary/scalar_unary_red.cu @@ -23,60 +23,77 @@ namespace cunumeric { using namespace Legion; -template + typename LHS> static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduction_kernel(size_t volume, - Op op, + OP, + LG_OP, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, - VAL identity) + LHS identity) { auto value = identity; for (size_t idx = 0; idx < iters; idx++) { const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (offset < volume) { auto point = pitches.unflatten(offset, origin); - Op::template fold(value, in[point]); + LG_OP::template fold(value, OP::convert(in[point])); } } // Every thread in the thread block must participate in the exchange to get correct results reduce_output(out, value); } -template -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) contains_kernel( - size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, VAL to_find) +template +static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) + arg_reduction_kernel(size_t volume, + OP, + LG_OP, + Output out, + ReadAcc in, + Pitches pitches, + Point origin, + size_t iters, + LHS identity, + Point shape) { - bool value = false; + auto value = identity; for (size_t idx = 0; idx < iters; idx++) { const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (offset < volume) { auto point = pitches.unflatten(offset, origin); - SumReduction::fold(value, in[point] == to_find); + LG_OP::template fold(value, OP::convert(point, shape, in[point])); } } // Every thread in the thread block must participate in the exchange to get correct results reduce_output(out, value); } -template -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) count_nonzero_kernel( - size_t volume, Output out, AccessorRO in, Pitches pitches, Point origin, size_t iters) +template +static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) contains_kernel( + size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, RHS to_find) { - uint64_t value = 0; + bool value = false; for (size_t idx = 0; idx < iters; idx++) { const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; if (offset < volume) { auto point = pitches.unflatten(offset, origin); - SumReduction::fold(value, in[point] != VAL(0)); + SumReduction::fold(value, in[point] == to_find); } } // Every thread in the thread block must participate in the exchange to get correct results @@ -93,180 +110,96 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; + using LHS = typename OP::VAL; + template ::value>* = nullptr> void operator()(OP func, AccessorRD out, - AccessorRO in, + AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto stream = get_cached_stream(); const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; DeferredReduction result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(VAL); + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(LHS); if (blocks >= MAX_REDUCTION_CTAS) { const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; reduction_kernel<<>>( - volume, typename OP::OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity); + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity); } else reduction_kernel<<>>( - volume, typename OP::OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity); + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity); copy_kernel<<<1, 1, 0, stream>>>(result, out); CHECK_CUDA_STREAM(stream); } -}; -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Store& to_find_scalar, + template ::value>* = nullptr> + void operator()(OP func, + AccessorRD out, + AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto stream = get_cached_stream(); - const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - DeferredReduction> result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool); + DeferredReduction result; + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(LHS); if (blocks >= MAX_REDUCTION_CTAS) { const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; - contains_kernel<<>>( - volume, result, in, pitches, rect.lo, iters, to_find); + arg_reduction_kernel<<>>( + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, iters, LG_OP::identity, shape); } else - contains_kernel<<>>( - volume, result, in, pitches, rect.lo, 1, to_find); + arg_reduction_kernel<<>>( + volume, OP{}, LG_OP{}, result, in, pitches, rect.lo, 1, LG_OP::identity, shape); copy_kernel<<<1, 1, 0, stream>>>(result, out); CHECK_CUDA_STREAM(stream); } }; -namespace detail { - -template -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) logical_kernel( - size_t volume, Output out, ReadAcc in, Pitches pitches, Point origin, size_t iters, VAL identity) -{ - auto value = identity; - for (size_t idx = 0; idx < iters; idx++) { - const size_t offset = (idx * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x; - if (offset < volume) { - auto point = pitches.unflatten(offset, origin); - Op::template fold(value, convert_to_bool(in[point])); - } - } - // Every thread in the thread block must participate in the exchange to get correct results - reduce_output(out, value); -} - -template -void logical_operator_gpu(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) -{ - auto stream = get_cached_stream(); - - const size_t volume = rect.volume(); - const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - DeferredReduction result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool); - - if (blocks >= MAX_REDUCTION_CTAS) { - const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; - logical_kernel<<>>( - volume, result, in, pitches, rect.lo, iters, LG_OP::identity); - } else - logical_kernel<<>>( - volume, result, in, pitches, rect.lo, 1, LG_OP::identity); - - copy_kernel<<<1, 1, 0, stream>>>(result, out); - CHECK_CUDA_STREAM(stream); -} - -} // namespace detail - template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_gpu>(out, in, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImplBody { +struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_gpu>(out, in, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, + AccessorRO in, + const Store& to_find_scalar, const Rect& rect, const Pitches& pitches, bool dense) const { auto stream = get_cached_stream(); + const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; - DeferredReduction> result; - size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(uint64_t); + DeferredReduction> result; + size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool); if (blocks >= MAX_REDUCTION_CTAS) { const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS; - count_nonzero_kernel<<>>( - volume, result, in, pitches, rect.lo, iters); + contains_kernel<<>>( + volume, result, in, pitches, rect.lo, iters, to_find); } else - count_nonzero_kernel<<>>( - volume, result, in, pitches, rect.lo, 1); + contains_kernel<<>>( + volume, result, in, pitches, rect.lo, 1, to_find); copy_kernel<<<1, 1, 0, stream>>>(result, out); CHECK_CUDA_STREAM(stream); diff --git a/src/cunumeric/unary/scalar_unary_red.h b/src/cunumeric/unary/scalar_unary_red.h index f8beab279d..e529abd7f4 100644 --- a/src/cunumeric/unary/scalar_unary_red.h +++ b/src/cunumeric/unary/scalar_unary_red.h @@ -25,6 +25,7 @@ struct ScalarUnaryRedArgs { const Array& out; const Array& in; UnaryRedCode op_code; + Legion::DomainPoint shape; std::vector args; }; @@ -43,17 +44,4 @@ class ScalarUnaryRedTask : public CuNumericTask { #endif }; -namespace detail { -template ::value>* = nullptr> -__CUDA_HD__ inline bool convert_to_bool(const _T& in) -{ - return bool(in); -} -template ::value>* = nullptr> -__CUDA_HD__ inline bool convert_to_bool(const _T& in) -{ - return bool(in.real()); -} -} // namespace detail - } // namespace cunumeric diff --git a/src/cunumeric/unary/scalar_unary_red_omp.cc b/src/cunumeric/unary/scalar_unary_red_omp.cc index 6d190cdf51..e276b4da5f 100644 --- a/src/cunumeric/unary/scalar_unary_red_omp.cc +++ b/src/cunumeric/unary/scalar_unary_red_omp.cc @@ -29,19 +29,23 @@ template struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; + using LHS = typename OP::VAL; + template ::value>* = nullptr> void operator()(OP func, AccessorRD out, - AccessorRO in, + AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto result = LG_OP::identity; const size_t volume = rect.volume(); const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); + ThreadLocalStorage locals(max_threads); for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity; if (dense) { auto inptr = in.ptr(rect); @@ -49,7 +53,8 @@ struct ScalarUnaryRedImplBody { { const int tid = omp_get_thread_num(); #pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) OP::template fold(locals[tid], inptr[idx]); + for (size_t idx = 0; idx < volume; ++idx) + OP::template fold(locals[tid], OP::convert(inptr[idx])); } } else { #pragma omp parallel @@ -58,42 +63,39 @@ struct ScalarUnaryRedImplBody { #pragma omp for schedule(static) for (size_t idx = 0; idx < volume; ++idx) { auto p = pitches.unflatten(idx, rect.lo); - OP::template fold(locals[tid], in[p]); + OP::template fold(locals[tid], OP::convert(in[p])); } } } for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]); } -}; -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Store& to_find_scalar, + template ::value>* = nullptr> + void operator()(OP func, + AccessorRD out, + AccessorRO in, const Rect& rect, const Pitches& pitches, - bool dense) const + bool dense, + const Point& shape) const { auto result = LG_OP::identity; - const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); - for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = false; + ThreadLocalStorage locals(max_threads); + for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity; if (dense) { auto inptr = in.ptr(rect); #pragma omp parallel { const int tid = omp_get_thread_num(); #pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) - if (inptr[idx] == to_find) locals[tid] = true; + for (size_t idx = 0; idx < volume; ++idx) { + auto p = pitches.unflatten(idx, rect.lo); + OP::template fold(locals[tid], OP::convert(p, shape, inptr[idx])); + } } } else { #pragma omp parallel @@ -101,8 +103,8 @@ struct ScalarUnaryRedImplBody(locals[tid], OP::convert(p, shape, in[p])); } } } @@ -111,103 +113,33 @@ struct ScalarUnaryRedImplBody -void logical_operator_omp(AccessorRO in, - AccessorRD out, - const Rect& rect, - const Pitches& pitches, - bool dense) -{ - const size_t volume = rect.volume(); - const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); - for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = LG_OP::identity; - if (dense) { - auto inptr = in.ptr(rect); -#pragma omp parallel - { - const int tid = omp_get_thread_num(); -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) - OP::template fold(locals[tid], convert_to_bool(inptr[idx])); - } - } else { -#pragma omp parallel - { - const int tid = omp_get_thread_num(); -#pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) { - auto p = pitches.unflatten(idx, rect.lo); - OP::template fold(locals[tid], convert_to_bool(in[p])); - } - } - } - - for (auto idx = 0; idx < max_threads; ++idx) out.reduce(0, locals[idx]); -} - -} // namespace detail - template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_omp(in, out, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImplBody { +struct ScalarUnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD out, - AccessorRO in, - const Rect& rect, - const Pitches& pitches, - bool dense) const - - { - detail::logical_operator_omp(in, out, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD out, - AccessorRO in, + AccessorRO in, + const Store& to_find_scalar, const Rect& rect, const Pitches& pitches, bool dense) const { auto result = LG_OP::identity; + const auto to_find = to_find_scalar.scalar(); const size_t volume = rect.volume(); const auto max_threads = omp_get_max_threads(); - ThreadLocalStorage locals(max_threads); - for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = 0; + ThreadLocalStorage locals(max_threads); + for (auto idx = 0; idx < max_threads; ++idx) locals[idx] = false; if (dense) { auto inptr = in.ptr(rect); #pragma omp parallel { const int tid = omp_get_thread_num(); #pragma omp for schedule(static) - for (size_t idx = 0; idx < volume; ++idx) locals[tid] += inptr[idx] != VAL(0); + for (size_t idx = 0; idx < volume; ++idx) + if (inptr[idx] == to_find) locals[tid] = true; } } else { #pragma omp parallel @@ -216,7 +148,7 @@ struct ScalarUnaryRedImplBody; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; auto rect = args.in.shape(); @@ -44,7 +44,7 @@ struct ScalarUnaryRedImpl { if (0 == volume) return; auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); + auto in = args.in.read_accessor(rect); #ifndef LEGION_BOUNDS_CHECKS // Check to see if this is dense or not @@ -54,7 +54,8 @@ struct ScalarUnaryRedImpl { bool dense = false; #endif - ScalarUnaryRedImplBody()(OP{}, out, in, rect, pitches, dense); + ScalarUnaryRedImplBody()( + OP{}, out, in, rect, pitches, dense, args.shape); } template -struct ScalarUnaryRedImpl { - template - void operator()(ScalarUnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - auto rect = args.in.shape(); - - Pitches pitches; - size_t volume = pitches.flatten(rect); - - if (0 == volume) return; - - auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - // Check to see if this is dense or not - bool dense = in.accessor.is_dense_row_major(rect); -#else - // No dense execution if we're doing bounds checks - bool dense = false; -#endif - - ScalarUnaryRedImplBody()(out, in, rect, pitches, dense); - } -}; - -template -struct ScalarUnaryRedImpl { - template - void operator()(ScalarUnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - auto rect = args.in.shape(); - - Pitches pitches; - size_t volume = pitches.flatten(rect); - - if (0 == volume) return; - - auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - // Check to see if this is dense or not - bool dense = in.accessor.is_dense_row_major(rect); -#else - // No dense execution if we're doing bounds checks - bool dense = false; -#endif - - ScalarUnaryRedImplBody()(out, in, rect, pitches, dense); - } -}; - template struct ScalarUnaryRedImpl { template @@ -135,7 +74,7 @@ struct ScalarUnaryRedImpl { { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; auto rect = args.in.shape(); @@ -145,7 +84,7 @@ struct ScalarUnaryRedImpl { if (0 == volume) return; auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); + auto in = args.in.read_accessor(rect); #ifndef LEGION_BOUNDS_CHECKS // Check to see if this is dense or not @@ -160,51 +99,14 @@ struct ScalarUnaryRedImpl { } }; -template -struct ScalarUnaryRedImpl { - template - void operator()(ScalarUnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - auto rect = args.in.shape(); - - Pitches pitches; - size_t volume = pitches.flatten(rect); - - if (0 == volume) return; - - auto out = args.out.reduce_accessor(); - auto in = args.in.read_accessor(rect); - -#ifndef LEGION_BOUNDS_CHECKS - // Check to see if this is dense or not - bool dense = in.accessor.is_dense_row_major(rect); -#else - // No dense execution if we're doing bounds checks - bool dense = false; -#endif - - ScalarUnaryRedImplBody()( - out, in, rect, pitches, dense); - } -}; - template struct ScalarUnaryRedDispatch { - template ::value>* = nullptr> + template void operator()(ScalarUnaryRedArgs& args) const { auto dim = std::max(1, args.in.dim()); double_dispatch(dim, args.in.code(), ScalarUnaryRedImpl{}, args); } - template ::value>* = nullptr> - void operator()(ScalarUnaryRedArgs& args) const - { - assert(false); - } }; template @@ -216,14 +118,16 @@ static void scalar_unary_red_template(TaskContext& context) std::vector extra_args; for (size_t idx = 1; idx < inputs.size(); ++idx) extra_args.push_back(std::move(inputs[idx])); + auto op_code = scalars[0].value(); + auto shape = scalars[1].value(); + // If the RHS was a scalar, use (1,) as the shape + if (shape.dim == 0) { + shape.dim = 1; + shape[0] = 1; + } ScalarUnaryRedArgs args{ - context.reductions()[0], inputs[0], scalars[0].value(), std::move(extra_args)}; - if (args.op_code == UnaryRedCode::COUNT_NONZERO) { - auto dim = std::max(1, args.in.dim()); - double_dispatch( - dim, args.in.code(), ScalarUnaryRedImpl{}, args); - } else - op_dispatch(args.op_code, ScalarUnaryRedDispatch{}, args); + context.reductions()[0], inputs[0], op_code, shape, std::move(extra_args)}; + op_dispatch(args.op_code, ScalarUnaryRedDispatch{}, args); } } // namespace cunumeric diff --git a/src/cunumeric/unary/unary_red.cc b/src/cunumeric/unary/unary_red.cc index aef90964aa..8b9b93942c 100644 --- a/src/cunumeric/unary/unary_red.cc +++ b/src/cunumeric/unary/unary_red.cc @@ -24,43 +24,9 @@ using namespace legate; template struct UnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - - void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - lhs.reduce(point, rhs[point]); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - OP::template fold(lhs[point], rhs[point]); - } - } -}; - -template -struct ArgRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; using RHS = legate_type_of; - using LHS = Argval; void operator()(AccessorRD lhs, AccessorRO rhs, @@ -71,20 +37,7 @@ struct ArgRedImplBody { { for (size_t idx = 0; idx < volume; ++idx) { auto point = pitches.unflatten(idx, rect.lo); - lhs.reduce(point, LHS(point[collapsed_dim], rhs[point])); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - for (size_t idx = 0; idx < volume; ++idx) { - auto point = pitches.unflatten(idx, rect.lo); - OP::template fold(lhs[point], LHS(point[collapsed_dim], rhs[point])); + lhs.reduce(point, OP::convert(point, collapsed_dim, rhs[point])); } } }; diff --git a/src/cunumeric/unary/unary_red.cu b/src/cunumeric/unary/unary_red.cu index 86003c00e5..1cc0d46536 100644 --- a/src/cunumeric/unary/unary_red.cu +++ b/src/cunumeric/unary/unary_red.cu @@ -38,7 +38,7 @@ static constexpr coord_t WARP_SIZE = 32; // have enough elements to be assigned to the threads, we also parallelize on // the collapsing domain. One exceptional case to this strategy is where the collapsing // dimension is the innermost one, in which case we prefer that dimension to the others -// in order to enjoy wrap coalescing. The maximum degree of such parallelism woudl be 32, +// in order to enjoy wrap coalescing. The maximum degree of such parallelism would be 32, // which is the size of a wrap. template struct ThreadBlock { @@ -203,9 +203,8 @@ std::ostream& operator<<(std::ostream& os, const ThreadBlocks& blocks) return os; } -template -static __device__ __forceinline__ Point local_reduce(CTOR ctor, - LHS& result, +template +static __device__ __forceinline__ Point local_reduce(LHS& result, AccessorRO in, LHS identity, const ThreadBlocks& blocks, @@ -218,7 +217,7 @@ static __device__ __forceinline__ Point local_reduce(CTOR ctor, if (!domain.contains(point)) return point; while (point[collapsed_dim] <= domain.hi[collapsed_dim]) { - LHS value = ctor(point, in[point], collapsed_dim); + LHS value = OP::convert(point, collapsed_dim, in[point]); REDOP::template fold(result, value); blocks.next_point(point); } @@ -274,22 +273,7 @@ static __device__ __forceinline__ Point local_reduce(CTOR ctor, return point; } -template -static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) - reduce_with_rw_acc(AccessorRW out, - AccessorRO in, - LHS identity, - ThreadBlocks blocks, - Rect domain, - int32_t collapsed_dim) -{ - auto result = identity; - auto point = local_reduce( - CTOR{}, result, in, identity, blocks, domain, collapsed_dim); - if (result != identity) REDOP::template fold(out[point], result); -} - -template +template static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) reduce_with_rd_acc(AccessorRD out, AccessorRO in, @@ -299,64 +283,17 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM) int32_t collapsed_dim) { auto result = identity; - auto point = local_reduce( - CTOR{}, result, in, identity, blocks, domain, collapsed_dim); + auto point = + local_reduce(result, in, identity, blocks, domain, collapsed_dim); if (result != identity) out.reduce(point, result); } template struct UnaryRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - using CTOR = ValueConstructor; - - void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - auto Kernel = reduce_with_rd_acc; - auto stream = get_cached_stream(); - - ThreadBlocks blocks; - blocks.initialize(rect, collapsed_dim); - - blocks.compute_maximum_concurrency(reinterpret_cast(Kernel)); - Kernel<<>>( - lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim); - CHECK_CUDA_STREAM(stream); - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - auto Kernel = reduce_with_rw_acc; - auto stream = get_cached_stream(); - - ThreadBlocks blocks; - blocks.initialize(rect, collapsed_dim); - - blocks.compute_maximum_concurrency(reinterpret_cast(Kernel)); - Kernel<<>>( - lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim); - CHECK_CUDA_STREAM(stream); - } -}; - -template -struct ArgRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; using RHS = legate_type_of; - using LHS = Argval; - using CTOR = ArgvalConstructor; + using LHS = typename OP::VAL; void operator()(AccessorRD lhs, AccessorRO rhs, @@ -365,26 +302,7 @@ struct ArgRedImplBody { int collapsed_dim, size_t volume) const { - auto Kernel = reduce_with_rd_acc; - auto stream = get_cached_stream(); - - ThreadBlocks blocks; - blocks.initialize(rect, collapsed_dim); - - blocks.compute_maximum_concurrency(reinterpret_cast(Kernel)); - Kernel<<>>( - lhs, rhs, LG_OP::identity, blocks, rect, collapsed_dim); - CHECK_CUDA_STREAM(stream); - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - auto Kernel = reduce_with_rw_acc; + auto Kernel = reduce_with_rd_acc; auto stream = get_cached_stream(); ThreadBlocks blocks; diff --git a/src/cunumeric/unary/unary_red_omp.cc b/src/cunumeric/unary/unary_red_omp.cc index 1cdb40b44c..6696c0f7b0 100644 --- a/src/cunumeric/unary/unary_red_omp.cc +++ b/src/cunumeric/unary/unary_red_omp.cc @@ -79,10 +79,10 @@ template struct UnaryRedImplBody { using OP = UnaryRedOp; using LG_OP = typename OP::OP; - using VAL = legate_type_of; + using RHS = legate_type_of; void operator()(AccessorRD lhs, - AccessorRO rhs, + AccessorRO rhs, const Rect& rect, const Pitches& pitches, int collapsed_dim, @@ -95,69 +95,7 @@ struct UnaryRedImplBody { for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { auto point = splitter.combine(o_idx, i_idx, rect.lo); - lhs.reduce(point, rhs[point]); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - Splitter splitter; - auto split = splitter.split(rect, collapsed_dim); - -#pragma omp parallel for schedule(static) - for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) - for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { - auto point = splitter.combine(o_idx, i_idx, rect.lo); - OP::template fold(lhs[point], rhs[point]); - } - } -}; - -template -struct ArgRedImplBody { - using OP = UnaryRedOp; - using LG_OP = typename OP::OP; - using VAL = legate_type_of; - using ARGVAL = Argval; - - void operator()(AccessorRD lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - Splitter splitter; - auto split = splitter.split(rect, collapsed_dim); - -#pragma omp parallel for schedule(static) - for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) - for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { - auto point = splitter.combine(o_idx, i_idx, rect.lo); - lhs.reduce(point, ARGVAL(point[collapsed_dim], rhs[point])); - } - } - - void operator()(AccessorRW lhs, - AccessorRO rhs, - const Rect& rect, - const Pitches& pitches, - int collapsed_dim, - size_t volume) const - { - Splitter splitter; - auto split = splitter.split(rect, collapsed_dim); - -#pragma omp parallel for schedule(static) - for (size_t o_idx = 0; o_idx < split.outer; ++o_idx) - for (size_t i_idx = 0; i_idx < split.inner; ++i_idx) { - auto point = splitter.combine(o_idx, i_idx, rect.lo); - OP::template fold(lhs[point], ARGVAL(point[collapsed_dim], rhs[point])); + lhs.reduce(point, OP::convert(point, collapsed_dim, rhs[point])); } } }; diff --git a/src/cunumeric/unary/unary_red_template.inl b/src/cunumeric/unary/unary_red_template.inl index 309dec413d..7283949642 100644 --- a/src/cunumeric/unary/unary_red_template.inl +++ b/src/cunumeric/unary/unary_red_template.inl @@ -26,9 +26,6 @@ using namespace legate; template struct UnaryRedImplBody; -template -struct ArgRedImplBody; - template struct UnaryRedImpl { template ; - using VAL = legate_type_of; + using RHS = legate_type_of; Pitches pitches; auto rect = args.rhs.shape(); @@ -45,7 +42,7 @@ struct UnaryRedImpl { if (volume == 0) return; - auto rhs = args.rhs.read_accessor(rect); + auto rhs = args.rhs.read_accessor(rect); auto lhs = args.lhs.reduce_accessor(rect); UnaryRedImplBody()( @@ -61,52 +58,14 @@ struct UnaryRedImpl { } }; -template -struct ArgRedImpl { - template 1) && UnaryRedOp::valid>* = nullptr> - void operator()(UnaryRedArgs& args) const - { - using OP = UnaryRedOp; - using VAL = legate_type_of; - using ARGVAL = Argval; - - Pitches pitches; - auto rect = args.rhs.shape(); - auto volume = pitches.flatten(rect); - - if (volume == 0) return; - - auto rhs = args.rhs.read_accessor(rect); - - auto lhs = args.lhs.reduce_accessor(rect); - ArgRedImplBody()(lhs, rhs, rect, pitches, args.collapsed_dim, volume); - } - - template ::valid>* = nullptr> - void operator()(UnaryRedArgs& args) const - { - assert(false); - } -}; - template struct UnaryRedDispatch { - template ::value>* = nullptr> + template void operator()(UnaryRedArgs& args) const { auto dim = std::max(1, args.rhs.dim()); return double_dispatch(dim, args.rhs.code(), UnaryRedImpl{}, args); } - template ::value>* = nullptr> - void operator()(UnaryRedArgs& args) const - { - auto dim = std::max(1, args.rhs.dim()); - return double_dispatch(dim, args.rhs.code(), ArgRedImpl{}, args); - } }; template diff --git a/src/cunumeric/unary/unary_red_util.h b/src/cunumeric/unary/unary_red_util.h index 233190731a..316445a02b 100644 --- a/src/cunumeric/unary/unary_red_util.h +++ b/src/cunumeric/unary/unary_red_util.h @@ -58,6 +58,8 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&.. return f.template operator()(std::forward(args)...); case UnaryRedCode::CONTAINS: return f.template operator()(std::forward(args)...); + case UnaryRedCode::COUNT_NONZERO: + return f.template operator()(std::forward(args)...); case UnaryRedCode::MAX: return f.template operator()(std::forward(args)...); case UnaryRedCode::MIN: @@ -72,26 +74,6 @@ constexpr decltype(auto) op_dispatch(UnaryRedCode op_code, Functor f, Fnargs&&.. return f.template operator()(std::forward(args)...); } -template -struct ValueConstructor { - __CUDA_HD__ inline constexpr T operator()(const Legion::Point&, - const T& value, - int32_t) const - { - return value; - } -}; - -template -struct ArgvalConstructor { - __CUDA_HD__ inline constexpr Argval operator()(const Legion::Point& point, - const T& value, - int32_t collapsed_dim) const - { - return Argval(point[collapsed_dim], value); - } -}; - template struct UnaryRedOp { static constexpr bool valid = false; @@ -99,144 +81,229 @@ struct UnaryRedOp { template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = bool; using OP = Legion::ProdReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } -}; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs != RHS(0); + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); } }; template struct UnaryRedOp { + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; + + using RHS = legate::legate_type_of; + using VAL = bool; + using OP = Legion::SumReduction; + + template + __CUDA_HD__ static void fold(VAL& a, VAL b) + { + OP::template fold(a, b); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs != RHS(0); + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs != RHS(0); } +}; + +template +struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = uint64_t; using OP = Legion::SumReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return static_cast(rhs != RHS(0)); + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return static_cast(rhs != RHS(0)); } }; template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::MaxReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } -}; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs; + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::MinReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } -}; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs; + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::ProdReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } -}; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs; + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template struct UnaryRedOp { static constexpr bool valid = true; - using VAL = legate::legate_type_of; + using RHS = legate::legate_type_of; + using VAL = RHS; using OP = Legion::SumReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } + + template + __CUDA_HD__ static VAL convert(const Legion::Point&, int32_t, const RHS& rhs) + { + return rhs; + } + + __CUDA_HD__ static VAL convert(const RHS& rhs) { return rhs; } }; template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; - using VAL = Argval>; - using OP = ArgmaxReduction>; + using RHS = legate::legate_type_of; + using VAL = Argval; + using OP = ArgmaxReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } -}; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + int32_t collapsed_dim, + const RHS& rhs) + { + return VAL(point[collapsed_dim], rhs); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + const Legion::Point& shape, + const RHS& rhs) + { + int64_t idx = 0; + for (int32_t dim = 0; dim < DIM; ++dim) idx = idx * shape[dim] + point[dim]; + return VAL(idx, rhs); + } }; template struct UnaryRedOp { - static constexpr bool valid = true; + static constexpr bool valid = TYPE_CODE != legate::LegateTypeCode::COMPLEX128_LT; - using VAL = Argval>; - using OP = ArgminReduction>; + using RHS = legate::legate_type_of; + using VAL = Argval; + using OP = ArgminReduction; template - __CUDA_HD__ static void fold(VAL& rhs1, VAL rhs2) + __CUDA_HD__ static void fold(VAL& a, VAL b) { - OP::template fold(rhs1, rhs2); + OP::template fold(a, b); } -}; -template <> -struct UnaryRedOp { - static constexpr bool valid = false; + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + int32_t collapsed_dim, + const RHS& rhs) + { + return VAL(point[collapsed_dim], rhs); + } + + template + __CUDA_HD__ static VAL convert(const Legion::Point& point, + const Legion::Point& shape, + const RHS& rhs) + { + int64_t idx = 0; + for (int32_t dim = 0; dim < DIM; ++dim) idx = idx * shape[dim] + point[dim]; + return VAL(idx, rhs); + } }; } // namespace cunumeric diff --git a/tests/integration/test_argmin.py b/tests/integration/test_arg_reduce.py similarity index 54% rename from tests/integration/test_argmin.py rename to tests/integration/test_arg_reduce.py index 7281a1096a..ab6f2f6daa 100644 --- a/tests/integration/test_argmin.py +++ b/tests/integration/test_arg_reduce.py @@ -17,22 +17,26 @@ import pytest import cunumeric as num - -anp = np.random.randn(4, 5) - - -def test_argmin(): - a = num.array(anp) - - assert np.array_equal(num.argmin(a, axis=0), np.argmin(anp, axis=0)) - assert np.array_equal(num.argmin(a, axis=1), np.argmin(anp, axis=1)) - - -def test_argmax(): - a = num.array(anp) - - assert np.array_equal(num.argmax(a, axis=0), np.argmax(anp, axis=0)) - assert np.array_equal(num.argmax(a, axis=1), np.argmax(anp, axis=1)) +from legate.core import LEGATE_MAX_DIM + + +@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1)) +def test_argmax_and_argmin(ndim): + shape = (5,) * ndim + + in_np = np.random.random(shape) + in_num = num.array(in_np) + + for fn in ("argmax", "argmin"): + fn_np = getattr(np, fn) + fn_num = getattr(num, fn) + assert np.array_equal(fn_np(in_np), fn_num(in_num)) + if in_num.ndim == 1: + continue + for axis in range(in_num.ndim): + out_np = fn_np(in_np, axis=axis) + out_num = fn_num(in_num, axis=axis) + assert np.array_equal(out_np, out_num) if __name__ == "__main__": diff --git a/tests/integration/test_logical.py b/tests/integration/test_logical.py index 1ef81d48c2..4eda4388e6 100644 --- a/tests/integration/test_logical.py +++ b/tests/integration/test_logical.py @@ -17,54 +17,62 @@ import pytest import cunumeric as num - - -def test_any_basic(): - assert num.array_equal(num.any([-1, 4, 5]), np.any([-1, 4, 5])) - - x = [5, 10, 0, 100] - cx = num.array(x) - assert num.array_equal(num.any(cx), np.any(x)) - - y = [[0, 0], [0, 0]] - cy = num.array(y) - assert num.array_equal(num.any(cy), np.any(y)) - - -def test_any_axis(): - x = np.array([[True, True, False], [True, True, True]]) - cx = num.array(x) - - assert num.array_equal(num.any(cx), np.any(x)) - assert num.array_equal(num.any(cx, axis=0), np.any(x, axis=0)) - - -def test_all_basic(): - assert num.array_equal(num.all([-1, 4, 5]), np.all([-1, 4, 5])) - - x = [5, 10, 0, 100] - cx = num.array(x) - assert num.array_equal(num.all(cx), np.all(x)) - - y = [[0, 0], [0, 0]] - cy = num.array(y) - assert num.array_equal(num.all(cy), np.all(y)) - - -def test_all_axis(): - x = np.array([[True, True, False], [True, True, True]]) - cx = num.array(x) - - assert num.array_equal(num.all(cx), np.all(x)) - assert num.array_equal(num.all(cx, axis=0), np.all(x, axis=0)) - - -def test_nan(): - assert num.equal(num.all(num.nan), np.all(np.nan)) - assert num.equal(num.any(num.nan), np.any(np.nan)) - - assert num.array_equal(num.all(num.nan), np.all(np.nan)) - assert num.array_equal(num.any(num.nan), np.any(np.nan)) +from legate.core import LEGATE_MAX_DIM + +INPUTS = ( + [-1, 4, 5], + [5, 10, 0, 100], + [[0, 0], [0, 0]], + [[True, True, False], [True, True, True]], + [[False, True, False]], + [[0.0, 1.0, 0.0]], + [[1, 0 + 1j, 1 + 1j]], + [[1, 0 + 1j, 0 + 0j]], + [np.nan], +) + + +@pytest.mark.parametrize("input", INPUTS) +def test_any_and_all(input): + in_np = np.array(input) + # cuNumeric doesn't support reductions for complex128 + if in_np.dtype.kind == "c": + in_np = in_np.astype("F") + in_num = num.array(in_np) + + for fn in ("any", "all"): + fn_np = getattr(np, fn) + fn_num = getattr(num, fn) + assert np.array_equal(fn_np(in_np), fn_num(in_num)) + for axis in range(in_num.ndim): + out_np = fn_np(in_np, axis=axis) + out_num = fn_num(in_num, axis=axis) + assert np.array_equal(out_np, out_num) + + +@pytest.mark.parametrize("ndim", range(LEGATE_MAX_DIM + 1)) +def test_nd_inputs(ndim): + shape = (3,) * ndim + in_np = np.random.random(shape) + in_num = num.array(in_np) + + for fn in ("any", "all"): + fn_np = getattr(np, fn) + fn_num = getattr(num, fn) + for axis in range(in_num.ndim): + out_np = fn_np(in_np, axis=axis) + out_num = fn_num(in_num, axis=axis) + assert np.array_equal(out_np, out_num) + + out_np = np.empty(out_np.shape, dtype="D") + out_num = num.empty(out_num.shape, dtype="D") + fn_np(in_np, axis=axis, out=out_np) + fn_num(in_num, axis=axis, out=out_num) + assert np.array_equal(out_np, out_num) + + out_np = fn_np(in_np[1:], axis=axis) + out_num = fn_num(in_num[1:], axis=axis) + assert np.array_equal(out_np, out_num) @pytest.mark.skip diff --git a/tests/integration/test_nonzero.py b/tests/integration/test_nonzero.py index e09340c8be..27538ae003 100644 --- a/tests/integration/test_nonzero.py +++ b/tests/integration/test_nonzero.py @@ -98,12 +98,11 @@ def test_axis(): x_np, axis=(0, 1, 2) ) - # TODO: Put this back once we have per-axis count_nonzero - # for axis in range(3): - # assert_equal( - # num.count_nonzero(x, axis=axis), - # np.count_nonzero(x_np, axis=axis), - # ) + for axis in range(3): + assert_equal( + num.count_nonzero(x, axis=axis), + np.count_nonzero(x_np, axis=axis), + ) def test_deprecated_0d(): diff --git a/tests/integration/test_reduction_axis.py b/tests/integration/test_reduction_axis.py index bfe67a41cc..7b6ff05553 100644 --- a/tests/integration/test_reduction_axis.py +++ b/tests/integration/test_reduction_axis.py @@ -21,8 +21,8 @@ import cunumeric as cn -def _sum(shape, axis, lib): - return lib.ones(shape).sum(axis=axis) +def _sum(shape, axis, lib, dtype=None): + return lib.ones(shape).sum(axis=axis, dtype=dtype) # Try various non-square shapes, to nudge the core towards trying many @@ -31,6 +31,9 @@ def _sum(shape, axis, lib): @pytest.mark.parametrize("shape", permutations((3, 4, 5)), ids=str) def test_3d(shape, axis): assert np.array_equal(_sum(shape, axis, np), _sum(shape, axis, cn)) + assert np.array_equal( + _sum(shape, axis, np, dtype="D"), _sum(shape, axis, cn, dtype="D") + ) if __name__ == "__main__":