diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index abc2665a2d0..a8f839c33ea 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -255,6 +255,28 @@ struct SumState { } }; +template <> +struct SumState { + using SumType = typename FindAccumulatorType::Type; + using ThisType = SumState; + + ThisType& operator+=(const ThisType& rhs) { + this->count += rhs.count; + this->sum += rhs.sum; + return *this; + } + + public: + void Consume(const Array& input) { + const BooleanArray& array = static_cast(input); + count += array.length() - array.null_count(); + sum += array.true_count(); + } + + size_t count = 0; + typename SumType::c_type sum = 0; +}; + template struct SumImpl : public ScalarAggregator { using ArrayType = typename TypeTraits::ArrayType; @@ -311,6 +333,11 @@ struct SumLikeInit { return Status::NotImplemented("No sum implemented"); } + Status Visit(const BooleanType&) { + state.reset(new KernelClass()); + return Status::OK(); + } + template enable_if_number Visit(const Type&) { state.reset(new KernelClass()); @@ -339,6 +366,30 @@ std::unique_ptr MeanInit(KernelContext* ctx, const KernelInitArgs& template struct MinMaxState {}; +template +struct MinMaxState> { + using ThisType = MinMaxState; + using T = typename ArrowType::c_type; + + ThisType& operator+=(const ThisType& rhs) { + this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; + this->min = this->min && rhs.min; + this->max = this->max || rhs.max; + return *this; + } + + void MergeOne(T value) { + this->min = this->min && value; + this->max = this->max || value; + } + + T min = true; + T max = false; + bool has_nulls = false; + bool has_values = false; +}; + template struct MinMaxState> { using ThisType = MinMaxState; @@ -346,6 +397,7 @@ struct MinMaxState> { ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; this->min = std::min(this->min, rhs.min); this->max = std::max(this->max, rhs.max); return *this; @@ -359,6 +411,7 @@ struct MinMaxState> { T min = std::numeric_limits::max(); T max = std::numeric_limits::min(); bool has_nulls = false; + bool has_values = false; }; template @@ -368,6 +421,7 @@ struct MinMaxState> { ThisType& operator+=(const ThisType& rhs) { this->has_nulls |= rhs.has_nulls; + this->has_values |= rhs.has_values; this->min = std::fmin(this->min, rhs.min); this->max = std::fmax(this->max, rhs.max); return *this; @@ -381,6 +435,7 @@ struct MinMaxState> { T min = std::numeric_limits::infinity(); T max = -std::numeric_limits::infinity(); bool has_nulls = false; + bool has_values = false; }; template @@ -397,24 +452,26 @@ struct MinMaxImpl : public ScalarAggregator { ArrayType arr(batch[0].array()); - local.has_nulls = arr.null_count() > 0; + const auto null_count = arr.null_count(); + local.has_nulls = null_count > 0; + local.has_values = (arr.length() - null_count) > 0; + if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { this->state = local; return; } - const auto values = arr.raw_values(); - if (arr.null_count() > 0) { + if (local.has_nulls) { BitmapReader reader(arr.null_bitmap_data(), arr.offset(), arr.length()); for (int64_t i = 0; i < arr.length(); i++) { if (reader.IsSet()) { - local.MergeOne(values[i]); + local.MergeOne(arr.Value(i)); } reader.Next(); } } else { for (int64_t i = 0; i < arr.length(); i++) { - local.MergeOne(values[i]); + local.MergeOne(arr.Value(i)); } } this->state = local; @@ -429,7 +486,8 @@ struct MinMaxImpl : public ScalarAggregator { using ScalarType = typename TypeTraits::ScalarType; std::vector> values; - if (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { + if (!state.has_values || + (state.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL)) { // (null, null) values = {std::make_shared(), std::make_shared()}; } else { @@ -444,6 +502,33 @@ struct MinMaxImpl : public ScalarAggregator { MinMaxState state; }; +struct BooleanMinMaxImpl : public MinMaxImpl { + using MinMaxImpl::MinMaxImpl; + + void Consume(KernelContext*, const ExecBatch& batch) override { + StateType local; + ArrayType arr(batch[0].array()); + + const auto arr_length = arr.length(); + const auto null_count = arr.null_count(); + const auto valid_count = arr_length - null_count; + + local.has_nulls = null_count > 0; + local.has_values = valid_count > 0; + if (local.has_nulls && options.null_handling == MinMaxOptions::OUTPUT_NULL) { + this->state = local; + return; + } + + const auto true_count = arr.true_count(); + const auto false_count = valid_count - true_count; + local.max = true_count > 0; + local.min = false_count == 0; + + this->state = local; + } +}; + struct MinMaxInitState { std::unique_ptr state; KernelContext* ctx; @@ -463,6 +548,11 @@ struct MinMaxInitState { return Status::NotImplemented("No sum implemented"); } + Status Visit(const BooleanType&) { + state.reset(new BooleanMinMaxImpl(out_type, options)); + return Status::OK(); + } + template enable_if_number Visit(const Type&) { state.reset(new MinMaxImpl(out_type, options)); @@ -525,18 +615,21 @@ void RegisterScalarAggregateBasic(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared("sum", Arity::Unary()); + AddBasicAggKernels(SumInit, {boolean()}, int64(), func.get()); AddBasicAggKernels(SumInit, SignedIntTypes(), int64(), func.get()); AddBasicAggKernels(SumInit, UnsignedIntTypes(), uint64(), func.get()); AddBasicAggKernels(SumInit, FloatingPointTypes(), float64(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); func = std::make_shared("mean", Arity::Unary()); + AddBasicAggKernels(MeanInit, {boolean()}, float64(), func.get()); AddBasicAggKernels(MeanInit, NumericTypes(), float64(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); static auto default_minmax_options = MinMaxOptions::Defaults(); func = std::make_shared("minmax", Arity::Unary(), &default_minmax_options); + AddMinMaxKernels(MinMaxInit, {boolean()}, func.get()); AddMinMaxKernels(MinMaxInit, NumericTypes(), func.get()); DCHECK_OK(registry->AddFunction(std::move(func))); } diff --git a/cpp/src/arrow/compute/kernels/aggregate_internal.h b/cpp/src/arrow/compute/kernels/aggregate_internal.h index de3584588ea..5f2f50c0b06 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_internal.h +++ b/cpp/src/arrow/compute/kernels/aggregate_internal.h @@ -27,6 +27,11 @@ namespace compute { template struct FindAccumulatorType {}; +template +struct FindAccumulatorType> { + using Type = UInt64Type; +}; + template struct FindAccumulatorType> { using Type = Int64Type; diff --git a/cpp/src/arrow/compute/kernels/aggregate_test.cc b/cpp/src/arrow/compute/kernels/aggregate_test.cc index 8cbb987356f..72531cf0c5f 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_test.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_test.cc @@ -127,6 +127,41 @@ void ValidateSum(const Array& array) { ValidateSum(array, NaiveSum(array)); } +using UnaryOp = Result(const Datum&, ExecContext*); + +template +void ValidateBooleanAgg(const std::string& json, + const std::shared_ptr& expected) { + auto array = ArrayFromJSON(boolean(), json); + auto exp = Datum(expected); + ASSERT_OK_AND_ASSIGN(Datum result, Op(array, nullptr)); + ASSERT_TRUE(result.Equals(exp)); +} + +TEST(TestBooleanAggregation, Sum) { + ValidateBooleanAgg("[]", std::make_shared()); + ValidateBooleanAgg("[null]", std::make_shared()); + ValidateBooleanAgg("[null, false]", std::make_shared(0)); + ValidateBooleanAgg("[true]", std::make_shared(1)); + ValidateBooleanAgg("[true, false, true]", std::make_shared(2)); + ValidateBooleanAgg("[true, false, true, true, null]", + std::make_shared(3)); +} + +TEST(TestBooleanAggregation, Mean) { + ValidateBooleanAgg("[]", std::make_shared()); + ValidateBooleanAgg("[null]", std::make_shared()); + ValidateBooleanAgg("[null, false]", std::make_shared(0)); + ValidateBooleanAgg("[true]", std::make_shared(1)); + ValidateBooleanAgg("[true, false, true, false]", + std::make_shared(0.5)); + ValidateBooleanAgg("[true, null]", std::make_shared(1)); + ValidateBooleanAgg("[true, null, false, true, true]", + std::make_shared(0.75)); + ValidateBooleanAgg("[true, null, false, false, false]", + std::make_shared(0.25)); +} + template class TestNumericSumKernel : public ::testing::Test {}; @@ -346,10 +381,10 @@ TYPED_TEST(TestRandomNumericMeanKernel, RandomArrayMean) { /// template -class TestNumericMinMaxKernel : public ::testing::Test { +class TestPrimitiveMinMaxKernel : public ::testing::Test { using Traits = TypeTraits; using ArrayType = typename Traits::ArrayType; - using c_type = typename ArrayType::value_type; + using c_type = typename ArrowType::c_type; using ScalarType = typename Traits::ScalarType; public: @@ -401,15 +436,57 @@ class TestNumericMinMaxKernel : public ::testing::Test { }; template -class TestFloatingMinMaxKernel : public TestNumericMinMaxKernel {}; +class TestIntegerMinMaxKernel : public TestPrimitiveMinMaxKernel {}; + +template +class TestFloatingMinMaxKernel : public TestPrimitiveMinMaxKernel {}; + +class TestBooleanMinMaxKernel : public TestPrimitiveMinMaxKernel {}; + +TEST_F(TestBooleanMinMaxKernel, Basics) { + MinMaxOptions options; + std::vector chunked_input0 = {"[]", "[]"}; + std::vector chunked_input1 = {"[true, true, null]", "[true, null]"}; + std::vector chunked_input2 = {"[false, false, false]", "[false]"}; + std::vector chunked_input3 = {"[true, null]", "[null, false]"}; + + // SKIP nulls by default + this->AssertMinMaxIsNull("[]", options); + this->AssertMinMaxIsNull("[null, null, null]", options); + this->AssertMinMaxIs("[false, false, false]", false, false, options); + this->AssertMinMaxIs("[false, false, false, null]", false, false, options); + this->AssertMinMaxIs("[true, null, true, true]", true, true, options); + this->AssertMinMaxIs("[true, null, true, true]", true, true, options); + this->AssertMinMaxIs("[true, null, false, true]", false, true, options); + this->AssertMinMaxIsNull(chunked_input0, options); + this->AssertMinMaxIs(chunked_input1, true, true, options); + this->AssertMinMaxIs(chunked_input2, false, false, options); + this->AssertMinMaxIs(chunked_input3, false, true, options); + + options = MinMaxOptions(MinMaxOptions::OUTPUT_NULL); + this->AssertMinMaxIsNull("[]", options); + this->AssertMinMaxIsNull("[null, null, null]", options); + this->AssertMinMaxIsNull("[false, null, false]", options); + this->AssertMinMaxIsNull("[true, null]", options); + this->AssertMinMaxIs("[true, true, true]", true, true, options); + this->AssertMinMaxIs("[false, false]", false, false, options); + this->AssertMinMaxIs("[false, true]", false, true, options); + this->AssertMinMaxIsNull(chunked_input0, options); + this->AssertMinMaxIsNull(chunked_input1, options); + this->AssertMinMaxIs(chunked_input2, false, false, options); + this->AssertMinMaxIsNull(chunked_input3, options); +} -TYPED_TEST_SUITE(TestNumericMinMaxKernel, IntegralArrowTypes); -TYPED_TEST(TestNumericMinMaxKernel, Basics) { +TYPED_TEST_SUITE(TestIntegerMinMaxKernel, IntegralArrowTypes); +TYPED_TEST(TestIntegerMinMaxKernel, Basics) { MinMaxOptions options; std::vector chunked_input1 = {"[5, 1, 2, 3, 4]", "[9, 1, null, 3, 4]"}; std::vector chunked_input2 = {"[5, null, 2, 3, 4]", "[9, 1, 2, 3, 4]"}; std::vector chunked_input3 = {"[5, 1, 2, 3, null]", "[9, 1, null, 3, 4]"}; + // SKIP nulls by default + this->AssertMinMaxIsNull("[]", options); + this->AssertMinMaxIsNull("[null, null, null]", options); this->AssertMinMaxIs("[5, 1, 2, 3, 4]", 1, 5, options); this->AssertMinMaxIs("[5, null, 2, 3, 4]", 2, 5, options); this->AssertMinMaxIs(chunked_input1, 1, 9, options); diff --git a/r/R/compute.R b/r/R/compute.R index 9b18f22ae27..f28c53a6b01 100644 --- a/r/R/compute.R +++ b/r/R/compute.R @@ -58,15 +58,10 @@ scalar_aggregate <- function(FUN, ..., na.rm = FALSE) { if (FUN %in% c("mean", "sum")) { # Arrow sum/mean function always drops NAs so handle that here # https://issues.apache.org/jira/browse/ARROW-9054 - return(Scalar$create(NA_integer_, type = a$type)) + return(Scalar$create(NA_real_)) } } - if (inherits(a$type, "Boolean")) { - # Bool sum/mean not implemented so cast to int - # https://issues.apache.org/jira/browse/ARROW-9055 - a <- a$cast(int8()) - } Scalar$create(call_function(FUN, a, options = list(na.rm = na.rm))) } diff --git a/r/tests/testthat/test-compute.R b/r/tests/testthat/test-compute.R index 3d4b5f102b1..811c27d05e5 100644 --- a/r/tests/testthat/test-compute.R +++ b/r/tests/testthat/test-compute.R @@ -33,9 +33,10 @@ test_that("sum.Array", { expect_is(sum(na, na.rm = TRUE), "Scalar") expect_identical(as.numeric(sum(na, na.rm = TRUE)), sum(floats, na.rm = TRUE)) - bools <- c(TRUE, TRUE, FALSE) + bools <- c(TRUE, NA, TRUE, FALSE) b <- Array$create(bools) expect_identical(as.integer(sum(b)), sum(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) }) test_that("sum.ChunkedArray", { @@ -73,9 +74,10 @@ test_that("mean.Array", { expect_is(mean(na, na.rm = TRUE), "Scalar") expect_identical(as.vector(mean(na, na.rm = TRUE)), mean(floats, na.rm = TRUE)) - bools <- c(TRUE, TRUE, FALSE) + bools <- c(TRUE, NA, TRUE, FALSE) b <- Array$create(bools) expect_identical(as.vector(mean(b)), mean(bools)) + expect_identical(as.integer(sum(b, na.rm = TRUE)), sum(bools, na.rm = TRUE)) }) test_that("mean.ChunkedArray", { @@ -113,5 +115,6 @@ test_that("min/max.Array", { bools <- c(TRUE, TRUE, FALSE) b <- Array$create(bools) - expect_identical(as.vector(min(b)), min(bools)) + # R is inconsistent here: typeof(min(NA)) == "integer", not "logical" + expect_identical(as.vector(min(b)), as.logical(min(bools))) })