diff --git a/cpp/src/arrow/compute/kernels/aggregate_basic.cc b/cpp/src/arrow/compute/kernels/aggregate_basic.cc index 14f9be3f93e..2f9f3d7a61e 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_basic.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_basic.cc @@ -137,9 +137,26 @@ struct SumState { ThisType local; const auto values = array.raw_values(); const int64_t length = array.length(); - for (int64_t i = 0; i < length; i++) { + + constexpr int64_t kRoundFactor = 8; + const int64_t length_rounded = BitUtil::RoundDown(length, kRoundFactor); + typename SumType::c_type sum_rounded[kRoundFactor] = {0}; + + // Unrolled the loop to add the results in parrel + for (int64_t i = 0; i < length_rounded; i += kRoundFactor) { + for (int64_t k = 0; k < kRoundFactor; k++) { + sum_rounded[k] += values[i + k]; + } + } + for (int64_t k = 0; k < kRoundFactor; k++) { + local.sum += sum_rounded[k]; + } + + // The trailing part + for (int64_t i = length_rounded; i < length; ++i) { local.sum += values[i]; } + local.count = length; return local; } diff --git a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc index 46d726d4071..101284ec6b0 100644 --- a/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc +++ b/cpp/src/arrow/compute/kernels/aggregate_benchmark.cc @@ -305,12 +305,14 @@ BENCHMARK_TEMPLATE(ReferenceSum, SumBitmapVectorizeUnroll) ->Apply(BenchmarkSetArgs); #endif // ARROW_WITH_BENCHMARKS_REFERENCE +template static void SumKernel(benchmark::State& state) { - const int64_t array_size = state.range(0) / sizeof(int64_t); + using CType = typename TypeTraits::CType; + + const int64_t array_size = state.range(0) / sizeof(CType); const double null_percent = static_cast(state.range(1)) / 100.0; auto rand = random::RandomArrayGenerator(1923); - auto array = std::static_pointer_cast>( - rand.Int64(array_size, -100, 100, null_percent)); + auto array = rand.Numeric(array_size, -100, 100, null_percent); for (auto _ : state) { ABORT_NOT_OK(Sum(array).status()); @@ -318,10 +320,19 @@ static void SumKernel(benchmark::State& state) { state.counters["size"] = static_cast(state.range(0)); state.counters["null_percent"] = static_cast(state.range(1)); - state.SetBytesProcessed(state.iterations() * array_size * sizeof(int64_t)); + state.SetBytesProcessed(state.iterations() * array_size * sizeof(CType)); } -BENCHMARK(SumKernel)->Apply(RegressionSetArgs); +#define SUM_KERNEL_BENCHMARK(FuncName, Type) \ + static void FuncName(benchmark::State& state) { SumKernel(state); } \ + BENCHMARK(FuncName)->Apply(RegressionSetArgs) + +SUM_KERNEL_BENCHMARK(SumKernelFloat, FloatType); +SUM_KERNEL_BENCHMARK(SumKernelDouble, DoubleType); +SUM_KERNEL_BENCHMARK(SumKernelInt8, Int8Type); +SUM_KERNEL_BENCHMARK(SumKernelInt16, Int16Type); +SUM_KERNEL_BENCHMARK(SumKernelInt32, Int32Type); +SUM_KERNEL_BENCHMARK(SumKernelInt64, Int64Type); } // namespace compute } // namespace arrow