Skip to content

Commit f7b3619

Browse files
committed
Add initial Unique benchmarks for int64, strings
Change-Id: I0c2eb14f1cd8c63a79fe2a3da308c76ac19a7384
1 parent 5107e93 commit f7b3619

3 files changed

Lines changed: 128 additions & 5 deletions

File tree

cpp/src/arrow/compute/compute-benchmark.cc

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,131 @@ static void BM_BuildStringDictionary(
8181
state.SetBytesProcessed(state.iterations() * total_bytes);
8282
}
8383

84-
BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
85-
BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
84+
template <typename Type>
85+
struct HashParams {
86+
using T = typename Type::c_type;
87+
88+
double null_percent;
89+
90+
void GenerateTestData(const int64_t length, const int64_t num_unique,
91+
std::shared_ptr<Array>* arr) const {
92+
std::vector<int64_t> draws;
93+
std::vector<T> values;
94+
std::vector<bool> is_valid;
95+
test::randint<int64_t>(length, 0, num_unique, &draws);
96+
for (int64_t draw : draws) {
97+
values.push_back(draw);
98+
}
99+
100+
if (this->null_percent > 0) {
101+
test::random_is_valid(length, this->null_percent, &is_valid);
102+
ArrayFromVector<Type, T>(is_valid, values, arr);
103+
} else {
104+
ArrayFromVector<Type, T>(values, arr);
105+
}
106+
}
107+
108+
int64_t GetBytesProcessed(int64_t length) const { return length * sizeof(T); }
109+
};
110+
111+
template <>
112+
struct HashParams<StringType> {
113+
double null_percent;
114+
int32_t byte_width;
115+
void GenerateTestData(const int64_t length, const int64_t num_unique,
116+
std::shared_ptr<Array>* arr) const {
117+
std::vector<int64_t> draws;
118+
test::randint<int64_t>(length, 0, num_unique, &draws);
119+
120+
const int64_t total_bytes = this->byte_width * num_unique;
121+
std::vector<uint8_t> uniques(total_bytes);
122+
const uint32_t seed = 0;
123+
test::random_bytes(total_bytes, seed, uniques.data());
124+
125+
std::vector<bool> is_valid;
126+
if (this->null_percent > 0) {
127+
test::random_is_valid(length, this->null_percent, &is_valid);
128+
}
129+
130+
StringBuilder builder;
131+
for (int64_t i = 0; i < length; ++i) {
132+
if (this->null_percent == 0 || is_valid[i]) {
133+
ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width * draws[i],
134+
this->byte_width));
135+
} else {
136+
ABORT_NOT_OK(builder.AppendNull());
137+
}
138+
}
139+
ABORT_NOT_OK(builder.Finish(arr));
140+
}
141+
142+
int64_t GetBytesProcessed(int64_t length) const { return length * byte_width; }
143+
};
144+
145+
template <typename ParamType>
146+
void BenchUnique(benchmark::State& state, const ParamType& params, int64_t length,
147+
int64_t num_unique) {
148+
std::shared_ptr<Array> arr;
149+
params.GenerateTestData(length, num_unique, &arr);
150+
151+
FunctionContext ctx;
152+
while (state.KeepRunning()) {
153+
std::shared_ptr<Array> out;
154+
ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out));
155+
}
156+
state.SetBytesProcessed(params.GetBytesProcessed(length));
157+
}
158+
159+
template <typename ParamType>
160+
void BenchDictionaryEncode(benchmark::State& state, const ParamType& params,
161+
int64_t length, int64_t num_unique) {
162+
std::shared_ptr<Array> arr;
163+
params.GenerateTestData(length, num_unique, &arr);
164+
165+
FunctionContext ctx;
166+
while (state.KeepRunning()) {
167+
Datum out;
168+
ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
169+
}
170+
state.SetBytesProcessed(params.GetBytesProcessed(length));
171+
}
172+
173+
static void BM_UniqueInt64NoNulls(benchmark::State& state) {
174+
BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
175+
}
176+
177+
static void BM_UniqueInt64WithNulls(benchmark::State& state) {
178+
BenchUnique(state, HashParams<Int64Type>{0.05}, state.range(0), state.range(1));
179+
}
180+
181+
static void BM_UniqueString10bytes(benchmark::State& state) {
182+
// Byte strings with 10 bytes each
183+
BenchUnique(state, HashParams<StringType>{0.05, 10}, state.range(0), state.range(1));
184+
}
185+
186+
static void BM_UniqueString100bytes(benchmark::State& state) {
187+
// Byte strings with 100 bytes each
188+
BenchUnique(state, HashParams<StringType>{0.05, 100}, state.range(0), state.range(1));
189+
}
190+
191+
BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
192+
BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
193+
194+
constexpr int64_t kHashBenchmarkLength = 1 << 24;
195+
196+
#define ADD_HASH_ARGS(WHAT) \
197+
WHAT->Args({kHashBenchmarkLength, 50}) \
198+
->Args({kHashBenchmarkLength, 1 << 10}) \
199+
->Args({kHashBenchmarkLength, 10 * 1 << 10}) \
200+
->Args({kHashBenchmarkLength, 1 << 20}) \
201+
->MinTime(1.0) \
202+
->Unit(benchmark::kMicrosecond) \
203+
->UseRealTime()
204+
205+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls));
206+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
207+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
208+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
86209

87210
} // namespace compute
88211
} // namespace arrow

cpp/src/arrow/compute/compute-test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -869,8 +869,8 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
869869
return;
870870
}
871871

872-
const int64_t kTotalValues = 10000;
873-
const int64_t kRepeats = 10;
872+
const int64_t kTotalValues = 1000000;
873+
const int64_t kRepeats = 5;
874874

875875
vector<T> values;
876876
vector<T> uniques;

cpp/src/arrow/compute/kernels/hash.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
260260
COMPUTE_HASH; \
261261
while (kHashSlotEmpty != new_hash_slots[j]) { \
262262
++j; \
263-
if (ARROW_PREDICT_FALSE(j == hash_table_size_)) { \
263+
if (ARROW_PREDICT_FALSE(j == new_size)) { \
264264
j = 0; \
265265
} \
266266
} \

0 commit comments

Comments
 (0)