Skip to content

Commit aa3328d

Browse files
committed
ARROW-1844: [C++] Add initial Unique benchmarks for int64, variable-length strings
I also fixed a bug this surfaced in the hash table resize (unit test coverage was not adequate) Now we have ``` $ ./release/compute-benchmark Run on (8 X 4200.16 MHz CPU s) 2017-11-28 18:33:53 Benchmark Time CPU Iterations ------------------------------------------------------------------------------------------------- BM_BuildDictionary/min_time:1.000 1352 us 1352 us 1038 2.88639GB/s BM_BuildStringDictionary/min_time:1.000 3994 us 3994 us 351 75.5809MB/s BM_UniqueInt64NoNulls/16M/50/min_time:1.000/real_time 35814 us 35816 us 39 3.49023GB/s BM_UniqueInt64NoNulls/16M/1024/min_time:1.000/real_time 119656 us 119660 us 12 1069.73MB/s BM_UniqueInt64NoNulls/16M/10k/min_time:1.000/real_time 174924 us 174930 us 8 731.747MB/s BM_UniqueInt64NoNulls/16M/1024k/min_time:1.000/real_time 448425 us 448440 us 3 285.443MB/s BM_UniqueInt64WithNulls/16M/50/min_time:1.000/real_time 49511 us 49513 us 29 2.52468GB/s BM_UniqueInt64WithNulls/16M/1024/min_time:1.000/real_time 134519 us 134523 us 10 951.541MB/s BM_UniqueInt64WithNulls/16M/10k/min_time:1.000/real_time 191331 us 191336 us 7 668.999MB/s BM_UniqueInt64WithNulls/16M/1024k/min_time:1.000/real_time 533597 us 533613 us 3 239.882MB/s BM_UniqueString10bytes/16M/50/min_time:1.000/real_time 150731 us 150736 us 9 1061.5MB/s BM_UniqueString10bytes/16M/1024/min_time:1.000/real_time 256929 us 256938 us 5 622.739MB/s BM_UniqueString10bytes/16M/10k/min_time:1.000/real_time 414412 us 414426 us 3 386.09MB/s BM_UniqueString10bytes/16M/1024k/min_time:1.000/real_time 1744253 us 1744308 us 1 91.7298MB/s BM_UniqueString100bytes/16M/50/min_time:1.000/real_time 563890 us 563909 us 2 2.77093GB/s BM_UniqueString100bytes/16M/1024/min_time:1.000/real_time 704695 us 704720 us 2 2.21727GB/s BM_UniqueString100bytes/16M/10k/min_time:1.000/real_time 995685 us 995721 us 2 1.56927GB/s BM_UniqueString100bytes/16M/1024k/min_time:1.000/real_time 3584108 us 3584230 us 1 446.415MB/s ``` We can also refactor the hash table implementations without worrying too much about whether we're making things slower Author: Wes McKinney <wes.mckinney@twosigma.com> Closes #1370 from wesm/ARROW-1844 and squashes the following commits: 638f1a1 [Wes McKinney] Decrease resize load factor to 0.5 2885c64 [Wes McKinney] Multiply bytes processed by state.iterations() f7b3619 [Wes McKinney] Add initial Unique benchmarks for int64, strings
1 parent bc70994 commit aa3328d

3 files changed

Lines changed: 129 additions & 6 deletions

File tree

cpp/src/arrow/compute/compute-benchmark.cc

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,131 @@ static void BM_BuildStringDictionary(
8181
state.SetBytesProcessed(state.iterations() * total_bytes);
8282
}
8383

84-
BENCHMARK(BM_BuildDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
85-
BENCHMARK(BM_BuildStringDictionary)->Repetitions(3)->Unit(benchmark::kMicrosecond);
84+
template <typename Type>
85+
struct HashParams {
86+
using T = typename Type::c_type;
87+
88+
double null_percent;
89+
90+
void GenerateTestData(const int64_t length, const int64_t num_unique,
91+
std::shared_ptr<Array>* arr) const {
92+
std::vector<int64_t> draws;
93+
std::vector<T> values;
94+
std::vector<bool> is_valid;
95+
test::randint<int64_t>(length, 0, num_unique, &draws);
96+
for (int64_t draw : draws) {
97+
values.push_back(draw);
98+
}
99+
100+
if (this->null_percent > 0) {
101+
test::random_is_valid(length, this->null_percent, &is_valid);
102+
ArrayFromVector<Type, T>(is_valid, values, arr);
103+
} else {
104+
ArrayFromVector<Type, T>(values, arr);
105+
}
106+
}
107+
108+
int64_t GetBytesProcessed(int64_t length) const { return length * sizeof(T); }
109+
};
110+
111+
template <>
112+
struct HashParams<StringType> {
113+
double null_percent;
114+
int32_t byte_width;
115+
void GenerateTestData(const int64_t length, const int64_t num_unique,
116+
std::shared_ptr<Array>* arr) const {
117+
std::vector<int64_t> draws;
118+
test::randint<int64_t>(length, 0, num_unique, &draws);
119+
120+
const int64_t total_bytes = this->byte_width * num_unique;
121+
std::vector<uint8_t> uniques(total_bytes);
122+
const uint32_t seed = 0;
123+
test::random_bytes(total_bytes, seed, uniques.data());
124+
125+
std::vector<bool> is_valid;
126+
if (this->null_percent > 0) {
127+
test::random_is_valid(length, this->null_percent, &is_valid);
128+
}
129+
130+
StringBuilder builder;
131+
for (int64_t i = 0; i < length; ++i) {
132+
if (this->null_percent == 0 || is_valid[i]) {
133+
ABORT_NOT_OK(builder.Append(uniques.data() + this->byte_width * draws[i],
134+
this->byte_width));
135+
} else {
136+
ABORT_NOT_OK(builder.AppendNull());
137+
}
138+
}
139+
ABORT_NOT_OK(builder.Finish(arr));
140+
}
141+
142+
int64_t GetBytesProcessed(int64_t length) const { return length * byte_width; }
143+
};
144+
145+
template <typename ParamType>
146+
void BenchUnique(benchmark::State& state, const ParamType& params, int64_t length,
147+
int64_t num_unique) {
148+
std::shared_ptr<Array> arr;
149+
params.GenerateTestData(length, num_unique, &arr);
150+
151+
FunctionContext ctx;
152+
while (state.KeepRunning()) {
153+
std::shared_ptr<Array> out;
154+
ABORT_NOT_OK(Unique(&ctx, Datum(arr), &out));
155+
}
156+
state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length));
157+
}
158+
159+
template <typename ParamType>
160+
void BenchDictionaryEncode(benchmark::State& state, const ParamType& params,
161+
int64_t length, int64_t num_unique) {
162+
std::shared_ptr<Array> arr;
163+
params.GenerateTestData(length, num_unique, &arr);
164+
165+
FunctionContext ctx;
166+
while (state.KeepRunning()) {
167+
Datum out;
168+
ABORT_NOT_OK(DictionaryEncode(&ctx, Datum(arr), &out));
169+
}
170+
state.SetBytesProcessed(state.iterations() * params.GetBytesProcessed(length));
171+
}
172+
173+
static void BM_UniqueInt64NoNulls(benchmark::State& state) {
174+
BenchUnique(state, HashParams<Int64Type>{0}, state.range(0), state.range(1));
175+
}
176+
177+
static void BM_UniqueInt64WithNulls(benchmark::State& state) {
178+
BenchUnique(state, HashParams<Int64Type>{0.05}, state.range(0), state.range(1));
179+
}
180+
181+
static void BM_UniqueString10bytes(benchmark::State& state) {
182+
// Byte strings with 10 bytes each
183+
BenchUnique(state, HashParams<StringType>{0.05, 10}, state.range(0), state.range(1));
184+
}
185+
186+
static void BM_UniqueString100bytes(benchmark::State& state) {
187+
// Byte strings with 100 bytes each
188+
BenchUnique(state, HashParams<StringType>{0.05, 100}, state.range(0), state.range(1));
189+
}
190+
191+
BENCHMARK(BM_BuildDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
192+
BENCHMARK(BM_BuildStringDictionary)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
193+
194+
constexpr int64_t kHashBenchmarkLength = 1 << 24;
195+
196+
#define ADD_HASH_ARGS(WHAT) \
197+
WHAT->Args({kHashBenchmarkLength, 50}) \
198+
->Args({kHashBenchmarkLength, 1 << 10}) \
199+
->Args({kHashBenchmarkLength, 10 * 1 << 10}) \
200+
->Args({kHashBenchmarkLength, 1 << 20}) \
201+
->MinTime(1.0) \
202+
->Unit(benchmark::kMicrosecond) \
203+
->UseRealTime()
204+
205+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls));
206+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64WithNulls));
207+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueString10bytes));
208+
ADD_HASH_ARGS(BENCHMARK(BM_UniqueString100bytes));
86209

87210
} // namespace compute
88211
} // namespace arrow

cpp/src/arrow/compute/compute-test.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -869,8 +869,8 @@ TYPED_TEST(TestHashKernelPrimitive, PrimitiveResizeTable) {
869869
return;
870870
}
871871

872-
const int64_t kTotalValues = 10000;
873-
const int64_t kRepeats = 10;
872+
const int64_t kTotalValues = 1000000;
873+
const int64_t kRepeats = 5;
874874

875875
vector<T> values;
876876
vector<T> uniques;

cpp/src/arrow/compute/kernels/hash.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ typedef int32_t hash_slot_t;
4343
static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
4444

4545
// The maximum load factor for the hash table before resizing.
46-
static constexpr double kMaxHashTableLoad = 0.7;
46+
static constexpr double kMaxHashTableLoad = 0.5;
4747

4848
enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
4949

@@ -260,7 +260,7 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
260260
COMPUTE_HASH; \
261261
while (kHashSlotEmpty != new_hash_slots[j]) { \
262262
++j; \
263-
if (ARROW_PREDICT_FALSE(j == hash_table_size_)) { \
263+
if (ARROW_PREDICT_FALSE(j == new_size)) { \
264264
j = 0; \
265265
} \
266266
} \

0 commit comments

Comments
 (0)