@@ -81,8 +81,131 @@ static void BM_BuildStringDictionary(
8181 state.SetBytesProcessed (state.iterations () * total_bytes);
8282}
8383
84- BENCHMARK (BM_BuildDictionary)->Repetitions (3 )->Unit(benchmark::kMicrosecond );
85- BENCHMARK (BM_BuildStringDictionary)->Repetitions (3 )->Unit(benchmark::kMicrosecond );
84+ template <typename Type>
85+ struct HashParams {
86+ using T = typename Type::c_type;
87+
88+ double null_percent;
89+
90+ void GenerateTestData (const int64_t length, const int64_t num_unique,
91+ std::shared_ptr<Array>* arr) const {
92+ std::vector<int64_t > draws;
93+ std::vector<T> values;
94+ std::vector<bool > is_valid;
95+ test::randint<int64_t >(length, 0 , num_unique, &draws);
96+ for (int64_t draw : draws) {
97+ values.push_back (draw);
98+ }
99+
100+ if (this ->null_percent > 0 ) {
101+ test::random_is_valid (length, this ->null_percent , &is_valid);
102+ ArrayFromVector<Type, T>(is_valid, values, arr);
103+ } else {
104+ ArrayFromVector<Type, T>(values, arr);
105+ }
106+ }
107+
108+ int64_t GetBytesProcessed (int64_t length) const { return length * sizeof (T); }
109+ };
110+
111+ template <>
112+ struct HashParams <StringType> {
113+ double null_percent;
114+ int32_t byte_width;
115+ void GenerateTestData (const int64_t length, const int64_t num_unique,
116+ std::shared_ptr<Array>* arr) const {
117+ std::vector<int64_t > draws;
118+ test::randint<int64_t >(length, 0 , num_unique, &draws);
119+
120+ const int64_t total_bytes = this ->byte_width * num_unique;
121+ std::vector<uint8_t > uniques (total_bytes);
122+ const uint32_t seed = 0 ;
123+ test::random_bytes (total_bytes, seed, uniques.data ());
124+
125+ std::vector<bool > is_valid;
126+ if (this ->null_percent > 0 ) {
127+ test::random_is_valid (length, this ->null_percent , &is_valid);
128+ }
129+
130+ StringBuilder builder;
131+ for (int64_t i = 0 ; i < length; ++i) {
132+ if (this ->null_percent == 0 || is_valid[i]) {
133+ ABORT_NOT_OK (builder.Append (uniques.data () + this ->byte_width * draws[i],
134+ this ->byte_width ));
135+ } else {
136+ ABORT_NOT_OK (builder.AppendNull ());
137+ }
138+ }
139+ ABORT_NOT_OK (builder.Finish (arr));
140+ }
141+
142+ int64_t GetBytesProcessed (int64_t length) const { return length * byte_width; }
143+ };
144+
145+ template <typename ParamType>
146+ void BenchUnique (benchmark::State& state, const ParamType& params, int64_t length,
147+ int64_t num_unique) {
148+ std::shared_ptr<Array> arr;
149+ params.GenerateTestData (length, num_unique, &arr);
150+
151+ FunctionContext ctx;
152+ while (state.KeepRunning ()) {
153+ std::shared_ptr<Array> out;
154+ ABORT_NOT_OK (Unique (&ctx, Datum (arr), &out));
155+ }
156+ state.SetBytesProcessed (params.GetBytesProcessed (length));
157+ }
158+
159+ template <typename ParamType>
160+ void BenchDictionaryEncode (benchmark::State& state, const ParamType& params,
161+ int64_t length, int64_t num_unique) {
162+ std::shared_ptr<Array> arr;
163+ params.GenerateTestData (length, num_unique, &arr);
164+
165+ FunctionContext ctx;
166+ while (state.KeepRunning ()) {
167+ Datum out;
168+ ABORT_NOT_OK (DictionaryEncode (&ctx, Datum (arr), &out));
169+ }
170+ state.SetBytesProcessed (params.GetBytesProcessed (length));
171+ }
172+
173+ static void BM_UniqueInt64NoNulls (benchmark::State& state) {
174+ BenchUnique (state, HashParams<Int64Type>{0 }, state.range (0 ), state.range (1 ));
175+ }
176+
177+ static void BM_UniqueInt64WithNulls (benchmark::State& state) {
178+ BenchUnique (state, HashParams<Int64Type>{0.05 }, state.range (0 ), state.range (1 ));
179+ }
180+
181+ static void BM_UniqueString10bytes (benchmark::State& state) {
182+ // Byte strings with 10 bytes each
183+ BenchUnique (state, HashParams<StringType>{0.05 , 10 }, state.range (0 ), state.range (1 ));
184+ }
185+
186+ static void BM_UniqueString100bytes (benchmark::State& state) {
187+ // Byte strings with 100 bytes each
188+ BenchUnique (state, HashParams<StringType>{0.05 , 100 }, state.range (0 ), state.range (1 ));
189+ }
190+
191+ BENCHMARK (BM_BuildDictionary)->MinTime (1.0 )->Unit(benchmark::kMicrosecond );
192+ BENCHMARK (BM_BuildStringDictionary)->MinTime (1.0 )->Unit(benchmark::kMicrosecond );
193+
194+ constexpr int64_t kHashBenchmarkLength = 1 << 24 ;
195+
196+ #define ADD_HASH_ARGS (WHAT ) \
197+ WHAT->Args ({kHashBenchmarkLength , 50 }) \
198+ ->Args({kHashBenchmarkLength , 1 << 10 }) \
199+ ->Args({kHashBenchmarkLength , 10 * 1 << 10 }) \
200+ ->Args({kHashBenchmarkLength , 1 << 20 }) \
201+ ->MinTime(1.0 ) \
202+ ->Unit(benchmark::kMicrosecond ) \
203+ ->UseRealTime()
204+
205+ ADD_HASH_ARGS(BENCHMARK(BM_UniqueInt64NoNulls));
206+ ADD_HASH_ARGS (BENCHMARK(BM_UniqueInt64WithNulls));
207+ ADD_HASH_ARGS (BENCHMARK(BM_UniqueString10bytes));
208+ ADD_HASH_ARGS (BENCHMARK(BM_UniqueString100bytes));
86209
87210} // namespace compute
88211} // namespace arrow
0 commit comments