Skip to content

Commit db49fbb

Browse files
committed
Ubsan excluding flatbuffers
1 parent fec7a0e commit db49fbb

9 files changed

Lines changed: 2015 additions & 1464 deletions

File tree

cpp/src/arrow/util/bpacking.h

Lines changed: 1969 additions & 1440 deletions
Large diffs are not rendered by default.

cpp/src/arrow/util/hashing.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ hash_t ComputeStringHash(const void* data, int64_t length) {
150150
uint32_t x, y;
151151
hash_t hx, hy;
152152
// XXX those are unaligned accesses. Should we have a facility for that?
153-
x = *reinterpret_cast<const uint32_t*>(p + n - 4);
154-
y = *reinterpret_cast<const uint32_t*>(p);
153+
x = util::SafeLoadAs<uint32_t>(p + n - 4);
154+
y = util::SafeLoadAs<uint32_t>(p);
155155
hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
156156
hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
157157
return n ^ hx ^ hy;
@@ -160,8 +160,8 @@ hash_t ComputeStringHash(const void* data, int64_t length) {
160160
// Apply the same principle as above
161161
uint64_t x, y;
162162
hash_t hx, hy;
163-
x = *reinterpret_cast<const uint64_t*>(p + n - 8);
164-
y = *reinterpret_cast<const uint64_t*>(p);
163+
x = util::SafeLoadAs<uint64_t>(p + n - 8);
164+
y = util::SafeLoadAs<uint64_t>(p);
165165
hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
166166
hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
167167
return n ^ hx ^ hy;

cpp/src/arrow/util/ubsan.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,5 +49,21 @@ inline T* MakeNonNull(T* maybe_null) {
4949
return reinterpret_cast<T*>(&internal::non_null_filler);
5050
}
5151

52+
template <typename T>
53+
inline typename std::enable_if<std::is_integral<T>::value, T>::type SafeLoadAs(
54+
const uint8_t* unaligned) {
55+
typename std::remove_const<T>::type ret;
56+
std::memcpy(&ret, unaligned, sizeof(T));
57+
return ret;
58+
}
59+
60+
template <typename T>
61+
inline typename std::enable_if<std::is_integral<T>::value, T>::type SafeLoad(
62+
const T* unaligned) {
63+
typename std::remove_const<T>::type ret;
64+
std::memcpy(&ret, unaligned, sizeof(T));
65+
return ret;
66+
}
67+
5268
} // namespace util
5369
} // namespace arrow

cpp/src/parquet/arrow/reader.cc

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ namespace arrow {
8383

8484
using ::arrow::BitUtil::FromBigEndian;
8585
using ::arrow::internal::SafeLeftShift;
86+
using ::arrow::util::SafeLoadAs;
8687

8788
template <typename ArrowType>
8889
using ArrayType = typename ::arrow::TypeTraits<ArrowType>::ArrayType;
@@ -1212,38 +1213,37 @@ static uint64_t BytesToInteger(const uint8_t* bytes, int32_t start, int32_t stop
12121213
case 1:
12131214
return bytes[start];
12141215
case 2:
1215-
return FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start));
1216+
return FromBigEndian(SafeLoadAs<uint16_t>(bytes + start));
12161217
case 3: {
1217-
const uint64_t first_two_bytes =
1218-
FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start));
1218+
const uint64_t first_two_bytes = FromBigEndian(SafeLoadAs<uint16_t>(bytes + start));
12191219
const uint64_t last_byte = bytes[stop - 1];
12201220
return first_two_bytes << 8 | last_byte;
12211221
}
12221222
case 4:
1223-
return FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
1223+
return FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
12241224
case 5: {
12251225
const uint64_t first_four_bytes =
1226-
FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
1226+
FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
12271227
const uint64_t last_byte = bytes[stop - 1];
12281228
return first_four_bytes << 8 | last_byte;
12291229
}
12301230
case 6: {
12311231
const uint64_t first_four_bytes =
1232-
FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
1232+
FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
12331233
const uint64_t last_two_bytes =
1234-
FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start + 4));
1234+
FromBigEndian(SafeLoadAs<uint16_t>(bytes + start + 4));
12351235
return first_four_bytes << 16 | last_two_bytes;
12361236
}
12371237
case 7: {
12381238
const uint64_t first_four_bytes =
1239-
FromBigEndian(*reinterpret_cast<const uint32_t*>(bytes + start));
1239+
FromBigEndian(SafeLoadAs<uint32_t>(bytes + start));
12401240
const uint64_t second_two_bytes =
1241-
FromBigEndian(*reinterpret_cast<const uint16_t*>(bytes + start + 4));
1241+
FromBigEndian(SafeLoadAs<uint16_t>(bytes + start + 4));
12421242
const uint64_t last_byte = bytes[stop - 1];
12431243
return first_four_bytes << 24 | second_two_bytes << 8 | last_byte;
12441244
}
12451245
case 8:
1246-
return FromBigEndian(*reinterpret_cast<const uint64_t*>(bytes + start));
1246+
return FromBigEndian(SafeLoadAs<uint64_t>(bytes + start));
12471247
default: {
12481248
DCHECK(false);
12491249
return UINT64_MAX;

cpp/src/parquet/arrow/writer.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,8 +211,9 @@ inline void ArrowTimestampToImpalaTimestamp(const int64_t time, Int96* impala_ti
211211
(*impala_timestamp).value[2] = (uint32_t)julian_days;
212212

213213
int64_t last_day_units = time % UnitPerDay;
214-
int64_t* impala_last_day_nanos = reinterpret_cast<int64_t*>(impala_timestamp);
215-
*impala_last_day_nanos = last_day_units * NanosecondsPerUnit;
214+
auto last_day_nanos = last_day_units * NanosecondsPerUnit;
215+
// Strage might be unaligned, so use mempcy instead of reinterpret_cast
216+
std::memcpy(impala_timestamp, &last_day_nanos, sizeof(int64_t));
216217
}
217218

218219
constexpr int64_t kSecondsInNanos = INT64_C(1000000000);

cpp/src/parquet/column_reader.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "arrow/util/compression.h"
2828
#include "arrow/util/logging.h"
2929
#include "arrow/util/rle-encoding.h"
30+
#include "arrow/util/ubsan.h"
3031

3132
#include "parquet/column_page.h"
3233
#include "parquet/encoding.h"
@@ -50,7 +51,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
5051
bit_width_ = BitUtil::Log2(max_level + 1);
5152
switch (encoding) {
5253
case Encoding::RLE: {
53-
num_bytes = *reinterpret_cast<const int32_t*>(data);
54+
num_bytes = arrow::util::SafeLoadAs<int32_t>(data);
5455
const uint8_t* decoder_data = data + sizeof(int32_t);
5556
if (!rle_decoder_) {
5657
rle_decoder_.reset(

cpp/src/parquet/encoding.cc

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "arrow/util/logging.h"
3030
#include "arrow/util/rle-encoding.h"
3131
#include "arrow/util/string_view.h"
32+
#include "arrow/util/ubsan.h"
3233

3334
#include "parquet/exception.h"
3435
#include "parquet/platform.h"
@@ -609,7 +610,7 @@ inline int DecodePlain<ByteArray>(const uint8_t* data, int64_t data_size, int nu
609610
int bytes_decoded = 0;
610611
int increment;
611612
for (int i = 0; i < num_values; ++i) {
612-
uint32_t len = out[i].len = *reinterpret_cast<const uint32_t*>(data);
613+
uint32_t len = out[i].len = arrow::util::SafeLoadAs<uint32_t>(data);
613614
increment = static_cast<int>(sizeof(uint32_t) + len);
614615
if (data_size < increment) ParquetException::EofException();
615616
out[i].ptr = data + sizeof(uint32_t);
@@ -719,7 +720,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
719720
int bytes_decoded = 0;
720721
while (i < num_values) {
721722
if (bit_reader.IsSet()) {
722-
uint32_t len = *reinterpret_cast<const uint32_t*>(data);
723+
uint32_t len = arrow::util::SafeLoadAs<uint32_t>(data);
723724
increment = static_cast<int>(sizeof(uint32_t) + len);
724725
if (data_size < increment) {
725726
ParquetException::EofException();
@@ -752,7 +753,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
752753
int bytes_decoded = 0;
753754

754755
while (i < num_values) {
755-
uint32_t len = *reinterpret_cast<const uint32_t*>(data);
756+
uint32_t len = arrow::util::SafeLoadAs<uint32_t>(data);
756757
int increment = static_cast<int>(sizeof(uint32_t) + len);
757758
if (data_size < increment) ParquetException::EofException();
758759
builder->Append(data + sizeof(uint32_t), len);
@@ -1103,7 +1104,7 @@ class DeltaLengthByteArrayDecoder : public DecoderImpl,
11031104
virtual void SetData(int num_values, const uint8_t* data, int len) {
11041105
num_values_ = num_values;
11051106
if (len == 0) return;
1106-
int total_lengths_len = *reinterpret_cast<const int*>(data);
1107+
int total_lengths_len = arrow::util::SafeLoadAs<int32_t>(data);
11071108
data += 4;
11081109
this->len_decoder_.SetData(num_values, data, total_lengths_len);
11091110
data_ = data + total_lengths_len;
@@ -1145,7 +1146,7 @@ class DeltaByteArrayDecoder : public DecoderImpl,
11451146
virtual void SetData(int num_values, const uint8_t* data, int len) {
11461147
num_values_ = num_values;
11471148
if (len == 0) return;
1148-
int prefix_len_length = *reinterpret_cast<const int*>(data);
1149+
int prefix_len_length = arrow::util::SafeLoadAs<int32_t>(data);
11491150
data += 4;
11501151
len -= 4;
11511152
prefix_len_decoder_.SetData(num_values, data, prefix_len_length);

cpp/src/parquet/file_reader.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "arrow/io/file.h"
2929
#include "arrow/status.h"
3030
#include "arrow/util/logging.h"
31+
#include "arrow/util/ubsan.h"
3132

3233
#include "parquet/column_reader.h"
3334
#include "parquet/column_scanner.h"
@@ -179,7 +180,7 @@ class SerializedFile : public ParquetFileReader::Contents {
179180
throw ParquetException("Invalid parquet file. Corrupt footer.");
180181
}
181182

182-
uint32_t metadata_len = *reinterpret_cast<const uint32_t*>(
183+
uint32_t metadata_len = arrow::util::SafeLoadAs<uint32_t>(
183184
reinterpret_cast<const uint8_t*>(footer_buffer->data()) + footer_read_size -
184185
kFooterSize);
185186
int64_t metadata_start = file_size - kFooterSize - metadata_len;

cpp/src/plasma/common.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
#include <limits>
2121

22+
#include "arrow/util/ubsan.h"
23+
2224
#include "plasma/plasma_generated.h"
2325

2426
namespace fb = plasma::flatbuf;
@@ -64,7 +66,7 @@ uint64_t MurmurHash64A(const void* key, int len, unsigned int seed) {
6466
const uint64_t* end = data + (len / 8);
6567

6668
while (data != end) {
67-
uint64_t k = *data++;
69+
uint64_t k = arrow::util::SafeLoad(data++);
6870

6971
k *= m;
7072
k ^= k >> r;

0 commit comments

Comments
 (0)