From b0b950442fa00da60555efa89ee2cdbc48d109bc Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Wed, 12 Oct 2022 09:24:04 -0400 Subject: [PATCH 1/3] ARROW-18004: [C++] ExecBatch conversion to RecordBatch may go out of bounds --- cpp/src/arrow/compute/exec.cc | 9 ++++++++- cpp/src/arrow/compute/exec_test.cc | 13 +++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 466b3d5dd4a7..09e499d29614 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -156,6 +156,9 @@ Result ExecBatch::Make(std::vector values) { Result> ExecBatch::ToRecordBatch( std::shared_ptr schema, MemoryPool* pool) const { + if (static_cast(schema->num_fields()) > values.size()) { + return Status::Invalid("mismatching schema size"); + } ArrayVector columns(schema->num_fields()); for (size_t i = 0; i < columns.size(); ++i) { @@ -163,8 +166,12 @@ Result> ExecBatch::ToRecordBatch( if (value.is_array()) { columns[i] = value.make_array(); continue; + } else if (value.is_scalar()) { + ARROW_ASSIGN_OR_RAISE(columns[i], + MakeArrayFromScalar(*value.scalar(), length, pool)); + } else { + DCHECK(false); } - ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool)); } return RecordBatch::Make(std::move(schema), length, std::move(columns)); diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index eac18f194d25..50a9de6f0b17 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -55,6 +55,19 @@ using ::arrow::internal::BitmapEquals; using ::arrow::internal::CopyBitmap; using ::arrow::internal::CountSetBits; +TEST(ExecBatch, Basics) { + auto i32_array = ArrayFromJSON(int32(), "[0, 1, 2]"); + auto utf8_array = ArrayFromJSON(utf8(), R"(["a", "b", "c"])"); + ExecBatch exec_batch({Datum(i32_array), Datum(utf8_array)}, 3); + auto right_schema = schema({field("a", int32()), field("b", utf8())}); + ASSERT_OK_AND_ASSIGN(auto right_record_batch, exec_batch.ToRecordBatch(right_schema)); + auto accept_schema = schema({field("a", int32())}); + ASSERT_OK_AND_ASSIGN(auto accept_record_batch, exec_batch.ToRecordBatch(accept_schema)); + auto reject_schema = + schema({field("a", int32()), field("b", utf8()), field("c", float64())}); + ASSERT_RAISES(Invalid, exec_batch.ToRecordBatch(reject_schema)); +} + TEST(ExecContext, BasicWorkings) { { ExecContext ctx; From bfe747f8e0c7d593b0c5cd31442968144161b38e Mon Sep 17 00:00:00 2001 From: Yaron Gvili Date: Wed, 12 Oct 2022 12:25:51 -0400 Subject: [PATCH 2/3] requested fixes --- cpp/src/arrow/compute/exec.cc | 5 +++-- cpp/src/arrow/compute/exec_test.cc | 11 ++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/compute/exec.cc b/cpp/src/arrow/compute/exec.cc index 09e499d29614..8f577539e34a 100644 --- a/cpp/src/arrow/compute/exec.cc +++ b/cpp/src/arrow/compute/exec.cc @@ -157,7 +157,7 @@ Result ExecBatch::Make(std::vector values) { Result> ExecBatch::ToRecordBatch( std::shared_ptr schema, MemoryPool* pool) const { if (static_cast(schema->num_fields()) > values.size()) { - return Status::Invalid("mismatching schema size"); + return Status::Invalid("ExecBatch::ToTRecordBatch mismatching schema size"); } ArrayVector columns(schema->num_fields()); @@ -170,7 +170,8 @@ Result> ExecBatch::ToRecordBatch( ARROW_ASSIGN_OR_RAISE(columns[i], MakeArrayFromScalar(*value.scalar(), length, pool)); } else { - DCHECK(false); + return Status::TypeError("ExecBatch::ToRecordBatch value ", i, " with unsupported ", + "value kind ", ::arrow::ToString(value.kind())); } } diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index 50a9de6f0b17..586f23fa71f3 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -35,6 +35,7 @@ #include "arrow/compute/kernel.h" #include "arrow/compute/registry.h" #include "arrow/memory_pool.h" +#include "arrow/record_batch.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/type.h" @@ -55,7 +56,7 @@ using ::arrow::internal::BitmapEquals; using ::arrow::internal::CopyBitmap; using ::arrow::internal::CountSetBits; -TEST(ExecBatch, Basics) { +TEST(ExecBatch, ToRecordBatch) { auto i32_array = ArrayFromJSON(int32(), "[0, 1, 2]"); auto utf8_array = ArrayFromJSON(utf8(), R"(["a", "b", "c"])"); ExecBatch exec_batch({Datum(i32_array), Datum(utf8_array)}, 3); @@ -66,6 +67,14 @@ TEST(ExecBatch, Basics) { auto reject_schema = schema({field("a", int32()), field("b", utf8()), field("c", float64())}); ASSERT_RAISES(Invalid, exec_batch.ToRecordBatch(reject_schema)); + + auto null_schema = schema({field("a", null())}); + // right-size but wrong-kind (null instead of array) schema yields invalid record batch + ASSERT_OK_AND_ASSIGN(auto null_record_batch, exec_batch.ToRecordBatch(null_schema)); + ASSERT_NOT_OK(null_record_batch->ValidateFull()); + // wrong-kind exec batch leads to a type error + ExecBatch miskinded_batch({Datum()}, 0); + ASSERT_RAISES(TypeError, miskinded_batch.ToRecordBatch(null_schema)); } TEST(ExecContext, BasicWorkings) { From df0a10c5370a11b8994d3cd06164fb80e054f268 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 13 Oct 2022 17:12:41 +0200 Subject: [PATCH 3/3] Rework tests --- cpp/src/arrow/compute/exec_test.cc | 31 +++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/compute/exec_test.cc b/cpp/src/arrow/compute/exec_test.cc index 586f23fa71f3..7f29a673d935 100644 --- a/cpp/src/arrow/compute/exec_test.cc +++ b/cpp/src/arrow/compute/exec_test.cc @@ -60,20 +60,37 @@ TEST(ExecBatch, ToRecordBatch) { auto i32_array = ArrayFromJSON(int32(), "[0, 1, 2]"); auto utf8_array = ArrayFromJSON(utf8(), R"(["a", "b", "c"])"); ExecBatch exec_batch({Datum(i32_array), Datum(utf8_array)}, 3); + auto right_schema = schema({field("a", int32()), field("b", utf8())}); ASSERT_OK_AND_ASSIGN(auto right_record_batch, exec_batch.ToRecordBatch(right_schema)); - auto accept_schema = schema({field("a", int32())}); - ASSERT_OK_AND_ASSIGN(auto accept_record_batch, exec_batch.ToRecordBatch(accept_schema)); + ASSERT_OK(right_record_batch->ValidateFull()); + auto expected_batch = RecordBatchFromJSON(right_schema, R"([ + {"a": 0, "b": "a"}, + {"a": 1, "b": "b"}, + {"a": 2, "b": "c"} + ])"); + AssertBatchesEqual(*right_record_batch, *expected_batch); + + // With a scalar column + auto utf8_scalar = ScalarFromJSON(utf8(), R"("z")"); + exec_batch = ExecBatch({Datum(i32_array), Datum(utf8_scalar)}, 3); + ASSERT_OK_AND_ASSIGN(right_record_batch, exec_batch.ToRecordBatch(right_schema)); + ASSERT_OK(right_record_batch->ValidateFull()); + expected_batch = RecordBatchFromJSON(right_schema, R"([ + {"a": 0, "b": "z"}, + {"a": 1, "b": "z"}, + {"a": 2, "b": "z"} + ])"); + AssertBatchesEqual(*right_record_batch, *expected_batch); + + // Wrong number of fields in schema auto reject_schema = schema({field("a", int32()), field("b", utf8()), field("c", float64())}); ASSERT_RAISES(Invalid, exec_batch.ToRecordBatch(reject_schema)); - auto null_schema = schema({field("a", null())}); - // right-size but wrong-kind (null instead of array) schema yields invalid record batch - ASSERT_OK_AND_ASSIGN(auto null_record_batch, exec_batch.ToRecordBatch(null_schema)); - ASSERT_NOT_OK(null_record_batch->ValidateFull()); - // wrong-kind exec batch leads to a type error + // Wrong-kind exec batch (not really valid, but test it here anyway) ExecBatch miskinded_batch({Datum()}, 0); + auto null_schema = schema({field("a", null())}); ASSERT_RAISES(TypeError, miskinded_batch.ToRecordBatch(null_schema)); }