GH-18481: [C++] prefer casting literal over casting field ref (#15180)

westonpace · web-flow · commit b56b91e1ef4c · 2023-02-03T21:55:05.000-08:00
I ran into this problem while trying to work out partition pruning in the new scan node.  I feel like this is a somewhat naive approach but it seems to work.

I think it would fail if a `DispatchBest` existed where a n-ary kernel existed with non-equal types.  For example, if there was a function foo(int8, int32) and it had a dispatch best of some kind.

Authored-by: Weston Pace &lt;weston.pace@gmail.com&gt;
Signed-off-by: Weston Pace &lt;weston.pace@gmail.com&gt;
diff --git a/cpp/src/arrow/compute/exec/expression.cc b/cpp/src/arrow/compute/exec/expression.cc
@@ -221,7 +221,11 @@ bool Expression::Equals(const Expression& other) const {
   }
 
   if (auto lit = literal()) {
-    return lit->Equals(*other.literal());
+    // The scalar NaN is not equal to the scalar NaN but the literal NaN
+    // is equal to the literal NaN (e.g. the expressions are equal even if
+    // the values are not)
+    EqualOptions equal_options = EqualOptions::Defaults().nans_equal(true);
+    return lit->scalar()->Equals(other.literal()->scalar(), equal_options);
   }
 
   if (auto ref = field_ref()) {
@@ -368,6 +372,158 @@ bool Expression::IsSatisfiable() const {
 
 namespace {
 
+TypeHolder SmallestTypeFor(const arrow::Datum& value) {
+  switch (value.type()->id()) {
+    case Type::INT8:
+      return int8();
+    case Type::UINT8:
+      return uint8();
+    case Type::INT16: {
+      int16_t i16 = value.scalar_as<Int16Scalar>().value;
+      if (i16 <= std::numeric_limits<int8_t>::max() &&
+          i16 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      return int16();
+    }
+    case Type::UINT16: {
+      uint16_t ui16 = value.scalar_as<UInt16Scalar>().value;
+      if (ui16 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      return uint16();
+    }
+    case Type::INT32: {
+      int32_t i32 = value.scalar_as<Int32Scalar>().value;
+      if (i32 <= std::numeric_limits<int8_t>::max() &&
+          i32 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      if (i32 <= std::numeric_limits<int16_t>::max() &&
+          i32 >= std::numeric_limits<int16_t>::min()) {
+        return int16();
+      }
+      return int32();
+    }
+    case Type::UINT32: {
+      uint32_t ui32 = value.scalar_as<UInt32Scalar>().value;
+      if (ui32 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      if (ui32 <= std::numeric_limits<uint16_t>::max()) {
+        return uint16();
+      }
+      return uint32();
+    }
+    case Type::INT64: {
+      int64_t i64 = value.scalar_as<Int64Scalar>().value;
+      if (i64 <= std::numeric_limits<int8_t>::max() &&
+          i64 >= std::numeric_limits<int8_t>::min()) {
+        return int8();
+      }
+      if (i64 <= std::numeric_limits<int16_t>::max() &&
+          i64 >= std::numeric_limits<int16_t>::min()) {
+        return int16();
+      }
+      if (i64 <= std::numeric_limits<int32_t>::max() &&
+          i64 >= std::numeric_limits<int32_t>::min()) {
+        return int32();
+      }
+      return int64();
+    }
+    case Type::UINT64: {
+      uint64_t ui64 = value.scalar_as<UInt64Scalar>().value;
+      if (ui64 <= std::numeric_limits<uint8_t>::max()) {
+        return uint8();
+      }
+      if (ui64 <= std::numeric_limits<uint16_t>::max()) {
+        return uint16();
+      }
+      if (ui64 <= std::numeric_limits<uint32_t>::max()) {
+        return uint32();
+      }
+      return uint64();
+    }
+    case Type::DOUBLE: {
+      double doub = value.scalar_as<DoubleScalar>().value;
+      if (!std::isfinite(doub)) {
+        // Special values can be float
+        return float32();
+      }
+      // Test if float representation is the same
+      if (static_cast<double>(static_cast<float>(doub)) == doub) {
+        return float32();
+      }
+      return float64();
+    }
+    case Type::LARGE_STRING: {
+      if (value.scalar_as<LargeStringScalar>().value->size() <=
+          std::numeric_limits<int32_t>::max()) {
+        return utf8();
+      }
+      return large_utf8();
+    }
+    case Type::LARGE_BINARY:
+      if (value.scalar_as<LargeBinaryScalar>().value->size() <=
+          std::numeric_limits<int32_t>::max()) {
+        return binary();
+      }
+      return large_binary();
+    case Type::TIMESTAMP: {
+      const auto& ts_type = checked_pointer_cast<TimestampType>(value.type());
+      uint64_t ts = value.scalar_as<TimestampScalar>().value;
+      switch (ts_type->unit()) {
+        case TimeUnit::SECOND:
+          return value.type();
+        case TimeUnit::MILLI:
+          if (ts % 1000 == 0) {
+            return timestamp(TimeUnit::SECOND);
+          }
+          return value.type();
+        case TimeUnit::MICRO:
+          if (ts % 1000000 == 0) {
+            return timestamp(TimeUnit::SECOND);
+          }
+          if (ts % 1000 == 0) {
+            return timestamp(TimeUnit::MILLI);
+          }
+          return value.type();
+        case TimeUnit::NANO:
+          if (ts % 1000000000 == 0) {
+            return timestamp(TimeUnit::SECOND);
+          }
+          if (ts % 1000000 == 0) {
+            return timestamp(TimeUnit::MILLI);
+          }
+          if (ts % 1000 == 0) {
+            return timestamp(TimeUnit::MICRO);
+          }
+          return value.type();
+        default:
+          return value.type();
+      }
+    }
+    default:
+      return value.type();
+  }
+}
+
+inline std::vector<TypeHolder> GetTypesWithSmallestLiteralRepresentation(
+    const std::vector<Expression>& exprs) {
+  std::vector<TypeHolder> types(exprs.size());
+  for (size_t i = 0; i < exprs.size(); ++i) {
+    DCHECK(exprs[i].IsBound());
+    if (const Datum* literal = exprs[i].literal()) {
+      if (literal->is_scalar()) {
+        types[i] = SmallestTypeFor(*literal);
+      }
+    } else {
+      types[i] = exprs[i].type();
+    }
+  }
+  return types;
+}
+
 // Produce a bound Expression from unbound Call and bound arguments.
 Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_casts,
                                     compute::ExecContext* exec_context) {
@@ -377,9 +533,18 @@ Result<Expression> BindNonRecursive(Expression::Call call, bool insert_implicit_
   std::vector<TypeHolder> types = GetTypes(call.arguments);
   ARROW_ASSIGN_OR_RAISE(call.function, GetFunction(call, exec_context));
 
-  if (!insert_implicit_casts) {
-    ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchExact(types));
+  // First try and bind exactly
+  Result<const Kernel*> maybe_exact_match = call.function->DispatchExact(types);
+  if (maybe_exact_match.ok()) {
+    call.kernel = *maybe_exact_match;
   } else {
+    if (!insert_implicit_casts) {
+      return maybe_exact_match.status();
+    }
+    // If exact binding fails, and we are allowed to cast, then prefer casting literals
+    // first.  Since DispatchBest generally prefers up-casting the best way to do this is
+    // first down-cast the literals as much as possible
+    types = GetTypesWithSmallestLiteralRepresentation(call.arguments);
     ARROW_ASSIGN_OR_RAISE(call.kernel, call.function->DispatchBest(&types));
 
     for (size_t i = 0; i < types.size(); ++i) {
diff --git a/cpp/src/arrow/compute/exec/expression_test.cc b/cpp/src/arrow/compute/exec/expression_test.cc
@@ -60,6 +60,7 @@ const std::shared_ptr<Schema> kBoringSchema = schema({
     field("dict_i32", dictionary(int32(), int32())),
     field("ts_ns", timestamp(TimeUnit::NANO)),
     field("ts_s", timestamp(TimeUnit::SECOND)),
+    field("binary", binary()),
 });
 
 #define EXPECT_OK ARROW_EXPECT_OK
@@ -330,6 +331,23 @@ TEST(Expression, Equality) {
   EXPECT_EQ(literal(1), literal(1));
   EXPECT_NE(literal(1), literal(2));
 
+  // NaN literals (of the same type) should be equal.  This allows, for example,
+  // the expression x == NaN to equal itself.
+  auto double_nan_literal = literal(std::numeric_limits<double>::quiet_NaN());
+  auto float_nan_literal = literal(std::numeric_limits<float>::quiet_NaN());
+  EXPECT_EQ(double_nan_literal, double_nan_literal);
+  EXPECT_NE(double_nan_literal, float_nan_literal);
+  // The literals may be equal but the values should not be
+  Expression nans_eq = equal(double_nan_literal, double_nan_literal);
+  ASSERT_OK_AND_ASSIGN(nans_eq, nans_eq.Bind(*kBoringSchema));
+  ASSERT_OK_AND_ASSIGN(Datum nans_eq_rsp, ExecuteScalarExpression(nans_eq, ExecBatch()));
+  EXPECT_FALSE(nans_eq_rsp.scalar_as<BooleanScalar>().value);
+  if (std::numeric_limits<double>::has_signaling_NaN) {
+    // We intentionally do not care about signaling and may even discard it on conversion.
+    EXPECT_EQ(literal(std::numeric_limits<double>::quiet_NaN()),
+              literal(std::numeric_limits<double>::signaling_NaN()));
+  }
+
   EXPECT_EQ(field_ref("a"), field_ref("a"));
   EXPECT_NE(field_ref("a"), field_ref("b"));
   EXPECT_NE(field_ref("a"), literal(2));
@@ -593,8 +611,36 @@ TEST(Expression, BindWithImplicitCasts) {
     ExpectBindsTo(cmp(field_ref("dict_str"), field_ref("str")),
                   cmp(cast(field_ref("dict_str"), utf8()), field_ref("str")));
 
+    // Should prefer the literal
+    ExpectBindsTo(cmp(field_ref("dict_i32"), literal(int64_t(4))),
+                  cmp(field_ref("dict_i32"), literal(int32_t(4))));
     ExpectBindsTo(cmp(field_ref("dict_i32"), literal(int64_t(4))),
-                  cmp(cast(field_ref("dict_i32"), int64()), literal(int64_t(4))));
+                  cmp(field_ref("dict_i32"), literal(int32_t(4))));
+    ExpectBindsTo(cmp(field_ref("ts_s"),
+                      literal(std::make_shared<TimestampScalar>(0, TimeUnit::NANO))),
+                  cmp(field_ref("ts_s"),
+                      literal(std::make_shared<TimestampScalar>(0, TimeUnit::SECOND))));
+    ExpectBindsTo(
+        cmp(field_ref("binary"), literal(std::make_shared<LargeBinaryScalar>("foo"))),
+        cmp(field_ref("binary"), literal(std::make_shared<BinaryScalar>("foo"))));
+
+    // We will not implicitly cast a literal from signed to unsigned or vice versa
+    ExpectBindsTo(cmp(field_ref("i8"), literal(uint8_t(4))),
+                  cmp(cast(field_ref("i8"), int16()), literal(int16_t(4))));
+    ExpectBindsTo(cmp(field_ref("u32"), literal(int64_t(4))),
+                  cmp(cast(field_ref("u32"), int64()), literal(int64_t(4))));
+
+    // NaN / Inf can be float or double as needed
+    ExpectBindsTo(
+        cmp(field_ref("f32"), literal(std::numeric_limits<double>::quiet_NaN())),
+        cmp(field_ref("f32"), literal(std::numeric_limits<float>::quiet_NaN())));
+    ExpectBindsTo(cmp(field_ref("f32"), literal(std::numeric_limits<double>::infinity())),
+                  cmp(field_ref("f32"), literal(std::numeric_limits<float>::infinity())));
+
+    // Bit of an odd case, both fields are cast
+    ExpectBindsTo(cmp(field_ref("i32"), literal(std::make_shared<DoubleScalar>(10.0))),
+                  cmp(cast(field_ref("i32"), float32()),
+                      literal(std::make_shared<FloatScalar>(10.0f))));
   }
 
   compute::SetLookupOptions in_a{ArrayFromJSON(utf8(), R"(["a"])")};
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.cc b/cpp/src/arrow/compute/kernels/codegen_internal.cc
@@ -338,12 +338,11 @@ Status CastBinaryDecimalArgs(DecimalPromotion promotion, std::vector<TypeHolder>
   const DataType& right_type = *(*types)[1];
   DCHECK(is_decimal(left_type.id()) || is_decimal(right_type.id()));
 
-  // decimal + float = float
-  if (is_floating(left_type.id())) {
-    (*types)[1] = (*types)[0];
-    return Status::OK();
-  } else if (is_floating(right_type.id())) {
-    (*types)[0] = (*types)[1];
+  // decimal + float64 = float64
+  // decimal + float32 is roughly float64 + float32 so we choose float64
+  if (is_floating(left_type.id()) || is_floating(right_type.id())) {
+    (*types)[0] = float64();
+    (*types)[1] = float64();
     return Status::OK();
   }
 
diff --git a/cpp/src/arrow/compute/kernels/codegen_internal_test.cc b/cpp/src/arrow/compute/kernels/codegen_internal_test.cc
@@ -34,7 +34,7 @@ TEST(TestDispatchBest, CastBinaryDecimalArgs) {
 
   // Any float -> all float
   for (auto mode : modes) {
-    args = {decimal128(3, 2), float64()};
+    args = {decimal128(3, 2), float32(), float64()};
     ASSERT_OK(CastBinaryDecimalArgs(mode, &args));
     AssertTypeEqual(*args[0], *float64());
     AssertTypeEqual(*args[1], *float64());
diff --git a/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc b/cpp/src/arrow/compute/kernels/scalar_arithmetic_test.cc
@@ -1763,9 +1763,9 @@ TEST_F(TestBinaryArithmeticDecimal, DispatchBest) {
     for (std::string suffix : {"", "_checked"}) {
       name += suffix;
 
-      CheckDispatchBest(name, {decimal128(1, 0), float32()}, {float32(), float32()});
+      CheckDispatchBest(name, {decimal128(1, 0), float32()}, {float64(), float64()});
       CheckDispatchBest(name, {decimal256(1, 0), float64()}, {float64(), float64()});
-      CheckDispatchBest(name, {float32(), decimal256(1, 0)}, {float32(), float32()});
+      CheckDispatchBest(name, {float32(), decimal256(1, 0)}, {float64(), float64()});
       CheckDispatchBest(name, {float64(), decimal128(1, 0)}, {float64(), float64()});
     }
   }
diff --git a/cpp/src/arrow/engine/substrait/serde_test.cc b/cpp/src/arrow/engine/substrait/serde_test.cc
@@ -3757,7 +3757,7 @@ TEST(Substrait, NestedProjectWithMultiFieldExpressions) {
 
   ASSERT_OK_AND_ASSIGN(auto buf, internal::SubstraitFromJSON("Plan", substrait_json));
 
-  auto output_schema = schema({field("A", float64()), field("B", float64())});
+  auto output_schema = schema({field("A", float32()), field("B", float32())});
   auto expected_table = TableFromJSON(output_schema, {R"([
       [20, 20],
       [30, 30],
diff --git a/r/tests/testthat/test-type.R b/r/tests/testthat/test-type.R
@@ -280,7 +280,9 @@ test_that("infer_type() gets the right type for Expression", {
   expect_equal(y$type(), infer_type(y))
   expect_equal(infer_type(y), float64())
   expect_equal(add_xy$type(), infer_type(add_xy))
-  expect_equal(infer_type(add_xy), float64())
+  # even though 10 is a float64, arrow will clamp it to the narrowest
+  # type that can exactly represent it when building expressions
+  expect_equal(infer_type(add_xy), float32())
 })
 
 test_that("infer_type() infers type for POSIXlt", {

Original file line number	Diff line number	Diff line change
`@@ -1763,9 +1763,9 @@ TEST_F(TestBinaryArithmeticDecimal, DispatchBest) {`
`1763`	`1763`	`for (std::string suffix : {"", "_checked"}) {`
`1764`	`1764`	`name += suffix;`
`1765`	`1765`
`1766`		`- CheckDispatchBest(name, {decimal128(1, 0), float32()}, {float32(), float32()});`
	`1766`	`+ CheckDispatchBest(name, {decimal128(1, 0), float32()}, {float64(), float64()});`
`1767`	`1767`	`CheckDispatchBest(name, {decimal256(1, 0), float64()}, {float64(), float64()});`
`1768`		`- CheckDispatchBest(name, {float32(), decimal256(1, 0)}, {float32(), float32()});`
	`1768`	`+ CheckDispatchBest(name, {float32(), decimal256(1, 0)}, {float64(), float64()});`
`1769`	`1769`	`CheckDispatchBest(name, {float64(), decimal128(1, 0)}, {float64(), float64()});`
`1770`	`1770`	`}`
`1771`	`1771`	`}`