Skip to content

Commit 64e8559

Browse files
committed
Make Like a top-level Expr
1 parent 191d8b7 commit 64e8559

16 files changed

Lines changed: 360 additions & 95 deletions

File tree

datafusion/core/src/execution/context.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,6 +1439,8 @@ impl SessionState {
14391439
rules.push(Arc::new(TypeCoercion::new()));
14401440
rules.push(Arc::new(LimitPushDown::new()));
14411441
rules.push(Arc::new(SingleDistinctToGroupBy::new()));
1442+
//TODO add a config so we can turn this off since it is new
1443+
rules.push(Arc::new(TypeCoercion::default()));
14421444

14431445
let mut physical_optimizers: Vec<Arc<dyn PhysicalOptimizerRule + Sync + Send>> = vec![
14441446
Arc::new(AggregateStatistics::new()),
@@ -1589,6 +1591,7 @@ impl SessionState {
15891591
) -> Result<Arc<dyn ExecutionPlan>> {
15901592
let planner = self.query_planner.clone();
15911593
let logical_plan = self.optimize(logical_plan)?;
1594+
println!("optimized plan [2]: {:?}", logical_plan);
15921595
planner.create_physical_plan(&logical_plan, self).await
15931596
}
15941597
}

datafusion/core/src/physical_plan/planner.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1682,6 +1682,8 @@ mod tests {
16821682
use datafusion_expr::expr::GroupingSet;
16831683
use datafusion_expr::sum;
16841684
use datafusion_expr::{col, lit};
1685+
use datafusion_optimizer::type_coercion::TypeCoercion;
1686+
use datafusion_optimizer::{OptimizerConfig, OptimizerRule};
16851687
use fmt::Debug;
16861688
use std::collections::HashMap;
16871689
use std::convert::TryFrom;
@@ -1859,7 +1861,19 @@ mod tests {
18591861
col("c1").like(col("c2")),
18601862
];
18611863
for case in cases {
1862-
let logical_plan = test_csv_scan().await?.project(vec![case.clone()]);
1864+
let logical_plan = test_csv_scan()
1865+
.await?
1866+
.project(vec![case.clone()])
1867+
.and_then(|b| b.build())
1868+
.and_then(|plan| {
1869+
// this test was expecting type coercion/validation errors before the optimizer
1870+
// had run due to the legacy approach of type coercion but now we need to run the
1871+
// optimizer here
1872+
let type_coercion = TypeCoercion::default();
1873+
let mut config = OptimizerConfig::new();
1874+
type_coercion.optimize(&plan, &mut config)
1875+
});
1876+
18631877
let message = format!(
18641878
"Expression {:?} expected to error due to impossible coercion",
18651879
case

datafusion/core/tests/sql/mod.rs

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -751,29 +751,32 @@ async fn try_execute_to_batches(
751751
/// Execute query and return results as a Vec of RecordBatches
752752
async fn execute_to_batches(ctx: &SessionContext, sql: &str) -> Vec<RecordBatch> {
753753
let msg = format!("Creating logical plan for '{}'", sql);
754-
let plan = ctx
754+
let logical_plan = ctx
755755
.create_logical_plan(sql)
756756
.map_err(|e| format!("{:?} at {}", e, msg))
757757
.unwrap();
758-
let logical_schema = plan.schema();
758+
let logical_schema = logical_plan.schema();
759759

760-
let msg = format!("Optimizing logical plan for '{}': {:?}", sql, plan);
761-
let plan = ctx
762-
.optimize(&plan)
760+
let msg = format!("Optimizing logical plan for '{}': {:?}", sql, logical_plan);
761+
let optimized_logical_plan = ctx
762+
.optimize(&logical_plan)
763763
.map_err(|e| format!("{:?} at {}", e, msg))
764764
.unwrap();
765-
let optimized_logical_schema = plan.schema();
766-
767-
let msg = format!("Creating physical plan for '{}': {:?}", sql, plan);
768-
let plan = ctx
769-
.create_physical_plan(&plan)
765+
println!("optimized plan [1]: {:?}", optimized_logical_plan);
766+
let optimized_logical_schema = optimized_logical_plan.schema();
767+
768+
// creating a physical plan will call `optimize` again so we pass in the
769+
// unoptimized logical plan here
770+
let msg = format!("Creating physical plan for '{}': {:?}", sql, logical_plan);
771+
let physical_plan = ctx
772+
.create_physical_plan(&logical_plan)
770773
.await
771774
.map_err(|e| format!("{:?} at {}", e, msg))
772775
.unwrap();
773776

774-
let msg = format!("Executing physical plan for '{}': {:?}", sql, plan);
777+
let msg = format!("Executing physical plan for '{}': {:?}", sql, physical_plan);
775778
let task_ctx = ctx.task_ctx();
776-
let results = collect(plan, task_ctx)
779+
let results = collect(physical_plan, task_ctx)
777780
.await
778781
.map_err(|e| format!("{:?} at {}", e, msg))
779782
.unwrap();

datafusion/expr/src/binary_rule.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,6 @@ pub fn binary_operator_data_type(
4242
| Operator::NotEq
4343
| Operator::And
4444
| Operator::Or
45-
| Operator::Like
46-
| Operator::NotLike
4745
| Operator::Lt
4846
| Operator::Gt
4947
| Operator::GtEq
@@ -95,8 +93,6 @@ pub fn coerce_types(
9593
| Operator::Gt
9694
| Operator::GtEq
9795
| Operator::LtEq => comparison_coercion(lhs_type, rhs_type),
98-
// "like" operators operate on strings and always return a boolean
99-
Operator::Like | Operator::NotLike => like_coercion(lhs_type, rhs_type),
10096
// date +/- interval returns date
10197
Operator::Plus | Operator::Minus
10298
if (*lhs_type == DataType::Date32
@@ -504,7 +500,7 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
504500

505501
/// coercion rules for like operations.
506502
/// This is a union of string coercion rules and dictionary coercion rules
507-
fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
503+
pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
508504
string_coercion(lhs_type, rhs_type)
509505
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
510506
.or_else(|| null_coercion(lhs_type, rhs_type))

datafusion/expr/src/expr.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -444,12 +444,22 @@ impl Expr {
444444

445445
/// Return `self LIKE other`
446446
pub fn like(self, other: Expr) -> Expr {
447-
binary_expr(self, Operator::Like, other)
447+
Expr::Like {
448+
negated: false,
449+
expr: Box::new(self),
450+
pattern: Box::new(other),
451+
escape_char: None,
452+
}
448453
}
449454

450455
/// Return `self NOT LIKE other`
451456
pub fn not_like(self, other: Expr) -> Expr {
452-
binary_expr(self, Operator::NotLike, other)
457+
Expr::Like {
458+
negated: true,
459+
expr: Box::new(self),
460+
pattern: Box::new(other),
461+
escape_char: None,
462+
}
453463
}
454464

455465
/// Return `self AS name` alias expression
@@ -505,6 +515,7 @@ impl Not for Expr {
505515
type Output = Self;
506516

507517
fn not(self) -> Self::Output {
518+
// TODO file issue for extending this to other similar expressions
508519
match self {
509520
Expr::Like {
510521
negated,

datafusion/expr/src/expr_fn.rs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ use crate::expr::GroupingSet;
2121
use crate::{
2222
aggregate_function, built_in_function, conditional_expressions::CaseBuilder, lit,
2323
logical_plan::Subquery, AccumulatorFunctionImplementation, AggregateUDF,
24-
BuiltinScalarFunction, Expr, LogicalPlan, Operator, ReturnTypeFunction,
25-
ScalarFunctionImplementation, ScalarUDF, Signature, StateTypeFunction, Volatility,
24+
BuiltinScalarFunction, Expr, ExprSchemable, LogicalPlan, Operator,
25+
ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, Signature,
26+
StateTypeFunction, Volatility,
2627
};
2728
use arrow::datatypes::DataType;
28-
use datafusion_common::Result;
29+
use datafusion_common::{DFSchema, Result};
2930
use std::sync::Arc;
3031

3132
/// Create a column expression based on a qualified or unqualified column name
@@ -259,6 +260,23 @@ pub fn cast(expr: Expr, data_type: DataType) -> Expr {
259260
}
260261
}
261262

263+
/// Create a cast expression
264+
pub fn cast_if_needed(
265+
expr: Expr,
266+
data_type: &DataType,
267+
input_schema: &DFSchema,
268+
) -> Result<Expr> {
269+
let t = expr.get_type(input_schema)?;
270+
if &t == data_type {
271+
Ok(expr)
272+
} else {
273+
Ok(Expr::Cast {
274+
expr: Box::new(expr),
275+
data_type: data_type.clone(),
276+
})
277+
}
278+
}
279+
262280
/// Create an convenience function representing a unary scalar function
263281
macro_rules! unary_scalar_expr {
264282
($ENUM:ident, $FUNC:ident, $DOC:expr) => {

datafusion/expr/src/operator.rs

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,6 @@ pub enum Operator {
5151
And,
5252
/// Logical OR, like `||`
5353
Or,
54-
/// Matches a wildcard pattern
55-
Like,
56-
/// Does not match a wildcard pattern
57-
NotLike,
5854
/// IS DISTINCT FROM
5955
IsDistinctFrom,
6056
/// IS NOT DISTINCT FROM
@@ -90,8 +86,6 @@ impl Operator {
9086
Operator::LtEq => Some(Operator::Gt),
9187
Operator::Gt => Some(Operator::LtEq),
9288
Operator::GtEq => Some(Operator::Lt),
93-
Operator::Like => Some(Operator::NotLike),
94-
Operator::NotLike => Some(Operator::Like),
9589
Operator::IsDistinctFrom => Some(Operator::IsNotDistinctFrom),
9690
Operator::IsNotDistinctFrom => Some(Operator::IsDistinctFrom),
9791
Operator::Plus
@@ -130,8 +124,6 @@ impl fmt::Display for Operator {
130124
Operator::Modulo => "%",
131125
Operator::And => "AND",
132126
Operator::Or => "OR",
133-
Operator::Like => "LIKE",
134-
Operator::NotLike => "NOT LIKE",
135127
Operator::RegexMatch => "~",
136128
Operator::RegexIMatch => "~*",
137129
Operator::RegexNotMatch => "!~",

datafusion/optimizer/src/simplify_expressions.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,18 @@ fn negate_clause(expr: Expr) -> Expr {
201201
};
202202
}
203203
match op {
204+
// not (A is distinct from B) ===> (A is not distinct from B)
205+
Operator::IsDistinctFrom => Expr::BinaryExpr {
206+
left,
207+
op: Operator::IsNotDistinctFrom,
208+
right,
209+
},
210+
// not (A is not distinct from B) ===> (A is distinct from B)
211+
Operator::IsNotDistinctFrom => Expr::BinaryExpr {
212+
left,
213+
op: Operator::IsDistinctFrom,
214+
right,
215+
},
204216
// not (A and B) ===> (not A) or (not B)
205217
Operator::And => {
206218
let left = negate_clause(*left);
@@ -2210,7 +2222,7 @@ mod tests {
22102222
.unwrap()
22112223
.build()
22122224
.unwrap();
2213-
let expected = "Filter: #test.a NOT LIKE #test.b AS NOT test.a LIKE test.b\
2225+
let expected = "Filter: #test.a NOT LIKE #test.b\
22142226
\n TableScan: test";
22152227

22162228
assert_optimized_plan_eq(&plan, expected);
@@ -2232,7 +2244,7 @@ mod tests {
22322244
.unwrap()
22332245
.build()
22342246
.unwrap();
2235-
let expected = "Filter: #test.a LIKE #test.b AS NOT test.a NOT LIKE test.b\
2247+
let expected = "Filter: #test.a LIKE #test.b\
22362248
\n TableScan: test";
22372249

22382250
assert_optimized_plan_eq(&plan, expected);

datafusion/physical-expr/src/expressions/binary.rs

Lines changed: 3 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ use arrow::compute::kernels::arithmetic::{
2828
multiply_scalar, subtract, subtract_scalar,
2929
};
3030
use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene};
31+
use arrow::compute::kernels::comparison::regexp_is_match_utf8;
32+
use arrow::compute::kernels::comparison::regexp_is_match_utf8_scalar;
3133
use arrow::compute::kernels::comparison::{
3234
eq_dyn_binary_scalar, gt_dyn_binary_scalar, gt_eq_dyn_binary_scalar,
3335
lt_dyn_binary_scalar, lt_eq_dyn_binary_scalar, neq_dyn_binary_scalar,
@@ -47,10 +49,6 @@ use arrow::compute::kernels::comparison::{
4749
use arrow::compute::kernels::comparison::{
4850
eq_scalar, gt_eq_scalar, gt_scalar, lt_eq_scalar, lt_scalar, neq_scalar,
4951
};
50-
use arrow::compute::kernels::comparison::{like_utf8, nlike_utf8, regexp_is_match_utf8};
51-
use arrow::compute::kernels::comparison::{
52-
like_utf8_scalar, nlike_utf8_scalar, regexp_is_match_utf8_scalar,
53-
};
5452

5553
use adapter::{eq_dyn, gt_dyn, gt_eq_dyn, lt_dyn, lt_eq_dyn, neq_dyn};
5654
use arrow::compute::kernels::concat_elements::concat_elements_utf8;
@@ -323,19 +321,6 @@ macro_rules! compute_op {
323321
}};
324322
}
325323

326-
macro_rules! binary_string_array_op_scalar {
327-
($LEFT:expr, $RIGHT:expr, $OP:ident, $OP_TYPE:expr) => {{
328-
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
329-
DataType::Utf8 => compute_utf8_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $OP_TYPE),
330-
other => Err(DataFusionError::Internal(format!(
331-
"Data type {:?} not supported for scalar operation '{}' on string array",
332-
other, stringify!($OP)
333-
))),
334-
};
335-
Some(result)
336-
}};
337-
}
338-
339324
macro_rules! binary_string_array_op {
340325
($LEFT:expr, $RIGHT:expr, $OP:ident) => {{
341326
match $LEFT.data_type() {
@@ -623,7 +608,7 @@ impl PhysicalExpr for BinaryExpr {
623608
}
624609

625610
/// unwrap underlying (non dictionary) value, if any, to pass to a scalar kernel
626-
fn unwrap_dict_value(v: ScalarValue) -> ScalarValue {
611+
pub fn unwrap_dict_value(v: ScalarValue) -> ScalarValue {
627612
if let ScalarValue::Dictionary(_key_type, v) = v {
628613
unwrap_dict_value(*v)
629614
} else {
@@ -713,12 +698,6 @@ impl BinaryExpr {
713698
Operator::NotEq => {
714699
binary_array_op_dyn_scalar!(array, scalar.clone(), neq, bool_type)
715700
}
716-
Operator::Like => {
717-
binary_string_array_op_scalar!(array, scalar.clone(), like, bool_type)
718-
}
719-
Operator::NotLike => {
720-
binary_string_array_op_scalar!(array, scalar.clone(), nlike, bool_type)
721-
}
722701
Operator::Plus => {
723702
binary_primitive_array_op_scalar!(array, scalar.clone(), add)
724703
}
@@ -818,8 +797,6 @@ impl BinaryExpr {
818797
right_data_type: &DataType,
819798
) -> Result<ArrayRef> {
820799
match &self.op {
821-
Operator::Like => binary_string_array_op!(left, right, like),
822-
Operator::NotLike => binary_string_array_op!(left, right, nlike),
823800
Operator::Lt => lt_dyn(&left, &right),
824801
Operator::LtEq => lt_eq_dyn(&left, &right),
825802
Operator::Gt => gt_dyn(&left, &right),
@@ -1115,18 +1092,6 @@ mod tests {
11151092
DataType::Float32,
11161093
vec![2f32]
11171094
);
1118-
test_coercion!(
1119-
StringArray,
1120-
DataType::Utf8,
1121-
vec!["hello world", "world"],
1122-
StringArray,
1123-
DataType::Utf8,
1124-
vec!["%hello%", "%hello%"],
1125-
Operator::Like,
1126-
BooleanArray,
1127-
DataType::Boolean,
1128-
vec![true, false]
1129-
);
11301095
test_coercion!(
11311096
StringArray,
11321097
DataType::Utf8,

0 commit comments

Comments
 (0)