From b8c8f63fbedc74f62b9eda3a25fa50a2f94e2856 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 18 Apr 2025 10:58:22 -0500 Subject: [PATCH 01/10] predicate pruning: support dictionaries --- datafusion/physical-optimizer/src/pruning.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 1dd168f181676..186c8b67157ac 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1215,6 +1215,18 @@ fn is_compare_op(op: Operator) -> bool { // For example, casts from string to numbers is not correct. // Because the "13" is less than "3" with UTF8 comparison order. fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Result<()> { + // Unrwavel dictionaries, these are always supported + let from_type = match from_type { + DataType::Dictionary(_, t) => return verify_support_type_for_prune(t.as_ref(), to_type), + _ => from_type, + }; + let to_type = match to_type { + DataType::Dictionary(_, t) => return verify_support_type_for_prune(from_type, t.as_ref()), + _ => to_type, + }; + if from_type == to_type { + return Ok(()); + } // TODO: support other data type for prunable cast or try cast if matches!( from_type, @@ -1544,7 +1556,10 @@ fn build_predicate_expression( Ok(builder) => builder, // allow partial failure in predicate expression generation // this can still produce a useful predicate when multiple conditions are joined using AND - Err(_) => return unhandled_hook.handle(expr), + Err(e) => { + println!("Error building pruning expression: {e}"); + return unhandled_hook.handle(expr) + }, }; build_statistics_expr(&mut expr_builder) From 13b093fb280d7ecf489004618ac0dd250cb549d8 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 18 Apr 2025 11:15:19 -0500 Subject: [PATCH 02/10] more types --- datafusion/physical-optimizer/src/pruning.rs | 64 +++++++++++++++++--- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 186c8b67157ac..5951ba74b6714 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1215,29 +1215,79 @@ fn is_compare_op(op: Operator) -> bool { // For example, casts from string to numbers is not correct. // Because the "13" is less than "3" with UTF8 comparison order. fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Result<()> { - // Unrwavel dictionaries, these are always supported + // Dictionary casts are always supported as long as the value types are supported let from_type = match from_type { - DataType::Dictionary(_, t) => return verify_support_type_for_prune(t.as_ref(), to_type), + DataType::Dictionary(_, t) => { + return verify_support_type_for_prune(t.as_ref(), to_type) + } _ => from_type, }; let to_type = match to_type { - DataType::Dictionary(_, t) => return verify_support_type_for_prune(from_type, t.as_ref()), + DataType::Dictionary(_, t) => { + return verify_support_type_for_prune(from_type, t.as_ref()) + } _ => to_type, }; + // If the types are the same (e.g. after unpacking a dictionary) they are supported if from_type == to_type { return Ok(()); } // TODO: support other data type for prunable cast or try cast if matches!( + // String -> String casts are suppoted from_type, - DataType::Int8 + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) && matches!( + to_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) { + return Ok(()); + } else if matches!( + // Numeric -> Numeric casts are supported + from_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Decimal128(_, _) + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + ) && matches!( + to_type, + DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Int8 | DataType::Int16 | DataType::Int32 | DataType::Int64 | DataType::Decimal128(_, _) + | DataType::Float16 + | DataType::Float32 + | DataType::Float64 + ) { + Ok(()) + } else if matches!( + // Temporal -> Temporal casts are supported + from_type, + DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) ) && matches!( to_type, - DataType::Int8 | DataType::Int32 | DataType::Int64 | DataType::Decimal128(_, _) + DataType::Date32 + | DataType::Date64 + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Timestamp(_, _) ) { Ok(()) } else { @@ -1558,8 +1608,8 @@ fn build_predicate_expression( // this can still produce a useful predicate when multiple conditions are joined using AND Err(e) => { println!("Error building pruning expression: {e}"); - return unhandled_hook.handle(expr) - }, + return unhandled_hook.handle(expr); + } }; build_statistics_expr(&mut expr_builder) From 4e0706f28270d1acb7abbea943d2c8d02a32066b Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 18 Apr 2025 11:32:45 -0500 Subject: [PATCH 03/10] clippy --- datafusion/physical-optimizer/src/pruning.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 5951ba74b6714..4ab672c25f9ec 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1232,17 +1232,15 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re if from_type == to_type { return Ok(()); } - // TODO: support other data type for prunable cast or try cast - if matches!( + let supported_string_cast = (matches!( // String -> String casts are suppoted from_type, DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View ) && matches!( to_type, DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View - ) { - return Ok(()); - } else if matches!( + )); + let supported_numeric_cast = (matches!( // Numeric -> Numeric casts are supported from_type, DataType::UInt8 @@ -1271,9 +1269,8 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re | DataType::Float16 | DataType::Float32 | DataType::Float64 - ) { - Ok(()) - } else if matches!( + )); + let supported_temporal_cast = (matches!( // Temporal -> Temporal casts are supported from_type, DataType::Date32 @@ -1288,7 +1285,10 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re | DataType::Time32(_) | DataType::Time64(_) | DataType::Timestamp(_, _) - ) { + )); + + // TODO: support other data type for prunable cast or try cast + if supported_string_cast || supported_numeric_cast || supported_temporal_cast { Ok(()) } else { plan_err!( From 67e923f6848e51d140cca5add31cb702287d5a21 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 18 Apr 2025 11:49:07 -0500 Subject: [PATCH 04/10] add tests --- datafusion/physical-optimizer/src/pruning.rs | 75 +++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 4ab672c25f9ec..bcb5275f29050 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -3071,7 +3071,7 @@ mod tests { } #[test] - fn row_group_predicate_cast() -> Result<()> { + fn row_group_predicate_cast_int_int() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 1 AND 1 <= CAST(c1_max@1 AS Int64)"; @@ -3108,6 +3108,79 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_cast_int_float() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Float64) <= 1 AND 1 <= CAST(c1_max@1 AS Float64)"; + + // test cast(c1 as float64) = 1 + // test column on the left + let expr = + cast(col("c1"), DataType::Float64).eq(lit(ScalarValue::Float64(Some(1.0)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Float64(Some(1.0))).eq(cast(col("c1"), DataType::Float64)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_string_string() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Utf8) <= 1 AND 1 <= CAST(c1_max@1 AS Utf8)"; + + // test cast(c1 as string) = '1' + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("1".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_timestamp_date() -> Result<()> { + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Date32) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Date32)"; + + // test cast(c1 as date) = '1970-01-01' + // test column on the left + let expr = + cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(0)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Date32(Some(0))).eq(cast(col("c1"), DataType::Date32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_cast_list() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); From 7ee1a38b570a885fde856b000917431f57430c2c Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Fri, 18 Apr 2025 11:52:19 -0500 Subject: [PATCH 05/10] add tests --- datafusion/physical-optimizer/src/pruning.rs | 73 +++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index bcb5275f29050..8fdf91938b4bd 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1607,7 +1607,7 @@ fn build_predicate_expression( // allow partial failure in predicate expression generation // this can still produce a useful predicate when multiple conditions are joined using AND Err(e) => { - println!("Error building pruning expression: {e}"); + dbg!(format!("Error building pruning expression: {e}")); return unhandled_hook.handle(expr); } }; @@ -3181,6 +3181,77 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_cast_timestamp_string() -> Result<()> { + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Timestamp(TimeUnit::Nanosecond, None), + false, + )]); + let expected_expr = "true"; + + // test cast(c1 as string) = '1970-01-01' + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("1970-01-01".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("1970-01-01".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_string_int() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); + let expected_expr = "true"; + + // test cast(c1 as int) = 1 + // test column on the left + let expr = cast(col("c1"), DataType::Int32).eq(lit(ScalarValue::Int32(Some(1)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Int32(Some(1))).eq(cast(col("c1"), DataType::Int32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_cast_int_string() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); + let expected_expr = "true"; + + // test cast(c1 as string) = '1' + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("1".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_cast_list() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); From 1b7c4db347b8f42d32e47bb6ef1fde840c06b3ab Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 19 Apr 2025 08:24:41 -0500 Subject: [PATCH 06/10] simplify to dicts --- datafusion/physical-optimizer/src/pruning.rs | 50 ++------------------ 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 8fdf91938b4bd..00f8eb307fdc2 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1232,6 +1232,7 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re if from_type == to_type { return Ok(()); } + // TODO: add an `is_string()` method to DataType let supported_string_cast = (matches!( // String -> String casts are suppoted from_type, @@ -1240,55 +1241,10 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re to_type, DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View )); - let supported_numeric_cast = (matches!( - // Numeric -> Numeric casts are supported - from_type, - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Decimal128(_, _) - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - ) && matches!( - to_type, - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Decimal128(_, _) - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - )); - let supported_temporal_cast = (matches!( - // Temporal -> Temporal casts are supported - from_type, - DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - ) && matches!( - to_type, - DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Timestamp(_, _) - )); + let supported_integer_cast = from_type.is_integer() && to_type.is_integer(); // TODO: support other data type for prunable cast or try cast - if supported_string_cast || supported_numeric_cast || supported_temporal_cast { + if supported_string_cast || supported_integer_cast { Ok(()) } else { plan_err!( From 293e36081c6a5705f51c615d5212f96049d7051d Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 19 Apr 2025 08:28:41 -0500 Subject: [PATCH 07/10] revert most changes --- datafusion/physical-optimizer/src/pruning.rs | 91 +++----------------- 1 file changed, 12 insertions(+), 79 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 00f8eb307fdc2..e853abb2954dc 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1241,10 +1241,20 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re to_type, DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View )); - let supported_integer_cast = from_type.is_integer() && to_type.is_integer(); + let supported_numeric_cast = matches!( + from_type, + DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::Decimal128(_, _) + ) && matches!( + to_type, + DataType::Int8 | DataType::Int32 | DataType::Int64 | DataType::Decimal128(_, _) + ); // TODO: support other data type for prunable cast or try cast - if supported_string_cast || supported_integer_cast { + if supported_string_cast || supported_numeric_cast { Ok(()) } else { plan_err!( @@ -3064,29 +3074,6 @@ mod tests { Ok(()) } - #[test] - fn row_group_predicate_cast_int_float() -> Result<()> { - let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Float64) <= 1 AND 1 <= CAST(c1_max@1 AS Float64)"; - - // test cast(c1 as float64) = 1 - // test column on the left - let expr = - cast(col("c1"), DataType::Float64).eq(lit(ScalarValue::Float64(Some(1.0)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = - lit(ScalarValue::Float64(Some(1.0))).eq(cast(col("c1"), DataType::Float64)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - #[test] fn row_group_predicate_cast_string_string() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); @@ -3110,60 +3097,6 @@ mod tests { Ok(()) } - #[test] - fn row_group_predicate_cast_timestamp_date() -> Result<()> { - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - )]); - let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Date32) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Date32)"; - - // test cast(c1 as date) = '1970-01-01' - // test column on the left - let expr = - cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(0)))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = - lit(ScalarValue::Date32(Some(0))).eq(cast(col("c1"), DataType::Date32)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - - #[test] - fn row_group_predicate_cast_timestamp_string() -> Result<()> { - let schema = Schema::new(vec![Field::new( - "c1", - DataType::Timestamp(TimeUnit::Nanosecond, None), - false, - )]); - let expected_expr = "true"; - - // test cast(c1 as string) = '1970-01-01' - // test column on the left - let expr = cast(col("c1"), DataType::Utf8) - .eq(lit(ScalarValue::Utf8(Some("1970-01-01".to_string())))); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - // test column on the right - let expr = lit(ScalarValue::Utf8(Some("1970-01-01".to_string()))) - .eq(cast(col("c1"), DataType::Utf8)); - let predicate_expr = - test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); - assert_eq!(predicate_expr.to_string(), expected_expr); - - Ok(()) - } - #[test] fn row_group_predicate_cast_string_int() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); From 5e91ba8c944bf872966bca8bc11c7ef77b58041b Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 19 Apr 2025 19:22:35 -0500 Subject: [PATCH 08/10] just check for strings, more tests --- datafusion/physical-optimizer/src/pruning.rs | 103 ++++++++++++++----- 1 file changed, 77 insertions(+), 26 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index e853abb2954dc..2b48039c0350b 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1210,6 +1210,13 @@ fn is_compare_op(op: Operator) -> bool { ) } +fn is_string_type(data_type: &DataType) -> bool { + matches!( + data_type, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + ) +} + // The pruning logic is based on the comparing the min/max bounds. // Must make sure the two type has order. // For example, casts from string to numbers is not correct. @@ -1232,29 +1239,10 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re if from_type == to_type { return Ok(()); } - // TODO: add an `is_string()` method to DataType - let supported_string_cast = (matches!( - // String -> String casts are suppoted - from_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View - ) && matches!( - to_type, - DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View - )); - let supported_numeric_cast = matches!( - from_type, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Decimal128(_, _) - ) && matches!( - to_type, - DataType::Int8 | DataType::Int32 | DataType::Int64 | DataType::Decimal128(_, _) - ); - - // TODO: support other data type for prunable cast or try cast - if supported_string_cast || supported_numeric_cast { + // If both types are strings or both are not strings (number, timestamp, etc) + // then we can compare them. + // PruningPredicate does not support casting of strings to numbers and such. + if is_string_type(from_type) == is_string_type(to_type) { Ok(()) } else { plan_err!( @@ -3079,7 +3067,6 @@ mod tests { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Utf8) <= 1 AND 1 <= CAST(c1_max@1 AS Utf8)"; - // test cast(c1 as string) = '1' // test column on the left let expr = cast(col("c1"), DataType::Utf8) .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); @@ -3102,7 +3089,6 @@ mod tests { let schema = Schema::new(vec![Field::new("c1", DataType::Utf8View, false)]); let expected_expr = "true"; - // test cast(c1 as int) = 1 // test column on the left let expr = cast(col("c1"), DataType::Int32).eq(lit(ScalarValue::Int32(Some(1)))); let predicate_expr = @@ -3123,7 +3109,6 @@ mod tests { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); let expected_expr = "true"; - // test cast(c1 as string) = '1' // test column on the left let expr = cast(col("c1"), DataType::Utf8) .eq(lit(ScalarValue::Utf8(Some("1".to_string())))); @@ -3141,6 +3126,72 @@ mod tests { Ok(()) } + #[test] + fn row_group_predicate_date_date() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Date64) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Date64)"; + + // test column on the left + let expr = cast(col("c1"), DataType::Date64) + .eq(lit(ScalarValue::Date64(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Date64(Some(123))) + .eq(cast(col("c1"), DataType::Date64)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_date_string() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Utf8, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast(col("c1"), DataType::Date32) + .eq(lit(ScalarValue::Date32(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Date32(Some(123))) + .eq(cast(col("c1"), DataType::Date32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_string_date() -> Result<()> { + let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast(col("c1"), DataType::Utf8) + .eq(lit(ScalarValue::Utf8(Some("2024-01-01".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("2024-01-01".to_string()))) + .eq(cast(col("c1"), DataType::Utf8)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + #[test] fn row_group_predicate_cast_list() -> Result<()> { let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]); From 797f1ac6ef78f8effe9759e761faea57eb1c5aa1 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 19 Apr 2025 19:44:46 -0500 Subject: [PATCH 09/10] more tests --- datafusion/physical-optimizer/src/pruning.rs | 171 ++++++++++++++++++- 1 file changed, 163 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 2b48039c0350b..4be4044d034b0 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -3132,15 +3132,170 @@ mod tests { let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Date64) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Date64)"; // test column on the left - let expr = cast(col("c1"), DataType::Date64) - .eq(lit(ScalarValue::Date64(Some(123)))); + let expr = + cast(col("c1"), DataType::Date64).eq(lit(ScalarValue::Date64(Some(123)))); let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); // test column on the right - let expr = lit(ScalarValue::Date64(Some(123))) - .eq(cast(col("c1"), DataType::Date64)); + let expr = + lit(ScalarValue::Date64(Some(123))).eq(cast(col("c1"), DataType::Date64)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_string_date() -> Result<()> { + // Test with Dictionary for the literal + let schema = Schema::new(vec![Field::new("c1", DataType::Date32, false)]); + let expected_expr = "true"; + + // test column on the left + let expr = cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + ) + .eq(lit(ScalarValue::Utf8(Some("2024-01-01".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = lit(ScalarValue::Utf8(Some("2024-01-01".to_string()))).eq(cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + )); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_date_dict_string() -> Result<()> { + // Test with Dictionary for the column + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + false, + )]); + let expected_expr = "true"; + + // test column on the left + let expr = + cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + // test column on the right + let expr = + lit(ScalarValue::Date32(Some(123))).eq(cast(col("c1"), DataType::Date32)); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_dict_same_value_type() -> Result<()> { + // Test with Dictionary types that have the same value type but different key types + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Utf8)), + false, + )]); + + // Direct comparison with no cast + let expr = col("c1").eq(lit(ScalarValue::Utf8(Some("test".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + let expected_expr = + "c1_null_count@2 != row_count@3 AND c1_min@0 <= test AND test <= c1_max@1"; + assert_eq!(predicate_expr.to_string(), expected_expr); + + // Test with column cast to a dictionary with different key type + let expr = cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Utf8)), + ) + .eq(lit(ScalarValue::Utf8(Some("test".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Dictionary(UInt16, Utf8)) <= test AND test <= CAST(c1_max@1 AS Dictionary(UInt16, Utf8))"; + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_dict_different_value_type() -> Result<()> { + // Test with Dictionary types that have different value types + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Int32)), + false, + )]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Int64) <= 123 AND 123 <= CAST(c1_max@1 AS Int64)"; + + // Test with literal of a different type + let expr = + cast(col("c1"), DataType::Int64).eq(lit(ScalarValue::Int64(Some(123)))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_nested_dict() -> Result<()> { + // Test with nested Dictionary types + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary( + Box::new(DataType::UInt8), + Box::new(DataType::Dictionary( + Box::new(DataType::UInt16), + Box::new(DataType::Utf8), + )), + ), + false, + )]); + let expected_expr = + "c1_null_count@2 != row_count@3 AND c1_min@0 <= test AND test <= c1_max@1"; + + // Test with a simple literal + let expr = col("c1").eq(lit(ScalarValue::Utf8(Some("test".to_string())))); + let predicate_expr = + test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); + assert_eq!(predicate_expr.to_string(), expected_expr); + + Ok(()) + } + + #[test] + fn row_group_predicate_dict_date_dict_date() -> Result<()> { + // Test with dictionary-wrapped date types for both sides + let schema = Schema::new(vec![Field::new( + "c1", + DataType::Dictionary(Box::new(DataType::UInt8), Box::new(DataType::Date32)), + false, + )]); + let expected_expr = "c1_null_count@2 != row_count@3 AND CAST(c1_min@0 AS Dictionary(UInt16, Date64)) <= 1970-01-01 AND 1970-01-01 <= CAST(c1_max@1 AS Dictionary(UInt16, Date64))"; + + // Test with a cast to a different date type + let expr = cast( + col("c1"), + DataType::Dictionary(Box::new(DataType::UInt16), Box::new(DataType::Date64)), + ) + .eq(lit(ScalarValue::Date64(Some(123)))); let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); @@ -3154,15 +3309,15 @@ mod tests { let expected_expr = "true"; // test column on the left - let expr = cast(col("c1"), DataType::Date32) - .eq(lit(ScalarValue::Date32(Some(123)))); + let expr = + cast(col("c1"), DataType::Date32).eq(lit(ScalarValue::Date32(Some(123)))); let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); // test column on the right - let expr = lit(ScalarValue::Date32(Some(123))) - .eq(cast(col("c1"), DataType::Date32)); + let expr = + lit(ScalarValue::Date32(Some(123))).eq(cast(col("c1"), DataType::Date32)); let predicate_expr = test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new()); assert_eq!(predicate_expr.to_string(), expected_expr); From d579fda82a2a2d548d60565114f80fd60439260c Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Sat, 19 Apr 2025 19:47:58 -0500 Subject: [PATCH 10/10] remove unecessary now confusing clause --- datafusion/physical-optimizer/src/pruning.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/datafusion/physical-optimizer/src/pruning.rs b/datafusion/physical-optimizer/src/pruning.rs index 4be4044d034b0..b62b72ac6dd7d 100644 --- a/datafusion/physical-optimizer/src/pruning.rs +++ b/datafusion/physical-optimizer/src/pruning.rs @@ -1235,10 +1235,6 @@ fn verify_support_type_for_prune(from_type: &DataType, to_type: &DataType) -> Re } _ => to_type, }; - // If the types are the same (e.g. after unpacking a dictionary) they are supported - if from_type == to_type { - return Ok(()); - } // If both types are strings or both are not strings (number, timestamp, etc) // then we can compare them. // PruningPredicate does not support casting of strings to numbers and such.