diff --git a/Cargo.lock b/Cargo.lock index 4bc6a6728fe17..6921c5b080e55 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2389,6 +2389,7 @@ dependencies = [ "arrow", "async-trait", "chrono", + "criterion", "ctor", "datafusion-common", "datafusion-expr", diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 61d101aab3f8e..60358d20e2a1a 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -55,6 +55,7 @@ regex-syntax = "0.8.0" [dev-dependencies] async-trait = { workspace = true } +criterion = { workspace = true } ctor = { workspace = true } datafusion-functions-aggregate = { workspace = true } datafusion-functions-window = { workspace = true } @@ -62,3 +63,7 @@ datafusion-functions-window-common = { workspace = true } datafusion-sql = { workspace = true } env_logger = { workspace = true } insta = { workspace = true } + +[[bench]] +name = "projection_unnecessary" +harness = false diff --git a/datafusion/optimizer/benches/projection_unnecessary.rs b/datafusion/optimizer/benches/projection_unnecessary.rs new file mode 100644 index 0000000000000..100ee97542ebb --- /dev/null +++ b/datafusion/optimizer/benches/projection_unnecessary.rs @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::datatypes::{DataType, Field, Schema}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_common::ToDFSchema; +use datafusion_common::{Column, TableReference}; +use datafusion_expr::{logical_plan::LogicalPlan, projection_schema, Expr}; +use datafusion_optimizer::optimize_projections::is_projection_unnecessary; +use std::sync::Arc; + +fn is_projection_unnecessary_old( + input: &LogicalPlan, + proj_exprs: &[Expr], +) -> datafusion_common::Result { + // First check if all expressions are trivial (cheaper operation than `projection_schema`) + if !proj_exprs + .iter() + .all(|expr| matches!(expr, Expr::Column(_) | Expr::Literal(_))) + { + return Ok(false); + } + let proj_schema = projection_schema(input, proj_exprs)?; + Ok(&proj_schema == input.schema()) +} + +fn create_plan_with_many_exprs(num_exprs: usize) -> (LogicalPlan, Vec) { + // Create schema with many fields + let fields = (0..num_exprs) + .map(|i| Field::new(format!("col{}", i), DataType::Int32, false)) + .collect::>(); + let schema = Schema::new(fields); + + // Create table scan + let table_scan = LogicalPlan::EmptyRelation(datafusion_expr::EmptyRelation { + produce_one_row: true, + schema: Arc::new(schema.clone().to_dfschema().unwrap()), + }); + + // Create projection expressions (just column references) + let exprs = (0..num_exprs) + .map(|i| Expr::Column(Column::new(None::, format!("col{}", i)))) + .collect(); + + (table_scan, exprs) +} + +fn benchmark_is_projection_unnecessary(c: &mut Criterion) { + let (plan, exprs) = create_plan_with_many_exprs(1000); + + let mut group = c.benchmark_group("projection_unnecessary_comparison"); + + group.bench_function("is_projection_unnecessary_new", |b| { + b.iter(|| black_box(is_projection_unnecessary(&plan, &exprs).unwrap())) + }); + + group.bench_function("is_projection_unnecessary_old", |b| { + b.iter(|| black_box(is_projection_unnecessary_old(&plan, &exprs).unwrap())) + }); + + group.finish(); +} + +criterion_group!(benches, benchmark_is_projection_unnecessary); +criterion_main!(benches); diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index a070a998ea445..4452b2d4ce034 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -31,8 +31,7 @@ use datafusion_common::{ use datafusion_expr::expr::Alias; use datafusion_expr::Unnest; use datafusion_expr::{ - logical_plan::LogicalPlan, projection_schema, Aggregate, Distinct, Expr, Projection, - TableScan, Window, + logical_plan::LogicalPlan, Aggregate, Distinct, Expr, Projection, TableScan, Window, }; use crate::optimize_projections::required_indices::RequiredIndices; @@ -785,13 +784,24 @@ fn rewrite_projection_given_requirements( /// Projection is unnecessary, when /// - input schema of the projection, output schema of the projection are same, and /// - all projection expressions are either Column or Literal -fn is_projection_unnecessary(input: &LogicalPlan, proj_exprs: &[Expr]) -> Result { - // First check if all expressions are trivial (cheaper operation than `projection_schema`) - if !proj_exprs.iter().all(is_expr_trivial) { +pub fn is_projection_unnecessary( + input: &LogicalPlan, + proj_exprs: &[Expr], +) -> Result { + // First check if the number of expressions is equal to the number of fields in the input schema. + if proj_exprs.len() != input.schema().fields().len() { return Ok(false); } - let proj_schema = projection_schema(input, proj_exprs)?; - Ok(&proj_schema == input.schema()) + Ok(input.schema().iter().zip(proj_exprs.iter()).all( + |((field_relation, field_name), expr)| { + // Check if the expression is a column and if it matches the field name + if let Expr::Column(col) = expr { + col.relation.as_ref() == field_relation && col.name.eq(field_name.name()) + } else { + false + } + }, + )) } #[cfg(test)]