diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 50e479af36204..e993f1ceee880 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -22,11 +22,12 @@ use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Diagnostic, Result, Span, }; -use datafusion_expr::expr::{ - NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction, +use datafusion_expr::{ + expr, + expr::{NullTreatment, ScalarFunction, Unnest, WildcardOptions, WindowFunction}, + planner::{PlannerResult, RawAggregateExpr, RawWindowExpr}, + Expr, ExprSchemable, SortExpr, WindowFrame, WindowFunctionDefinition, }; -use datafusion_expr::planner::{PlannerResult, RawAggregateExpr, RawWindowExpr}; -use datafusion_expr::{expr, Expr, ExprSchemable, WindowFrame, WindowFunctionDefinition}; use sqlparser::ast::{ DuplicateTreatment, Expr as SQLExpr, Function as SQLFunction, FunctionArg, FunctionArgExpr, FunctionArgumentClause, FunctionArgumentList, FunctionArguments, @@ -212,6 +213,9 @@ impl FunctionArgs { } } +// Helper type for extracting WITHIN GROUP ordering and prepended args +type WithinGroupExtraction = (Vec, Vec, Vec>); + impl SqlToRel<'_, S> { pub(super) fn sql_function_to_expr( &self, @@ -490,31 +494,30 @@ impl SqlToRel<'_, S> { let (mut args, mut arg_names) = self.function_args_to_expr_with_names(args, schema, planner_context)?; - let order_by = if fm.supports_within_group_clause() { - let within_group = self.order_by_to_sort_expr( - within_group, - schema, - planner_context, - false, - None, - )?; - - // Add the WITHIN GROUP ordering expressions to the front of the argument list - // So function(arg) WITHIN GROUP (ORDER BY x) becomes function(x, arg) - if !within_group.is_empty() { - // Prepend None arg names for each WITHIN GROUP expression - let within_group_count = within_group.len(); - arg_names = std::iter::repeat_n(None, within_group_count) - .chain(arg_names) - .collect(); - - args = within_group - .iter() - .map(|sort| sort.expr.clone()) - .chain(args) - .collect::>(); - } - within_group + // UDAFs must opt-in via `supports_within_group_clause()` to + // accept a WITHIN GROUP clause. + let supports_within_group = fm.supports_within_group_clause(); + + if !within_group.is_empty() && !supports_within_group { + return plan_err!( + "WITHIN GROUP is only supported for ordered-set aggregate functions" + ); + } + + // If the UDAF supports WITHIN GROUP, convert the ordering into + // sort expressions and prepend them as unnamed function args. + let order_by = if supports_within_group { + let (within_group_sorts, new_args, new_arg_names) = self + .extract_and_prepend_within_group_args( + within_group, + args, + arg_names, + schema, + planner_context, + )?; + args = new_args; + arg_names = new_arg_names; + within_group_sorts } else { let order_by = if !order_by.is_empty() { order_by @@ -807,6 +810,38 @@ impl SqlToRel<'_, S> { Ok((exprs, names)) } + fn extract_and_prepend_within_group_args( + &self, + within_group: Vec, + mut args: Vec, + mut arg_names: Vec>, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Result { + let within_group = self.order_by_to_sort_expr( + within_group, + schema, + planner_context, + false, + None, + )?; + + if !within_group.is_empty() { + let within_group_count = within_group.len(); + arg_names = std::iter::repeat_n(None, within_group_count) + .chain(arg_names) + .collect(); + + args = within_group + .iter() + .map(|sort| sort.expr.clone()) + .chain(args) + .collect::>(); + } + + Ok((within_group, args, arg_names)) + } + pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> { // Check argument type, array types are supported match arg.get_type(schema)? { diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 96d9f23522f1f..b69352159a963 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -38,10 +38,12 @@ use datafusion_sql::{ use crate::common::{CustomExprPlanner, CustomTypePlanner, MockSessionState}; use datafusion_functions::core::planner::CoreFunctionPlanner; use datafusion_functions_aggregate::{ - approx_median::approx_median_udaf, count::count_udaf, min_max::max_udaf, - min_max::min_udaf, + approx_median::approx_median_udaf, + average::avg_udaf, + count::count_udaf, + grouping::grouping_udaf, + min_max::{max_udaf, min_udaf}, }; -use datafusion_functions_aggregate::{average::avg_udaf, grouping::grouping_udaf}; use datafusion_functions_nested::make_array::make_array_udf; use datafusion_functions_window::{rank::rank_udwf, row_number::row_number_udwf}; use insta::{allow_duplicates, assert_snapshot}; @@ -233,6 +235,22 @@ fn parse_ident_normalization_4() { ); } +#[test] +fn within_group_rejected_for_non_ordered_set_udaf() { + // MIN is order-sensitive by nature but does not implement the + // ordered-set `WITHIN GROUP` opt-in. The planner must reject + // explicit `WITHIN GROUP` syntax for functions that do not + // advertise `supports_within_group_clause()`. + let sql = "SELECT min(c1) WITHIN GROUP (ORDER BY c1) FROM person"; + let err = logical_plan(sql) + .expect_err("expected planning to fail for MIN WITHIN GROUP") + .to_string(); + assert_contains!( + err, + "WITHIN GROUP is only supported for ordered-set aggregate functions" + ); +} + #[test] fn parse_ident_normalization_5() { let sql = "SELECT AGE FROM PERSON"; diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 132922e42d38d..0445df90f682e 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -129,6 +129,16 @@ CREATE TABLE group_median_table_nullable ( # Error tests ####### +statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions +SELECT SUM(c2) WITHIN GROUP (ORDER BY c2) FROM aggregate_test_100 + +# WITHIN GROUP rejected for non-ordered-set UDAF +# MIN does not implement ordered-set semantics (`supports_within_group_clause()`), +# so the planner should reject the WITHIN GROUP syntax. +statement error DataFusion error: Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions +SELECT MIN(c) WITHIN GROUP (ORDER BY c) FROM (VALUES (1),(2)) as t(c); + + # https://github.com/apache/datafusion/issues/3353 statement error DataFusion error: Schema error: Schema contains duplicate unqualified field name "approx_distinct\(aggregate_test_100\.c9\)" SELECT approx_distinct(c9) count_c9, approx_distinct(cast(c9 as varchar)) count_c9_str FROM aggregate_test_100 @@ -7867,17 +7877,15 @@ VALUES ---- x 1 -query ? +query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions SELECT array_agg(a_varchar) WITHIN GROUP (ORDER BY a_varchar) FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar); ----- -[a, a, c, d] -query ? + +query error Error during planning: WITHIN GROUP is only supported for ordered-set aggregate functions SELECT array_agg(DISTINCT a_varchar) WITHIN GROUP (ORDER BY a_varchar) FROM (VALUES ('a'), ('d'), ('c'), ('a')) t(a_varchar); ----- -[a, c, d] + query error Error during planning: ORDER BY and WITHIN GROUP clauses cannot be used together in the same aggregate function SELECT array_agg(a_varchar order by a_varchar) WITHIN GROUP (ORDER BY a_varchar) diff --git a/dev/update_function_docs.sh b/dev/update_function_docs.sh index 6ed760bd22ff4..63f4f2475c471 100755 --- a/dev/update_function_docs.sh +++ b/dev/update_function_docs.sh @@ -78,6 +78,36 @@ FROM employees; ``` Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`. + +## WITHIN GROUP / Ordered-set aggregates + +Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the +aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns +`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to +use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail +during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions". + +Currently, the built-in aggregate functions that support `WITHIN GROUP` are: + +- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`) +- `approx_percentile_cont` — approximate percentile using the t-digest algorithm +- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm + +Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and +use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion. + +Example (ordered-set aggregate): + +```sql +percentile_cont(0.5) WITHIN GROUP (ORDER BY value) +``` + +Example (invalid usage — planner will error): + +```sql +-- This will fail: SUM is not an ordered-set aggregate +SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t; +``` EOF echo "Running CLI and inserting aggregate function docs table" diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 7cc84f424be45..8288923008606 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -25,6 +25,27 @@ You can see the current [status of the `52.0.0` release here](https://github.com/apache/datafusion/issues/18566) +### Planner now requires explicit opt-in for WITHIN GROUP syntax + +The SQL planner now enforces the aggregate UDF contract more strictly: the +`WITHIN GROUP (ORDER BY ...)` syntax is accepted only if the aggregate UDAF +explicitly advertises support by returning `true` from +`AggregateUDFImpl::supports_within_group_clause()`. + +Previously the planner forwarded a `WITHIN GROUP` clause to order-sensitive +aggregates even when they did not implement ordered-set semantics, which could +cause queries such as `SUM(x) WITHIN GROUP (ORDER BY x)` to plan successfully. +This behavior was too permissive and has been changed to match PostgreSQL and +the documented semantics. + +Migration: If your UDAF intentionally implements ordered-set semantics and +wants to accept the `WITHIN GROUP` SQL syntax, update your implementation to +return `true` from `supports_within_group_clause()` and handle the ordering +semantics in your accumulator implementation. If your UDAF is merely +order-sensitive (but not an ordered-set aggregate), do not advertise +`supports_within_group_clause()` and clients should use alternative function +signatures (for example, explicit ordering as a function argument) instead. + ### `AggregateUDFImpl::supports_null_handling_clause` now defaults to `false` This method specifies whether an aggregate function allows `IGNORE NULLS`/`RESPECT NULLS` diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index f17e09f2ce9d0..ba9c6ae12477b 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -48,6 +48,36 @@ FROM employees; Note: When no rows pass the filter, `COUNT` returns `0` while `SUM`/`AVG`/`MIN`/`MAX` return `NULL`. +## WITHIN GROUP / Ordered-set aggregates + +Some aggregate functions accept the SQL `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering the +aggregate relies on. In DataFusion this is opt-in: only aggregate functions whose implementation returns +`true` from `AggregateUDFImpl::supports_within_group_clause()` accept the `WITHIN GROUP` clause. Attempting to +use `WITHIN GROUP` with a regular aggregate (for example, `SELECT SUM(x) WITHIN GROUP (ORDER BY x)`) will fail +during planning with an error: "WITHIN GROUP is only supported for ordered-set aggregate functions". + +Currently, the built-in aggregate functions that support `WITHIN GROUP` are: + +- `percentile_cont` — exact percentile aggregate (also available as `percentile_cont(column, percentile)`) +- `approx_percentile_cont` — approximate percentile using the t-digest algorithm +- `approx_percentile_cont_with_weight` — approximate weighted percentile using the t-digest algorithm + +Note: rank-like functions such as `rank()`, `dense_rank()`, and `percent_rank()` are window functions and +use the `OVER (...)` clause; they are not ordered-set aggregates that accept `WITHIN GROUP` in DataFusion. + +Example (ordered-set aggregate): + +```sql +percentile_cont(0.5) WITHIN GROUP (ORDER BY value) +``` + +Example (invalid usage — planner will error): + +```sql +-- This will fail: SUM is not an ordered-set aggregate +SELECT SUM(x) WITHIN GROUP (ORDER BY x) FROM t; +``` + ## General Functions - [array_agg](#array_agg)