Skip to content

Commit 714c5df

Browse files
author
yangzhong
committed
Refine the statistics estimation for the limit and aggregate operator
1 parent a2bebe6 commit 714c5df

2 files changed

Lines changed: 30 additions & 9 deletions

File tree

datafusion/core/src/physical_plan/aggregates/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -505,7 +505,11 @@ impl ExecutionPlan for AggregateExec {
505505
..Default::default()
506506
}
507507
}
508-
_ => Statistics::default(),
508+
_ => Statistics {
509+
num_rows: self.input.statistics().num_rows,
510+
is_exact: false,
511+
..Default::default()
512+
},
509513
}
510514
}
511515
}

datafusion/core/src/physical_plan/limit.rs

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,16 @@ impl ExecutionPlan for GlobalLimitExec {
194194
fn statistics(&self) -> Statistics {
195195
let input_stats = self.input.statistics();
196196
let skip = self.skip;
197+
let max_row_num = self
198+
.fetch
199+
.map(|fetch| {
200+
if fetch >= usize::MAX - skip {
201+
usize::MAX
202+
} else {
203+
fetch + skip
204+
}
205+
})
206+
.unwrap_or(usize::MAX);
197207
match input_stats {
198208
Statistics {
199209
num_rows: Some(nr), ..
@@ -205,22 +215,25 @@ impl ExecutionPlan for GlobalLimitExec {
205215
is_exact: input_stats.is_exact,
206216
..Default::default()
207217
}
208-
} else if nr - skip <= self.fetch.unwrap_or(usize::MAX) {
218+
} else if nr <= max_row_num {
209219
// if the input does not reach the "fetch" globally, return input stats
210220
input_stats
211-
} else if nr - skip > self.fetch.unwrap_or(usize::MAX) {
221+
} else {
212222
// if the input is greater than the "fetch", the num_row will be the "fetch",
213223
// but we won't be able to predict the other statistics
214224
Statistics {
215-
num_rows: self.fetch,
225+
num_rows: Some(max_row_num),
216226
is_exact: input_stats.is_exact,
217227
..Default::default()
218228
}
219-
} else {
220-
Statistics::default()
221229
}
222230
}
223-
_ => Statistics::default(),
231+
_ => Statistics {
232+
// the result output row number will always be no greater than the limit number
233+
num_rows: Some(max_row_num),
234+
is_exact: false,
235+
..Default::default()
236+
},
224237
}
225238
}
226239
}
@@ -358,8 +371,12 @@ impl ExecutionPlan for LocalLimitExec {
358371
is_exact: input_stats.is_exact,
359372
..Default::default()
360373
},
361-
// if we don't know the input size, we can't predict the limit's behaviour
362-
_ => Statistics::default(),
374+
_ => Statistics {
375+
// the result output row number will always be no greater than the limit number
376+
num_rows: Some(self.fetch * self.output_partitioning().partition_count()),
377+
is_exact: false,
378+
..Default::default()
379+
},
363380
}
364381
}
365382
}

0 commit comments

Comments
 (0)