Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7a94690
Replace `KeyGroupedPartitioning` with `KeyedPartitioning`, add new `G…
peter-toth Feb 11, 2026
2762976
code cleanup
peter-toth Feb 21, 2026
0fb5be6
more code cleanup
peter-toth Feb 21, 2026
b8b2faa
no need to sort partition keys when building grouped `KeyedPartitioning`
peter-toth Feb 24, 2026
4873513
fix `BatchScanExec.equals()`
peter-toth Feb 24, 2026
6f7b980
remove `originalPartitionKeys` from `KeyedPartitioning` because it is…
peter-toth Feb 24, 2026
dc72b0d
minor name and docs fix
peter-toth Feb 24, 2026
84c3afa
fix `BatchScanExec` canonicalization
peter-toth Feb 25, 2026
5b7677c
more code cleanup and docs fixes
peter-toth Feb 25, 2026
2586620
partially clustered distribution no longer requires `canApplyPartialC…
peter-toth Feb 25, 2026
90a49a3
more code cleanup and comments
peter-toth Feb 26, 2026
a0a8a3d
BatchScanExec code cleanup
peter-toth Feb 27, 2026
d04ceab
Merge branch 'master' into SPARK-55535-refactor-kgp-and-spj
peter-toth Mar 3, 2026
853e6b7
make `isGrouped` precomputed
peter-toth Mar 3, 2026
f5baf76
change `toGrouped` to sort partition keys
peter-toth Mar 3, 2026
174fe90
use `InternalRowComparableWrapper` partition keys in `KeyedPartitioning`
peter-toth Mar 3, 2026
fa43291
fix dead variable
peter-toth Mar 3, 2026
1b6bb29
`GroupPartitionsExec` support columnar execution
peter-toth Mar 3, 2026
94054cb
fix `KeyedPartitioning.isGrouped` calculations
peter-toth Mar 3, 2026
b43017d
fix `applyGroupPartitions` documentation
peter-toth Mar 3, 2026
490f782
address review findings
peter-toth Mar 3, 2026
2a7e0b3
Update sql/core/src/main/scala/org/apache/spark/sql/execution/exchang…
peter-toth Mar 3, 2026
200fdc0
minor test fix
peter-toth Mar 4, 2026
4a904ad
empty partitioned table test
peter-toth Mar 4, 2026
5a4ecd1
additional checks for runtime filter tests
peter-toth Mar 4, 2026
b3d34ef
minor improvements
peter-toth Mar 4, 2026
326915b
refactor `GroupedPartitions`, document what `KeyedPartitioning.satisf…
peter-toth Mar 5, 2026
8526dc1
fix `KeyedPartitioning.isGrouped` when `expectedPartitionKeys` is set
peter-toth Mar 5, 2026
32b563f
add empty groupPartitions test case, fix test spark tags, cleanup SP…
peter-toth Mar 5, 2026
7951dc6
fix BroadcastDistribution in EnsureRequirements
peter-toth Mar 6, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/src/main/scala/org/apache/spark/Partitioner.scala
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ private[spark] class PartitionIdPassthrough(override val numPartitions: Int) ext
/**
* A [[org.apache.spark.Partitioner]] that partitions all records using partition value map.
* The `valueMap` is a map that contains tuples of (partition value, partition id). It is generated
* by [[org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning]], used to partition
* by [[org.apache.spark.sql.catalyst.plans.physical.KeyedPartitioning]], used to partition
* the other side of a join to make sure records with same partition value are in the same
* partition.
*/
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@

package org.apache.spark.sql.catalyst.util

import scala.collection.mutable

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{BaseOrdering, Expression, Murmur3HashFunction, RowOrdering}
import org.apache.spark.sql.connector.read.{HasPartitionKey, InputPartition}
Expand Down Expand Up @@ -101,31 +99,6 @@ object InternalRowComparableWrapper {
new InternalRowComparableWrapper(partitionRow, partitionExpression.map(_.dataType))
}

def mergePartitions(
leftPartitioning: Seq[InternalRow],
rightPartitioning: Seq[InternalRow],
partitionExpression: Seq[Expression],
intersect: Boolean = false): Seq[InternalRowComparableWrapper] = {
val partitionDataTypes = partitionExpression.map(_.dataType)
val leftPartitionSet = new mutable.HashSet[InternalRowComparableWrapper]
val internalRowComparableWrapperFactory =
getInternalRowComparableWrapperFactory(partitionDataTypes)
leftPartitioning
.map(internalRowComparableWrapperFactory)
.foreach(partition => leftPartitionSet.add(partition))
val rightPartitionSet = new mutable.HashSet[InternalRowComparableWrapper]
rightPartitioning
.map(internalRowComparableWrapperFactory)
.foreach(partition => rightPartitionSet.add(partition))

val result = if (intersect) {
leftPartitionSet.intersect(rightPartitionSet)
} else {
leftPartitionSet.union(rightPartitionSet)
}
result.toSeq
}

/** Creates a shared factory method for a given row schema to avoid excessive cache lookups. */
def getInternalRowComparableWrapperFactory(
dataTypes: Seq[DataType]): InternalRow => InternalRowComparableWrapper = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

package org.apache.spark.sql.catalyst.util

import scala.collection.mutable

import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
import org.apache.spark.sql.connector.catalog.PartitionInternalRow
import org.apache.spark.sql.types.IntegerType

Expand All @@ -41,30 +41,28 @@ object InternalRowComparableWrapperBenchmark extends BenchmarkBase {
val partitionNum = 200_000
val bucketNum = 4096
val day = 20240401
val partitions = (0 until partitionNum).map { i =>
val partitionKeys = (0 until partitionNum).map { i =>
val bucketId = i % bucketNum
PartitionInternalRow.apply(Array(day, bucketId));
}
val benchmark = new Benchmark("internal row comparable wrapper", partitionNum, output = output)

val comparableKeyWrapperFactory =
InternalRowComparableWrapper.getInternalRowComparableWrapperFactory(
Seq(IntegerType, IntegerType))
val comparablePartitionKeys = partitionKeys.map(comparableKeyWrapperFactory)

benchmark.addCase("toSet") { _ =>
val internalRowComparableWrapperFactory =
InternalRowComparableWrapper.getInternalRowComparableWrapperFactory(
Seq(IntegerType, IntegerType))
val distinct = partitions
.map(internalRowComparableWrapperFactory)
.toSet
val distinct = comparablePartitionKeys.toSet

assert(distinct.size == bucketNum)
}

benchmark.addCase("mergePartitions") { _ =>
// just to mock the data types
val expressions = (Seq(Literal(day, IntegerType), Literal(0, IntegerType)))
val leftKeySet = mutable.HashSet.from(comparablePartitionKeys)
val rightKeySet = mutable.HashSet.from(comparablePartitionKeys)
val merged = leftKeySet.union(rightKeySet)

val leftPartitioning = KeyGroupedPartitioning(expressions, bucketNum, partitions)
val rightPartitioning = KeyGroupedPartitioning(expressions, bucketNum, partitions)
val merged = InternalRowComparableWrapper.mergePartitions(
leftPartitioning.partitionValues, rightPartitioning.partitionValues, expressions)
assert(merged.size == bucketNum)
}

Expand Down

This file was deleted.

Loading