-
Notifications
You must be signed in to change notification settings - Fork 3.3k
Spark 4.1: Implement SupportsReportOrdering DSv2 API #14948
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
e53c944
c858020
51b4150
4f599ec
b21387e
10d713a
179056d
62fcecd
8fafe5a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,275 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one | ||
| * or more contributor license agreements. See the NOTICE file | ||
| * distributed with this work for additional information | ||
| * regarding copyright ownership. The ASF licenses this file | ||
| * to you under the Apache License, Version 2.0 (the | ||
| * "License"); you may not use this file except in compliance | ||
| * with the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
| package org.apache.iceberg.spark.source; | ||
|
|
||
| import java.io.IOException; | ||
| import java.io.UncheckedIOException; | ||
| import java.util.Arrays; | ||
| import java.util.Collections; | ||
| import java.util.Comparator; | ||
| import java.util.List; | ||
| import org.apache.iceberg.BaseScanTaskGroup; | ||
| import org.apache.iceberg.FileScanTask; | ||
| import org.apache.iceberg.ScanTaskGroup; | ||
| import org.apache.iceberg.Schema; | ||
| import org.apache.iceberg.SortField; | ||
| import org.apache.iceberg.SortOrder; | ||
| import org.apache.iceberg.SortOrderComparators; | ||
| import org.apache.iceberg.StructLike; | ||
| import org.apache.iceberg.Table; | ||
| import org.apache.iceberg.io.CloseableGroup; | ||
| import org.apache.iceberg.io.CloseableIterable; | ||
| import org.apache.iceberg.io.CloseableIterator; | ||
| import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
| import org.apache.iceberg.relocated.com.google.common.collect.Lists; | ||
| import org.apache.iceberg.spark.SparkSchemaUtil; | ||
| import org.apache.iceberg.spark.source.metrics.TaskNumDeletes; | ||
| import org.apache.iceberg.spark.source.metrics.TaskNumSplits; | ||
| import org.apache.iceberg.types.TypeUtil; | ||
| import org.apache.iceberg.types.Types; | ||
| import org.apache.iceberg.util.SortedMerge; | ||
| import org.apache.spark.sql.catalyst.InternalRow; | ||
| import org.apache.spark.sql.catalyst.ProjectingInternalRow; | ||
| import org.apache.spark.sql.connector.metric.CustomTaskMetric; | ||
| import org.apache.spark.sql.connector.read.PartitionReader; | ||
| import org.apache.spark.sql.types.StructType; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
| import scala.collection.JavaConverters; | ||
|
|
||
| /** | ||
| * A {@link PartitionReader} that reads multiple sorted files and merges them into a single sorted | ||
| * stream using a k-way heap merge ({@link SortedMerge}). | ||
| * | ||
| * <p>This reader is used when {@code preserve-data-ordering} is enabled and the task group contains | ||
| * multiple files that all have the same sort order. | ||
| * | ||
| * <p>Sort key columns absent from the requested projection are temporarily added to the read schema | ||
| * so that {@link SortOrderComparators} can access them during the merge. The extra columns are | ||
| * stripped from each row before it is returned to Spark. | ||
| */ | ||
| class MergingSortedRowDataReader implements PartitionReader<InternalRow> { | ||
| private static final Logger LOG = LoggerFactory.getLogger(MergingSortedRowDataReader.class); | ||
|
|
||
| private final CloseableGroup resources; | ||
| private final CloseableIterator<InternalRow> mergedIterator; | ||
| private final List<RowDataReader> fileReaders; | ||
| // non-null only when sort key columns were added to the read schema beyond what Spark projected | ||
| private final ProjectingInternalRow projectingRow; | ||
| private InternalRow current; | ||
|
|
||
| MergingSortedRowDataReader(SparkInputPartition partition) { | ||
| Table table = partition.table(); | ||
| ScanTaskGroup<FileScanTask> taskGroup = partition.taskGroup(); | ||
| Schema projection = partition.projection(); | ||
| SortOrder sortOrder = table.sortOrder(); | ||
|
|
||
| int numFiles = taskGroup.tasks().size(); | ||
|
|
||
| Preconditions.checkState( | ||
| sortOrder.isSorted(), "Cannot create merging reader for unsorted table %s", table.name()); | ||
| Preconditions.checkState( | ||
| numFiles > 1, "Merging reader requires multiple files, got %s", numFiles); | ||
|
|
||
| LOG.info( | ||
| "Creating merging reader for {} files with sort order {} in table {}", | ||
| numFiles, | ||
| sortOrder.orderId(), | ||
| table.name()); | ||
|
|
||
| // Augment the projected schema with any sort key columns Spark did not request so that | ||
| // SortOrderComparators can access every sort key field during the merge. | ||
| Schema mergeReadSchema = mergeReadSchema(projection, sortOrder, table); | ||
| this.projectingRow = buildProjectingRow(projection, mergeReadSchema); | ||
|
|
||
| this.resources = new CloseableGroup(); | ||
| this.fileReaders = | ||
| taskGroup.tasks().stream() | ||
| .map( | ||
| task -> | ||
| new RowDataReader( | ||
| table, | ||
| partition.io(), | ||
| new BaseScanTaskGroup<>(Collections.singletonList(task)), | ||
| mergeReadSchema, | ||
| partition.isCaseSensitive(), | ||
| partition.cacheDeleteFilesOnExecutors())) | ||
| .toList(); | ||
| // Wrap each reader as a CloseableIterable and feed into SortedMerge. | ||
| List<CloseableIterable<InternalRow>> fileIterables = | ||
| fileReaders.stream().map(this::readerToIterable).toList(); | ||
| SortedMerge<InternalRow> sortedMerge = | ||
| new SortedMerge<>(buildComparator(mergeReadSchema, sortOrder), fileIterables); | ||
| resources.addCloseable(sortedMerge); | ||
| this.mergedIterator = sortedMerge.iterator(); | ||
| } | ||
|
|
||
| /** | ||
| * Adapts a {@link RowDataReader} to a {@link CloseableIterable} for use with {@link SortedMerge}. | ||
| * Each row is copied before it enters the priority queue because Spark's Parquet/ORC readers | ||
| * reuse {@link InternalRow} instances for performance. | ||
| */ | ||
| private CloseableIterable<InternalRow> readerToIterable(RowDataReader reader) { | ||
| return CloseableIterable.withNoopClose( | ||
| () -> | ||
| new CloseableIterator<>() { | ||
| private boolean advanced = false; | ||
| private boolean hasNext = false; | ||
|
|
||
| @Override | ||
| public boolean hasNext() { | ||
| if (!advanced) { | ||
| try { | ||
| hasNext = reader.next(); | ||
| advanced = true; | ||
| } catch (IOException e) { | ||
| throw new UncheckedIOException("Failed to advance reader", e); | ||
| } | ||
| } | ||
| return hasNext; | ||
| } | ||
|
|
||
| @Override | ||
| public InternalRow next() { | ||
| if (!advanced) { | ||
| hasNext(); | ||
| } | ||
| advanced = false; | ||
| return reader.get().copy(); | ||
| } | ||
|
|
||
| @Override | ||
| public void close() throws IOException { | ||
| reader.close(); | ||
| } | ||
| }); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean next() throws IOException { | ||
| if (!mergedIterator.hasNext()) { | ||
| return false; | ||
| } | ||
|
|
||
| InternalRow merged = mergedIterator.next(); | ||
| if (projectingRow == null) { | ||
| this.current = merged; | ||
| } else { | ||
| projectingRow.project(merged); | ||
| this.current = projectingRow; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| @Override | ||
| public InternalRow get() { | ||
| return current; | ||
| } | ||
|
|
||
| @Override | ||
| public void close() throws IOException { | ||
| resources.close(); | ||
| } | ||
|
|
||
| @Override | ||
| public CustomTaskMetric[] currentMetricsValues() { | ||
| long totalDeletes = | ||
| fileReaders.stream() | ||
| .flatMap(reader -> Arrays.stream(reader.currentMetricsValues())) | ||
| .filter(metric -> metric instanceof TaskNumDeletes) | ||
| .mapToLong(CustomTaskMetric::value) | ||
| .sum(); | ||
| return new CustomTaskMetric[] { | ||
| new TaskNumSplits(fileReaders.size()), new TaskNumDeletes(totalDeletes) | ||
| }; | ||
| } | ||
|
|
||
| /** | ||
| * Builds a comparator for merging {@link InternalRow}s by the given sort order. Uses {@link | ||
| * SortOrderComparators} which handles all transform types (identity, bucket, truncate), ASC/DESC | ||
| * directions, and null ordering. The two {@link InternalRowWrapper} instances are allocated once | ||
| * and reused — {@code wrap()} just updates an internal reference. | ||
| */ | ||
| private static Comparator<InternalRow> buildComparator( | ||
| Schema mergeReadSchema, SortOrder sortOrder) { | ||
| StructType sparkSchema = SparkSchemaUtil.convert(mergeReadSchema); | ||
| Comparator<StructLike> keyComparator = | ||
| SortOrderComparators.forSchema(mergeReadSchema, sortOrder); | ||
| InternalRowWrapper left = new InternalRowWrapper(sparkSchema, mergeReadSchema.asStruct()); | ||
| InternalRowWrapper right = new InternalRowWrapper(sparkSchema, mergeReadSchema.asStruct()); | ||
| return (r1, r2) -> keyComparator.compare(left.wrap(r1), right.wrap(r2)); | ||
| } | ||
|
|
||
| /** | ||
| * Returns a {@link ProjectingInternalRow} that remaps columns from the wider merge schema back to | ||
| * the requested projection, or {@code null} if no extra columns were added. | ||
| */ | ||
| private static ProjectingInternalRow buildProjectingRow(Schema projection, Schema mergeSchema) { | ||
| if (projection.columns().size() == mergeSchema.columns().size()) { | ||
| return null; | ||
| } | ||
|
|
||
| List<Types.NestedField> mergeColumns = mergeSchema.columns(); | ||
| List<Object> positions = Lists.newArrayListWithCapacity(projection.columns().size()); | ||
|
|
||
| for (int i = 0; i < projection.columns().size(); i++) { | ||
| int fieldId = projection.columns().get(i).fieldId(); | ||
| boolean found = false; | ||
| for (int j = 0; j < mergeColumns.size(); j++) { | ||
| if (mergeColumns.get(j).fieldId() == fieldId) { | ||
| positions.add(j); | ||
| found = true; | ||
| break; | ||
| } | ||
| } | ||
| Preconditions.checkState( | ||
| found, "Projection field id=%s not found in merge read schema — this is a bug", fieldId); | ||
| } | ||
|
|
||
| StructType sparkSchema = SparkSchemaUtil.convert(projection); | ||
| return new ProjectingInternalRow(sparkSchema, JavaConverters.asScala(positions).toIndexedSeq()); | ||
| } | ||
|
|
||
| /** | ||
| * Returns the schema to use when reading each file. This is the requested {@code projection} | ||
| * augmented with any sort key columns that are not already present, so the merge comparator can | ||
| * access every sort key field regardless of what Spark projected. | ||
| */ | ||
| private static Schema mergeReadSchema(Schema projection, SortOrder sortOrder, Table table) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this work for nested schema like struct?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added a test for nested structure and it failed. Thanks for pointing this out. I have some ideas on how we can make it work for nested structs but I would like to make that change as a follow up. For this PR, if the sort key is a nested field, then we do not report the ordering. I added a |
||
| Schema tableSchema = table.schema(); | ||
| List<Types.NestedField> missingFields = Lists.newArrayList(); | ||
|
|
||
| for (SortField sortField : sortOrder.fields()) { | ||
| int fieldId = sortField.sourceId(); | ||
| if (projection.findField(fieldId) == null) { | ||
| Types.NestedField tableField = tableSchema.findField(fieldId); | ||
| if (tableField != null) { | ||
| missingFields.add(tableField); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (missingFields.isEmpty()) { | ||
| return projection; | ||
| } | ||
|
|
||
| return TypeUtil.join(projection, new Schema(missingFields)); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Issue found by Codex (I verified):
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm, this is an interesting one I had not thought of, I'm trying to understand the InputFilleBlockHolder more.
One option is to track which file each merged row came from and update
InputFileBlockHolderinMergingSortedRowDataReader.next(). Each sub-reader wraps a single file, so we can capture the file path when building the iterables and tag each row with its source.What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.