[SPARK-53535][SQL][FOLLOWUP] Fix findCheapestGroupField returning invalid Map

ZiyaZa · cloud-fan · commit 7ac2b5423362 · 2025-10-28T19:06:18.000+08:00
### What changes were proposed in this pull request? This PR fixes a bug from #52557, where we are reading an additional field if all the requested fields of a struct are missing from the Parquet file. We used to always pick the cheapest leaf column of the struct. However, if this leaf was inside a Map column, then we'd generate an invalid Map type like the following: ``` optional group _1 (MAP) { repeated group key_value { required boolean key; } } ``` Since there is no `value` field in this group, we'd fail later when trying to convert this Parquet type to a Spark type. This PR changes the additional field selection logic to enforce selecting a field from both the key and the value of the map, which can now give us a type like following: ``` optional group _1 (MAP) { repeated group key_value { required boolean key; optional group value { optional int32 _2; } } } ``` ### Why are the changes needed? To fix a critical bug where we would throw an exception when reading a Parquet file. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? New unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52758 from ZiyaZa/fix-missing-struct-with-map. Authored-by: Ziya Mukhtarov <ziya5muxtarov@gmail.com> Signed-off-by: Wenchen Fan <wenchen@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadSupport.scala
@@ -28,7 +28,7 @@ import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
 import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
 import org.apache.parquet.io.api.RecordMaterializer
 import org.apache.parquet.schema._
-import org.apache.parquet.schema.LogicalTypeAnnotation.ListLogicalTypeAnnotation
+import org.apache.parquet.schema.LogicalTypeAnnotation.{ListLogicalTypeAnnotation, MapKeyValueTypeAnnotation, MapLogicalTypeAnnotation}
 import org.apache.parquet.schema.Type.Repetition
 
 import org.apache.spark.internal.Logging
@@ -502,36 +502,64 @@ object ParquetReadSupport extends Logging {
   }
 
   /**
-   * Finds the leaf node under a given file schema node that is likely to be cheapest to fetch.
-   * Keeps this leaf node inside the same parent hierarchy. This is used when all struct fields in
-   * the requested schema are missing. Uses a very simple heuristic based on the parquet type.
+   * Finds the leaf node(s) under a given file schema node that is likely to be cheapest to fetch.
+   * Note that multiple leaves can be selected if a map type is deemed to be the cheapest to fetch,
+   * because for each map in the hierarchy, we need to fetch something from both the key and value
+   * types. This function keeps the leaf node(s) inside the same parent hierarchy. This is used when
+   * all struct fields in the requested schema are missing. Uses a very simple heuristic based on
+   * the parquet type.
    */
   private def findCheapestGroupField(parentGroupType: GroupType): Type = {
     def findCheapestGroupFieldRecurse(curType: Type, repLevel: Int = 0): (Type, Int, Int) = {
       curType match {
         case groupType: GroupType =>
-          var (bestType, bestRepLevel, bestCost) = (Option.empty[Type], 0, 0)
-          for (field <- groupType.getFields.asScala) {
-            val newRepLevel = repLevel + (if (field.isRepetition(Repetition.REPEATED)) 1 else 0)
-            // Never take a field at a deeper repetition level, since it's likely to have more data.
-            // Don't do safety checks because we should already have done them when traversing the
-            // schema for the first time.
-            if (bestType.isEmpty || newRepLevel <= bestRepLevel) {
-              val (childType, childRepLevel, childCost) =
-                findCheapestGroupFieldRecurse(field, newRepLevel)
-              // Always prefer elements with a lower repetition level, since more nesting of arrays
-              // is likely to result in more data. At the same repetition level, prefer the smaller
-              // type.
-              if (bestType.isEmpty || childRepLevel < bestRepLevel ||
-                  (childRepLevel == bestRepLevel && childCost < bestCost)) {
-                // This is the new best path.
-                bestType = Some(childType)
-                bestRepLevel = childRepLevel
-                bestCost = childCost
+          groupType.getLogicalTypeAnnotation match {
+            case _: MapLogicalTypeAnnotation | _: MapKeyValueTypeAnnotation =>
+              // For maps, we need to ensure we read something from both the key and value types, as
+              // otherwise the GroupType we return from here would contain only one child (either
+              // the key or the value, but not both), which is invalid when the GroupType has a map
+              // logical annotation. This would later cause failures when converting the row we read
+              // to Spark type.
+              // Below code is adapted from ParquetSchemaConverter.convertGroupField
+              ParquetSchemaConverter.checkConversionRequirement(
+                groupType.getFieldCount == 1 && !groupType.getType(0).isPrimitive,
+                s"Invalid map type: $groupType")
+
+              val keyValueType = groupType.getFields.get(0).asGroupType()
+              ParquetSchemaConverter.checkConversionRequirement(
+                keyValueType.isRepetition(Repetition.REPEATED) && keyValueType.getFieldCount == 2,
+                s"Invalid map type: $groupType")
+
+              val keyResult = findCheapestGroupFieldRecurse(keyValueType.getType(0), repLevel + 1)
+              val valueResult = findCheapestGroupFieldRecurse(keyValueType.getType(1), repLevel + 1)
+              (
+                groupType.withNewFields(keyValueType.withNewFields(keyResult._1, valueResult._1)),
+                keyResult._2.max(valueResult._2), // Take max repetition level
+                keyResult._3 + valueResult._3 // Add up the costs of reading both fields
+              )
+            case _ =>
+              var (bestType, bestRepLevel, bestCost) = (Option.empty[Type], 0, 0)
+              for (field <- groupType.getFields.asScala) {
+                val newRepLevel = repLevel + (if (field.isRepetition(Repetition.REPEATED)) 1 else 0)
+                // Never take a field at a deeper repetition level, since it's likely to have more
+                // data.
+                if (bestType.isEmpty || newRepLevel <= bestRepLevel) {
+                  val (childType, childRepLevel, childCost) =
+                    findCheapestGroupFieldRecurse(field, newRepLevel)
+                  // Always prefer elements with a lower repetition level, since more nesting of
+                  // arrays is likely to result in more data. At the same repetition level, prefer
+                  // the smaller type.
+                  if (bestType.isEmpty || childRepLevel < bestRepLevel ||
+                    (childRepLevel == bestRepLevel && childCost < bestCost)) {
+                    // This is the new best path.
+                    bestType = Some(childType)
+                    bestRepLevel = childRepLevel
+                    bestCost = childCost
+                  }
+                }
               }
-            }
+              (groupType.withNewFields(bestType.get), bestRepLevel, bestCost)
           }
-          (groupType.withNewFields(bestType.get), bestRepLevel, bestCost)
         case primitiveType: PrimitiveType =>
           val cost = primitiveType.getPrimitiveTypeName match {
             case PrimitiveType.PrimitiveTypeName.BOOLEAN => 1
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala
@@ -833,6 +833,101 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSparkSession
     }
   }
 
+  test("SPARK-53535: vectorized reader: missing all struct fields, struct with map field only") {
+    val data = Seq(
+      Row(Row(Map("key1" -> 1)), 100),
+      Row(Row(Map("key2" -> 2)), 100),
+      Row(null, 100)
+    )
+
+    val tableSchema = new StructType()
+      .add("_1", new StructType()
+        .add("_1", MapType(StringType, IntegerType, valueContainsNull = true)))
+      .add("_2", IntegerType)
+
+    val readSchema = new StructType()
+      .add("_1", new StructType()
+        .add("_101", IntegerType))
+
+    withTempPath { path =>
+      val file = path.getCanonicalPath
+      spark.createDataFrame(data.asJava, tableSchema).write.partitionBy("_2").parquet(file)
+
+      for {
+        offheapEnabled <- Seq(true, false)
+        returnNullStructIfAllFieldsMissing <- Seq(true, false)
+      } {
+        withSQLConf(
+            SQLConf.LEGACY_PARQUET_RETURN_NULL_STRUCT_IF_ALL_FIELDS_MISSING.key ->
+              returnNullStructIfAllFieldsMissing.toString,
+            SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key -> "true",
+            SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED.key -> offheapEnabled.toString
+        ) {
+          val expectedAnswer = if (!returnNullStructIfAllFieldsMissing) {
+            Row(Row(null), 100) :: Row(Row(null), 100) :: Row(null, 100) :: Nil
+          } else {
+            Row(null, 100) :: Row(null, 100) :: Row(null, 100) :: Nil
+          }
+
+          withAllParquetReaders {
+            checkAnswer(spark.read.schema(readSchema).parquet(file), expectedAnswer)
+          }
+        }
+      }
+    }
+  }
+
+  test("SPARK-53535: vectorized reader: missing all struct fields, " +
+      "struct with cheap map and more expensive array field") {
+    val data = Seq(
+      Row(Row(Map(false -> Row("expensive", 1)), Seq("test1")), 100),
+      Row(Row(Map(true -> Row("expensive", 2)), Seq("test2")), 100),
+      Row(null, 100)
+    )
+
+    val tableSchema = new StructType()
+      .add("_1", new StructType()
+        .add("_1", MapType(
+          BooleanType,
+          new StructType()
+            .add("_1", StringType)
+            .add("_2", IntegerType),
+          valueContainsNull = true))
+        .add("_2", ArrayType(StringType, containsNull = true)))
+      .add("_2", IntegerType)
+
+    val readSchema = new StructType()
+      .add("_1", new StructType()
+        .add("_101", IntegerType))
+
+    withTempPath { path =>
+      val file = path.getCanonicalPath
+      spark.createDataFrame(data.asJava, tableSchema).write.partitionBy("_2").parquet(file)
+
+      for {
+        offheapEnabled <- Seq(true, false)
+        returnNullStructIfAllFieldsMissing <- Seq(true, false)
+      } {
+        withSQLConf(
+            SQLConf.LEGACY_PARQUET_RETURN_NULL_STRUCT_IF_ALL_FIELDS_MISSING.key ->
+              returnNullStructIfAllFieldsMissing.toString,
+            SQLConf.PARQUET_VECTORIZED_READER_NESTED_COLUMN_ENABLED.key -> "true",
+            SQLConf.COLUMN_VECTOR_OFFHEAP_ENABLED.key -> offheapEnabled.toString
+        ) {
+          val expectedAnswer = if (!returnNullStructIfAllFieldsMissing) {
+            Row(Row(null), 100) :: Row(Row(null), 100) :: Row(null, 100) :: Nil
+          } else {
+            Row(null, 100) :: Row(null, 100) :: Row(null, 100) :: Nil
+          }
+
+          withAllParquetReaders {
+            checkAnswer(spark.read.schema(readSchema).parquet(file), expectedAnswer)
+          }
+        }
+      }
+    }
+  }
+
   test("vectorized reader: missing some struct fields") {
     Seq(true, false).foreach { offheapEnabled =>
       withSQLConf(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala