apache · habren · Aug 16, 2018 · Aug 17, 2018 · gatorsmile · Aug 16, 2018
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -36,6 +36,7 @@ import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.analysis.Resolver
 import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode
 import org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator
+import org.apache.spark.sql.types.StringType
 import org.apache.spark.util.Utils
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -459,6 +460,36 @@ object SQLConf {
     .intConf
     .createWithDefault(4096)
 
+  val IS_COLUMNAR_PARTITION_ADAPTIVE_ENABLED = buildConf("spark.sql.columnar.adaptiveFileSplit")
+    .doc("When this is enabled, the partition size will adaptively enlarged when reading " +
+      "from columnar storage to make sure the actual input bytes of each task is close to " +
+      "the a proper partition size. When a user set spark.sql.files.maxPartitionBytes, he " +
+      "may think that is the upper bound of a single partition and each task will read at " +
+      "most this amount of data. For row based files, that is right. But for columnar storage, " +
+      "such as Parquet OR ORC, each task may read much less data because of column pruning." +
+      "For example, a 1024 MB file may contains of 10 columns, 5 of which are integer while " +
+      "another 5 are long. If this is a row based file, there will be 8 tasks, each of which " +
+      "will read 128MB. If this is a Parquet or ORC file and the Job only read a single " +
+      "column in long, there will be also 8 tasks, but each task will read much less than " +
+      "128 MB because of column pruning.")
+    .booleanConf
+    .createWithDefault(false)
+
+  val COLUMNAR_STRUCT_LENGTH = buildConf("spark.sql.columnar.struct.length")
+    .doc("Set the default size of struct column")
+    .intConf
+    .createWithDefault(StringType.defaultSize)
+
+  val COLUMNAR_MAP_LENGTH = buildConf("spark.sql.columnar.map.length")
+    .doc("Set the default size of map column")
+    .intConf
+    .createWithDefault(StringType.defaultSize)
+
+  val COLUMNAR_ARRAY_LENGTH = buildConf("spark.sql.columnar.array.length")
+    .doc("Set the default size of array column")
+    .intConf
+    .createWithDefault(StringType.defaultSize)
+
   val ORC_COMPRESSION = buildConf("spark.sql.orc.compression.codec")
     .doc("Sets the compression codec used when writing ORC files. If either `compression` or " +
       "`orc.compress` is specified in the table-specific options/properties, the precedence " +
@@ -1714,6 +1745,15 @@ class SQLConf extends Serializable with Logging {
 
   def parquetRecordFilterEnabled: Boolean = getConf(PARQUET_RECORD_FILTER_ENABLED)
 
+  def isColumnarStorageSplitSizeAdaptiveEnabled: Boolean =
+    getConf(IS_COLUMNAR_PARTITION_ADAPTIVE_ENABLED)
+
+  def columnarStructTypeLength: Int = getConf(COLUMNAR_STRUCT_LENGTH)
+
+  def columnarMapTypeLength: Int = getConf(COLUMNAR_MAP_LENGTH)
+
+  def columnarArrayTypeLength: Int = getConf(COLUMNAR_ARRAY_LENGTH)
+
   def inMemoryPartitionPruning: Boolean = getConf(IN_MEMORY_PARTITION_PRUNING)
 
   def offHeapColumnVectorEnabled: Boolean = getConf(COLUMN_VECTOR_OFFHEAP_ENABLED)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala
@@ -31,10 +31,11 @@ import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.plans.QueryPlan
 import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning, UnknownPartitioning}
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => ParquetSource}
 import org.apache.spark.sql.execution.metric.SQLMetrics
 import org.apache.spark.sql.sources.{BaseRelation, Filter}
-import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.types._
 import org.apache.spark.util.Utils
 import org.apache.spark.util.collection.BitSet
 
@@ -425,12 +426,44 @@ case class FileSourceScanExec(
       fsRelation: HadoopFsRelation): RDD[InternalRow] = {
     val defaultMaxSplitBytes =
       fsRelation.sparkSession.sessionState.conf.filesMaxPartitionBytes
-    val openCostInBytes = fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes
+    var openCostInBytes = fsRelation.sparkSession.sessionState.conf.filesOpenCostInBytes
     val defaultParallelism = fsRelation.sparkSession.sparkContext.defaultParallelism
     val totalBytes = selectedPartitions.flatMap(_.files.map(_.getLen + openCostInBytes)).sum
     val bytesPerCore = totalBytes / defaultParallelism
 
-    val maxSplitBytes = Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))
+    var maxSplitBytes = Math.min(defaultMaxSplitBytes, Math.max(openCostInBytes, bytesPerCore))
+
+    if(fsRelation.sparkSession.sessionState.conf.isColumnarStorageSplitSizeAdaptiveEnabled &&
+      (fsRelation.fileFormat.isInstanceOf[ParquetSource] ||
+        fsRelation.fileFormat.isInstanceOf[OrcFileFormat])) {
+      if (relation.dataSchema.map(_.dataType).forall(dataType =>
+        dataType.isInstanceOf[CalendarIntervalType] || dataType.isInstanceOf[StructType]
+          || dataType.isInstanceOf[MapType] || dataType.isInstanceOf[NullType]
+          || dataType.isInstanceOf[AtomicType] || dataType.isInstanceOf[ArrayType])) {
+
+        def getTypeLength(dataType: DataType): Int = {
+          if (dataType.isInstanceOf[StructType]) {
+            fsRelation.sparkSession.sessionState.conf.columnarStructTypeLength
+          } else if (dataType.isInstanceOf[ArrayType]) {
+            fsRelation.sparkSession.sessionState.conf.columnarArrayTypeLength
+          } else if (dataType.isInstanceOf[MapType]) {
+            fsRelation.sparkSession.sessionState.conf.columnarMapTypeLength
+          } else {
+            dataType.defaultSize
+          }
+        }
+
+        val selectedColumnSize = requiredSchema.map(_.dataType).map(getTypeLength(_))
+          .reduceOption(_ + _).getOrElse(StringType.defaultSize)
+        val totalColumnSize = relation.dataSchema.map(_.dataType).map(getTypeLength(_))
+          .reduceOption(_ + _).getOrElse(StringType.defaultSize)
+        val multiplier = totalColumnSize / selectedColumnSize
+        maxSplitBytes = maxSplitBytes * multiplier
+        openCostInBytes = openCostInBytes * multiplier
+      }
+    }
+
+
     logInfo(s"Planning scan with bin packing, max size: $maxSplitBytes bytes, " +
       s"open cost is considered as scanning $openCostInBytes bytes.")