From 112ce2d34d0d039711777351c1ab8e74629fc8e6 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 20 Mar 2018 16:30:44 +0100
Subject: [PATCH 01/48] Checks column names are compatible to provided schema

---
 .../sql/execution/datasources/csv/CSVSuite.scala    | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 4398e547d9217..772e66e0e7ffa 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1279,4 +1279,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil
     )
   }
+
+  test("Check column names during schema validation") {
+    withTempPath { path =>
+      import collection.JavaConverters._
+      val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+      val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+      odf.write.option("header", "true").csv(path.getCanonicalPath)
+      val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+      val idf = spark.read.schema(ischema).option("header", "true").csv(path.getCanonicalPath)
+
+      checkAnswer(idf.select('f1), odf.select('f1))
+    }
+  }
 }

From a85ccce23c3c5ee69ff321303ad830c71dd05931 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 20 Mar 2018 21:51:03 +0100
Subject: [PATCH 02/48] Checking header is matched to schema in per-line mode

---
 .../datasources/csv/CSVDataSource.scala       | 21 +++++++++++++-----
 .../datasources/csv/CSVFileFormat.scala       |  3 ++-
 .../datasources/csv/UnivocityParser.scala     | 22 +++++++++++++++++++
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 4870d75fc5f08..f7be8a2c764dc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -50,7 +50,9 @@ abstract class CSVDataSource extends Serializable {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow]
+      schema: StructType, // Schema of projection
+      dataSchema: StructType // Schema of data in csv files
+  ): Iterator[InternalRow]
 
   /**
    * Infers the schema from `inputPaths` files.
@@ -127,7 +129,8 @@ object TextInputCSVDataSource extends CSVDataSource {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow] = {
+      schema: StructType,
+      dataSchema: StructType): Iterator[InternalRow] = {
     val lines = {
       val linesReader = new HadoopFileLinesReader(file, conf)
       Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close()))
@@ -136,8 +139,15 @@ object TextInputCSVDataSource extends CSVDataSource {
       }
     }
 
-    val shouldDropHeader = parser.options.headerFlag && file.start == 0
-    UnivocityParser.parseIterator(lines, shouldDropHeader, parser, schema)
+    val hasHeader = parser.options.headerFlag && file.start == 0
+    if (hasHeader) {
+      lines.take(1).foreach { header =>
+        UnivocityParser.checkHeaderBySchema(parser, dataSchema, header)
+      }
+    }
+
+    UnivocityParser.parseIterator(
+      conf, lines, shouldDropHeader = hasHeader, parser, schema, file.filePath)
   }
 
   override def infer(
@@ -204,7 +214,8 @@ object MultiLineCSVDataSource extends CSVDataSource {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow] = {
+      schema: StructType,
+      dataSchema: StructType): Iterator[InternalRow] = {
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
       parser.options.headerFlag,
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index e20977a4ec79f..4888d7632ab9c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -129,7 +129,8 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
         StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         parsedOptions)
-      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema)
+      CSVDataSource(parsedOptions)
+        .readFile(conf, file, parser, requiredSchema, dataSchema)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 3d6cc30f2ba83..4842c62a5d852 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -312,4 +312,26 @@ private[csv] object UnivocityParser {
       parser.options.columnNameOfCorruptRecord)
     filteredLines.flatMap(safeParser.parse)
   }
+
+  def checkHeaderBySchema(
+      parser: UnivocityParser,
+      schema: StructType,
+      header: String
+  ): Unit = {
+    val columnNames = parser.tokenizer.parseLine(header)
+    val fieldNames = schema.map(_.name)
+    val isMatched = fieldNames.zip(columnNames).forall { pair =>
+      val (nameInSchema, nameInHeader) = pair
+      nameInSchema == nameInHeader
+    }
+    if (!isMatched) {
+      throw new IllegalArgumentException(
+        s"""
+           |Fields in the header of csv file are not matched to field names of the schema:
+           | Header: ${header}
+           | Schema: ${fieldNames.mkString(",")}
+           |""".stripMargin
+      )
+    }
+  }
 }

From 75e15345b6a5a9e807375fdf465dccfce4ea62c7 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 20 Mar 2018 22:36:56 +0100
Subject: [PATCH 03/48] Extract header and check that it is matched to schema

---
 .../datasources/csv/CSVDataSource.scala       |  6 ++++-
 .../execution/datasources/csv/CSVUtils.scala  | 22 +++++++++++--------
 .../datasources/csv/UnivocityParser.scala     | 11 +---------
 .../execution/datasources/csv/CSVSuite.scala  | 13 ++++++++---
 4 files changed, 29 insertions(+), 23 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index f7be8a2c764dc..764ee8a118d31 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -141,7 +141,11 @@ object TextInputCSVDataSource extends CSVDataSource {
 
     val hasHeader = parser.options.headerFlag && file.start == 0
     if (hasHeader) {
-      lines.take(1).foreach { header =>
+      // Checking header field names are matched to fields in schema.
+      // The header is removed from lines.
+      // Note: if there are only comments in the first block, the header would probably
+      // be not extracted.
+      CSVUtils.extractHeader(lines, parser.options).foreach { header =>
         UnivocityParser.checkHeaderBySchema(parser, dataSchema, header)
       }
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
index 72b053d2092ca..3a2c5e6653615 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala
@@ -67,12 +67,8 @@ object CSVUtils {
     }
   }
 
-  /**
-   * Drop header line so that only data can remain.
-   * This is similar with `filterHeaderLine` above and currently being used in CSV reading path.
-   */
-  def dropHeaderLine(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
-    val nonEmptyLines = if (options.isCommentSet) {
+  def skipComments(iter: Iterator[String], options: CSVOptions): Iterator[String] = {
+    if (options.isCommentSet) {
       val commentPrefix = options.comment.toString
       iter.dropWhile { line =>
         line.trim.isEmpty || line.trim.startsWith(commentPrefix)
@@ -80,11 +76,19 @@ object CSVUtils {
     } else {
       iter.dropWhile(_.trim.isEmpty)
     }
-
-    if (nonEmptyLines.hasNext) nonEmptyLines.drop(1)
-    iter
   }
 
+  /**
+   * Extracts header and moves iterator forward so that only data remains in it
+   */
+  def extractHeader(iter: Iterator[String], options: CSVOptions): Option[String] = {
+    val nonEmptyLines = skipComments(iter, options)
+    if (nonEmptyLines.hasNext) {
+      Some(nonEmptyLines.next())
+    } else {
+      None
+    }
+  }
   /**
    * Helper method that converts string representation of a character to actual character.
    * It handles some Java escaped strings and throws exception if given string is longer than one
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 4842c62a5d852..9516b0cb72c67 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -289,21 +289,12 @@ private[csv] object UnivocityParser {
    */
   def parseIterator(
       lines: Iterator[String],
-      shouldDropHeader: Boolean,
       parser: UnivocityParser,
       schema: StructType): Iterator[InternalRow] = {
     val options = parser.options
 
-    val linesWithoutHeader = if (shouldDropHeader) {
-      // Note that if there are only comments in the first block, the header would probably
-      // be not dropped.
-      CSVUtils.dropHeaderLine(lines, options)
-    } else {
-      lines
-    }
-
     val filteredLines: Iterator[String] =
-      CSVUtils.filterCommentAndEmpty(linesWithoutHeader, options)
+      CSVUtils.filterCommentAndEmpty(lines, options)
 
     val safeParser = new FailureSafeParser[String](
       input => Seq(parser.parse(input)),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 772e66e0e7ffa..35245583026c1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1287,9 +1287,16 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
       odf.write.option("header", "true").csv(path.getCanonicalPath)
       val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
-      val idf = spark.read.schema(ischema).option("header", "true").csv(path.getCanonicalPath)
-
-      checkAnswer(idf.select('f1), odf.select('f1))
+      val exception = intercept[SparkException] {
+         spark.read
+           .schema(ischema)
+           .option("header", "true")
+           .csv(path.getCanonicalPath)
+           .collect()
+      }
+      assert(exception.getMessage.contains(
+        "Fields in the header of csv file are not matched to field names of the schema"
+      ))
     }
   }
 }

From 8eb45b8b634ba2c9b641de12e09f17c63240ccc4 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 11:57:30 +0100
Subject: [PATCH 04/48] Checking column names in header in multiLine mode

---
 .../datasources/csv/CSVDataSource.scala       | 13 ++--
 .../datasources/csv/CSVOptions.scala          |  2 +-
 .../datasources/csv/UnivocityParser.scala     | 62 ++++++++++++-------
 .../execution/datasources/csv/CSVSuite.scala  | 37 ++++++-----
 4 files changed, 70 insertions(+), 44 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 764ee8a118d31..51255efc86470 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -145,9 +145,8 @@ object TextInputCSVDataSource extends CSVDataSource {
       // The header is removed from lines.
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
-      CSVUtils.extractHeader(lines, parser.options).foreach { header =>
-        UnivocityParser.checkHeaderBySchema(parser, dataSchema, header)
-      }
+      val checkHeader = UnivocityParser.checkHeader(parser, dataSchema, _: String)
+      CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_))
     }
 
     UnivocityParser.parseIterator(
@@ -224,7 +223,10 @@ object MultiLineCSVDataSource extends CSVDataSource {
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
       parser.options.headerFlag,
       parser,
-      schema)
+      schema,
+      file.filePath,
+      checkHeader = UnivocityParser.checkHeaderColumnNames(parser, dataSchema, _)
+    )
   }
 
   override def infer(
@@ -232,11 +234,13 @@ object MultiLineCSVDataSource extends CSVDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
+    val checkHeader = (_: Array[String]) => ()
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(
         CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
         shouldDropHeader = false,
+        checkHeader,
         new CsvParser(parsedOptions.asParserSettings))
     }.take(1).headOption match {
       case Some(firstRow) =>
@@ -248,6 +252,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
               lines.getConfiguration,
               new Path(lines.getPath())),
             parsedOptions.headerFlag,
+            checkHeader,
             new CsvParser(parsedOptions.asParserSettings))
         }
         CSVInferSchema.infer(tokenRDD, header, parsedOptions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index c16790630ce17..d5a5a73c45268 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -149,7 +149,7 @@ class CSVOptions(
   val inputBufferSize = 128
 
   val isCommentSet = this.comment != '\u0000'
-
+  
   def asWriterSettings: CsvWriterSettings = {
     val writerSettings = new CsvWriterSettings()
     val format = writerSettings.getFormat
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 9516b0cb72c67..75f6b2f25bcce 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -237,8 +237,9 @@ private[csv] object UnivocityParser {
   def tokenizeStream(
       inputStream: InputStream,
       shouldDropHeader: Boolean,
+      checkHeader: Array[String] => Unit,
       tokenizer: CsvParser): Iterator[Array[String]] = {
-    convertStream(inputStream, shouldDropHeader, tokenizer)(tokens => tokens)
+    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader)(tokens => tokens)
   }
 
   /**
@@ -248,14 +249,16 @@ private[csv] object UnivocityParser {
       inputStream: InputStream,
       shouldDropHeader: Boolean,
       parser: UnivocityParser,
-      schema: StructType): Iterator[InternalRow] = {
+      schema: StructType,
+      filePath: String,
+      checkHeader: Array[String] => Unit): Iterator[InternalRow] = {
     val tokenizer = parser.tokenizer
     val safeParser = new FailureSafeParser[Array[String]](
       input => Seq(parser.convert(input)),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
-    convertStream(inputStream, shouldDropHeader, tokenizer) { tokens =>
+    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens =>
       safeParser.parse(tokens)
     }.flatten
   }
@@ -263,11 +266,14 @@ private[csv] object UnivocityParser {
   private def convertStream[T](
       inputStream: InputStream,
       shouldDropHeader: Boolean,
-      tokenizer: CsvParser)(convert: Array[String] => T) = new Iterator[T] {
+      tokenizer: CsvParser,
+      checkHeader: Array[String] => Unit
+  )(convert: Array[String] => T) = new Iterator[T] {
     tokenizer.beginParsing(inputStream)
     private var nextRecord = {
       if (shouldDropHeader) {
-        tokenizer.parseNext()
+        val header = tokenizer.parseNext()
+        checkHeader(header)
       }
       tokenizer.parseNext()
     }
@@ -304,25 +310,35 @@ private[csv] object UnivocityParser {
     filteredLines.flatMap(safeParser.parse)
   }
 
-  def checkHeaderBySchema(
-      parser: UnivocityParser,
-      schema: StructType,
-      header: String
+  def checkHeaderColumnNames(
+    parser: UnivocityParser,
+    schema: StructType,
+    columnNames: Array[String]
   ): Unit = {
-    val columnNames = parser.tokenizer.parseLine(header)
-    val fieldNames = schema.map(_.name)
-    val isMatched = fieldNames.zip(columnNames).forall { pair =>
-      val (nameInSchema, nameInHeader) = pair
-      nameInSchema == nameInHeader
-    }
-    if (!isMatched) {
-      throw new IllegalArgumentException(
-        s"""
-           |Fields in the header of csv file are not matched to field names of the schema:
-           | Header: ${header}
-           | Schema: ${fieldNames.mkString(",")}
-           |""".stripMargin
-      )
+    if (columnNames != null) {
+      val fieldNames = schema.map(_.name)
+      val isMatched = fieldNames.zip(columnNames).forall { pair =>
+        val (nameInSchema, nameInHeader) = pair
+        nameInSchema == nameInHeader
+      }
+      if (!isMatched) {
+        throw new IllegalArgumentException(
+          s"""
+             |Fields in the header of csv file are not matched to field names of the schema:
+             | Header: ${columnNames.mkString(",")}
+
+             | Schema: ${fieldNames.mkString(",")}
+
+
+             |""".
+            stripMargin
+        )
+      }
     }
   }
+
+  def checkHeader(parser: UnivocityParser, schema: StructType, header: String): Unit = {
+    val columnNames = parser.tokenizer.parseLine(header)
+    checkHeaderColumnNames(parser, schema, columnNames)
+  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 35245583026c1..1007a85d3b1a3 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1280,23 +1280,28 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     )
   }
 
-  test("Check column names during schema validation") {
-    withTempPath { path =>
-      import collection.JavaConverters._
-      val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
-      val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
-      odf.write.option("header", "true").csv(path.getCanonicalPath)
-      val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
-      val exception = intercept[SparkException] {
-         spark.read
-           .schema(ischema)
-           .option("header", "true")
-           .csv(path.getCanonicalPath)
-           .collect()
+  def checkHeader(multiLine: String): Unit = {
+    test(s"Check column names during schema validation - multiLine = $multiLine") {
+      withTempPath { path =>
+        import collection.JavaConverters._
+        val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+        val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+        odf.write.option("header", "true").csv(path.getCanonicalPath)
+        val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+        val exception = intercept[SparkException] {
+          spark.read
+            .schema(ischema)
+            .option("header", "true")
+            .option("multiLine", multiLine)
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exception.getMessage.contains(
+          "Fields in the header of csv file are not matched to field names of the schema"
+        ))
       }
-      assert(exception.getMessage.contains(
-        "Fields in the header of csv file are not matched to field names of the schema"
-      ))
     }
   }
+
+  List("false", "true").foreach(checkHeader(_))
 }

From 9b1a9862531b8d3fb3cffce75126413ca9a844b9 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 12:13:17 +0100
Subject: [PATCH 05/48] Adding the checkHeader option with true by default

---
 .../execution/datasources/csv/CSVOptions.scala   |  4 +++-
 .../datasources/csv/UnivocityParser.scala        | 16 +++++-----------
 .../sql/execution/datasources/csv/CSVSuite.scala |  3 ++-
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index d5a5a73c45268..1f7986fb59864 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -149,7 +149,9 @@ class CSVOptions(
   val inputBufferSize = 128
 
   val isCommentSet = this.comment != '\u0000'
-  
+
+  val checkHeader = getBool("checkHeader", true)
+
   def asWriterSettings: CsvWriterSettings = {
     val writerSettings = new CsvWriterSettings()
     val format = writerSettings.getFormat
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 75f6b2f25bcce..89815bbd56518 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -315,7 +315,7 @@ private[csv] object UnivocityParser {
     schema: StructType,
     columnNames: Array[String]
   ): Unit = {
-    if (columnNames != null) {
+    if (parser.options.checkHeader && columnNames != null) {
       val fieldNames = schema.map(_.name)
       val isMatched = fieldNames.zip(columnNames).forall { pair =>
         val (nameInSchema, nameInHeader) = pair
@@ -323,22 +323,16 @@ private[csv] object UnivocityParser {
       }
       if (!isMatched) {
         throw new IllegalArgumentException(
-          s"""
-             |Fields in the header of csv file are not matched to field names of the schema:
-             | Header: ${columnNames.mkString(",")}
-
-             | Schema: ${fieldNames.mkString(",")}
-
-
-             |""".
-            stripMargin
+          s"""|Fields in the header of csv file are not matched to field names of the schema:
+              | Header: ${columnNames.mkString(",")}
+              | Schema: ${fieldNames.mkString(",")}""".stripMargin
         )
       }
     }
   }
 
   def checkHeader(parser: UnivocityParser, schema: StructType, header: String): Unit = {
-    val columnNames = parser.tokenizer.parseLine(header)
+    lazy val columnNames = parser.tokenizer.parseLine(header)
     checkHeaderColumnNames(parser, schema, columnNames)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 1007a85d3b1a3..42d2cd99c40ac 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1291,8 +1291,9 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
         val exception = intercept[SparkException] {
           spark.read
             .schema(ischema)
-            .option("header", "true")
             .option("multiLine", multiLine)
+            .option("header", "true")
+            .option("checkHeader", "true")
             .csv(path.getCanonicalPath)
             .collect()
         }

From 64426332b2ab42a1cd9c5a05a77e90332572bbec Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 12:25:31 +0100
Subject: [PATCH 06/48] Fix csv test by changing headers or disabling header
 checking

---
 .../sql/execution/datasources/csv/CSVSuite.scala     | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 42d2cd99c40ac..1be8e79305d35 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -252,7 +252,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
           |(yearMade double, makeName string, modelName string, priceTag decimal,
           | comments string, grp string)
           |USING csv
-          |OPTIONS (path "${testFile(carsTsvFile)}", header "true", delimiter "\t")
+          |OPTIONS (
+          |  path "${testFile(carsTsvFile)}",
+          |  header "true", checkHeader "false",
+          |   delimiter "\t"
+          |)
          """.stripMargin.replaceAll("\n", " "))
 
       assert(
@@ -275,7 +279,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   test("test for blank column names on read and select columns") {
     val cars = spark.read
       .format("csv")
-      .options(Map("header" -> "true", "inferSchema" -> "true"))
+      .options(Map("header" -> "true", "checkHeader" -> "false", "inferSchema" -> "true"))
       .load(testFile(carsBlankColName))
 
     assert(cars.select("customer").collect().size == 2)
@@ -348,7 +352,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       spark.sql(
         s"""
           |CREATE TEMPORARY VIEW carsTable
-          |(yearMade double, makeName string, modelName string, comments string, blank string)
+          |(year double, make string, model string, comment string, blank string)
           |USING csv
           |OPTIONS (path "${testFile(carsFile)}", header "true")
          """.stripMargin.replaceAll("\n", " "))
@@ -356,7 +360,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       val cars = spark.table("carsTable")
       verifyCars(cars, withHeader = true, checkHeader = false, checkValues = false)
       assert(
-        cars.schema.fieldNames === Array("yearMade", "makeName", "modelName", "comments", "blank"))
+        cars.schema.fieldNames === Array("year", "make", "model", "comment", "blank"))
     }
   }
 

From 9440d8a5c097a1d8e111b397fbda9e54751b7a84 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 12:36:21 +0100
Subject: [PATCH 07/48] Adding comment for the checkHeader option

---
 .../spark/sql/execution/datasources/csv/CSVOptions.scala      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 1f7986fb59864..2030c05b9ff86 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -150,6 +150,10 @@ class CSVOptions(
 
   val isCommentSet = this.comment != '\u0000'
 
+  /**
+   * The option enables checks of headers in csv files. In particular, column names
+   * are matched to field names of provided schema.
+   */
   val checkHeader = getBool("checkHeader", true)
 
   def asWriterSettings: CsvWriterSettings = {

From 9f91ce73c5c313a9c51067a81e395e9385016ec5 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 12:42:48 +0100
Subject: [PATCH 08/48] Added comments

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 51255efc86470..3f9cbc0ed0838 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -141,7 +141,7 @@ object TextInputCSVDataSource extends CSVDataSource {
 
     val hasHeader = parser.options.headerFlag && file.start == 0
     if (hasHeader) {
-      // Checking header field names are matched to fields in schema.
+      // Checking that column names in the header are matched to field names of the schema.
       // The header is removed from lines.
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
@@ -234,6 +234,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
+    // The header is not checked because there is no schema against with it could be check
     val checkHeader = (_: Array[String]) => ()
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())

From 0878f7aad3c074e63ac3ab1d6e471ce8b988f278 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 13:09:20 +0100
Subject: [PATCH 09/48] Adding a space between column names

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala   | 2 +-
 .../spark/sql/execution/datasources/csv/UnivocityParser.scala | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 3f9cbc0ed0838..dc1306716fd87 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -142,7 +142,7 @@ object TextInputCSVDataSource extends CSVDataSource {
     val hasHeader = parser.options.headerFlag && file.start == 0
     if (hasHeader) {
       // Checking that column names in the header are matched to field names of the schema.
-      // The header is removed from lines.
+      // The header will be removed from lines.
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
       val checkHeader = UnivocityParser.checkHeader(parser, dataSchema, _: String)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 89815bbd56518..1e93ac5b6ba46 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -324,8 +324,8 @@ private[csv] object UnivocityParser {
       if (!isMatched) {
         throw new IllegalArgumentException(
           s"""|Fields in the header of csv file are not matched to field names of the schema:
-              | Header: ${columnNames.mkString(",")}
-              | Schema: ${fieldNames.mkString(",")}""".stripMargin
+              | Header: ${columnNames.mkString(", ")}
+              | Schema: ${fieldNames.mkString(", ")}""".stripMargin
         )
       }
     }

From a341dd79c976df59fc8bffb272449973a09b86fe Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 21 Mar 2018 16:15:14 +0100
Subject: [PATCH 10/48] Fix a test: checking name duplication in schemas

---
 .../org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index 14b1feb2adc20..39c05a88fa132 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -764,7 +764,8 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
           // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896)
           var testDir = Utils.createTempDir(src.getAbsolutePath)
           Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(testDir.getAbsolutePath)
-          val df = spark.read.format("csv").option("inferSchema", true).option("header", true)
+          val df = spark.read.format("csv").option("inferSchema", true)
+            .option("header", true).option("checkHeader", "false")
             .load(testDir.getAbsolutePath)
           checkAnswer(df, Row(1, 1))
           checkReadPartitionColumnDuplication("csv", c0, c1, src)

From 98c27eaa80cf3fae11092d78f22122688e4041a4 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 23 Mar 2018 22:04:57 +0100
Subject: [PATCH 11/48] Fixing the test and adding ticket number to test's
 title

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala    | 3 +--
 .../spark/sql/execution/datasources/csv/UnivocityParser.scala  | 1 +
 .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index dc1306716fd87..3e1ce82f3aab4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -149,8 +149,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_))
     }
 
-    UnivocityParser.parseIterator(
-      conf, lines, shouldDropHeader = hasHeader, parser, schema, file.filePath)
+    UnivocityParser.parseIterator(lines, parser, schema)
   }
 
   override def infer(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 1e93ac5b6ba46..74df38dcc8d5a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -307,6 +307,7 @@ private[csv] object UnivocityParser {
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
+
     filteredLines.flatMap(safeParser.parse)
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 1be8e79305d35..53b9b4b4b830a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1285,7 +1285,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   }
 
   def checkHeader(multiLine: String): Unit = {
-    test(s"Check column names during schema validation - multiLine = $multiLine") {
+    test(s"SPARK-23786: Checking column names against schema ($multiLine)") {
       withTempPath { path =>
         import collection.JavaConverters._
         val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)

From 811df6fa7b17ff12bdd70318cf330a0f54815397 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 23 Mar 2018 22:10:20 +0100
Subject: [PATCH 12/48] Refactoring - removing unneeded parameter

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala    | 1 -
 .../spark/sql/execution/datasources/csv/CSVFileFormat.scala    | 3 +--
 .../spark/sql/execution/datasources/csv/UnivocityParser.scala  | 1 -
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 3e1ce82f3aab4..d2996a27d06bd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -223,7 +223,6 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parser.options.headerFlag,
       parser,
       schema,
-      file.filePath,
       checkHeader = UnivocityParser.checkHeaderColumnNames(parser, dataSchema, _)
     )
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index 4888d7632ab9c..37ed8d5e017d8 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -129,8 +129,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
         StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         parsedOptions)
-      CSVDataSource(parsedOptions)
-        .readFile(conf, file, parser, requiredSchema, dataSchema)
+      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema)
     }
   }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 74df38dcc8d5a..beef78946e73f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -250,7 +250,6 @@ private[csv] object UnivocityParser {
       shouldDropHeader: Boolean,
       parser: UnivocityParser,
       schema: StructType,
-      filePath: String,
       checkHeader: Array[String] => Unit): Iterator[InternalRow] = {
     val tokenizer = parser.tokenizer
     val safeParser = new FailureSafeParser[Array[String]](

From 691cfbcf1c6c926bb0ed666db59af6070b6a1372 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sun, 25 Mar 2018 15:01:44 +0200
Subject: [PATCH 13/48] Output filename in the exception

---
 .../datasources/csv/CSVDataSource.scala         | 17 ++++++++++++++---
 .../datasources/csv/UnivocityParser.scala       | 15 +++++++++++----
 2 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index d2996a27d06bd..75af28d06dada 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -145,7 +145,12 @@ object TextInputCSVDataSource extends CSVDataSource {
       // The header will be removed from lines.
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
-      val checkHeader = UnivocityParser.checkHeader(parser, dataSchema, _: String)
+      val checkHeader = UnivocityParser.checkHeader(
+        parser,
+        dataSchema,
+        _: String,
+        file.filePath
+      )
       CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_))
     }
 
@@ -218,13 +223,19 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parser: UnivocityParser,
       schema: StructType,
       dataSchema: StructType): Iterator[InternalRow] = {
+    val checkHeader = UnivocityParser.checkHeaderColumnNames(
+      parser,
+      dataSchema,
+      _: Array[String],
+      file.filePath
+    )
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
       parser.options.headerFlag,
       parser,
       schema,
-      checkHeader = UnivocityParser.checkHeaderColumnNames(parser, dataSchema, _)
-    )
+      file.filePath,
+      checkHeader)
   }
 
   override def infer(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index beef78946e73f..a10548f2dfd2f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -313,7 +313,8 @@ private[csv] object UnivocityParser {
   def checkHeaderColumnNames(
     parser: UnivocityParser,
     schema: StructType,
-    columnNames: Array[String]
+    columnNames: Array[String],
+    fileName: String
   ): Unit = {
     if (parser.options.checkHeader && columnNames != null) {
       val fieldNames = schema.map(_.name)
@@ -325,14 +326,20 @@ private[csv] object UnivocityParser {
         throw new IllegalArgumentException(
           s"""|Fields in the header of csv file are not matched to field names of the schema:
               | Header: ${columnNames.mkString(", ")}
-              | Schema: ${fieldNames.mkString(", ")}""".stripMargin
+              | Schema: ${fieldNames.mkString(", ")}
+              |CSV file: $fileName""".stripMargin
         )
       }
     }
   }
 
-  def checkHeader(parser: UnivocityParser, schema: StructType, header: String): Unit = {
+  def checkHeader(
+      parser: UnivocityParser,
+      schema: StructType,
+      header: String,
+      fileName: String
+  ): Unit = {
     lazy val columnNames = parser.tokenizer.parseLine(header)
-    checkHeaderColumnNames(parser, schema, columnNames)
+    checkHeaderColumnNames(parser, schema, columnNames, fileName)
   }
 }

From efb01051a59c948e943d211abadff08d3dac6acf Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sun, 25 Mar 2018 16:53:37 +0200
Subject: [PATCH 14/48] PySpark: adding a test and checkHeader parameter

---
 python/pyspark/sql/readwriter.py |  8 ++++++--
 python/pyspark/sql/tests.py      | 15 +++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index e5288636c596e..9845ce8d45512 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -335,7 +335,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
-            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None):
+            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
+            checkHeader=None):
         """Loads a CSV file and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -360,6 +361,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                         character. By default (None), it is disabled.
         :param header: uses the first line as names of columns. If None is set, it uses the
                        default value, ``false``.
+        :param checkHeader: compares column names in the header with field names in the schema
+                            and outputs an error if names are not matched.
+                            If None is set, it uses the default value, ``true``.
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
@@ -436,7 +440,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=maxCharsPerColumn,
             maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
-            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping)
+            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, checkHeader=checkHeader)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 967cc83166f3f..5bf1759690fe7 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2974,6 +2974,21 @@ def test_create_dateframe_from_pandas_with_dst(self):
                 os.environ['TZ'] = orig_env_tz
             time.tzset()
 
+    def test_checking_csv_header(self):
+        tmpPath = tempfile.mkdtemp()
+        shutil.rmtree(tmpPath)
+        self.spark.createDataFrame([[1, 1000],[2000, 2]]).\
+            toDF('f1', 'f2').write.option("header", "true").csv(tmpPath)
+        schema = StructType([
+            StructField('f2', IntegerType(), nullable=True),
+            StructField('f1', IntegerType(), nullable=True)])
+        df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath)
+        self.assertRaisesRegexp(
+            Exception,
+            "Fields in the header of csv file are not matched to field names of the schema",
+            lambda: df.collect())
+        shutil.rmtree(tmpPath)
+
 
 class HiveSparkSubmitTests(SparkSubmitTests):
 

From c9f5e1491e82455abb1baecec86d4ad91dd6fabf Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sun, 25 Mar 2018 17:27:52 +0200
Subject: [PATCH 15/48] Removing unneeded parameter - fileName

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala      | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 75af28d06dada..ae08c97975fb7 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -234,7 +234,6 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parser.options.headerFlag,
       parser,
       schema,
-      file.filePath,
       checkHeader)
   }
 

From e1958388678534a424931cfea08c63ee2c3df102 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sun, 25 Mar 2018 17:28:50 +0200
Subject: [PATCH 16/48] Fix for pycodestyle checks

---
 python/pyspark/sql/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 5bf1759690fe7..13804078574b9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2977,7 +2977,7 @@ def test_create_dateframe_from_pandas_with_dst(self):
     def test_checking_csv_header(self):
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)
-        self.spark.createDataFrame([[1, 1000],[2000, 2]]).\
+        self.spark.createDataFrame([[1, 1000], [2000, 2]]).\
             toDF('f1', 'f2').write.option("header", "true").csv(tmpPath)
         schema = StructType([
             StructField('f2', IntegerType(), nullable=True),

From d6d370d98f3f12c9c53fd784c0b6e8f9d3d28f54 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sun, 25 Mar 2018 17:32:17 +0200
Subject: [PATCH 17/48] Adding description of the checkHeader option

---
 .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 1a5e47508c070..4f04bd7ddc0c5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -524,6 +524,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`comment` (default empty string): sets a single character used for skipping lines
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
+   * <li>`checkHeader` (default `true`): compares column names in the header with field names
+   * in the schema and outputs an error if names are not matched.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading

From acd6d2e1c381936f8982e70aac6c85d9e85de63e Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 26 Mar 2018 22:44:56 +0200
Subject: [PATCH 18/48] Improving error messages and handling the case when
 header size is not equal to schema size

---
 .../datasources/csv/UnivocityParser.scala     | 38 ++++++++++++++-----
 .../execution/datasources/csv/CSVSuite.scala  | 34 ++++++++++++++++-
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index a10548f2dfd2f..7b43e9a0533ff 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -317,19 +317,37 @@ private[csv] object UnivocityParser {
     fileName: String
   ): Unit = {
     if (parser.options.checkHeader && columnNames != null) {
-      val fieldNames = schema.map(_.name)
-      val isMatched = fieldNames.zip(columnNames).forall { pair =>
-        val (nameInSchema, nameInHeader) = pair
-        nameInSchema == nameInHeader
-      }
-      if (!isMatched) {
-        throw new IllegalArgumentException(
-          s"""|Fields in the header of csv file are not matched to field names of the schema:
-              | Header: ${columnNames.mkString(", ")}
-              | Schema: ${fieldNames.mkString(", ")}
+      val fieldNames = schema.map(_.name).toIndexedSeq
+      val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
+      var error: Option[String] = None
+
+      if (headerLen == schemaSize) {
+        var i = 0
+        while (error.isEmpty && i < headerLen) {
+          val nameInSchema = fieldNames(i).toLowerCase
+          val nameInHeader = columnNames(i).toLowerCase
+          if (nameInHeader != nameInSchema) {
+            error = Some(
+              s"""|CSV file header does not contain the expected fields.
+                  | Header: ${columnNames.mkString(", ")}
+                  | Schema: ${fieldNames.mkString(", ")}
+                  |Expected: $nameInSchema but found: $nameInHeader
+                  |CSV file: $fileName""".stripMargin
+            )
+          }
+          i += 1
+        }
+      } else {
+        error = Some(
+          s"""|Number of column in CSV header is not equal to number of fields in the schema:
+              | Header length: $headerLen, schema size: $schemaSize
               |CSV file: $fileName""".stripMargin
         )
       }
+
+      error.headOption.foreach { msg =>
+        throw new IllegalArgumentException(msg)
+      }
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 53b9b4b4b830a..e28ba8089c6a9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1302,7 +1302,39 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
             .collect()
         }
         assert(exception.getMessage.contains(
-          "Fields in the header of csv file are not matched to field names of the schema"
+          "CSV file header does not contain the expected fields"
+        ))
+
+        val shortSchema = new StructType().add("f1", DoubleType)
+        val exceptionForShortSchema = intercept[SparkException] {
+          spark.read
+            .schema(shortSchema)
+            .option("multiLine", multiLine)
+            .option("header", "true")
+            .option("checkHeader", "true")
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exceptionForShortSchema.getMessage.contains(
+          "Number of column in CSV header is not equal to number of fields in the schema"
+        ))
+
+        val longSchema = new StructType()
+          .add("f1", DoubleType)
+          .add("f2", DoubleType)
+          .add("f3", DoubleType)
+
+        val exceptionForLongSchema = intercept[SparkException] {
+          spark.read
+            .schema(longSchema)
+            .option("multiLine", multiLine)
+            .option("header", "true")
+            .option("checkHeader", "true")
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exceptionForLongSchema.getMessage.contains(
+          "Header length: 2, schema size: 3"
         ))
       }
     }

From 13892fddc7007c95ebfc3d2fcb789401fc78426f Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 26 Mar 2018 22:55:09 +0200
Subject: [PATCH 19/48] Refactoring: check header by calling an uniVocity
 method

---
 .../sql/execution/datasources/csv/CSVDataSource.scala  | 10 +++-------
 .../execution/datasources/csv/UnivocityParser.scala    |  2 +-
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index ae08c97975fb7..3bf6fc41d0a84 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -145,13 +145,9 @@ object TextInputCSVDataSource extends CSVDataSource {
       // The header will be removed from lines.
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
-      val checkHeader = UnivocityParser.checkHeader(
-        parser,
-        dataSchema,
-        _: String,
-        file.filePath
-      )
-      CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_))
+      CSVUtils.extractHeader(lines, parser.options).foreach { header =>
+        UnivocityParser.checkHeader(header, parser, dataSchema, file.filePath)
+      }
     }
 
     UnivocityParser.parseIterator(lines, parser, schema)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 7b43e9a0533ff..237be35a277ac 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -352,9 +352,9 @@ private[csv] object UnivocityParser {
   }
 
   def checkHeader(
+      header: String,
       parser: UnivocityParser,
       schema: StructType,
-      header: String,
       fileName: String
   ): Unit = {
     lazy val columnNames = parser.tokenizer.parseLine(header)

From 476b5176a489a06eb892f06111fb5a3e7f474324 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 26 Mar 2018 23:40:30 +0200
Subject: [PATCH 20/48] Refactoring: convert val to def

---
 .../execution/datasources/csv/CSVDataSource.scala   | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 3bf6fc41d0a84..6f1610ad85b21 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -219,12 +219,10 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parser: UnivocityParser,
       schema: StructType,
       dataSchema: StructType): Iterator[InternalRow] = {
-    val checkHeader = UnivocityParser.checkHeaderColumnNames(
-      parser,
-      dataSchema,
-      _: Array[String],
-      file.filePath
-    )
+    def checkHeader(header: Array[String]): Unit = {
+      UnivocityParser.checkHeaderColumnNames(parser, dataSchema, header, file.filePath)
+    }
+
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
       parser.options.headerFlag,
@@ -239,7 +237,8 @@ object MultiLineCSVDataSource extends CSVDataSource {
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
     // The header is not checked because there is no schema against with it could be check
-    val checkHeader = (_: Array[String]) => ()
+    def checkHeader(header: Array[String]): Unit = ()
+
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(

From f8167e45d3c72ca47f67f5567086f0085099f90f Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 26 Mar 2018 23:43:16 +0200
Subject: [PATCH 21/48] Parse header only if the checkHeader option is true

---
 .../sql/execution/datasources/csv/UnivocityParser.scala    | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 237be35a277ac..4f823145d395c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -357,7 +357,10 @@ private[csv] object UnivocityParser {
       schema: StructType,
       fileName: String
   ): Unit = {
-    lazy val columnNames = parser.tokenizer.parseLine(header)
-    checkHeaderColumnNames(parser, schema, columnNames, fileName)
+    if (parser.options.checkHeader) {
+      val columnNames = parser.tokenizer.parseLine(header)
+
+      checkHeaderColumnNames(parser, schema, columnNames, fileName)
+    }
   }
 }

From d068f6c44801ac8c89ef38f41692ac7c5a0d909e Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 26 Mar 2018 23:57:58 +0200
Subject: [PATCH 22/48] Moving header checks to CSVDataSource

---
 .../datasources/csv/CSVDataSource.scala       | 62 ++++++++++++++++++-
 .../datasources/csv/UnivocityParser.scala     | 56 +----------------
 2 files changed, 61 insertions(+), 57 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 6f1610ad85b21..214585afb8d9e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -120,6 +120,61 @@ object CSVDataSource {
       TextInputCSVDataSource
     }
   }
+
+  def checkHeaderColumnNames(
+    schema: StructType,
+    columnNames: Array[String],
+    fileName: String,
+    checkHeaderFlag: Boolean
+  ): Unit = {
+    if (checkHeaderFlag && columnNames != null) {
+      val fieldNames = schema.map(_.name).toIndexedSeq
+      val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
+      var error: Option[String] = None
+
+      if (headerLen == schemaSize) {
+        var i = 0
+        while (error.isEmpty && i < headerLen) {
+          val nameInSchema = fieldNames(i).toLowerCase
+          val nameInHeader = columnNames(i).toLowerCase
+          if (nameInHeader != nameInSchema) {
+            error = Some(
+              s"""|CSV file header does not contain the expected fields.
+                  | Header: ${columnNames.mkString(", ")}
+                  | Schema: ${fieldNames.mkString(", ")}
+                  |Expected: $nameInSchema but found: $nameInHeader
+                  |CSV file: $fileName""".stripMargin
+            )
+          }
+          i += 1
+        }
+      } else {
+        error = Some(
+          s"""|Number of column in CSV header is not equal to number of fields in the schema:
+              | Header length: $headerLen, schema size: $schemaSize
+              |CSV file: $fileName""".stripMargin
+        )
+      }
+
+      error.headOption.foreach { msg =>
+        throw new IllegalArgumentException(msg)
+      }
+    }
+  }
+
+  def checkHeader(
+    header: String,
+    parser: CsvParser,
+    schema: StructType,
+    fileName: String,
+    checkHeaderFlag: Boolean
+  ): Unit = {
+    if (checkHeaderFlag) {
+      val columnNames = parser.parseLine(header)
+
+      checkHeaderColumnNames(schema, columnNames, fileName, checkHeaderFlag)
+    }
+  }
 }
 
 object TextInputCSVDataSource extends CSVDataSource {
@@ -146,7 +201,8 @@ object TextInputCSVDataSource extends CSVDataSource {
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
-        UnivocityParser.checkHeader(header, parser, dataSchema, file.filePath)
+        CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema,
+          file.filePath, parser.options.checkHeader)
       }
     }
 
@@ -220,7 +276,9 @@ object MultiLineCSVDataSource extends CSVDataSource {
       schema: StructType,
       dataSchema: StructType): Iterator[InternalRow] = {
     def checkHeader(header: Array[String]): Unit = {
-      UnivocityParser.checkHeaderColumnNames(parser, dataSchema, header, file.filePath)
+      CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath,
+        parser.options.checkHeader
+      )
     }
 
     UnivocityParser.parseStream(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 4f823145d395c..d5ae6874d4dda 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -47,7 +47,7 @@ class UnivocityParser(
   // A `ValueConverter` is responsible for converting the given value to a desired type.
   private type ValueConverter = String => Any
 
-  private val tokenizer = new CsvParser(options.asParserSettings)
+  val tokenizer = new CsvParser(options.asParserSettings)
 
   private val row = new GenericInternalRow(requiredSchema.length)
 
@@ -309,58 +309,4 @@ private[csv] object UnivocityParser {
 
     filteredLines.flatMap(safeParser.parse)
   }
-
-  def checkHeaderColumnNames(
-    parser: UnivocityParser,
-    schema: StructType,
-    columnNames: Array[String],
-    fileName: String
-  ): Unit = {
-    if (parser.options.checkHeader && columnNames != null) {
-      val fieldNames = schema.map(_.name).toIndexedSeq
-      val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
-      var error: Option[String] = None
-
-      if (headerLen == schemaSize) {
-        var i = 0
-        while (error.isEmpty && i < headerLen) {
-          val nameInSchema = fieldNames(i).toLowerCase
-          val nameInHeader = columnNames(i).toLowerCase
-          if (nameInHeader != nameInSchema) {
-            error = Some(
-              s"""|CSV file header does not contain the expected fields.
-                  | Header: ${columnNames.mkString(", ")}
-                  | Schema: ${fieldNames.mkString(", ")}
-                  |Expected: $nameInSchema but found: $nameInHeader
-                  |CSV file: $fileName""".stripMargin
-            )
-          }
-          i += 1
-        }
-      } else {
-        error = Some(
-          s"""|Number of column in CSV header is not equal to number of fields in the schema:
-              | Header length: $headerLen, schema size: $schemaSize
-              |CSV file: $fileName""".stripMargin
-        )
-      }
-
-      error.headOption.foreach { msg =>
-        throw new IllegalArgumentException(msg)
-      }
-    }
-  }
-
-  def checkHeader(
-      header: String,
-      parser: UnivocityParser,
-      schema: StructType,
-      fileName: String
-  ): Unit = {
-    if (parser.options.checkHeader) {
-      val columnNames = parser.tokenizer.parseLine(header)
-
-      checkHeaderColumnNames(parser, schema, columnNames, fileName)
-    }
-  }
 }

From 08cfcf43f4564195bdd83218d2183a2dae4d4f2c Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 27 Mar 2018 00:04:55 +0200
Subject: [PATCH 23/48] Making uniVocity wrapper unaware of header

---
 .../datasources/csv/CSVDataSource.scala       |  2 +-
 .../datasources/csv/UnivocityParser.scala     | 23 ++++++++++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 214585afb8d9e..3c906e46199e5 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -301,7 +301,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(
         CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
-        shouldDropHeader = false,
+        dropFirstRecord = false,
         checkHeader,
         new CsvParser(parsedOptions.asParserSettings))
     }.take(1).headOption match {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index d5ae6874d4dda..ef0d5688f8464 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -236,10 +236,10 @@ private[csv] object UnivocityParser {
    */
   def tokenizeStream(
       inputStream: InputStream,
-      shouldDropHeader: Boolean,
-      checkHeader: Array[String] => Unit,
+      dropFirstRecord: Boolean,
+      checkFirstRecord: Array[String] => Unit,
       tokenizer: CsvParser): Iterator[Array[String]] = {
-    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader)(tokens => tokens)
+    convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord)(tokens => tokens)
   }
 
   /**
@@ -247,32 +247,33 @@ private[csv] object UnivocityParser {
    */
   def parseStream(
       inputStream: InputStream,
-      shouldDropHeader: Boolean,
+      dropFirstRecord: Boolean,
       parser: UnivocityParser,
       schema: StructType,
-      checkHeader: Array[String] => Unit): Iterator[InternalRow] = {
+      filePath: String,
+      checkFirstRecord: Array[String] => Unit): Iterator[InternalRow] = {
     val tokenizer = parser.tokenizer
     val safeParser = new FailureSafeParser[Array[String]](
       input => Seq(parser.convert(input)),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
-    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens =>
+    convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord) { tokens =>
       safeParser.parse(tokens)
     }.flatten
   }
 
   private def convertStream[T](
       inputStream: InputStream,
-      shouldDropHeader: Boolean,
+      dropFirstRecord: Boolean,
       tokenizer: CsvParser,
-      checkHeader: Array[String] => Unit
+      checkFirstRecord: Array[String] => Unit
   )(convert: Array[String] => T) = new Iterator[T] {
     tokenizer.beginParsing(inputStream)
     private var nextRecord = {
-      if (shouldDropHeader) {
-        val header = tokenizer.parseNext()
-        checkHeader(header)
+      if (dropFirstRecord) {
+        val firstRecord = tokenizer.parseNext()
+        checkFirstRecord(firstRecord)
       }
       tokenizer.parseNext()
     }

From f6a1694608204f0923e2dec1c79b97e5fdda0154 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Tue, 27 Mar 2018 15:44:23 +0200
Subject: [PATCH 24/48] Fix the test: error mesage was changed

---
 python/pyspark/sql/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 13804078574b9..ace5eafa8820c 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2985,7 +2985,7 @@ def test_checking_csv_header(self):
         df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath)
         self.assertRaisesRegexp(
             Exception,
-            "Fields in the header of csv file are not matched to field names of the schema",
+            "CSV file header does not contain the expected fields",
             lambda: df.collect())
         shutil.rmtree(tmpPath)
 

From adbedf39194f72a873db10a42ac826cf0568513d Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sat, 31 Mar 2018 22:48:37 +0200
Subject: [PATCH 25/48] Revert CSV tests as it was before the option was
 introduced

---
 .../datasources/csv/CSVDataSource.scala        |  6 +++---
 .../execution/datasources/csv/CSVOptions.scala |  6 +++---
 .../execution/datasources/csv/CSVSuite.scala   | 18 +++++++-----------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 3c906e46199e5..ce508406749f4 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -201,8 +201,8 @@ object TextInputCSVDataSource extends CSVDataSource {
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
-        CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema,
-          file.filePath, parser.options.checkHeader)
+        CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
+          checkHeaderFlag = !parser.options.enforceSchema)
       }
     }
 
@@ -277,7 +277,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
       dataSchema: StructType): Iterator[InternalRow] = {
     def checkHeader(header: Array[String]): Unit = {
       CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath,
-        parser.options.checkHeader
+        checkHeaderFlag = !parser.options.enforceSchema
       )
     }
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 2030c05b9ff86..7927c117d7280 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -151,10 +151,10 @@ class CSVOptions(
   val isCommentSet = this.comment != '\u0000'
 
   /**
-   * The option enables checks of headers in csv files. In particular, column names
-   * are matched to field names of provided schema.
+   * Forcibly apply the specified or inferred schema to datasource files.
+   * If the option is enabled, headers of CSV files will be ignored.
    */
-  val checkHeader = getBool("checkHeader", true)
+  val enforceSchema = getBool("enforceSchema", true)
 
   def asWriterSettings: CsvWriterSettings = {
     val writerSettings = new CsvWriterSettings()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index e28ba8089c6a9..cdbe0f724e8ae 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -252,11 +252,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
           |(yearMade double, makeName string, modelName string, priceTag decimal,
           | comments string, grp string)
           |USING csv
-          |OPTIONS (
-          |  path "${testFile(carsTsvFile)}",
-          |  header "true", checkHeader "false",
-          |   delimiter "\t"
-          |)
+          |OPTIONS (path "${testFile(carsTsvFile)}", header "true", delimiter "\t")
          """.stripMargin.replaceAll("\n", " "))
 
       assert(
@@ -279,7 +275,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   test("test for blank column names on read and select columns") {
     val cars = spark.read
       .format("csv")
-      .options(Map("header" -> "true", "checkHeader" -> "false", "inferSchema" -> "true"))
+      .options(Map("header" -> "true", "inferSchema" -> "true"))
       .load(testFile(carsBlankColName))
 
     assert(cars.select("customer").collect().size == 2)
@@ -352,7 +348,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       spark.sql(
         s"""
           |CREATE TEMPORARY VIEW carsTable
-          |(year double, make string, model string, comment string, blank string)
+          |(yearMade double, makeName string, modelName string, comments string, blank string)
           |USING csv
           |OPTIONS (path "${testFile(carsFile)}", header "true")
          """.stripMargin.replaceAll("\n", " "))
@@ -360,7 +356,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       val cars = spark.table("carsTable")
       verifyCars(cars, withHeader = true, checkHeader = false, checkValues = false)
       assert(
-        cars.schema.fieldNames === Array("year", "make", "model", "comment", "blank"))
+        cars.schema.fieldNames === Array("yearMade", "makeName", "modelName", "comments", "blank"))
     }
   }
 
@@ -1297,7 +1293,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
             .schema(ischema)
             .option("multiLine", multiLine)
             .option("header", "true")
-            .option("checkHeader", "true")
+            .option("enforceSchema", "false")
             .csv(path.getCanonicalPath)
             .collect()
         }
@@ -1311,7 +1307,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
             .schema(shortSchema)
             .option("multiLine", multiLine)
             .option("header", "true")
-            .option("checkHeader", "true")
+            .option("enforceSchema", "false")
             .csv(path.getCanonicalPath)
             .collect()
         }
@@ -1329,7 +1325,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
             .schema(longSchema)
             .option("multiLine", multiLine)
             .option("header", "true")
-            .option("checkHeader", "true")
+            .option("enforceSchema", "false")
             .csv(path.getCanonicalPath)
             .collect()
         }

From 0904daf52b1d94467f75bc6703d3d81747208e77 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sat, 31 Mar 2018 23:24:00 +0200
Subject: [PATCH 26/48] Renaming checkHeader to enforceSchema

---
 python/pyspark/sql/readwriter.py                       | 10 +++++-----
 python/pyspark/sql/tests.py                            |  3 ++-
 .../scala/org/apache/spark/sql/DataFrameReader.scala   |  4 ++--
 .../spark/sql/test/DataFrameReaderWriterSuite.scala    |  3 +--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 9845ce8d45512..6f437b47053bc 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -336,7 +336,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
             columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
-            checkHeader=None):
+            enforceSchema=None):
         """Loads a CSV file and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -361,11 +361,11 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                         character. By default (None), it is disabled.
         :param header: uses the first line as names of columns. If None is set, it uses the
                        default value, ``false``.
-        :param checkHeader: compares column names in the header with field names in the schema
-                            and outputs an error if names are not matched.
-                            If None is set, it uses the default value, ``true``.
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
+        :param enforceSchema: Forcibly apply the specified or inferred schema to CSV
+                              files. If None is set, it uses the default value, ``true``.
+                              In that case, CSV headers are ignored.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
                                         uses the default value, ``false``.
@@ -440,7 +440,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=maxCharsPerColumn,
             maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
-            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, checkHeader=checkHeader)
+            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema)
         if isinstance(path, basestring):
             path = [path]
         if type(path) == list:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index ace5eafa8820c..db51ac37096e8 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -2982,7 +2982,8 @@ def test_checking_csv_header(self):
         schema = StructType([
             StructField('f2', IntegerType(), nullable=True),
             StructField('f1', IntegerType(), nullable=True)])
-        df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath)
+        df = self.spark.read.option('header', 'true').schema(schema).\
+            csv(tmpPath, enforceSchema=False)
         self.assertRaisesRegexp(
             Exception,
             "CSV file header does not contain the expected fields",
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 4f04bd7ddc0c5..2668a1c756820 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -524,8 +524,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`comment` (default empty string): sets a single character used for skipping lines
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
-   * <li>`checkHeader` (default `true`): compares column names in the header with field names
-   * in the schema and outputs an error if names are not matched.</li>
+   * <li>`enforceSchema` (default `true`): Forcibly apply the specified or inferred schema to
+   * datasource files. If it is set, CSV headers are ignored. </li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
index 39c05a88fa132..14b1feb2adc20 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala
@@ -764,8 +764,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be
           // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896)
           var testDir = Utils.createTempDir(src.getAbsolutePath)
           Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(testDir.getAbsolutePath)
-          val df = spark.read.format("csv").option("inferSchema", true)
-            .option("header", true).option("checkHeader", "false")
+          val df = spark.read.format("csv").option("inferSchema", true).option("header", true)
             .load(testDir.getAbsolutePath)
           checkAnswer(df, Row(1, 1))
           checkReadPartitionColumnDuplication("csv", c0, c1, src)

From 191b415e77bbc8fe662c19faae627e40787c5e23 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Sun, 1 Apr 2018 10:07:32 +0200
Subject: [PATCH 27/48] Pass required parameter

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala  | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index ce508406749f4..98533a8977a3a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -283,10 +283,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
 
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
-      parser.options.headerFlag,
-      parser,
-      schema,
-      checkHeader)
+      parser.options.headerFlag, parser, schema, file.filePath, checkHeader)
   }
 
   override def infer(

From ab9c514c2567795ff07aa1e25fac16ca62e994b6 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 13 Apr 2018 12:48:06 +0200
Subject: [PATCH 28/48] Addressing Xiao Li's review comments

---
 .../apache/spark/sql/DataFrameReader.scala    |   7 +-
 .../datasources/csv/CSVDataSource.scala       |  42 +++---
 .../datasources/csv/UnivocityParser.scala     |   8 +-
 .../execution/datasources/csv/CSVSuite.scala  | 128 ++++++++++--------
 4 files changed, 98 insertions(+), 87 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 3d39d9d809499..6b7a9ab5b7b71 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -526,8 +526,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`comment` (default empty string): sets a single character used for skipping lines
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
-   * <li>`enforceSchema` (default `true`): Forcibly apply the specified or inferred schema to
-   * datasource files. If it is set, CSV headers are ignored. </li>
+   * <li>`enforceSchema` (default `true`): If it is set to `true`, the specified or inferred schema
+   * will be forcibly applied to datasource files and headers in CSV files will be ignored.
+   * If the option is set to `false`, the schema will be validated against headers in CSV files if
+   * the `header` option is set to `true`. The validation is performed in column ordering aware and
+   * case sensitive manner.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 98533a8977a3a..6f6f3a8d5bfec 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -27,13 +27,14 @@ import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-
 import org.apache.spark.TaskContext
+
 import org.apache.spark.input.{PortableDataStream, StreamInputFormat}
 import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.datasources._
+import org.apache.spark.sql.execution.datasources.csv.CSVDataSource.checkHeaderColumnNames
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
 import org.apache.spark.sql.types.StructType
 
@@ -50,9 +51,9 @@ abstract class CSVDataSource extends Serializable {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType, // Schema of projection
-      dataSchema: StructType // Schema of data in csv files
-  ): Iterator[InternalRow]
+      schema: StructType,
+      // Actual schema of data in the csv file
+      dataSchema: StructType): Iterator[InternalRow]
 
   /**
    * Infers the schema from `inputPaths` files.
@@ -121,12 +122,8 @@ object CSVDataSource {
     }
   }
 
-  def checkHeaderColumnNames(
-    schema: StructType,
-    columnNames: Array[String],
-    fileName: String,
-    checkHeaderFlag: Boolean
-  ): Unit = {
+  def checkHeaderColumnNames(schema: StructType, columnNames: Array[String], fileName: String,
+      checkHeaderFlag: Boolean): Unit = {
     if (checkHeaderFlag && columnNames != null) {
       val fieldNames = schema.map(_.name).toIndexedSeq
       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
@@ -161,20 +158,6 @@ object CSVDataSource {
       }
     }
   }
-
-  def checkHeader(
-    header: String,
-    parser: CsvParser,
-    schema: StructType,
-    fileName: String,
-    checkHeaderFlag: Boolean
-  ): Unit = {
-    if (checkHeaderFlag) {
-      val columnNames = parser.parseLine(header)
-
-      checkHeaderColumnNames(schema, columnNames, fileName, checkHeaderFlag)
-    }
-  }
 }
 
 object TextInputCSVDataSource extends CSVDataSource {
@@ -201,7 +184,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
-        CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
+        checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
           checkHeaderFlag = !parser.options.enforceSchema)
       }
     }
@@ -264,6 +247,13 @@ object TextInputCSVDataSource extends CSVDataSource {
       sparkSession.createDataset(rdd)(Encoders.STRING)
     }
   }
+
+  def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String,
+      checkHeaderFlag: Boolean): Unit = {
+    if (checkHeaderFlag) {
+      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag)
+    }
+  }
 }
 
 object MultiLineCSVDataSource extends CSVDataSource {
@@ -283,7 +273,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
 
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
-      parser.options.headerFlag, parser, schema, file.filePath, checkHeader)
+      parser.options.headerFlag, parser, schema, checkHeader)
   }
 
   override def infer(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index ef0d5688f8464..d0e25a4c3349c 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -250,7 +250,6 @@ private[csv] object UnivocityParser {
       dropFirstRecord: Boolean,
       parser: UnivocityParser,
       schema: StructType,
-      filePath: String,
       checkFirstRecord: Array[String] => Unit): Iterator[InternalRow] = {
     val tokenizer = parser.tokenizer
     val safeParser = new FailureSafeParser[Array[String]](
@@ -267,8 +266,8 @@ private[csv] object UnivocityParser {
       inputStream: InputStream,
       dropFirstRecord: Boolean,
       tokenizer: CsvParser,
-      checkFirstRecord: Array[String] => Unit
-  )(convert: Array[String] => T) = new Iterator[T] {
+      checkFirstRecord: Array[String] => Unit)(
+      convert: Array[String] => T) = new Iterator[T] {
     tokenizer.beginParsing(inputStream)
     private var nextRecord = {
       if (dropFirstRecord) {
@@ -299,8 +298,7 @@ private[csv] object UnivocityParser {
       schema: StructType): Iterator[InternalRow] = {
     val options = parser.options
 
-    val filteredLines: Iterator[String] =
-      CSVUtils.filterCommentAndEmpty(lines, options)
+    val filteredLines: Iterator[String] = CSVUtils.filterCommentAndEmpty(lines, options)
 
     val safeParser = new FailureSafeParser[String](
       input => Seq(parser.parse(input)),
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index cdbe0f724e8ae..e42774cb45942 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -30,7 +30,6 @@ import org.apache.hadoop.io.compress.GzipCodec
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, UDT}
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
-import org.apache.spark.sql.functions.{col, regexp_replace}
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils}
 import org.apache.spark.sql.types._
@@ -1280,61 +1279,82 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     )
   }
 
-  def checkHeader(multiLine: String): Unit = {
-    test(s"SPARK-23786: Checking column names against schema ($multiLine)") {
-      withTempPath { path =>
-        import collection.JavaConverters._
-        val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
-        val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
-        odf.write.option("header", "true").csv(path.getCanonicalPath)
-        val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
-        val exception = intercept[SparkException] {
-          spark.read
-            .schema(ischema)
-            .option("multiLine", multiLine)
-            .option("header", "true")
-            .option("enforceSchema", "false")
-            .csv(path.getCanonicalPath)
-            .collect()
-        }
-        assert(exception.getMessage.contains(
-          "CSV file header does not contain the expected fields"
-        ))
-
-        val shortSchema = new StructType().add("f1", DoubleType)
-        val exceptionForShortSchema = intercept[SparkException] {
-          spark.read
-            .schema(shortSchema)
-            .option("multiLine", multiLine)
-            .option("header", "true")
-            .option("enforceSchema", "false")
-            .csv(path.getCanonicalPath)
-            .collect()
-        }
-        assert(exceptionForShortSchema.getMessage.contains(
-          "Number of column in CSV header is not equal to number of fields in the schema"
-        ))
-
-        val longSchema = new StructType()
-          .add("f1", DoubleType)
-          .add("f2", DoubleType)
-          .add("f3", DoubleType)
-
-        val exceptionForLongSchema = intercept[SparkException] {
-          spark.read
-            .schema(longSchema)
-            .option("multiLine", multiLine)
-            .option("header", "true")
-            .option("enforceSchema", "false")
-            .csv(path.getCanonicalPath)
-            .collect()
-        }
-        assert(exceptionForLongSchema.getMessage.contains(
-          "Header length: 2, schema size: 3"
-        ))
+  def checkHeader(multiLine: Boolean): Unit = {
+    withTempPath { path =>
+      import collection.JavaConverters._
+      val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+      val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+      odf.write.option("header", true).csv(path.getCanonicalPath)
+      val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+      val exception = intercept[SparkException] {
+        spark.read
+          .schema(ischema)
+          .option("multiLine", multiLine)
+          .option("header", true)
+          .option("enforceSchema", false)
+          .csv(path.getCanonicalPath)
+          .collect()
+      }
+      assert(exception.getMessage.contains(
+        "CSV file header does not contain the expected fields"
+      ))
+
+      val shortSchema = new StructType().add("f1", DoubleType)
+      val exceptionForShortSchema = intercept[SparkException] {
+        spark.read
+          .schema(shortSchema)
+          .option("multiLine", multiLine)
+          .option("header", true)
+          .option("enforceSchema", false)
+          .csv(path.getCanonicalPath)
+          .collect()
       }
+      assert(exceptionForShortSchema.getMessage.contains(
+        "Number of column in CSV header is not equal to number of fields in the schema"
+      ))
+
+      val longSchema = new StructType()
+        .add("f1", DoubleType)
+        .add("f2", DoubleType)
+        .add("f3", DoubleType)
+
+      val exceptionForLongSchema = intercept[SparkException] {
+        spark.read
+          .schema(longSchema)
+          .option("multiLine", multiLine)
+          .option("header", true)
+          .option("enforceSchema", false)
+          .csv(path.getCanonicalPath)
+          .collect()
+      }
+      assert(exceptionForLongSchema.getMessage.contains(
+        "Header length: 2, schema size: 3"
+      ))
     }
   }
 
-  List("false", "true").foreach(checkHeader(_))
+  test(s"SPARK-23786: Checking column names against schema in the multiline mode") {
+    checkHeader(multiLine = true)
+  }
+
+  test(s"SPARK-23786: Checking column names against schema in the per-line mode") {
+    checkHeader(multiLine = false)
+  }
+
+  test("SPARK-23786: CSV header must not be checked if it doesn't exist") {
+    withTempPath { path =>
+      import collection.JavaConverters._
+      val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+      val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+      odf.write.option("header", false).csv(path.getCanonicalPath)
+      val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+      val idf = spark.read
+          .schema(ischema)
+          .option("header", false)
+          .option("enforceSchema", false)
+          .csv(path.getCanonicalPath)
+
+      checkAnswer(idf, odf)
+    }
+  }
 }

From 0405863ca209b8bd9cf3950c3d4d122ac718928b Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 13 Apr 2018 12:57:40 +0200
Subject: [PATCH 29/48] Making header validation case sensitive

---
 .../execution/datasources/csv/CSVDataSource.scala  |  3 +--
 .../sql/execution/datasources/csv/CSVSuite.scala   | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 6f6f3a8d5bfec..e77117618cd55 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -132,8 +132,7 @@ object CSVDataSource {
       if (headerLen == schemaSize) {
         var i = 0
         while (error.isEmpty && i < headerLen) {
-          val nameInSchema = fieldNames(i).toLowerCase
-          val nameInHeader = columnNames(i).toLowerCase
+          val (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
           if (nameInHeader != nameInSchema) {
             error = Some(
               s"""|CSV file header does not contain the expected fields.
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index e42774cb45942..cbe5fb9165446 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1330,6 +1330,20 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       assert(exceptionForLongSchema.getMessage.contains(
         "Header length: 2, schema size: 3"
       ))
+
+      val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType)
+      val caseSensitiveException = intercept[SparkException] {
+        spark.read
+          .schema(caseSensitiveSchema)
+          .option("multiLine", multiLine)
+          .option("header", true)
+          .option("enforceSchema", false)
+          .csv(path.getCanonicalPath)
+          .collect()
+      }
+      assert(caseSensitiveException.getMessage.contains(
+        "CSV file header does not contain the expected fields"
+      ))
     }
   }
 

From 714c66d730a7e0e49016734dd3d771dcd991911e Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 13 Apr 2018 13:54:25 +0200
Subject: [PATCH 30/48] Describing enforceSchema in PySpark's csv method

---
 python/pyspark/sql/readwriter.py                         | 9 ++++++---
 .../sql/execution/datasources/csv/CSVDataSource.scala    | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 52c42449de4d9..bd2d2188eb890 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -365,9 +365,12 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                        default value, ``false``.
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
-        :param enforceSchema: Forcibly apply the specified or inferred schema to CSV
-                              files. If None is set, it uses the default value, ``true``.
-                              In that case, CSV headers are ignored.
+        :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be
+                              forcibly applied to datasource files and headers in CSV files will be
+                              ignored. If the option is set to ``false``, the schema will be
+                              validated against headers in CSV files if the ``header`` option is set
+                              to ``true``. The validation is performed in column ordering aware and
+                              case sensitive manner. If None is set, ``true`` is used by default.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
                                         uses the default value, ``false``.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index e77117618cd55..444df7c1f8878 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -27,8 +27,8 @@ import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapred.TextInputFormat
 import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
-import org.apache.spark.TaskContext
 
+import org.apache.spark.TaskContext
 import org.apache.spark.input.{PortableDataStream, StreamInputFormat}
 import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}

From 78d9f66cc575c62fced967820d60b756e82f77fb Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 13 Apr 2018 21:21:48 +0200
Subject: [PATCH 31/48] Respect to caseSensitive parameter

---
 .../datasources/csv/CSVDataSource.scala       |  39 +++--
 .../datasources/csv/CSVFileFormat.scala       |   4 +-
 .../execution/datasources/csv/CSVSuite.scala  | 145 ++++++++++--------
 3 files changed, 104 insertions(+), 84 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 444df7c1f8878..fc80a7006d1fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -53,7 +53,8 @@ abstract class CSVDataSource extends Serializable {
       parser: UnivocityParser,
       schema: StructType,
       // Actual schema of data in the csv file
-      dataSchema: StructType): Iterator[InternalRow]
+      dataSchema: StructType,
+      caseSensitive: Boolean): Iterator[InternalRow]
 
   /**
    * Infers the schema from `inputPaths` files.
@@ -123,7 +124,7 @@ object CSVDataSource {
   }
 
   def checkHeaderColumnNames(schema: StructType, columnNames: Array[String], fileName: String,
-      checkHeaderFlag: Boolean): Unit = {
+      checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = {
     if (checkHeaderFlag && columnNames != null) {
       val fieldNames = schema.map(_.name).toIndexedSeq
       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
@@ -132,7 +133,11 @@ object CSVDataSource {
       if (headerLen == schemaSize) {
         var i = 0
         while (error.isEmpty && i < headerLen) {
-          val (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
+          var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
+          if (caseSensitive == false) {
+            nameInSchema = nameInSchema.toLowerCase
+            nameInHeader = nameInHeader.toLowerCase
+          }
           if (nameInHeader != nameInSchema) {
             error = Some(
               s"""|CSV file header does not contain the expected fields.
@@ -162,12 +167,9 @@ object CSVDataSource {
 object TextInputCSVDataSource extends CSVDataSource {
   override val isSplitable: Boolean = true
 
-  override def readFile(
-      conf: Configuration,
-      file: PartitionedFile,
-      parser: UnivocityParser,
-      schema: StructType,
-      dataSchema: StructType): Iterator[InternalRow] = {
+  override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser,
+      schema: StructType, dataSchema: StructType,
+      caseSensitive: Boolean): Iterator[InternalRow] = {
     val lines = {
       val linesReader = new HadoopFileLinesReader(file, conf)
       Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close()))
@@ -184,7 +186,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
         checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
-          checkHeaderFlag = !parser.options.enforceSchema)
+          checkHeaderFlag = !parser.options.enforceSchema, caseSensitive)
       }
     }
 
@@ -248,9 +250,10 @@ object TextInputCSVDataSource extends CSVDataSource {
   }
 
   def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String,
-      checkHeaderFlag: Boolean): Unit = {
+      checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = {
     if (checkHeaderFlag) {
-      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag)
+      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag,
+        caseSensitive)
     }
   }
 }
@@ -258,16 +261,12 @@ object TextInputCSVDataSource extends CSVDataSource {
 object MultiLineCSVDataSource extends CSVDataSource {
   override val isSplitable: Boolean = false
 
-  override def readFile(
-      conf: Configuration,
-      file: PartitionedFile,
-      parser: UnivocityParser,
-      schema: StructType,
-      dataSchema: StructType): Iterator[InternalRow] = {
+  override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser,
+      schema: StructType, dataSchema: StructType,
+      caseSensitive: Boolean): Iterator[InternalRow] = {
     def checkHeader(header: Array[String]): Unit = {
       CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath,
-        checkHeaderFlag = !parser.options.enforceSchema
-      )
+        checkHeaderFlag = !parser.options.enforceSchema, caseSensitive)
     }
 
     UnivocityParser.parseStream(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index 37ed8d5e017d8..53231843c5145 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -122,6 +122,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
           "df.filter($\"_corrupt_record\".isNotNull).count()."
       )
     }
+    val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis
 
     (file: PartitionedFile) => {
       val conf = broadcastedHadoopConf.value.value
@@ -129,7 +130,8 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
         StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         parsedOptions)
-      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema)
+      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema,
+        caseSensitive)
     }
   }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index cbe5fb9165446..af981a1be04bb 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1280,70 +1280,72 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
   }
 
   def checkHeader(multiLine: Boolean): Unit = {
-    withTempPath { path =>
-      import collection.JavaConverters._
-      val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
-      val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
-      odf.write.option("header", true).csv(path.getCanonicalPath)
-      val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
-      val exception = intercept[SparkException] {
-        spark.read
-          .schema(ischema)
-          .option("multiLine", multiLine)
-          .option("header", true)
-          .option("enforceSchema", false)
-          .csv(path.getCanonicalPath)
-          .collect()
-      }
-      assert(exception.getMessage.contains(
-        "CSV file header does not contain the expected fields"
-      ))
-
-      val shortSchema = new StructType().add("f1", DoubleType)
-      val exceptionForShortSchema = intercept[SparkException] {
-        spark.read
-          .schema(shortSchema)
-          .option("multiLine", multiLine)
-          .option("header", true)
-          .option("enforceSchema", false)
-          .csv(path.getCanonicalPath)
-          .collect()
-      }
-      assert(exceptionForShortSchema.getMessage.contains(
-        "Number of column in CSV header is not equal to number of fields in the schema"
-      ))
-
-      val longSchema = new StructType()
-        .add("f1", DoubleType)
-        .add("f2", DoubleType)
-        .add("f3", DoubleType)
-
-      val exceptionForLongSchema = intercept[SparkException] {
-        spark.read
-          .schema(longSchema)
-          .option("multiLine", multiLine)
-          .option("header", true)
-          .option("enforceSchema", false)
-          .csv(path.getCanonicalPath)
-          .collect()
-      }
-      assert(exceptionForLongSchema.getMessage.contains(
-        "Header length: 2, schema size: 3"
-      ))
-
-      val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType)
-      val caseSensitiveException = intercept[SparkException] {
-        spark.read
-          .schema(caseSensitiveSchema)
-          .option("multiLine", multiLine)
-          .option("header", true)
-          .option("enforceSchema", false)
-          .csv(path.getCanonicalPath)
-          .collect()
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
+      withTempPath { path =>
+        import collection.JavaConverters._
+        val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+        val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+        odf.write.option("header", true).csv(path.getCanonicalPath)
+        val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+        val exception = intercept[SparkException] {
+          spark.read
+            .schema(ischema)
+            .option("multiLine", multiLine)
+            .option("header", true)
+            .option("enforceSchema", false)
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exception.getMessage.contains(
+          "CSV file header does not contain the expected fields"
+        ))
+
+        val shortSchema = new StructType().add("f1", DoubleType)
+        val exceptionForShortSchema = intercept[SparkException] {
+          spark.read
+            .schema(shortSchema)
+            .option("multiLine", multiLine)
+            .option("header", true)
+            .option("enforceSchema", false)
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exceptionForShortSchema.getMessage.contains(
+          "Number of column in CSV header is not equal to number of fields in the schema"
+        ))
+
+        val longSchema = new StructType()
+          .add("f1", DoubleType)
+          .add("f2", DoubleType)
+          .add("f3", DoubleType)
+
+        val exceptionForLongSchema = intercept[SparkException] {
+          spark.read
+            .schema(longSchema)
+            .option("multiLine", multiLine)
+            .option("header", true)
+            .option("enforceSchema", false)
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(exceptionForLongSchema.getMessage.contains(
+          "Header length: 2, schema size: 3"
+        ))
+
+        val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType)
+        val caseSensitiveException = intercept[SparkException] {
+          spark.read
+            .schema(caseSensitiveSchema)
+            .option("multiLine", multiLine)
+            .option("header", true)
+            .option("enforceSchema", false)
+            .csv(path.getCanonicalPath)
+            .collect()
+        }
+        assert(caseSensitiveException.getMessage.contains(
+          "CSV file header does not contain the expected fields"
+        ))
       }
-      assert(caseSensitiveException.getMessage.contains(
-        "CSV file header does not contain the expected fields"
-      ))
     }
   }
 
@@ -1371,4 +1373,21 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       checkAnswer(idf, odf)
     }
   }
+
+  test("Ignore column name case if spark.sql.caseSensitive is false") {
+    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
+      withTempPath { path =>
+        import collection.JavaConverters._
+        val oschema = new StructType().add("A", StringType)
+        val odf = spark.createDataFrame(List(Row("0")).asJava, oschema)
+        odf.write.option("header", true).csv(path.getCanonicalPath)
+        val ischema = new StructType().add("a", StringType)
+        val idf = spark.read.schema(ischema)
+          .option("header", true)
+          .option("enforceSchema", false)
+          .csv(path.getCanonicalPath)
+        checkAnswer(idf, odf)
+      }
+    }
+  }
 }

From b43a7c7ec50e03aaf4990e9bbb6989cdb2c076ef Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 13 Apr 2018 22:06:36 +0200
Subject: [PATCH 32/48] Check header on csv parsing from dataset of strings

---
 .../apache/spark/sql/DataFrameReader.scala    |  7 +++++++
 .../datasources/csv/CSVDataSource.scala       | 20 +++++++++----------
 .../execution/datasources/csv/CSVSuite.scala  | 12 ++++++++++-
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 6b7a9ab5b7b71..bbc2e4d1344d1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -21,6 +21,8 @@ import java.util.{Locale, Properties}
 
 import scala.collection.JavaConverters._
 
+import com.univocity.parsers.csv.CsvParser
+
 import org.apache.spark.Partition
 import org.apache.spark.annotation.InterfaceStability
 import org.apache.spark.api.java.JavaRDD
@@ -486,6 +488,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
 
     val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine =>
+      if (parsedOptions.enforceSchema == false) {
+        CSVDataSource.checkHeader(firstLine, new CsvParser(parsedOptions.asParserSettings),
+          actualSchema, csvDataset.getClass.getCanonicalName, checkHeaderFlag = true,
+          sparkSession.sessionState.conf.caseSensitiveAnalysis)
+      }
       filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions))
     }.getOrElse(filteredLines.rdd)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index fc80a7006d1fc..73898fc23679e 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -143,7 +143,7 @@ object CSVDataSource {
               s"""|CSV file header does not contain the expected fields.
                   | Header: ${columnNames.mkString(", ")}
                   | Schema: ${fieldNames.mkString(", ")}
-                  |Expected: $nameInSchema but found: $nameInHeader
+                  |Expected: ${columnNames(i)} but found: ${fieldNames(i)}
                   |CSV file: $fileName""".stripMargin
             )
           }
@@ -162,6 +162,14 @@ object CSVDataSource {
       }
     }
   }
+
+  def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String,
+    checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = {
+    if (checkHeaderFlag) {
+      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag,
+        caseSensitive)
+    }
+  }
 }
 
 object TextInputCSVDataSource extends CSVDataSource {
@@ -185,7 +193,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
-        checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
+        CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
           checkHeaderFlag = !parser.options.enforceSchema, caseSensitive)
       }
     }
@@ -248,14 +256,6 @@ object TextInputCSVDataSource extends CSVDataSource {
       sparkSession.createDataset(rdd)(Encoders.STRING)
     }
   }
-
-  def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String,
-      checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = {
-    if (checkHeaderFlag) {
-      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag,
-        caseSensitive)
-    }
-  }
 }
 
 object MultiLineCSVDataSource extends CSVDataSource {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index af981a1be04bb..b06480d7ab35a 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1374,7 +1374,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
     }
   }
 
-  test("Ignore column name case if spark.sql.caseSensitive is false") {
+  test("SPARK-23786: Ignore column name case if spark.sql.caseSensitive is false") {
     withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
       withTempPath { path =>
         import collection.JavaConverters._
@@ -1390,4 +1390,14 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       }
     }
   }
+
+  test("SPARK-23786: check header on parsing of dataset of strings") {
+    val ds = Seq("columnA,columnB", "1.0,1000.0").toDS()
+    val ischema = new StructType().add("columnB", DoubleType).add("columnA", DoubleType)
+    val exception = intercept[IllegalArgumentException] {
+      spark.read.schema(ischema).option("header", true).option("enforceSchema", false).csv(ds)
+    }
+
+    assert(exception.getMessage.contains("CSV file header does not contain the expected fields"))
+  }
 }

From 9b2d403085f45b0d975cc3e7d5ac559aa81e0c64 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 18 Apr 2018 12:13:33 +0200
Subject: [PATCH 33/48] Make Scala style checker happy

---
 .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index cd9829750aec8..56826ad12f4b6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -21,8 +21,8 @@ import java.util.{Locale, Properties}
 
 import scala.collection.JavaConverters._
 
-import com.univocity.parsers.csv.CsvParser
 import com.fasterxml.jackson.databind.ObjectMapper
+import com.univocity.parsers.csv.CsvParser
 
 import org.apache.spark.Partition
 import org.apache.spark.annotation.InterfaceStability

From 21f8b10dda4b0ef71ba69cc6147d1cf8614812f1 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Mon, 14 May 2018 15:06:02 +0200
Subject: [PATCH 34/48] Removing a space to make Scala style checker happy.

---
 .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 524c0c81f779d..7704526811fc1 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1368,7 +1368,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
       checkAnswer(computed, expected)
     }
   }
- 
+
   def checkHeader(multiLine: Boolean): Unit = {
     withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
       withTempPath { path =>

From e3b4275d71d2230b9833d92435157a54cdc0b7e0 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Wed, 16 May 2018 16:37:56 +0200
Subject: [PATCH 35/48] Addressing review comments

---
 python/pyspark/sql/readwriter.py              |  5 +-
 python/pyspark/sql/streaming.py               | 12 +++-
 python/pyspark/sql/tests.py                   | 26 +++----
 .../apache/spark/sql/DataFrameReader.scala    |  6 +-
 .../datasources/csv/CSVDataSource.scala       | 67 ++++++++++++-------
 .../datasources/csv/UnivocityParser.scala     |  1 -
 .../execution/datasources/csv/CSVSuite.scala  |  5 +-
 7 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 8b3bcd90a6010..bd65ac386a8db 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -377,8 +377,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                               forcibly applied to datasource files and headers in CSV files will be
                               ignored. If the option is set to ``false``, the schema will be
                               validated against headers in CSV files if the ``header`` option is set
-                              to ``true``. The validation is performed in column ordering aware and
-                              case sensitive manner. If None is set, ``true`` is used by default.
+                              to ``true``.  The validation is performed in column ordering aware
+                              manner by taking into account ``spark.sql.caseSensitive``.
+                              If None is set, ``true`` is used by default.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
                                         uses the default value, ``false``.
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 15f9407389864..30cc8d3baad96 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -564,7 +564,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
-            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None):
+            columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
+            enforceSchema=None):
         """Loads a CSV file stream and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -592,6 +593,13 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                        default value, ``false``.
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
+        :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be
+                              forcibly applied to datasource files and headers in CSV files will be
+                              ignored. If the option is set to ``false``, the schema will be
+                              validated against headers in CSV files if the ``header`` option is set
+                              to ``true``.  The validation is performed in column ordering aware
+                              manner by taking into account ``spark.sql.caseSensitive``.
+                              If None is set, ``true`` is used by default.
         :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
                                         uses the default value, ``false``.
@@ -664,7 +672,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=maxCharsPerColumn,
             maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode,
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
-            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping)
+            charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema)
         if isinstance(path, basestring):
             return self._df(self._jreader.csv(path))
         else:
diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 031239a68b3eb..689b27f475ec0 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3043,18 +3043,20 @@ def test_csv_sampling_ratio(self):
     def test_checking_csv_header(self):
         tmpPath = tempfile.mkdtemp()
         shutil.rmtree(tmpPath)
-        self.spark.createDataFrame([[1, 1000], [2000, 2]]).\
-            toDF('f1', 'f2').write.option("header", "true").csv(tmpPath)
-        schema = StructType([
-            StructField('f2', IntegerType(), nullable=True),
-            StructField('f1', IntegerType(), nullable=True)])
-        df = self.spark.read.option('header', 'true').schema(schema).\
-            csv(tmpPath, enforceSchema=False)
-        self.assertRaisesRegexp(
-            Exception,
-            "CSV file header does not contain the expected fields",
-            lambda: df.collect())
-        shutil.rmtree(tmpPath)
+        try:
+            self.spark.createDataFrame([[1, 1000], [2000, 2]])\
+                .toDF('f1', 'f2').write.option("header", "true").csv(tmpPath)
+            schema = StructType([
+                StructField('f2', IntegerType(), nullable=True),
+                StructField('f1', IntegerType(), nullable=True)])
+            df = self.spark.read.option('header', 'true').schema(schema)\
+                .csv(tmpPath, enforceSchema=False)
+            self.assertRaisesRegexp(
+                Exception,
+                "CSV file header does not contain the expected fields",
+                lambda: df.collect())
+        finally:
+            shutil.rmtree(tmpPath)
 
 
 class HiveSparkSubmitTests(SparkSubmitTests):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index fb02fe2e92894..b991d2538ae82 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -545,9 +545,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
    * <li>`enforceSchema` (default `true`): If it is set to `true`, the specified or inferred schema
    * will be forcibly applied to datasource files and headers in CSV files will be ignored.
-   * If the option is set to `false`, the schema will be validated against headers in CSV files if
-   * the `header` option is set to `true`. The validation is performed in column ordering aware and
-   * case sensitive manner.</li>
+   * If the option is set to `false`, the schema will be validated against headers in CSV files
+   * in the case when the `header` option is set to `true`. The validation is performed in column
+   * ordering aware manner by taking into account `spark.sql.caseSensitive`.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`samplingRatio` (default is 1.0): defines fraction of rows used for schema inferring.</li>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 0c42050bae673..2c28024a791da 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -34,7 +34,6 @@ import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.execution.datasources._
-import org.apache.spark.sql.execution.datasources.csv.CSVDataSource.checkHeaderColumnNames
 import org.apache.spark.sql.execution.datasources.text.TextFileFormat
 import org.apache.spark.sql.types.StructType
 
@@ -51,7 +50,7 @@ abstract class CSVDataSource extends Serializable {
       conf: Configuration,
       file: PartitionedFile,
       parser: UnivocityParser,
-      schema: StructType,
+      requiredSchema: StructType,
       // Actual schema of data in the csv file
       dataSchema: StructType,
       caseSensitive: Boolean): Iterator[InternalRow]
@@ -123,48 +122,58 @@ object CSVDataSource {
     }
   }
 
-  def checkHeaderColumnNames(schema: StructType, columnNames: Array[String], fileName: String,
-      checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = {
+  /**
+   * Checks that column names in a CSV header and field names in the schema are the same
+   * by taking into account case sensitivity.
+   */
+  def checkHeaderColumnNames(
+      schema: StructType,
+      columnNames: Array[String],
+      fileName: String,
+      checkHeaderFlag: Boolean,
+      caseSensitive: Boolean): Unit = {
     if (checkHeaderFlag && columnNames != null) {
       val fieldNames = schema.map(_.name).toIndexedSeq
       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
-      var error: Option[String] = None
 
       if (headerLen == schemaSize) {
         var i = 0
-        while (error.isEmpty && i < headerLen) {
+        while (i < headerLen) {
           var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
-          if (caseSensitive == false) {
+          if (!caseSensitive) {
             nameInSchema = nameInSchema.toLowerCase
             nameInHeader = nameInHeader.toLowerCase
           }
           if (nameInHeader != nameInSchema) {
-            error = Some(
+            throw new IllegalArgumentException(
               s"""|CSV file header does not contain the expected fields.
                   | Header: ${columnNames.mkString(", ")}
                   | Schema: ${fieldNames.mkString(", ")}
                   |Expected: ${columnNames(i)} but found: ${fieldNames(i)}
-                  |CSV file: $fileName""".stripMargin
-            )
+                  |CSV file: $fileName""".stripMargin)
           }
           i += 1
         }
       } else {
-        error = Some(
+        throw new IllegalArgumentException(
           s"""|Number of column in CSV header is not equal to number of fields in the schema:
               | Header length: $headerLen, schema size: $schemaSize
-              |CSV file: $fileName""".stripMargin
-        )
-      }
-
-      error.headOption.foreach { msg =>
-        throw new IllegalArgumentException(msg)
+              |CSV file: $fileName""".stripMargin)
       }
     }
   }
 
-  def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String,
-    checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = {
+  /**
+   * Checks that CSV header contains the same column names as fields names in the given schema
+   * by taking into account case sensitivity.
+   */
+  def checkHeader(
+      header: String,
+      parser: CsvParser,
+      schema: StructType,
+      fileName: String,
+      checkHeaderFlag: Boolean,
+      caseSensitive: Boolean): Unit = {
     if (checkHeaderFlag) {
       checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag,
         caseSensitive)
@@ -175,8 +184,12 @@ object CSVDataSource {
 object TextInputCSVDataSource extends CSVDataSource {
   override val isSplitable: Boolean = true
 
-  override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser,
-      schema: StructType, dataSchema: StructType,
+  override def readFile(
+      conf: Configuration,
+      file: PartitionedFile,
+      parser: UnivocityParser,
+      requiredSchema: StructType,
+      dataSchema: StructType,
       caseSensitive: Boolean): Iterator[InternalRow] = {
     val lines = {
       val linesReader = new HadoopFileLinesReader(file, conf)
@@ -198,7 +211,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       }
     }
 
-    UnivocityParser.parseIterator(lines, parser, schema)
+    UnivocityParser.parseIterator(lines, parser, requiredSchema)
   }
 
   override def infer(
@@ -263,8 +276,12 @@ object TextInputCSVDataSource extends CSVDataSource {
 object MultiLineCSVDataSource extends CSVDataSource {
   override val isSplitable: Boolean = false
 
-  override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser,
-      schema: StructType, dataSchema: StructType,
+  override def readFile(
+      conf: Configuration,
+      file: PartitionedFile,
+      parser: UnivocityParser,
+      requiredSchema: StructType,
+      dataSchema: StructType,
       caseSensitive: Boolean): Iterator[InternalRow] = {
     def checkHeader(header: Array[String]): Unit = {
       CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath,
@@ -273,7 +290,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
 
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
-      parser.options.headerFlag, parser, schema, checkHeader)
+      parser.options.headerFlag, parser, requiredSchema, checkHeader)
   }
 
   override def infer(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index a34b69462ad70..944e02bced65b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -303,7 +303,6 @@ private[csv] object UnivocityParser {
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
-
     filteredLines.flatMap(safeParser.parse)
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 7704526811fc1..3f8c3cf9a48de 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -23,6 +23,8 @@ import java.sql.{Date, Timestamp}
 import java.text.SimpleDateFormat
 import java.util.Locale
 
+import scala.collection.JavaConverters._
+
 import org.apache.commons.lang3.time.FastDateFormat
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.GzipCodec
@@ -1372,7 +1374,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
   def checkHeader(multiLine: Boolean): Unit = {
     withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
       withTempPath { path =>
-        import collection.JavaConverters._
         val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
         val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
         odf.write.option("header", true).csv(path.getCanonicalPath)
@@ -1449,7 +1450,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
 
   test("SPARK-23786: CSV header must not be checked if it doesn't exist") {
     withTempPath { path =>
-      import collection.JavaConverters._
       val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
       val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
       odf.write.option("header", false).csv(path.getCanonicalPath)
@@ -1467,7 +1467,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
   test("SPARK-23786: Ignore column name case if spark.sql.caseSensitive is false") {
     withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
       withTempPath { path =>
-        import collection.JavaConverters._
         val oschema = new StructType().add("A", StringType)
         val odf = spark.createDataFrame(List(Row("0")).asJava, oschema)
         odf.write.option("header", true).csv(path.getCanonicalPath)

From d7047666f32601676a0b1f8d46ec7a0878e7404a Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Thu, 17 May 2018 15:41:36 +0200
Subject: [PATCH 36/48] Removing unnecessary empty checks

---
 .../spark/sql/execution/datasources/csv/CSVDataSource.scala  | 4 ----
 .../sql/execution/datasources/csv/UnivocityParser.scala      | 5 ++---
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 2c28024a791da..c00b32578fb65 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -298,15 +298,12 @@ object MultiLineCSVDataSource extends CSVDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
-    // The header is not checked because there is no schema against with it could be check
-    def checkHeader(header: Array[String]): Unit = ()
 
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(
         CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
         dropFirstRecord = false,
-        checkHeader,
         new CsvParser(parsedOptions.asParserSettings))
     }.take(1).headOption match {
       case Some(firstRow) =>
@@ -318,7 +315,6 @@ object MultiLineCSVDataSource extends CSVDataSource {
               lines.getConfiguration,
               new Path(lines.getPath())),
             parsedOptions.headerFlag,
-            checkHeader,
             new CsvParser(parsedOptions.asParserSettings))
         }
         val sampled = CSVUtils.sample(tokenRDD, parsedOptions)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 944e02bced65b..fc71323e8d6e3 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -235,9 +235,8 @@ private[csv] object UnivocityParser {
   def tokenizeStream(
       inputStream: InputStream,
       dropFirstRecord: Boolean,
-      checkFirstRecord: Array[String] => Unit,
       tokenizer: CsvParser): Iterator[Array[String]] = {
-    convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord)(tokens => tokens)
+    convertStream(inputStream, dropFirstRecord, tokenizer)(tokens => tokens)
   }
 
   /**
@@ -264,7 +263,7 @@ private[csv] object UnivocityParser {
       inputStream: InputStream,
       dropFirstRecord: Boolean,
       tokenizer: CsvParser,
-      checkFirstRecord: Array[String] => Unit)(
+      checkFirstRecord: Array[String] => Unit = _ => ())(
       convert: Array[String] => T) = new Iterator[T] {
     tokenizer.beginParsing(inputStream)
     private var nextRecord = {

From 04199e0fd28ae0f5e21050eb4d08ec60d238cab5 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Thu, 17 May 2018 16:37:18 +0200
Subject: [PATCH 37/48] Addressing review comments

---
 python/pyspark/sql/tests.py                    |  2 +-
 .../org/apache/spark/sql/DataFrameReader.scala |  4 ++--
 .../datasources/csv/CSVDataSource.scala        | 18 +++++++++---------
 .../execution/datasources/csv/CSVSuite.scala   |  6 +++---
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index 689b27f475ec0..faa5f72c8b044 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3053,7 +3053,7 @@ def test_checking_csv_header(self):
                 .csv(tmpPath, enforceSchema=False)
             self.assertRaisesRegexp(
                 Exception,
-                "CSV file header does not contain the expected fields",
+                "CSV header is not conform to the schema",
                 lambda: df.collect())
         finally:
             shutil.rmtree(tmpPath)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b991d2538ae82..8e2e46858db04 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -498,9 +498,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
 
     val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine =>
-      if (parsedOptions.enforceSchema == false) {
+      if (!parsedOptions.enforceSchema) {
         CSVDataSource.checkHeader(firstLine, new CsvParser(parsedOptions.asParserSettings),
-          actualSchema, csvDataset.getClass.getCanonicalName, checkHeaderFlag = true,
+          actualSchema, csvDataset.getClass.getCanonicalName, parsedOptions.enforceSchema,
           sparkSession.sessionState.conf.caseSensitiveAnalysis)
       }
       filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions))
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index c00b32578fb65..fbb5f2eba99cd 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -130,9 +130,9 @@ object CSVDataSource {
       schema: StructType,
       columnNames: Array[String],
       fileName: String,
-      checkHeaderFlag: Boolean,
+      enforceSchema: Boolean,
       caseSensitive: Boolean): Unit = {
-    if (checkHeaderFlag && columnNames != null) {
+    if (!enforceSchema && columnNames != null) {
       val fieldNames = schema.map(_.name).toIndexedSeq
       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
 
@@ -146,10 +146,10 @@ object CSVDataSource {
           }
           if (nameInHeader != nameInSchema) {
             throw new IllegalArgumentException(
-              s"""|CSV file header does not contain the expected fields.
+              s"""|CSV header is not conform to the schema.
                   | Header: ${columnNames.mkString(", ")}
                   | Schema: ${fieldNames.mkString(", ")}
-                  |Expected: ${columnNames(i)} but found: ${fieldNames(i)}
+                  |Expected: ${fieldNames(i)} but found: ${columnNames(i)}
                   |CSV file: $fileName""".stripMargin)
           }
           i += 1
@@ -172,10 +172,10 @@ object CSVDataSource {
       parser: CsvParser,
       schema: StructType,
       fileName: String,
-      checkHeaderFlag: Boolean,
+      enforceSchema: Boolean,
       caseSensitive: Boolean): Unit = {
-    if (checkHeaderFlag) {
-      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag,
+    if (!enforceSchema) {
+      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, enforceSchema,
         caseSensitive)
     }
   }
@@ -207,7 +207,7 @@ object TextInputCSVDataSource extends CSVDataSource {
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
         CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
-          checkHeaderFlag = !parser.options.enforceSchema, caseSensitive)
+          parser.options.enforceSchema, caseSensitive)
       }
     }
 
@@ -285,7 +285,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
       caseSensitive: Boolean): Iterator[InternalRow] = {
     def checkHeader(header: Array[String]): Unit = {
       CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath,
-        checkHeaderFlag = !parser.options.enforceSchema, caseSensitive)
+        parser.options.enforceSchema, caseSensitive)
     }
 
     UnivocityParser.parseStream(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 3f8c3cf9a48de..fcee02dd9be47 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1388,7 +1388,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .collect()
         }
         assert(exception.getMessage.contains(
-          "CSV file header does not contain the expected fields"
+          "CSV header is not conform to the schema"
         ))
 
         val shortSchema = new StructType().add("f1", DoubleType)
@@ -1434,7 +1434,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .collect()
         }
         assert(caseSensitiveException.getMessage.contains(
-          "CSV file header does not contain the expected fields"
+          "CSV header is not conform to the schema"
         ))
       }
     }
@@ -1487,6 +1487,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
       spark.read.schema(ischema).option("header", true).option("enforceSchema", false).csv(ds)
     }
 
-    assert(exception.getMessage.contains("CSV file header does not contain the expected fields"))
+    assert(exception.getMessage.contains("CSV header is not conform to the schema"))
   }
 }

From 795a8781129d5265906d537ad87a3e9b0abc890f Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Thu, 17 May 2018 23:40:59 +0200
Subject: [PATCH 38/48] Addressing Hyukjin Kwon's review comments

---
 python/pyspark/sql/tests.py                        | 10 +++++-----
 .../execution/datasources/csv/CSVDataSource.scala  |  3 +--
 .../datasources/csv/UnivocityParser.scala          |  4 ++--
 .../sql/execution/datasources/csv/CSVSuite.scala   | 14 ++++----------
 4 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index faa5f72c8b044..1edb0dc49afd9 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3041,22 +3041,22 @@ def test_csv_sampling_ratio(self):
         self.assertEquals(schema, StructType([StructField("_c0", IntegerType(), True)]))
 
     def test_checking_csv_header(self):
-        tmpPath = tempfile.mkdtemp()
-        shutil.rmtree(tmpPath)
+        path = tempfile.mkdtemp()
+        shutil.rmtree(path)
         try:
             self.spark.createDataFrame([[1, 1000], [2000, 2]])\
-                .toDF('f1', 'f2').write.option("header", "true").csv(tmpPath)
+                .toDF('f1', 'f2').write.option("header", "true").csv(path)
             schema = StructType([
                 StructField('f2', IntegerType(), nullable=True),
                 StructField('f1', IntegerType(), nullable=True)])
             df = self.spark.read.option('header', 'true').schema(schema)\
-                .csv(tmpPath, enforceSchema=False)
+                .csv(path, enforceSchema=False)
             self.assertRaisesRegexp(
                 Exception,
                 "CSV header is not conform to the schema",
                 lambda: df.collect())
         finally:
-            shutil.rmtree(tmpPath)
+            shutil.rmtree(path)
 
 
 class HiveSparkSubmitTests(SparkSubmitTests):
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index fbb5f2eba99cd..6c5387af650c1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -298,12 +298,11 @@ object MultiLineCSVDataSource extends CSVDataSource {
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
     val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
-
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(
         CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path),
-        dropFirstRecord = false,
+        shouldDropHeader = false,
         new CsvParser(parsedOptions.asParserSettings))
     }.take(1).headOption match {
       case Some(firstRow) =>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index fc71323e8d6e3..fd95b063ab830 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -234,9 +234,9 @@ private[csv] object UnivocityParser {
    */
   def tokenizeStream(
       inputStream: InputStream,
-      dropFirstRecord: Boolean,
+      shouldDropHeader: Boolean,
       tokenizer: CsvParser): Iterator[Array[String]] = {
-    convertStream(inputStream, dropFirstRecord, tokenizer)(tokens => tokens)
+    convertStream(inputStream, shouldDropHeader, tokenizer)(tokens => tokens)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index fcee02dd9be47..2d01ad2270946 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1387,9 +1387,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .csv(path.getCanonicalPath)
             .collect()
         }
-        assert(exception.getMessage.contains(
-          "CSV header is not conform to the schema"
-        ))
+        assert(exception.getMessage.contains("CSV header is not conform to the schema"))
 
         val shortSchema = new StructType().add("f1", DoubleType)
         val exceptionForShortSchema = intercept[SparkException] {
@@ -1402,8 +1400,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .collect()
         }
         assert(exceptionForShortSchema.getMessage.contains(
-          "Number of column in CSV header is not equal to number of fields in the schema"
-        ))
+          "Number of column in CSV header is not equal to number of fields in the schema"))
 
         val longSchema = new StructType()
           .add("f1", DoubleType)
@@ -1419,9 +1416,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .csv(path.getCanonicalPath)
             .collect()
         }
-        assert(exceptionForLongSchema.getMessage.contains(
-          "Header length: 2, schema size: 3"
-        ))
+        assert(exceptionForLongSchema.getMessage.contains("Header length: 2, schema size: 3"))
 
         val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType)
         val caseSensitiveException = intercept[SparkException] {
@@ -1434,8 +1429,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .collect()
         }
         assert(caseSensitiveException.getMessage.contains(
-          "CSV header is not conform to the schema"
-        ))
+          "CSV header is not conform to the schema"))
       }
     }
   }

From 05fc7cd8b163b35d388474bd9ea788a7db2c378c Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 18 May 2018 22:51:53 +0200
Subject: [PATCH 39/48] Improving description of the option

---
 python/pyspark/sql/readwriter.py                    |  9 +++++----
 python/pyspark/sql/streaming.py                     |  9 +++++----
 .../org/apache/spark/sql/DataFrameReader.scala      | 13 +++++++++----
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index bd65ac386a8db..eddf3b9611058 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -374,11 +374,12 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
         :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be
-                              forcibly applied to datasource files and headers in CSV files will be
+                              forcibly applied to datasource files, and headers in CSV files will be
                               ignored. If the option is set to ``false``, the schema will be
-                              validated against headers in CSV files if the ``header`` option is set
-                              to ``true``.  The validation is performed in column ordering aware
-                              manner by taking into account ``spark.sql.caseSensitive``.
+                              validated against all headers in CSV files or the first header in RDD
+                              if the ``header`` option is set to ``true``. Field names in the schema
+                              and column names in CSV headers are checked by their positions
+                              taking into account ``spark.sql.caseSensitive``.
                               If None is set, ``true`` is used by default.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 30cc8d3baad96..4c1a6da286f16 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -594,11 +594,12 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         :param inferSchema: infers the input schema automatically from data. It requires one extra
                        pass over the data. If None is set, it uses the default value, ``false``.
         :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be
-                              forcibly applied to datasource files and headers in CSV files will be
+                              forcibly applied to datasource files, and headers in CSV files will be
                               ignored. If the option is set to ``false``, the schema will be
-                              validated against headers in CSV files if the ``header`` option is set
-                              to ``true``.  The validation is performed in column ordering aware
-                              manner by taking into account ``spark.sql.caseSensitive``.
+                              validated against all headers in CSV files or the first header in RDD
+                              if the ``header`` option is set to ``true``. Field names in the schema
+                              and column names in CSV headers are checked by their positions
+                              taking into account ``spark.sql.caseSensitive``.
                               If None is set, ``true`` is used by default.
         :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 8e2e46858db04..c66b74e446471 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -474,6 +474,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * it determines the columns as string types and it reads only the first line to determine the
    * names and the number of fields.
    *
+   * If the enforceSchema is set to `false`, only the CSV header in the first line is checked
+   * to conform specified or inferred schema.
+   *
    * @param csvDataset input Dataset with one CSV row per record
    * @since 2.2.0
    */
@@ -544,10 +547,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * beginning with this character. By default, it is disabled.</li>
    * <li>`header` (default `false`): uses the first line as names of columns.</li>
    * <li>`enforceSchema` (default `true`): If it is set to `true`, the specified or inferred schema
-   * will be forcibly applied to datasource files and headers in CSV files will be ignored.
-   * If the option is set to `false`, the schema will be validated against headers in CSV files
-   * in the case when the `header` option is set to `true`. The validation is performed in column
-   * ordering aware manner by taking into account `spark.sql.caseSensitive`.</li>
+   * will be forcibly applied to datasource files, and headers in CSV files will be ignored.
+   * If the option is set to `false`, the schema will be validated against all headers in CSV files
+   * in the case when the `header` option is set to `true`. Field names in the schema
+   * and column names in CSV headers are checked by their positions taking into account
+   * `spark.sql.caseSensitive`.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`samplingRatio` (default is 1.0): defines fraction of rows used for schema inferring.</li>
@@ -592,6 +596,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`.</li>
    * <li>`multiLine` (default `false`): parse one record, which may span multiple lines.</li>
    * </ul>
+   *
    * @since 2.0.0
    */
   @scala.annotation.varargs

From 11c75913f4afadd0437945e0a41870a141326375 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 18 May 2018 23:13:02 +0200
Subject: [PATCH 40/48] Addressing Wenchen Fan's review comment

---
 python/pyspark/sql/readwriter.py                            | 6 ++++--
 python/pyspark/sql/streaming.py                             | 6 ++++--
 .../main/scala/org/apache/spark/sql/DataFrameReader.scala   | 3 ++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index eddf3b9611058..a0e20d39c20da 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -379,8 +379,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                               validated against all headers in CSV files or the first header in RDD
                               if the ``header`` option is set to ``true``. Field names in the schema
                               and column names in CSV headers are checked by their positions
-                              taking into account ``spark.sql.caseSensitive``.
-                              If None is set, ``true`` is used by default.
+                              taking into account ``spark.sql.caseSensitive``. If None is set,
+                              ``true`` is used by default. Though the default value is ``true``,
+                              it is recommended to disable the ``enforceSchema`` option
+                              to avoid incorrect results.
         :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
                                         uses the default value, ``false``.
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 4c1a6da286f16..fae50b3d5d532 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -599,8 +599,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                               validated against all headers in CSV files or the first header in RDD
                               if the ``header`` option is set to ``true``. Field names in the schema
                               and column names in CSV headers are checked by their positions
-                              taking into account ``spark.sql.caseSensitive``.
-                              If None is set, ``true`` is used by default.
+                              taking into account ``spark.sql.caseSensitive``. If None is set,
+                              ``true`` is used by default. Though the default value is ``true``,
+                              it is recommended to disable the ``enforceSchema`` option
+                              to avoid incorrect results.
         :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from
                                         values being read should be skipped. If None is set, it
                                         uses the default value, ``false``.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 552c5055bdc69..7eea42147704a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -552,7 +552,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * If the option is set to `false`, the schema will be validated against all headers in CSV files
    * in the case when the `header` option is set to `true`. Field names in the schema
    * and column names in CSV headers are checked by their positions taking into account
-   * `spark.sql.caseSensitive`.</li>
+   * `spark.sql.caseSensitive`. Though the default value is true, it is recommended to disable
+   * the `enforceSchema` option to avoid incorrect results.</li>
    * <li>`inferSchema` (default `false`): infers the input schema automatically from data. It
    * requires one extra pass over the data.</li>
    * <li>`samplingRatio` (default is 1.0): defines fraction of rows used for schema inferring.</li>

From c0083281e702aea17ee7781e089a17d650382c7c Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 25 May 2018 13:19:14 +0200
Subject: [PATCH 41/48] Output warnings when enforceSchema is enabled and the
 schema is not conform to CSV header

---
 .../datasources/csv/CSVDataSource.scala       | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 6c5387af650c1..139eed804c6fc 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 
 import org.apache.spark.TaskContext
 import org.apache.spark.input.{PortableDataStream, StreamInputFormat}
+import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.{BinaryFileRDD, RDD}
 import org.apache.spark.sql.{Dataset, Encoders, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
@@ -113,7 +114,7 @@ abstract class CSVDataSource extends Serializable {
   }
 }
 
-object CSVDataSource {
+object CSVDataSource extends Logging {
   def apply(options: CSVOptions): CSVDataSource = {
     if (options.multiLine) {
       MultiLineCSVDataSource
@@ -132,20 +133,21 @@ object CSVDataSource {
       fileName: String,
       enforceSchema: Boolean,
       caseSensitive: Boolean): Unit = {
-    if (!enforceSchema && columnNames != null) {
+    if (columnNames != null) {
       val fieldNames = schema.map(_.name).toIndexedSeq
       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
+      var errorMessage: Option[String] = None
 
       if (headerLen == schemaSize) {
         var i = 0
-        while (i < headerLen) {
+        while (errorMessage.isEmpty && i < headerLen) {
           var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
           if (!caseSensitive) {
             nameInSchema = nameInSchema.toLowerCase
             nameInHeader = nameInHeader.toLowerCase
           }
           if (nameInHeader != nameInSchema) {
-            throw new IllegalArgumentException(
+            errorMessage = Some(
               s"""|CSV header is not conform to the schema.
                   | Header: ${columnNames.mkString(", ")}
                   | Schema: ${fieldNames.mkString(", ")}
@@ -155,11 +157,19 @@ object CSVDataSource {
           i += 1
         }
       } else {
-        throw new IllegalArgumentException(
+        errorMessage = Some(
           s"""|Number of column in CSV header is not equal to number of fields in the schema:
               | Header length: $headerLen, schema size: $schemaSize
               |CSV file: $fileName""".stripMargin)
       }
+
+      errorMessage.foreach { msg =>
+        if (enforceSchema) {
+          logWarning(msg)
+        } else {
+          throw new IllegalArgumentException(msg)
+        }
+      }
     }
   }
 

From 9f7c4405ce4ca240a11d69126855fbc060444d3b Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 25 May 2018 14:44:43 +0200
Subject: [PATCH 42/48] Added tests for inferSchema is true and enforceSchema
 is false

---
 .../execution/datasources/csv/CSVSuite.scala  | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index a9b726a0de5a6..adc0f20b34947 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1512,4 +1512,36 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
 
     assert(exception.getMessage.contains("CSV header is not conform to the schema"))
   }
+
+  test("SPARK-23786: enforce inferred schema") {
+    val expectedSchema = new StructType().add("_c0", DoubleType).add("_c1", StringType)
+    val withHeader = spark.read
+      .option("inferSchema", true)
+      .option("enforceSchema", false)
+      .option("header", true)
+      .csv(Seq("_c0,_c1", "1.0,a").toDS())
+    assert(withHeader.schema == expectedSchema)
+    checkAnswer(withHeader, Seq(Row(1.0, "a")))
+
+    // Ignore the inferSchema flag if an user sets a schema
+    val schema = new StructType().add("colA", DoubleType).add("colB", StringType)
+    val ds = spark.read
+      .option("inferSchema", true)
+      .option("enforceSchema", false)
+      .option("header", true)
+      .schema(schema)
+      .csv(Seq("colA,colB", "1.0,a").toDS())
+    assert(ds.schema == schema)
+    checkAnswer(ds, Seq(Row(1.0, "a")))
+
+    val exception = intercept[IllegalArgumentException] {
+      spark.read
+        .option("inferSchema", true)
+        .option("enforceSchema", false)
+        .option("header", true)
+        .schema(schema)
+        .csv(Seq("col1,col2", "1.0,a").toDS())
+    }
+    assert(exception.getMessage.contains("CSV header is not conform to the schema"))
+  }
 }

From e83ad60da9c9266274fc09698bea382c762ef815 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 25 May 2018 14:53:44 +0200
Subject: [PATCH 43/48] Rename dropFirstRecord to shouldDropHeader

---
 .../datasources/csv/UnivocityParser.scala          | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
index 075d6405cee4b..5f7d5696b71a6 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -248,32 +248,32 @@ private[csv] object UnivocityParser {
    */
   def parseStream(
       inputStream: InputStream,
-      dropFirstRecord: Boolean,
+      shouldDropHeader: Boolean,
       parser: UnivocityParser,
       schema: StructType,
-      checkFirstRecord: Array[String] => Unit): Iterator[InternalRow] = {
+      checkHeader: Array[String] => Unit): Iterator[InternalRow] = {
     val tokenizer = parser.tokenizer
     val safeParser = new FailureSafeParser[Array[String]](
       input => Seq(parser.convert(input)),
       parser.options.parseMode,
       schema,
       parser.options.columnNameOfCorruptRecord)
-    convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord) { tokens =>
+    convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens =>
       safeParser.parse(tokens)
     }.flatten
   }
 
   private def convertStream[T](
       inputStream: InputStream,
-      dropFirstRecord: Boolean,
+      shouldDropHeader: Boolean,
       tokenizer: CsvParser,
-      checkFirstRecord: Array[String] => Unit = _ => ())(
+      checkHeader: Array[String] => Unit = _ => ())(
       convert: Array[String] => T) = new Iterator[T] {
     tokenizer.beginParsing(inputStream)
     private var nextRecord = {
-      if (dropFirstRecord) {
+      if (shouldDropHeader) {
         val firstRecord = tokenizer.parseNext()
-        checkFirstRecord(firstRecord)
+        checkHeader(firstRecord)
       }
       tokenizer.parseNext()
     }

From c5ee2076321e252ece99ebec930ae47b5bba209a Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 1 Jun 2018 18:37:15 +0200
Subject: [PATCH 44/48] Renaming of 'is not conform' to 'does not conform'

---
 .../sql/execution/datasources/csv/CSVDataSource.scala     | 2 +-
 .../spark/sql/execution/datasources/csv/CSVSuite.scala    | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 139eed804c6fc..645b1e27355b1 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -148,7 +148,7 @@ object CSVDataSource extends Logging {
           }
           if (nameInHeader != nameInSchema) {
             errorMessage = Some(
-              s"""|CSV header is not conform to the schema.
+              s"""|CSV header does not conform to the schema.
                   | Header: ${columnNames.mkString(", ")}
                   | Schema: ${fieldNames.mkString(", ")}
                   |Expected: ${fieldNames(i)} but found: ${columnNames(i)}
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index e258d3de56f99..1cb72f985cedd 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -1429,7 +1429,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .csv(path.getCanonicalPath)
             .collect()
         }
-        assert(exception.getMessage.contains("CSV header is not conform to the schema"))
+        assert(exception.getMessage.contains("CSV header does not conform to the schema"))
 
         val shortSchema = new StructType().add("f1", DoubleType)
         val exceptionForShortSchema = intercept[SparkException] {
@@ -1471,7 +1471,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
             .collect()
         }
         assert(caseSensitiveException.getMessage.contains(
-          "CSV header is not conform to the schema"))
+          "CSV header does not conform to the schema"))
       }
     }
   }
@@ -1523,7 +1523,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
       spark.read.schema(ischema).option("header", true).option("enforceSchema", false).csv(ds)
     }
 
-    assert(exception.getMessage.contains("CSV header is not conform to the schema"))
+    assert(exception.getMessage.contains("CSV header does not conform to the schema"))
   }
 
   test("SPARK-23786: enforce inferred schema") {
@@ -1555,6 +1555,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
         .schema(schema)
         .csv(Seq("col1,col2", "1.0,a").toDS())
     }
-    assert(exception.getMessage.contains("CSV header is not conform to the schema"))
+    assert(exception.getMessage.contains("CSV header does not conform to the schema"))
   }
 }

From a2cbb7b7879c065efcb8e5a56fde0e47d80e1c89 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 1 Jun 2018 18:47:46 +0200
Subject: [PATCH 45/48] Fix Scala coding style

---
 .../datasources/csv/CSVDataSource.scala       | 28 +++++++++++++++----
 .../datasources/csv/CSVFileFormat.scala       |  7 ++++-
 .../datasources/csv/CSVOptions.scala          |  2 +-
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 645b1e27355b1..745c6135700bb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -185,7 +185,11 @@ object CSVDataSource extends Logging {
       enforceSchema: Boolean,
       caseSensitive: Boolean): Unit = {
     if (!enforceSchema) {
-      checkHeaderColumnNames(schema, parser.parseLine(header), fileName, enforceSchema,
+      checkHeaderColumnNames(
+        schema,
+        parser.parseLine(header),
+        fileName,
+        enforceSchema,
         caseSensitive)
     }
   }
@@ -216,8 +220,13 @@ object TextInputCSVDataSource extends CSVDataSource {
       // Note: if there are only comments in the first block, the header would probably
       // be not extracted.
       CSVUtils.extractHeader(lines, parser.options).foreach { header =>
-        CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath,
-          parser.options.enforceSchema, caseSensitive)
+        CSVDataSource.checkHeader(
+          header,
+          parser.tokenizer,
+          dataSchema,
+          file.filePath,
+          parser.options.enforceSchema,
+          caseSensitive)
       }
     }
 
@@ -294,13 +303,20 @@ object MultiLineCSVDataSource extends CSVDataSource {
       dataSchema: StructType,
       caseSensitive: Boolean): Iterator[InternalRow] = {
     def checkHeader(header: Array[String]): Unit = {
-      CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath,
-        parser.options.enforceSchema, caseSensitive)
+      CSVDataSource.checkHeaderColumnNames(
+        dataSchema,
+        header,
+        file.filePath,
+        parser.options.enforceSchema,
+        caseSensitive)
     }
 
     UnivocityParser.parseStream(
       CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))),
-      parser.options.headerFlag, parser, requiredSchema, checkHeader)
+      parser.options.headerFlag,
+      parser,
+      requiredSchema,
+      checkHeader)
   }
 
   override def infer(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
index 1497834ab2cb8..b90275de9f40a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala
@@ -138,7 +138,12 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister {
         StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)),
         parsedOptions)
-      CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema,
+      CSVDataSource(parsedOptions).readFile(
+        conf,
+        file,
+        parser,
+        requiredSchema,
+        dataSchema,
         caseSensitive)
     }
   }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
index 2e040c86a46a8..fab8d62da0c1d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala
@@ -160,7 +160,7 @@ class CSVOptions(
    * Forcibly apply the specified or inferred schema to datasource files.
    * If the option is enabled, headers of CSV files will be ignored.
    */
-  val enforceSchema = getBool("enforceSchema", true)
+  val enforceSchema = getBool("enforceSchema", default = true)
 
   def asWriterSettings: CsvWriterSettings = {
     val writerSettings = new CsvWriterSettings()

From 70e2b75713d588027fc3ae1ac380f354a52ad2ac Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 1 Jun 2018 18:58:58 +0200
Subject: [PATCH 46/48] Added description of checkHeaderColumnNames's arguments

---
 .../sql/execution/datasources/csv/CSVDataSource.scala    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index 745c6135700bb..f8ed6fd58059d 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -126,6 +126,15 @@ object CSVDataSource extends Logging {
   /**
    * Checks that column names in a CSV header and field names in the schema are the same
    * by taking into account case sensitivity.
+   *
+   * @param schema - provided (or inferred) schema to which CSV must conform.
+   * @param columnNames - names of CSV columns that must be checked against to the schema.
+   * @param fileName - name of CSV file that are currently checked. It is used in error messages.
+   * @param enforceSchema - if it is `true`, column names are ignored otherwise the CSV column
+   *                        names are checked for conformance to the schema. In the case if
+   *                        the column name don't conform to the schema, an exception is thrown.
+   * @param caseSensitive - if it is set to `false`, comparison of column names and schema field
+   *                        names is not case sensitive.
    */
   def checkHeaderColumnNames(
       schema: StructType,

From e7c3acee886c238a58682b70a621a0c4d3293e21 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 1 Jun 2018 20:07:05 +0200
Subject: [PATCH 47/48] Test checks a warning presents in logs

---
 .../apache/spark/sql/DataFrameReader.scala    | 12 ++---
 .../datasources/csv/CSVDataSource.scala       |  4 +-
 .../execution/datasources/csv/CSVSuite.scala  | 45 +++++++++++++++++++
 3 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index 1e889622b7b52..de6be5f76e15a 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -503,11 +503,13 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
       StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
 
     val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine =>
-      if (!parsedOptions.enforceSchema) {
-        CSVDataSource.checkHeader(firstLine, new CsvParser(parsedOptions.asParserSettings),
-          actualSchema, csvDataset.getClass.getCanonicalName, parsedOptions.enforceSchema,
-          sparkSession.sessionState.conf.caseSensitiveAnalysis)
-      }
+      CSVDataSource.checkHeader(
+        firstLine,
+        new CsvParser(parsedOptions.asParserSettings),
+        actualSchema,
+        csvDataset.getClass.getCanonicalName,
+        parsedOptions.enforceSchema,
+        sparkSession.sessionState.conf.caseSensitiveAnalysis)
       filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions))
     }.getOrElse(filteredLines.rdd)
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
index f8ed6fd58059d..82322df407521 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -193,14 +193,12 @@ object CSVDataSource extends Logging {
       fileName: String,
       enforceSchema: Boolean,
       caseSensitive: Boolean): Unit = {
-    if (!enforceSchema) {
-      checkHeaderColumnNames(
+    checkHeaderColumnNames(
         schema,
         parser.parseLine(header),
         fileName,
         enforceSchema,
         caseSensitive)
-    }
   }
 }
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 1cb72f985cedd..d2f166c7d1877 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -28,6 +28,8 @@ import scala.collection.JavaConverters._
 import org.apache.commons.lang3.time.FastDateFormat
 import org.apache.hadoop.io.SequenceFile.CompressionType
 import org.apache.hadoop.io.compress.GzipCodec
+import org.apache.log4j.{AppenderSkeleton, LogManager}
+import org.apache.log4j.spi.LoggingEvent
 
 import org.apache.spark.SparkException
 import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, UDT}
@@ -1557,4 +1559,47 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te
     }
     assert(exception.getMessage.contains("CSV header does not conform to the schema"))
   }
+
+  test("SPARK-23786: warning should be printed if CSV header doesn't conform to schema") {
+    class TestAppender extends AppenderSkeleton {
+      var events = new java.util.ArrayList[LoggingEvent]
+      override def close(): Unit = {}
+      override def requiresLayout: Boolean = false
+      protected def append(event: LoggingEvent): Unit = events.add(event)
+    }
+
+    val testAppender1 = new TestAppender
+    LogManager.getRootLogger.addAppender(testAppender1)
+    try {
+      val ds = Seq("columnA,columnB", "1.0,1000.0").toDS()
+      val ischema = new StructType().add("columnB", DoubleType).add("columnA", DoubleType)
+
+      spark.read.schema(ischema).option("header", true).option("enforceSchema", true).csv(ds)
+    } finally {
+      LogManager.getRootLogger.removeAppender(testAppender1)
+    }
+    assert(testAppender1.events.asScala
+      .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema")))
+
+    val testAppender2 = new TestAppender
+    LogManager.getRootLogger.addAppender(testAppender2)
+    try {
+      withTempPath { path =>
+        val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType)
+        val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema)
+        odf.write.option("header", true).csv(path.getCanonicalPath)
+        val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType)
+        spark.read
+          .schema(ischema)
+          .option("header", true)
+          .option("enforceSchema", true)
+          .csv(path.getCanonicalPath)
+          .collect()
+      }
+    } finally {
+      LogManager.getRootLogger.removeAppender(testAppender2)
+    }
+    assert(testAppender2.events.asScala
+      .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema")))
+  }
 }

From 3b37712ded664aaf716306574f50072e58b9bbd1 Mon Sep 17 00:00:00 2001
From: Maxim Gekk <maxim.gekk@databricks.com>
Date: Fri, 1 Jun 2018 20:29:32 +0200
Subject: [PATCH 48/48] fix python tests

---
 python/pyspark/sql/tests.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py
index cf7ed69e17b32..ea2dd7605dc57 100644
--- a/python/pyspark/sql/tests.py
+++ b/python/pyspark/sql/tests.py
@@ -3069,7 +3069,7 @@ def test_checking_csv_header(self):
                 .csv(path, enforceSchema=False)
             self.assertRaisesRegexp(
                 Exception,
-                "CSV header is not conform to the schema",
+                "CSV header does not conform to the schema",
                 lambda: df.collect())
         finally:
             shutil.rmtree(path)