From 112ce2d34d0d039711777351c1ab8e74629fc8e6 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 20 Mar 2018 16:30:44 +0100 Subject: [PATCH 01/48] Checks column names are compatible to provided schema --- .../sql/execution/datasources/csv/CSVSuite.scala | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 4398e547d9217..772e66e0e7ffa 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1279,4 +1279,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { Row("0,2013-111-11 12:13:14") :: Row(null) :: Nil ) } + + test("Check column names during schema validation") { + withTempPath { path => + import collection.JavaConverters._ + val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) + val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) + odf.write.option("header", "true").csv(path.getCanonicalPath) + val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) + val idf = spark.read.schema(ischema).option("header", "true").csv(path.getCanonicalPath) + + checkAnswer(idf.select('f1), odf.select('f1)) + } + } } From a85ccce23c3c5ee69ff321303ad830c71dd05931 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 20 Mar 2018 21:51:03 +0100 Subject: [PATCH 02/48] Checking header is matched to schema in per-line mode --- .../datasources/csv/CSVDataSource.scala | 21 +++++++++++++----- .../datasources/csv/CSVFileFormat.scala | 3 ++- .../datasources/csv/UnivocityParser.scala | 22 +++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 4870d75fc5f08..f7be8a2c764dc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -50,7 +50,9 @@ abstract class CSVDataSource extends Serializable { conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType): Iterator[InternalRow] + schema: StructType, // Schema of projection + dataSchema: StructType // Schema of data in csv files + ): Iterator[InternalRow] /** * Infers the schema from `inputPaths` files. @@ -127,7 +129,8 @@ object TextInputCSVDataSource extends CSVDataSource { conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType): Iterator[InternalRow] = { + schema: StructType, + dataSchema: StructType): Iterator[InternalRow] = { val lines = { val linesReader = new HadoopFileLinesReader(file, conf) Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close())) @@ -136,8 +139,15 @@ object TextInputCSVDataSource extends CSVDataSource { } } - val shouldDropHeader = parser.options.headerFlag && file.start == 0 - UnivocityParser.parseIterator(lines, shouldDropHeader, parser, schema) + val hasHeader = parser.options.headerFlag && file.start == 0 + if (hasHeader) { + lines.take(1).foreach { header => + UnivocityParser.checkHeaderBySchema(parser, dataSchema, header) + } + } + + UnivocityParser.parseIterator( + conf, lines, shouldDropHeader = hasHeader, parser, schema, file.filePath) } override def infer( @@ -204,7 +214,8 @@ object MultiLineCSVDataSource extends CSVDataSource { conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType): Iterator[InternalRow] = { + schema: StructType, + dataSchema: StructType): Iterator[InternalRow] = { UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), parser.options.headerFlag, diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index e20977a4ec79f..4888d7632ab9c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -129,7 +129,8 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), parsedOptions) - CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema) + CSVDataSource(parsedOptions) + .readFile(conf, file, parser, requiredSchema, dataSchema) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 3d6cc30f2ba83..4842c62a5d852 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -312,4 +312,26 @@ private[csv] object UnivocityParser { parser.options.columnNameOfCorruptRecord) filteredLines.flatMap(safeParser.parse) } + + def checkHeaderBySchema( + parser: UnivocityParser, + schema: StructType, + header: String + ): Unit = { + val columnNames = parser.tokenizer.parseLine(header) + val fieldNames = schema.map(_.name) + val isMatched = fieldNames.zip(columnNames).forall { pair => + val (nameInSchema, nameInHeader) = pair + nameInSchema == nameInHeader + } + if (!isMatched) { + throw new IllegalArgumentException( + s""" + |Fields in the header of csv file are not matched to field names of the schema: + | Header: ${header} + | Schema: ${fieldNames.mkString(",")} + |""".stripMargin + ) + } + } } From 75e15345b6a5a9e807375fdf465dccfce4ea62c7 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 20 Mar 2018 22:36:56 +0100 Subject: [PATCH 03/48] Extract header and check that it is matched to schema --- .../datasources/csv/CSVDataSource.scala | 6 ++++- .../execution/datasources/csv/CSVUtils.scala | 22 +++++++++++-------- .../datasources/csv/UnivocityParser.scala | 11 +--------- .../execution/datasources/csv/CSVSuite.scala | 13 ++++++++--- 4 files changed, 29 insertions(+), 23 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index f7be8a2c764dc..764ee8a118d31 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -141,7 +141,11 @@ object TextInputCSVDataSource extends CSVDataSource { val hasHeader = parser.options.headerFlag && file.start == 0 if (hasHeader) { - lines.take(1).foreach { header => + // Checking header field names are matched to fields in schema. + // The header is removed from lines. + // Note: if there are only comments in the first block, the header would probably + // be not extracted. + CSVUtils.extractHeader(lines, parser.options).foreach { header => UnivocityParser.checkHeaderBySchema(parser, dataSchema, header) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala index 72b053d2092ca..3a2c5e6653615 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVUtils.scala @@ -67,12 +67,8 @@ object CSVUtils { } } - /** - * Drop header line so that only data can remain. - * This is similar with `filterHeaderLine` above and currently being used in CSV reading path. - */ - def dropHeaderLine(iter: Iterator[String], options: CSVOptions): Iterator[String] = { - val nonEmptyLines = if (options.isCommentSet) { + def skipComments(iter: Iterator[String], options: CSVOptions): Iterator[String] = { + if (options.isCommentSet) { val commentPrefix = options.comment.toString iter.dropWhile { line => line.trim.isEmpty || line.trim.startsWith(commentPrefix) @@ -80,11 +76,19 @@ object CSVUtils { } else { iter.dropWhile(_.trim.isEmpty) } - - if (nonEmptyLines.hasNext) nonEmptyLines.drop(1) - iter } + /** + * Extracts header and moves iterator forward so that only data remains in it + */ + def extractHeader(iter: Iterator[String], options: CSVOptions): Option[String] = { + val nonEmptyLines = skipComments(iter, options) + if (nonEmptyLines.hasNext) { + Some(nonEmptyLines.next()) + } else { + None + } + } /** * Helper method that converts string representation of a character to actual character. * It handles some Java escaped strings and throws exception if given string is longer than one diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 4842c62a5d852..9516b0cb72c67 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -289,21 +289,12 @@ private[csv] object UnivocityParser { */ def parseIterator( lines: Iterator[String], - shouldDropHeader: Boolean, parser: UnivocityParser, schema: StructType): Iterator[InternalRow] = { val options = parser.options - val linesWithoutHeader = if (shouldDropHeader) { - // Note that if there are only comments in the first block, the header would probably - // be not dropped. - CSVUtils.dropHeaderLine(lines, options) - } else { - lines - } - val filteredLines: Iterator[String] = - CSVUtils.filterCommentAndEmpty(linesWithoutHeader, options) + CSVUtils.filterCommentAndEmpty(lines, options) val safeParser = new FailureSafeParser[String]( input => Seq(parser.parse(input)), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 772e66e0e7ffa..35245583026c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1287,9 +1287,16 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) odf.write.option("header", "true").csv(path.getCanonicalPath) val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) - val idf = spark.read.schema(ischema).option("header", "true").csv(path.getCanonicalPath) - - checkAnswer(idf.select('f1), odf.select('f1)) + val exception = intercept[SparkException] { + spark.read + .schema(ischema) + .option("header", "true") + .csv(path.getCanonicalPath) + .collect() + } + assert(exception.getMessage.contains( + "Fields in the header of csv file are not matched to field names of the schema" + )) } } } From 8eb45b8b634ba2c9b641de12e09f17c63240ccc4 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 11:57:30 +0100 Subject: [PATCH 04/48] Checking column names in header in multiLine mode --- .../datasources/csv/CSVDataSource.scala | 13 ++-- .../datasources/csv/CSVOptions.scala | 2 +- .../datasources/csv/UnivocityParser.scala | 62 ++++++++++++------- .../execution/datasources/csv/CSVSuite.scala | 37 ++++++----- 4 files changed, 70 insertions(+), 44 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 764ee8a118d31..51255efc86470 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -145,9 +145,8 @@ object TextInputCSVDataSource extends CSVDataSource { // The header is removed from lines. // Note: if there are only comments in the first block, the header would probably // be not extracted. - CSVUtils.extractHeader(lines, parser.options).foreach { header => - UnivocityParser.checkHeaderBySchema(parser, dataSchema, header) - } + val checkHeader = UnivocityParser.checkHeader(parser, dataSchema, _: String) + CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_)) } UnivocityParser.parseIterator( @@ -224,7 +223,10 @@ object MultiLineCSVDataSource extends CSVDataSource { CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), parser.options.headerFlag, parser, - schema) + schema, + file.filePath, + checkHeader = UnivocityParser.checkHeaderColumnNames(parser, dataSchema, _) + ) } override def infer( @@ -232,11 +234,13 @@ object MultiLineCSVDataSource extends CSVDataSource { inputPaths: Seq[FileStatus], parsedOptions: CSVOptions): StructType = { val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions) + val checkHeader = (_: Array[String]) => () csv.flatMap { lines => val path = new Path(lines.getPath()) UnivocityParser.tokenizeStream( CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path), shouldDropHeader = false, + checkHeader, new CsvParser(parsedOptions.asParserSettings)) }.take(1).headOption match { case Some(firstRow) => @@ -248,6 +252,7 @@ object MultiLineCSVDataSource extends CSVDataSource { lines.getConfiguration, new Path(lines.getPath())), parsedOptions.headerFlag, + checkHeader, new CsvParser(parsedOptions.asParserSettings)) } CSVInferSchema.infer(tokenRDD, header, parsedOptions) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index c16790630ce17..d5a5a73c45268 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -149,7 +149,7 @@ class CSVOptions( val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' - + def asWriterSettings: CsvWriterSettings = { val writerSettings = new CsvWriterSettings() val format = writerSettings.getFormat diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 9516b0cb72c67..75f6b2f25bcce 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -237,8 +237,9 @@ private[csv] object UnivocityParser { def tokenizeStream( inputStream: InputStream, shouldDropHeader: Boolean, + checkHeader: Array[String] => Unit, tokenizer: CsvParser): Iterator[Array[String]] = { - convertStream(inputStream, shouldDropHeader, tokenizer)(tokens => tokens) + convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader)(tokens => tokens) } /** @@ -248,14 +249,16 @@ private[csv] object UnivocityParser { inputStream: InputStream, shouldDropHeader: Boolean, parser: UnivocityParser, - schema: StructType): Iterator[InternalRow] = { + schema: StructType, + filePath: String, + checkHeader: Array[String] => Unit): Iterator[InternalRow] = { val tokenizer = parser.tokenizer val safeParser = new FailureSafeParser[Array[String]]( input => Seq(parser.convert(input)), parser.options.parseMode, schema, parser.options.columnNameOfCorruptRecord) - convertStream(inputStream, shouldDropHeader, tokenizer) { tokens => + convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens => safeParser.parse(tokens) }.flatten } @@ -263,11 +266,14 @@ private[csv] object UnivocityParser { private def convertStream[T]( inputStream: InputStream, shouldDropHeader: Boolean, - tokenizer: CsvParser)(convert: Array[String] => T) = new Iterator[T] { + tokenizer: CsvParser, + checkHeader: Array[String] => Unit + )(convert: Array[String] => T) = new Iterator[T] { tokenizer.beginParsing(inputStream) private var nextRecord = { if (shouldDropHeader) { - tokenizer.parseNext() + val header = tokenizer.parseNext() + checkHeader(header) } tokenizer.parseNext() } @@ -304,25 +310,35 @@ private[csv] object UnivocityParser { filteredLines.flatMap(safeParser.parse) } - def checkHeaderBySchema( - parser: UnivocityParser, - schema: StructType, - header: String + def checkHeaderColumnNames( + parser: UnivocityParser, + schema: StructType, + columnNames: Array[String] ): Unit = { - val columnNames = parser.tokenizer.parseLine(header) - val fieldNames = schema.map(_.name) - val isMatched = fieldNames.zip(columnNames).forall { pair => - val (nameInSchema, nameInHeader) = pair - nameInSchema == nameInHeader - } - if (!isMatched) { - throw new IllegalArgumentException( - s""" - |Fields in the header of csv file are not matched to field names of the schema: - | Header: ${header} - | Schema: ${fieldNames.mkString(",")} - |""".stripMargin - ) + if (columnNames != null) { + val fieldNames = schema.map(_.name) + val isMatched = fieldNames.zip(columnNames).forall { pair => + val (nameInSchema, nameInHeader) = pair + nameInSchema == nameInHeader + } + if (!isMatched) { + throw new IllegalArgumentException( + s""" + |Fields in the header of csv file are not matched to field names of the schema: + | Header: ${columnNames.mkString(",")} + + | Schema: ${fieldNames.mkString(",")} + + + |""". + stripMargin + ) + } } } + + def checkHeader(parser: UnivocityParser, schema: StructType, header: String): Unit = { + val columnNames = parser.tokenizer.parseLine(header) + checkHeaderColumnNames(parser, schema, columnNames) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 35245583026c1..1007a85d3b1a3 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1280,23 +1280,28 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { ) } - test("Check column names during schema validation") { - withTempPath { path => - import collection.JavaConverters._ - val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) - val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) - odf.write.option("header", "true").csv(path.getCanonicalPath) - val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) - val exception = intercept[SparkException] { - spark.read - .schema(ischema) - .option("header", "true") - .csv(path.getCanonicalPath) - .collect() + def checkHeader(multiLine: String): Unit = { + test(s"Check column names during schema validation - multiLine = $multiLine") { + withTempPath { path => + import collection.JavaConverters._ + val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) + val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) + odf.write.option("header", "true").csv(path.getCanonicalPath) + val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) + val exception = intercept[SparkException] { + spark.read + .schema(ischema) + .option("header", "true") + .option("multiLine", multiLine) + .csv(path.getCanonicalPath) + .collect() + } + assert(exception.getMessage.contains( + "Fields in the header of csv file are not matched to field names of the schema" + )) } - assert(exception.getMessage.contains( - "Fields in the header of csv file are not matched to field names of the schema" - )) } } + + List("false", "true").foreach(checkHeader(_)) } From 9b1a9862531b8d3fb3cffce75126413ca9a844b9 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 12:13:17 +0100 Subject: [PATCH 05/48] Adding the checkHeader option with true by default --- .../execution/datasources/csv/CSVOptions.scala | 4 +++- .../datasources/csv/UnivocityParser.scala | 16 +++++----------- .../sql/execution/datasources/csv/CSVSuite.scala | 3 ++- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index d5a5a73c45268..1f7986fb59864 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -149,7 +149,9 @@ class CSVOptions( val inputBufferSize = 128 val isCommentSet = this.comment != '\u0000' - + + val checkHeader = getBool("checkHeader", true) + def asWriterSettings: CsvWriterSettings = { val writerSettings = new CsvWriterSettings() val format = writerSettings.getFormat diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 75f6b2f25bcce..89815bbd56518 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -315,7 +315,7 @@ private[csv] object UnivocityParser { schema: StructType, columnNames: Array[String] ): Unit = { - if (columnNames != null) { + if (parser.options.checkHeader && columnNames != null) { val fieldNames = schema.map(_.name) val isMatched = fieldNames.zip(columnNames).forall { pair => val (nameInSchema, nameInHeader) = pair @@ -323,22 +323,16 @@ private[csv] object UnivocityParser { } if (!isMatched) { throw new IllegalArgumentException( - s""" - |Fields in the header of csv file are not matched to field names of the schema: - | Header: ${columnNames.mkString(",")} - - | Schema: ${fieldNames.mkString(",")} - - - |""". - stripMargin + s"""|Fields in the header of csv file are not matched to field names of the schema: + | Header: ${columnNames.mkString(",")} + | Schema: ${fieldNames.mkString(",")}""".stripMargin ) } } } def checkHeader(parser: UnivocityParser, schema: StructType, header: String): Unit = { - val columnNames = parser.tokenizer.parseLine(header) + lazy val columnNames = parser.tokenizer.parseLine(header) checkHeaderColumnNames(parser, schema, columnNames) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 1007a85d3b1a3..42d2cd99c40ac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1291,8 +1291,9 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { val exception = intercept[SparkException] { spark.read .schema(ischema) - .option("header", "true") .option("multiLine", multiLine) + .option("header", "true") + .option("checkHeader", "true") .csv(path.getCanonicalPath) .collect() } From 64426332b2ab42a1cd9c5a05a77e90332572bbec Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 12:25:31 +0100 Subject: [PATCH 06/48] Fix csv test by changing headers or disabling header checking --- .../sql/execution/datasources/csv/CSVSuite.scala | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 42d2cd99c40ac..1be8e79305d35 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -252,7 +252,11 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { |(yearMade double, makeName string, modelName string, priceTag decimal, | comments string, grp string) |USING csv - |OPTIONS (path "${testFile(carsTsvFile)}", header "true", delimiter "\t") + |OPTIONS ( + | path "${testFile(carsTsvFile)}", + | header "true", checkHeader "false", + | delimiter "\t" + |) """.stripMargin.replaceAll("\n", " ")) assert( @@ -275,7 +279,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { test("test for blank column names on read and select columns") { val cars = spark.read .format("csv") - .options(Map("header" -> "true", "inferSchema" -> "true")) + .options(Map("header" -> "true", "checkHeader" -> "false", "inferSchema" -> "true")) .load(testFile(carsBlankColName)) assert(cars.select("customer").collect().size == 2) @@ -348,7 +352,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { spark.sql( s""" |CREATE TEMPORARY VIEW carsTable - |(yearMade double, makeName string, modelName string, comments string, blank string) + |(year double, make string, model string, comment string, blank string) |USING csv |OPTIONS (path "${testFile(carsFile)}", header "true") """.stripMargin.replaceAll("\n", " ")) @@ -356,7 +360,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { val cars = spark.table("carsTable") verifyCars(cars, withHeader = true, checkHeader = false, checkValues = false) assert( - cars.schema.fieldNames === Array("yearMade", "makeName", "modelName", "comments", "blank")) + cars.schema.fieldNames === Array("year", "make", "model", "comment", "blank")) } } From 9440d8a5c097a1d8e111b397fbda9e54751b7a84 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 12:36:21 +0100 Subject: [PATCH 07/48] Adding comment for the checkHeader option --- .../spark/sql/execution/datasources/csv/CSVOptions.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 1f7986fb59864..2030c05b9ff86 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -150,6 +150,10 @@ class CSVOptions( val isCommentSet = this.comment != '\u0000' + /** + * The option enables checks of headers in csv files. In particular, column names + * are matched to field names of provided schema. + */ val checkHeader = getBool("checkHeader", true) def asWriterSettings: CsvWriterSettings = { From 9f91ce73c5c313a9c51067a81e395e9385016ec5 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 12:42:48 +0100 Subject: [PATCH 08/48] Added comments --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 51255efc86470..3f9cbc0ed0838 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -141,7 +141,7 @@ object TextInputCSVDataSource extends CSVDataSource { val hasHeader = parser.options.headerFlag && file.start == 0 if (hasHeader) { - // Checking header field names are matched to fields in schema. + // Checking that column names in the header are matched to field names of the schema. // The header is removed from lines. // Note: if there are only comments in the first block, the header would probably // be not extracted. @@ -234,6 +234,7 @@ object MultiLineCSVDataSource extends CSVDataSource { inputPaths: Seq[FileStatus], parsedOptions: CSVOptions): StructType = { val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions) + // The header is not checked because there is no schema against with it could be check val checkHeader = (_: Array[String]) => () csv.flatMap { lines => val path = new Path(lines.getPath()) From 0878f7aad3c074e63ac3ab1d6e471ce8b988f278 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 13:09:20 +0100 Subject: [PATCH 09/48] Adding a space between column names --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 2 +- .../spark/sql/execution/datasources/csv/UnivocityParser.scala | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 3f9cbc0ed0838..dc1306716fd87 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -142,7 +142,7 @@ object TextInputCSVDataSource extends CSVDataSource { val hasHeader = parser.options.headerFlag && file.start == 0 if (hasHeader) { // Checking that column names in the header are matched to field names of the schema. - // The header is removed from lines. + // The header will be removed from lines. // Note: if there are only comments in the first block, the header would probably // be not extracted. val checkHeader = UnivocityParser.checkHeader(parser, dataSchema, _: String) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 89815bbd56518..1e93ac5b6ba46 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -324,8 +324,8 @@ private[csv] object UnivocityParser { if (!isMatched) { throw new IllegalArgumentException( s"""|Fields in the header of csv file are not matched to field names of the schema: - | Header: ${columnNames.mkString(",")} - | Schema: ${fieldNames.mkString(",")}""".stripMargin + | Header: ${columnNames.mkString(", ")} + | Schema: ${fieldNames.mkString(", ")}""".stripMargin ) } } From a341dd79c976df59fc8bffb272449973a09b86fe Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 21 Mar 2018 16:15:14 +0100 Subject: [PATCH 10/48] Fix a test: checking name duplication in schemas --- .../org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala index 14b1feb2adc20..39c05a88fa132 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala @@ -764,7 +764,8 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896) var testDir = Utils.createTempDir(src.getAbsolutePath) Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(testDir.getAbsolutePath) - val df = spark.read.format("csv").option("inferSchema", true).option("header", true) + val df = spark.read.format("csv").option("inferSchema", true) + .option("header", true).option("checkHeader", "false") .load(testDir.getAbsolutePath) checkAnswer(df, Row(1, 1)) checkReadPartitionColumnDuplication("csv", c0, c1, src) From 98c27eaa80cf3fae11092d78f22122688e4041a4 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 23 Mar 2018 22:04:57 +0100 Subject: [PATCH 11/48] Fixing the test and adding ticket number to test's title --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 3 +-- .../spark/sql/execution/datasources/csv/UnivocityParser.scala | 1 + .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index dc1306716fd87..3e1ce82f3aab4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -149,8 +149,7 @@ object TextInputCSVDataSource extends CSVDataSource { CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_)) } - UnivocityParser.parseIterator( - conf, lines, shouldDropHeader = hasHeader, parser, schema, file.filePath) + UnivocityParser.parseIterator(lines, parser, schema) } override def infer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 1e93ac5b6ba46..74df38dcc8d5a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -307,6 +307,7 @@ private[csv] object UnivocityParser { parser.options.parseMode, schema, parser.options.columnNameOfCorruptRecord) + filteredLines.flatMap(safeParser.parse) } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 1be8e79305d35..53b9b4b4b830a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1285,7 +1285,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } def checkHeader(multiLine: String): Unit = { - test(s"Check column names during schema validation - multiLine = $multiLine") { + test(s"SPARK-23786: Checking column names against schema ($multiLine)") { withTempPath { path => import collection.JavaConverters._ val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) From 811df6fa7b17ff12bdd70318cf330a0f54815397 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 23 Mar 2018 22:10:20 +0100 Subject: [PATCH 12/48] Refactoring - removing unneeded parameter --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 1 - .../spark/sql/execution/datasources/csv/CSVFileFormat.scala | 3 +-- .../spark/sql/execution/datasources/csv/UnivocityParser.scala | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 3e1ce82f3aab4..d2996a27d06bd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -223,7 +223,6 @@ object MultiLineCSVDataSource extends CSVDataSource { parser.options.headerFlag, parser, schema, - file.filePath, checkHeader = UnivocityParser.checkHeaderColumnNames(parser, dataSchema, _) ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 4888d7632ab9c..37ed8d5e017d8 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -129,8 +129,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), parsedOptions) - CSVDataSource(parsedOptions) - .readFile(conf, file, parser, requiredSchema, dataSchema) + CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 74df38dcc8d5a..beef78946e73f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -250,7 +250,6 @@ private[csv] object UnivocityParser { shouldDropHeader: Boolean, parser: UnivocityParser, schema: StructType, - filePath: String, checkHeader: Array[String] => Unit): Iterator[InternalRow] = { val tokenizer = parser.tokenizer val safeParser = new FailureSafeParser[Array[String]]( From 691cfbcf1c6c926bb0ed666db59af6070b6a1372 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 25 Mar 2018 15:01:44 +0200 Subject: [PATCH 13/48] Output filename in the exception --- .../datasources/csv/CSVDataSource.scala | 17 ++++++++++++++--- .../datasources/csv/UnivocityParser.scala | 15 +++++++++++---- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index d2996a27d06bd..75af28d06dada 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -145,7 +145,12 @@ object TextInputCSVDataSource extends CSVDataSource { // The header will be removed from lines. // Note: if there are only comments in the first block, the header would probably // be not extracted. - val checkHeader = UnivocityParser.checkHeader(parser, dataSchema, _: String) + val checkHeader = UnivocityParser.checkHeader( + parser, + dataSchema, + _: String, + file.filePath + ) CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_)) } @@ -218,13 +223,19 @@ object MultiLineCSVDataSource extends CSVDataSource { parser: UnivocityParser, schema: StructType, dataSchema: StructType): Iterator[InternalRow] = { + val checkHeader = UnivocityParser.checkHeaderColumnNames( + parser, + dataSchema, + _: Array[String], + file.filePath + ) UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), parser.options.headerFlag, parser, schema, - checkHeader = UnivocityParser.checkHeaderColumnNames(parser, dataSchema, _) - ) + file.filePath, + checkHeader) } override def infer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index beef78946e73f..a10548f2dfd2f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -313,7 +313,8 @@ private[csv] object UnivocityParser { def checkHeaderColumnNames( parser: UnivocityParser, schema: StructType, - columnNames: Array[String] + columnNames: Array[String], + fileName: String ): Unit = { if (parser.options.checkHeader && columnNames != null) { val fieldNames = schema.map(_.name) @@ -325,14 +326,20 @@ private[csv] object UnivocityParser { throw new IllegalArgumentException( s"""|Fields in the header of csv file are not matched to field names of the schema: | Header: ${columnNames.mkString(", ")} - | Schema: ${fieldNames.mkString(", ")}""".stripMargin + | Schema: ${fieldNames.mkString(", ")} + |CSV file: $fileName""".stripMargin ) } } } - def checkHeader(parser: UnivocityParser, schema: StructType, header: String): Unit = { + def checkHeader( + parser: UnivocityParser, + schema: StructType, + header: String, + fileName: String + ): Unit = { lazy val columnNames = parser.tokenizer.parseLine(header) - checkHeaderColumnNames(parser, schema, columnNames) + checkHeaderColumnNames(parser, schema, columnNames, fileName) } } From efb01051a59c948e943d211abadff08d3dac6acf Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 25 Mar 2018 16:53:37 +0200 Subject: [PATCH 14/48] PySpark: adding a test and checkHeader parameter --- python/pyspark/sql/readwriter.py | 8 ++++++-- python/pyspark/sql/tests.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index e5288636c596e..9845ce8d45512 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -335,7 +335,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, - columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None): + columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, + checkHeader=None): """Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -360,6 +361,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non character. By default (None), it is disabled. :param header: uses the first line as names of columns. If None is set, it uses the default value, ``false``. + :param checkHeader: compares column names in the header with field names in the schema + and outputs an error if names are not matched. + If None is set, it uses the default value, ``true``. :param inferSchema: infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, ``false``. :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from @@ -436,7 +440,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=maxCharsPerColumn, maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping) + charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, checkHeader=checkHeader) if isinstance(path, basestring): path = [path] if type(path) == list: diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 967cc83166f3f..5bf1759690fe7 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -2974,6 +2974,21 @@ def test_create_dateframe_from_pandas_with_dst(self): os.environ['TZ'] = orig_env_tz time.tzset() + def test_checking_csv_header(self): + tmpPath = tempfile.mkdtemp() + shutil.rmtree(tmpPath) + self.spark.createDataFrame([[1, 1000],[2000, 2]]).\ + toDF('f1', 'f2').write.option("header", "true").csv(tmpPath) + schema = StructType([ + StructField('f2', IntegerType(), nullable=True), + StructField('f1', IntegerType(), nullable=True)]) + df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath) + self.assertRaisesRegexp( + Exception, + "Fields in the header of csv file are not matched to field names of the schema", + lambda: df.collect()) + shutil.rmtree(tmpPath) + class HiveSparkSubmitTests(SparkSubmitTests): From c9f5e1491e82455abb1baecec86d4ad91dd6fabf Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 25 Mar 2018 17:27:52 +0200 Subject: [PATCH 15/48] Removing unneeded parameter - fileName --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 75af28d06dada..ae08c97975fb7 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -234,7 +234,6 @@ object MultiLineCSVDataSource extends CSVDataSource { parser.options.headerFlag, parser, schema, - file.filePath, checkHeader) } From e1958388678534a424931cfea08c63ee2c3df102 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 25 Mar 2018 17:28:50 +0200 Subject: [PATCH 16/48] Fix for pycodestyle checks --- python/pyspark/sql/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 5bf1759690fe7..13804078574b9 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -2977,7 +2977,7 @@ def test_create_dateframe_from_pandas_with_dst(self): def test_checking_csv_header(self): tmpPath = tempfile.mkdtemp() shutil.rmtree(tmpPath) - self.spark.createDataFrame([[1, 1000],[2000, 2]]).\ + self.spark.createDataFrame([[1, 1000], [2000, 2]]).\ toDF('f1', 'f2').write.option("header", "true").csv(tmpPath) schema = StructType([ StructField('f2', IntegerType(), nullable=True), From d6d370d98f3f12c9c53fd784c0b6e8f9d3d28f54 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 25 Mar 2018 17:32:17 +0200 Subject: [PATCH 17/48] Adding description of the checkHeader option --- .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 1a5e47508c070..4f04bd7ddc0c5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -524,6 +524,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { *
  • `comment` (default empty string): sets a single character used for skipping lines * beginning with this character. By default, it is disabled.
  • *
  • `header` (default `false`): uses the first line as names of columns.
  • + *
  • `checkHeader` (default `true`): compares column names in the header with field names + * in the schema and outputs an error if names are not matched.
  • *
  • `inferSchema` (default `false`): infers the input schema automatically from data. It * requires one extra pass over the data.
  • *
  • `ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading From acd6d2e1c381936f8982e70aac6c85d9e85de63e Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 26 Mar 2018 22:44:56 +0200 Subject: [PATCH 18/48] Improving error messages and handling the case when header size is not equal to schema size --- .../datasources/csv/UnivocityParser.scala | 38 ++++++++++++++----- .../execution/datasources/csv/CSVSuite.scala | 34 ++++++++++++++++- 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index a10548f2dfd2f..7b43e9a0533ff 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -317,19 +317,37 @@ private[csv] object UnivocityParser { fileName: String ): Unit = { if (parser.options.checkHeader && columnNames != null) { - val fieldNames = schema.map(_.name) - val isMatched = fieldNames.zip(columnNames).forall { pair => - val (nameInSchema, nameInHeader) = pair - nameInSchema == nameInHeader - } - if (!isMatched) { - throw new IllegalArgumentException( - s"""|Fields in the header of csv file are not matched to field names of the schema: - | Header: ${columnNames.mkString(", ")} - | Schema: ${fieldNames.mkString(", ")} + val fieldNames = schema.map(_.name).toIndexedSeq + val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) + var error: Option[String] = None + + if (headerLen == schemaSize) { + var i = 0 + while (error.isEmpty && i < headerLen) { + val nameInSchema = fieldNames(i).toLowerCase + val nameInHeader = columnNames(i).toLowerCase + if (nameInHeader != nameInSchema) { + error = Some( + s"""|CSV file header does not contain the expected fields. + | Header: ${columnNames.mkString(", ")} + | Schema: ${fieldNames.mkString(", ")} + |Expected: $nameInSchema but found: $nameInHeader + |CSV file: $fileName""".stripMargin + ) + } + i += 1 + } + } else { + error = Some( + s"""|Number of column in CSV header is not equal to number of fields in the schema: + | Header length: $headerLen, schema size: $schemaSize |CSV file: $fileName""".stripMargin ) } + + error.headOption.foreach { msg => + throw new IllegalArgumentException(msg) + } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 53b9b4b4b830a..e28ba8089c6a9 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1302,7 +1302,39 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { .collect() } assert(exception.getMessage.contains( - "Fields in the header of csv file are not matched to field names of the schema" + "CSV file header does not contain the expected fields" + )) + + val shortSchema = new StructType().add("f1", DoubleType) + val exceptionForShortSchema = intercept[SparkException] { + spark.read + .schema(shortSchema) + .option("multiLine", multiLine) + .option("header", "true") + .option("checkHeader", "true") + .csv(path.getCanonicalPath) + .collect() + } + assert(exceptionForShortSchema.getMessage.contains( + "Number of column in CSV header is not equal to number of fields in the schema" + )) + + val longSchema = new StructType() + .add("f1", DoubleType) + .add("f2", DoubleType) + .add("f3", DoubleType) + + val exceptionForLongSchema = intercept[SparkException] { + spark.read + .schema(longSchema) + .option("multiLine", multiLine) + .option("header", "true") + .option("checkHeader", "true") + .csv(path.getCanonicalPath) + .collect() + } + assert(exceptionForLongSchema.getMessage.contains( + "Header length: 2, schema size: 3" )) } } From 13892fddc7007c95ebfc3d2fcb789401fc78426f Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 26 Mar 2018 22:55:09 +0200 Subject: [PATCH 19/48] Refactoring: check header by calling an uniVocity method --- .../sql/execution/datasources/csv/CSVDataSource.scala | 10 +++------- .../execution/datasources/csv/UnivocityParser.scala | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index ae08c97975fb7..3bf6fc41d0a84 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -145,13 +145,9 @@ object TextInputCSVDataSource extends CSVDataSource { // The header will be removed from lines. // Note: if there are only comments in the first block, the header would probably // be not extracted. - val checkHeader = UnivocityParser.checkHeader( - parser, - dataSchema, - _: String, - file.filePath - ) - CSVUtils.extractHeader(lines, parser.options).foreach(checkHeader(_)) + CSVUtils.extractHeader(lines, parser.options).foreach { header => + UnivocityParser.checkHeader(header, parser, dataSchema, file.filePath) + } } UnivocityParser.parseIterator(lines, parser, schema) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 7b43e9a0533ff..237be35a277ac 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -352,9 +352,9 @@ private[csv] object UnivocityParser { } def checkHeader( + header: String, parser: UnivocityParser, schema: StructType, - header: String, fileName: String ): Unit = { lazy val columnNames = parser.tokenizer.parseLine(header) From 476b5176a489a06eb892f06111fb5a3e7f474324 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 26 Mar 2018 23:40:30 +0200 Subject: [PATCH 20/48] Refactoring: convert val to def --- .../execution/datasources/csv/CSVDataSource.scala | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 3bf6fc41d0a84..6f1610ad85b21 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -219,12 +219,10 @@ object MultiLineCSVDataSource extends CSVDataSource { parser: UnivocityParser, schema: StructType, dataSchema: StructType): Iterator[InternalRow] = { - val checkHeader = UnivocityParser.checkHeaderColumnNames( - parser, - dataSchema, - _: Array[String], - file.filePath - ) + def checkHeader(header: Array[String]): Unit = { + UnivocityParser.checkHeaderColumnNames(parser, dataSchema, header, file.filePath) + } + UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), parser.options.headerFlag, @@ -239,7 +237,8 @@ object MultiLineCSVDataSource extends CSVDataSource { parsedOptions: CSVOptions): StructType = { val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions) // The header is not checked because there is no schema against with it could be check - val checkHeader = (_: Array[String]) => () + def checkHeader(header: Array[String]): Unit = () + csv.flatMap { lines => val path = new Path(lines.getPath()) UnivocityParser.tokenizeStream( From f8167e45d3c72ca47f67f5567086f0085099f90f Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 26 Mar 2018 23:43:16 +0200 Subject: [PATCH 21/48] Parse header only if the checkHeader option is true --- .../sql/execution/datasources/csv/UnivocityParser.scala | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 237be35a277ac..4f823145d395c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -357,7 +357,10 @@ private[csv] object UnivocityParser { schema: StructType, fileName: String ): Unit = { - lazy val columnNames = parser.tokenizer.parseLine(header) - checkHeaderColumnNames(parser, schema, columnNames, fileName) + if (parser.options.checkHeader) { + val columnNames = parser.tokenizer.parseLine(header) + + checkHeaderColumnNames(parser, schema, columnNames, fileName) + } } } From d068f6c44801ac8c89ef38f41692ac7c5a0d909e Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 26 Mar 2018 23:57:58 +0200 Subject: [PATCH 22/48] Moving header checks to CSVDataSource --- .../datasources/csv/CSVDataSource.scala | 62 ++++++++++++++++++- .../datasources/csv/UnivocityParser.scala | 56 +---------------- 2 files changed, 61 insertions(+), 57 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 6f1610ad85b21..214585afb8d9e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -120,6 +120,61 @@ object CSVDataSource { TextInputCSVDataSource } } + + def checkHeaderColumnNames( + schema: StructType, + columnNames: Array[String], + fileName: String, + checkHeaderFlag: Boolean + ): Unit = { + if (checkHeaderFlag && columnNames != null) { + val fieldNames = schema.map(_.name).toIndexedSeq + val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) + var error: Option[String] = None + + if (headerLen == schemaSize) { + var i = 0 + while (error.isEmpty && i < headerLen) { + val nameInSchema = fieldNames(i).toLowerCase + val nameInHeader = columnNames(i).toLowerCase + if (nameInHeader != nameInSchema) { + error = Some( + s"""|CSV file header does not contain the expected fields. + | Header: ${columnNames.mkString(", ")} + | Schema: ${fieldNames.mkString(", ")} + |Expected: $nameInSchema but found: $nameInHeader + |CSV file: $fileName""".stripMargin + ) + } + i += 1 + } + } else { + error = Some( + s"""|Number of column in CSV header is not equal to number of fields in the schema: + | Header length: $headerLen, schema size: $schemaSize + |CSV file: $fileName""".stripMargin + ) + } + + error.headOption.foreach { msg => + throw new IllegalArgumentException(msg) + } + } + } + + def checkHeader( + header: String, + parser: CsvParser, + schema: StructType, + fileName: String, + checkHeaderFlag: Boolean + ): Unit = { + if (checkHeaderFlag) { + val columnNames = parser.parseLine(header) + + checkHeaderColumnNames(schema, columnNames, fileName, checkHeaderFlag) + } + } } object TextInputCSVDataSource extends CSVDataSource { @@ -146,7 +201,8 @@ object TextInputCSVDataSource extends CSVDataSource { // Note: if there are only comments in the first block, the header would probably // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => - UnivocityParser.checkHeader(header, parser, dataSchema, file.filePath) + CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, + file.filePath, parser.options.checkHeader) } } @@ -220,7 +276,9 @@ object MultiLineCSVDataSource extends CSVDataSource { schema: StructType, dataSchema: StructType): Iterator[InternalRow] = { def checkHeader(header: Array[String]): Unit = { - UnivocityParser.checkHeaderColumnNames(parser, dataSchema, header, file.filePath) + CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath, + parser.options.checkHeader + ) } UnivocityParser.parseStream( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 4f823145d395c..d5ae6874d4dda 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -47,7 +47,7 @@ class UnivocityParser( // A `ValueConverter` is responsible for converting the given value to a desired type. private type ValueConverter = String => Any - private val tokenizer = new CsvParser(options.asParserSettings) + val tokenizer = new CsvParser(options.asParserSettings) private val row = new GenericInternalRow(requiredSchema.length) @@ -309,58 +309,4 @@ private[csv] object UnivocityParser { filteredLines.flatMap(safeParser.parse) } - - def checkHeaderColumnNames( - parser: UnivocityParser, - schema: StructType, - columnNames: Array[String], - fileName: String - ): Unit = { - if (parser.options.checkHeader && columnNames != null) { - val fieldNames = schema.map(_.name).toIndexedSeq - val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) - var error: Option[String] = None - - if (headerLen == schemaSize) { - var i = 0 - while (error.isEmpty && i < headerLen) { - val nameInSchema = fieldNames(i).toLowerCase - val nameInHeader = columnNames(i).toLowerCase - if (nameInHeader != nameInSchema) { - error = Some( - s"""|CSV file header does not contain the expected fields. - | Header: ${columnNames.mkString(", ")} - | Schema: ${fieldNames.mkString(", ")} - |Expected: $nameInSchema but found: $nameInHeader - |CSV file: $fileName""".stripMargin - ) - } - i += 1 - } - } else { - error = Some( - s"""|Number of column in CSV header is not equal to number of fields in the schema: - | Header length: $headerLen, schema size: $schemaSize - |CSV file: $fileName""".stripMargin - ) - } - - error.headOption.foreach { msg => - throw new IllegalArgumentException(msg) - } - } - } - - def checkHeader( - header: String, - parser: UnivocityParser, - schema: StructType, - fileName: String - ): Unit = { - if (parser.options.checkHeader) { - val columnNames = parser.tokenizer.parseLine(header) - - checkHeaderColumnNames(parser, schema, columnNames, fileName) - } - } } From 08cfcf43f4564195bdd83218d2183a2dae4d4f2c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 27 Mar 2018 00:04:55 +0200 Subject: [PATCH 23/48] Making uniVocity wrapper unaware of header --- .../datasources/csv/CSVDataSource.scala | 2 +- .../datasources/csv/UnivocityParser.scala | 23 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 214585afb8d9e..3c906e46199e5 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -301,7 +301,7 @@ object MultiLineCSVDataSource extends CSVDataSource { val path = new Path(lines.getPath()) UnivocityParser.tokenizeStream( CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path), - shouldDropHeader = false, + dropFirstRecord = false, checkHeader, new CsvParser(parsedOptions.asParserSettings)) }.take(1).headOption match { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index d5ae6874d4dda..ef0d5688f8464 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -236,10 +236,10 @@ private[csv] object UnivocityParser { */ def tokenizeStream( inputStream: InputStream, - shouldDropHeader: Boolean, - checkHeader: Array[String] => Unit, + dropFirstRecord: Boolean, + checkFirstRecord: Array[String] => Unit, tokenizer: CsvParser): Iterator[Array[String]] = { - convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader)(tokens => tokens) + convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord)(tokens => tokens) } /** @@ -247,32 +247,33 @@ private[csv] object UnivocityParser { */ def parseStream( inputStream: InputStream, - shouldDropHeader: Boolean, + dropFirstRecord: Boolean, parser: UnivocityParser, schema: StructType, - checkHeader: Array[String] => Unit): Iterator[InternalRow] = { + filePath: String, + checkFirstRecord: Array[String] => Unit): Iterator[InternalRow] = { val tokenizer = parser.tokenizer val safeParser = new FailureSafeParser[Array[String]]( input => Seq(parser.convert(input)), parser.options.parseMode, schema, parser.options.columnNameOfCorruptRecord) - convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens => + convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord) { tokens => safeParser.parse(tokens) }.flatten } private def convertStream[T]( inputStream: InputStream, - shouldDropHeader: Boolean, + dropFirstRecord: Boolean, tokenizer: CsvParser, - checkHeader: Array[String] => Unit + checkFirstRecord: Array[String] => Unit )(convert: Array[String] => T) = new Iterator[T] { tokenizer.beginParsing(inputStream) private var nextRecord = { - if (shouldDropHeader) { - val header = tokenizer.parseNext() - checkHeader(header) + if (dropFirstRecord) { + val firstRecord = tokenizer.parseNext() + checkFirstRecord(firstRecord) } tokenizer.parseNext() } From f6a1694608204f0923e2dec1c79b97e5fdda0154 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Tue, 27 Mar 2018 15:44:23 +0200 Subject: [PATCH 24/48] Fix the test: error mesage was changed --- python/pyspark/sql/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 13804078574b9..ace5eafa8820c 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -2985,7 +2985,7 @@ def test_checking_csv_header(self): df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath) self.assertRaisesRegexp( Exception, - "Fields in the header of csv file are not matched to field names of the schema", + "CSV file header does not contain the expected fields", lambda: df.collect()) shutil.rmtree(tmpPath) From adbedf39194f72a873db10a42ac826cf0568513d Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 31 Mar 2018 22:48:37 +0200 Subject: [PATCH 25/48] Revert CSV tests as it was before the option was introduced --- .../datasources/csv/CSVDataSource.scala | 6 +++--- .../execution/datasources/csv/CSVOptions.scala | 6 +++--- .../execution/datasources/csv/CSVSuite.scala | 18 +++++++----------- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 3c906e46199e5..ce508406749f4 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -201,8 +201,8 @@ object TextInputCSVDataSource extends CSVDataSource { // Note: if there are only comments in the first block, the header would probably // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => - CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, - file.filePath, parser.options.checkHeader) + CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath, + checkHeaderFlag = !parser.options.enforceSchema) } } @@ -277,7 +277,7 @@ object MultiLineCSVDataSource extends CSVDataSource { dataSchema: StructType): Iterator[InternalRow] = { def checkHeader(header: Array[String]): Unit = { CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath, - parser.options.checkHeader + checkHeaderFlag = !parser.options.enforceSchema ) } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 2030c05b9ff86..7927c117d7280 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -151,10 +151,10 @@ class CSVOptions( val isCommentSet = this.comment != '\u0000' /** - * The option enables checks of headers in csv files. In particular, column names - * are matched to field names of provided schema. + * Forcibly apply the specified or inferred schema to datasource files. + * If the option is enabled, headers of CSV files will be ignored. */ - val checkHeader = getBool("checkHeader", true) + val enforceSchema = getBool("enforceSchema", true) def asWriterSettings: CsvWriterSettings = { val writerSettings = new CsvWriterSettings() diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index e28ba8089c6a9..cdbe0f724e8ae 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -252,11 +252,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { |(yearMade double, makeName string, modelName string, priceTag decimal, | comments string, grp string) |USING csv - |OPTIONS ( - | path "${testFile(carsTsvFile)}", - | header "true", checkHeader "false", - | delimiter "\t" - |) + |OPTIONS (path "${testFile(carsTsvFile)}", header "true", delimiter "\t") """.stripMargin.replaceAll("\n", " ")) assert( @@ -279,7 +275,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { test("test for blank column names on read and select columns") { val cars = spark.read .format("csv") - .options(Map("header" -> "true", "checkHeader" -> "false", "inferSchema" -> "true")) + .options(Map("header" -> "true", "inferSchema" -> "true")) .load(testFile(carsBlankColName)) assert(cars.select("customer").collect().size == 2) @@ -352,7 +348,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { spark.sql( s""" |CREATE TEMPORARY VIEW carsTable - |(year double, make string, model string, comment string, blank string) + |(yearMade double, makeName string, modelName string, comments string, blank string) |USING csv |OPTIONS (path "${testFile(carsFile)}", header "true") """.stripMargin.replaceAll("\n", " ")) @@ -360,7 +356,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { val cars = spark.table("carsTable") verifyCars(cars, withHeader = true, checkHeader = false, checkValues = false) assert( - cars.schema.fieldNames === Array("year", "make", "model", "comment", "blank")) + cars.schema.fieldNames === Array("yearMade", "makeName", "modelName", "comments", "blank")) } } @@ -1297,7 +1293,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { .schema(ischema) .option("multiLine", multiLine) .option("header", "true") - .option("checkHeader", "true") + .option("enforceSchema", "false") .csv(path.getCanonicalPath) .collect() } @@ -1311,7 +1307,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { .schema(shortSchema) .option("multiLine", multiLine) .option("header", "true") - .option("checkHeader", "true") + .option("enforceSchema", "false") .csv(path.getCanonicalPath) .collect() } @@ -1329,7 +1325,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { .schema(longSchema) .option("multiLine", multiLine) .option("header", "true") - .option("checkHeader", "true") + .option("enforceSchema", "false") .csv(path.getCanonicalPath) .collect() } From 0904daf52b1d94467f75bc6703d3d81747208e77 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sat, 31 Mar 2018 23:24:00 +0200 Subject: [PATCH 26/48] Renaming checkHeader to enforceSchema --- python/pyspark/sql/readwriter.py | 10 +++++----- python/pyspark/sql/tests.py | 3 ++- .../scala/org/apache/spark/sql/DataFrameReader.scala | 4 ++-- .../spark/sql/test/DataFrameReaderWriterSuite.scala | 3 +-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 9845ce8d45512..6f437b47053bc 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -336,7 +336,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, - checkHeader=None): + enforceSchema=None): """Loads a CSV file and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -361,11 +361,11 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non character. By default (None), it is disabled. :param header: uses the first line as names of columns. If None is set, it uses the default value, ``false``. - :param checkHeader: compares column names in the header with field names in the schema - and outputs an error if names are not matched. - If None is set, it uses the default value, ``true``. :param inferSchema: infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, ``false``. + :param enforceSchema: Forcibly apply the specified or inferred schema to CSV + files. If None is set, it uses the default value, ``true``. + In that case, CSV headers are ignored. :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it uses the default value, ``false``. @@ -440,7 +440,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=maxCharsPerColumn, maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, checkHeader=checkHeader) + charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema) if isinstance(path, basestring): path = [path] if type(path) == list: diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index ace5eafa8820c..db51ac37096e8 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -2982,7 +2982,8 @@ def test_checking_csv_header(self): schema = StructType([ StructField('f2', IntegerType(), nullable=True), StructField('f1', IntegerType(), nullable=True)]) - df = self.spark.read.option('header', 'true').schema(schema).csv(tmpPath) + df = self.spark.read.option('header', 'true').schema(schema).\ + csv(tmpPath, enforceSchema=False) self.assertRaisesRegexp( Exception, "CSV file header does not contain the expected fields", diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 4f04bd7ddc0c5..2668a1c756820 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -524,8 +524,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { *
  • `comment` (default empty string): sets a single character used for skipping lines * beginning with this character. By default, it is disabled.
  • *
  • `header` (default `false`): uses the first line as names of columns.
  • - *
  • `checkHeader` (default `true`): compares column names in the header with field names - * in the schema and outputs an error if names are not matched.
  • + *
  • `enforceSchema` (default `true`): Forcibly apply the specified or inferred schema to + * datasource files. If it is set, CSV headers are ignored.
  • *
  • `inferSchema` (default `false`): infers the input schema automatically from data. It * requires one extra pass over the data.
  • *
  • `ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala index 39c05a88fa132..14b1feb2adc20 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/DataFrameReaderWriterSuite.scala @@ -764,8 +764,7 @@ class DataFrameReaderWriterSuite extends QueryTest with SharedSQLContext with Be // If `inferSchema` is true, a CSV format is duplicate-safe (See SPARK-16896) var testDir = Utils.createTempDir(src.getAbsolutePath) Seq("a,a", "1,1").toDF().coalesce(1).write.mode("overwrite").text(testDir.getAbsolutePath) - val df = spark.read.format("csv").option("inferSchema", true) - .option("header", true).option("checkHeader", "false") + val df = spark.read.format("csv").option("inferSchema", true).option("header", true) .load(testDir.getAbsolutePath) checkAnswer(df, Row(1, 1)) checkReadPartitionColumnDuplication("csv", c0, c1, src) From 191b415e77bbc8fe662c19faae627e40787c5e23 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Sun, 1 Apr 2018 10:07:32 +0200 Subject: [PATCH 27/48] Pass required parameter --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index ce508406749f4..98533a8977a3a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -283,10 +283,7 @@ object MultiLineCSVDataSource extends CSVDataSource { UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), - parser.options.headerFlag, - parser, - schema, - checkHeader) + parser.options.headerFlag, parser, schema, file.filePath, checkHeader) } override def infer( From ab9c514c2567795ff07aa1e25fac16ca62e994b6 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Apr 2018 12:48:06 +0200 Subject: [PATCH 28/48] Addressing Xiao Li's review comments --- .../apache/spark/sql/DataFrameReader.scala | 7 +- .../datasources/csv/CSVDataSource.scala | 42 +++--- .../datasources/csv/UnivocityParser.scala | 8 +- .../execution/datasources/csv/CSVSuite.scala | 128 ++++++++++-------- 4 files changed, 98 insertions(+), 87 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 3d39d9d809499..6b7a9ab5b7b71 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -526,8 +526,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { *
  • `comment` (default empty string): sets a single character used for skipping lines * beginning with this character. By default, it is disabled.
  • *
  • `header` (default `false`): uses the first line as names of columns.
  • - *
  • `enforceSchema` (default `true`): Forcibly apply the specified or inferred schema to - * datasource files. If it is set, CSV headers are ignored.
  • + *
  • `enforceSchema` (default `true`): If it is set to `true`, the specified or inferred schema + * will be forcibly applied to datasource files and headers in CSV files will be ignored. + * If the option is set to `false`, the schema will be validated against headers in CSV files if + * the `header` option is set to `true`. The validation is performed in column ordering aware and + * case sensitive manner.
  • *
  • `inferSchema` (default `false`): infers the input schema automatically from data. It * requires one extra pass over the data.
  • *
  • `ignoreLeadingWhiteSpace` (default `false`): a flag indicating whether or not leading diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 98533a8977a3a..6f6f3a8d5bfec 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -27,13 +27,14 @@ import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat - import org.apache.spark.TaskContext + import org.apache.spark.input.{PortableDataStream, StreamInputFormat} import org.apache.spark.rdd.{BinaryFileRDD, RDD} import org.apache.spark.sql.{Dataset, Encoders, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.execution.datasources.csv.CSVDataSource.checkHeaderColumnNames import org.apache.spark.sql.execution.datasources.text.TextFileFormat import org.apache.spark.sql.types.StructType @@ -50,9 +51,9 @@ abstract class CSVDataSource extends Serializable { conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType, // Schema of projection - dataSchema: StructType // Schema of data in csv files - ): Iterator[InternalRow] + schema: StructType, + // Actual schema of data in the csv file + dataSchema: StructType): Iterator[InternalRow] /** * Infers the schema from `inputPaths` files. @@ -121,12 +122,8 @@ object CSVDataSource { } } - def checkHeaderColumnNames( - schema: StructType, - columnNames: Array[String], - fileName: String, - checkHeaderFlag: Boolean - ): Unit = { + def checkHeaderColumnNames(schema: StructType, columnNames: Array[String], fileName: String, + checkHeaderFlag: Boolean): Unit = { if (checkHeaderFlag && columnNames != null) { val fieldNames = schema.map(_.name).toIndexedSeq val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) @@ -161,20 +158,6 @@ object CSVDataSource { } } } - - def checkHeader( - header: String, - parser: CsvParser, - schema: StructType, - fileName: String, - checkHeaderFlag: Boolean - ): Unit = { - if (checkHeaderFlag) { - val columnNames = parser.parseLine(header) - - checkHeaderColumnNames(schema, columnNames, fileName, checkHeaderFlag) - } - } } object TextInputCSVDataSource extends CSVDataSource { @@ -201,7 +184,7 @@ object TextInputCSVDataSource extends CSVDataSource { // Note: if there are only comments in the first block, the header would probably // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => - CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath, + checkHeader(header, parser.tokenizer, dataSchema, file.filePath, checkHeaderFlag = !parser.options.enforceSchema) } } @@ -264,6 +247,13 @@ object TextInputCSVDataSource extends CSVDataSource { sparkSession.createDataset(rdd)(Encoders.STRING) } } + + def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String, + checkHeaderFlag: Boolean): Unit = { + if (checkHeaderFlag) { + checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag) + } + } } object MultiLineCSVDataSource extends CSVDataSource { @@ -283,7 +273,7 @@ object MultiLineCSVDataSource extends CSVDataSource { UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), - parser.options.headerFlag, parser, schema, file.filePath, checkHeader) + parser.options.headerFlag, parser, schema, checkHeader) } override def infer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index ef0d5688f8464..d0e25a4c3349c 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -250,7 +250,6 @@ private[csv] object UnivocityParser { dropFirstRecord: Boolean, parser: UnivocityParser, schema: StructType, - filePath: String, checkFirstRecord: Array[String] => Unit): Iterator[InternalRow] = { val tokenizer = parser.tokenizer val safeParser = new FailureSafeParser[Array[String]]( @@ -267,8 +266,8 @@ private[csv] object UnivocityParser { inputStream: InputStream, dropFirstRecord: Boolean, tokenizer: CsvParser, - checkFirstRecord: Array[String] => Unit - )(convert: Array[String] => T) = new Iterator[T] { + checkFirstRecord: Array[String] => Unit)( + convert: Array[String] => T) = new Iterator[T] { tokenizer.beginParsing(inputStream) private var nextRecord = { if (dropFirstRecord) { @@ -299,8 +298,7 @@ private[csv] object UnivocityParser { schema: StructType): Iterator[InternalRow] = { val options = parser.options - val filteredLines: Iterator[String] = - CSVUtils.filterCommentAndEmpty(lines, options) + val filteredLines: Iterator[String] = CSVUtils.filterCommentAndEmpty(lines, options) val safeParser = new FailureSafeParser[String]( input => Seq(parser.parse(input)), diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index cdbe0f724e8ae..e42774cb45942 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -30,7 +30,6 @@ import org.apache.hadoop.io.compress.GzipCodec import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, UDT} import org.apache.spark.sql.catalyst.util.DateTimeUtils -import org.apache.spark.sql.functions.{col, regexp_replace} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.{SharedSQLContext, SQLTestUtils} import org.apache.spark.sql.types._ @@ -1280,61 +1279,82 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { ) } - def checkHeader(multiLine: String): Unit = { - test(s"SPARK-23786: Checking column names against schema ($multiLine)") { - withTempPath { path => - import collection.JavaConverters._ - val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) - val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) - odf.write.option("header", "true").csv(path.getCanonicalPath) - val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) - val exception = intercept[SparkException] { - spark.read - .schema(ischema) - .option("multiLine", multiLine) - .option("header", "true") - .option("enforceSchema", "false") - .csv(path.getCanonicalPath) - .collect() - } - assert(exception.getMessage.contains( - "CSV file header does not contain the expected fields" - )) - - val shortSchema = new StructType().add("f1", DoubleType) - val exceptionForShortSchema = intercept[SparkException] { - spark.read - .schema(shortSchema) - .option("multiLine", multiLine) - .option("header", "true") - .option("enforceSchema", "false") - .csv(path.getCanonicalPath) - .collect() - } - assert(exceptionForShortSchema.getMessage.contains( - "Number of column in CSV header is not equal to number of fields in the schema" - )) - - val longSchema = new StructType() - .add("f1", DoubleType) - .add("f2", DoubleType) - .add("f3", DoubleType) - - val exceptionForLongSchema = intercept[SparkException] { - spark.read - .schema(longSchema) - .option("multiLine", multiLine) - .option("header", "true") - .option("enforceSchema", "false") - .csv(path.getCanonicalPath) - .collect() - } - assert(exceptionForLongSchema.getMessage.contains( - "Header length: 2, schema size: 3" - )) + def checkHeader(multiLine: Boolean): Unit = { + withTempPath { path => + import collection.JavaConverters._ + val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) + val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) + odf.write.option("header", true).csv(path.getCanonicalPath) + val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) + val exception = intercept[SparkException] { + spark.read + .schema(ischema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(exception.getMessage.contains( + "CSV file header does not contain the expected fields" + )) + + val shortSchema = new StructType().add("f1", DoubleType) + val exceptionForShortSchema = intercept[SparkException] { + spark.read + .schema(shortSchema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() } + assert(exceptionForShortSchema.getMessage.contains( + "Number of column in CSV header is not equal to number of fields in the schema" + )) + + val longSchema = new StructType() + .add("f1", DoubleType) + .add("f2", DoubleType) + .add("f3", DoubleType) + + val exceptionForLongSchema = intercept[SparkException] { + spark.read + .schema(longSchema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(exceptionForLongSchema.getMessage.contains( + "Header length: 2, schema size: 3" + )) } } - List("false", "true").foreach(checkHeader(_)) + test(s"SPARK-23786: Checking column names against schema in the multiline mode") { + checkHeader(multiLine = true) + } + + test(s"SPARK-23786: Checking column names against schema in the per-line mode") { + checkHeader(multiLine = false) + } + + test("SPARK-23786: CSV header must not be checked if it doesn't exist") { + withTempPath { path => + import collection.JavaConverters._ + val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) + val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) + odf.write.option("header", false).csv(path.getCanonicalPath) + val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) + val idf = spark.read + .schema(ischema) + .option("header", false) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + + checkAnswer(idf, odf) + } + } } From 0405863ca209b8bd9cf3950c3d4d122ac718928b Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Apr 2018 12:57:40 +0200 Subject: [PATCH 29/48] Making header validation case sensitive --- .../execution/datasources/csv/CSVDataSource.scala | 3 +-- .../sql/execution/datasources/csv/CSVSuite.scala | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 6f6f3a8d5bfec..e77117618cd55 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -132,8 +132,7 @@ object CSVDataSource { if (headerLen == schemaSize) { var i = 0 while (error.isEmpty && i < headerLen) { - val nameInSchema = fieldNames(i).toLowerCase - val nameInHeader = columnNames(i).toLowerCase + val (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) if (nameInHeader != nameInSchema) { error = Some( s"""|CSV file header does not contain the expected fields. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index e42774cb45942..cbe5fb9165446 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1330,6 +1330,20 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { assert(exceptionForLongSchema.getMessage.contains( "Header length: 2, schema size: 3" )) + + val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType) + val caseSensitiveException = intercept[SparkException] { + spark.read + .schema(caseSensitiveSchema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(caseSensitiveException.getMessage.contains( + "CSV file header does not contain the expected fields" + )) } } From 714c66d730a7e0e49016734dd3d771dcd991911e Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Apr 2018 13:54:25 +0200 Subject: [PATCH 30/48] Describing enforceSchema in PySpark's csv method --- python/pyspark/sql/readwriter.py | 9 ++++++--- .../sql/execution/datasources/csv/CSVDataSource.scala | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 52c42449de4d9..bd2d2188eb890 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -365,9 +365,12 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non default value, ``false``. :param inferSchema: infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, ``false``. - :param enforceSchema: Forcibly apply the specified or inferred schema to CSV - files. If None is set, it uses the default value, ``true``. - In that case, CSV headers are ignored. + :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be + forcibly applied to datasource files and headers in CSV files will be + ignored. If the option is set to ``false``, the schema will be + validated against headers in CSV files if the ``header`` option is set + to ``true``. The validation is performed in column ordering aware and + case sensitive manner. If None is set, ``true`` is used by default. :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it uses the default value, ``false``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index e77117618cd55..444df7c1f8878 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -27,8 +27,8 @@ import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapred.TextInputFormat import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat -import org.apache.spark.TaskContext +import org.apache.spark.TaskContext import org.apache.spark.input.{PortableDataStream, StreamInputFormat} import org.apache.spark.rdd.{BinaryFileRDD, RDD} import org.apache.spark.sql.{Dataset, Encoders, SparkSession} From 78d9f66cc575c62fced967820d60b756e82f77fb Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Apr 2018 21:21:48 +0200 Subject: [PATCH 31/48] Respect to caseSensitive parameter --- .../datasources/csv/CSVDataSource.scala | 39 +++-- .../datasources/csv/CSVFileFormat.scala | 4 +- .../execution/datasources/csv/CSVSuite.scala | 145 ++++++++++-------- 3 files changed, 104 insertions(+), 84 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 444df7c1f8878..fc80a7006d1fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -53,7 +53,8 @@ abstract class CSVDataSource extends Serializable { parser: UnivocityParser, schema: StructType, // Actual schema of data in the csv file - dataSchema: StructType): Iterator[InternalRow] + dataSchema: StructType, + caseSensitive: Boolean): Iterator[InternalRow] /** * Infers the schema from `inputPaths` files. @@ -123,7 +124,7 @@ object CSVDataSource { } def checkHeaderColumnNames(schema: StructType, columnNames: Array[String], fileName: String, - checkHeaderFlag: Boolean): Unit = { + checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = { if (checkHeaderFlag && columnNames != null) { val fieldNames = schema.map(_.name).toIndexedSeq val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) @@ -132,7 +133,11 @@ object CSVDataSource { if (headerLen == schemaSize) { var i = 0 while (error.isEmpty && i < headerLen) { - val (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) + var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) + if (caseSensitive == false) { + nameInSchema = nameInSchema.toLowerCase + nameInHeader = nameInHeader.toLowerCase + } if (nameInHeader != nameInSchema) { error = Some( s"""|CSV file header does not contain the expected fields. @@ -162,12 +167,9 @@ object CSVDataSource { object TextInputCSVDataSource extends CSVDataSource { override val isSplitable: Boolean = true - override def readFile( - conf: Configuration, - file: PartitionedFile, - parser: UnivocityParser, - schema: StructType, - dataSchema: StructType): Iterator[InternalRow] = { + override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser, + schema: StructType, dataSchema: StructType, + caseSensitive: Boolean): Iterator[InternalRow] = { val lines = { val linesReader = new HadoopFileLinesReader(file, conf) Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => linesReader.close())) @@ -184,7 +186,7 @@ object TextInputCSVDataSource extends CSVDataSource { // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => checkHeader(header, parser.tokenizer, dataSchema, file.filePath, - checkHeaderFlag = !parser.options.enforceSchema) + checkHeaderFlag = !parser.options.enforceSchema, caseSensitive) } } @@ -248,9 +250,10 @@ object TextInputCSVDataSource extends CSVDataSource { } def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String, - checkHeaderFlag: Boolean): Unit = { + checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = { if (checkHeaderFlag) { - checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag) + checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag, + caseSensitive) } } } @@ -258,16 +261,12 @@ object TextInputCSVDataSource extends CSVDataSource { object MultiLineCSVDataSource extends CSVDataSource { override val isSplitable: Boolean = false - override def readFile( - conf: Configuration, - file: PartitionedFile, - parser: UnivocityParser, - schema: StructType, - dataSchema: StructType): Iterator[InternalRow] = { + override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser, + schema: StructType, dataSchema: StructType, + caseSensitive: Boolean): Iterator[InternalRow] = { def checkHeader(header: Array[String]): Unit = { CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath, - checkHeaderFlag = !parser.options.enforceSchema - ) + checkHeaderFlag = !parser.options.enforceSchema, caseSensitive) } UnivocityParser.parseStream( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 37ed8d5e017d8..53231843c5145 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -122,6 +122,7 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { "df.filter($\"_corrupt_record\".isNotNull).count()." ) } + val caseSensitive = sparkSession.sessionState.conf.caseSensitiveAnalysis (file: PartitionedFile) => { val conf = broadcastedHadoopConf.value.value @@ -129,7 +130,8 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), parsedOptions) - CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema) + CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema, + caseSensitive) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index cbe5fb9165446..af981a1be04bb 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1280,70 +1280,72 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } def checkHeader(multiLine: Boolean): Unit = { - withTempPath { path => - import collection.JavaConverters._ - val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) - val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) - odf.write.option("header", true).csv(path.getCanonicalPath) - val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) - val exception = intercept[SparkException] { - spark.read - .schema(ischema) - .option("multiLine", multiLine) - .option("header", true) - .option("enforceSchema", false) - .csv(path.getCanonicalPath) - .collect() - } - assert(exception.getMessage.contains( - "CSV file header does not contain the expected fields" - )) - - val shortSchema = new StructType().add("f1", DoubleType) - val exceptionForShortSchema = intercept[SparkException] { - spark.read - .schema(shortSchema) - .option("multiLine", multiLine) - .option("header", true) - .option("enforceSchema", false) - .csv(path.getCanonicalPath) - .collect() - } - assert(exceptionForShortSchema.getMessage.contains( - "Number of column in CSV header is not equal to number of fields in the schema" - )) - - val longSchema = new StructType() - .add("f1", DoubleType) - .add("f2", DoubleType) - .add("f3", DoubleType) - - val exceptionForLongSchema = intercept[SparkException] { - spark.read - .schema(longSchema) - .option("multiLine", multiLine) - .option("header", true) - .option("enforceSchema", false) - .csv(path.getCanonicalPath) - .collect() - } - assert(exceptionForLongSchema.getMessage.contains( - "Header length: 2, schema size: 3" - )) - - val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType) - val caseSensitiveException = intercept[SparkException] { - spark.read - .schema(caseSensitiveSchema) - .option("multiLine", multiLine) - .option("header", true) - .option("enforceSchema", false) - .csv(path.getCanonicalPath) - .collect() + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { + withTempPath { path => + import collection.JavaConverters._ + val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) + val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) + odf.write.option("header", true).csv(path.getCanonicalPath) + val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) + val exception = intercept[SparkException] { + spark.read + .schema(ischema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(exception.getMessage.contains( + "CSV file header does not contain the expected fields" + )) + + val shortSchema = new StructType().add("f1", DoubleType) + val exceptionForShortSchema = intercept[SparkException] { + spark.read + .schema(shortSchema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(exceptionForShortSchema.getMessage.contains( + "Number of column in CSV header is not equal to number of fields in the schema" + )) + + val longSchema = new StructType() + .add("f1", DoubleType) + .add("f2", DoubleType) + .add("f3", DoubleType) + + val exceptionForLongSchema = intercept[SparkException] { + spark.read + .schema(longSchema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(exceptionForLongSchema.getMessage.contains( + "Header length: 2, schema size: 3" + )) + + val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType) + val caseSensitiveException = intercept[SparkException] { + spark.read + .schema(caseSensitiveSchema) + .option("multiLine", multiLine) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + .collect() + } + assert(caseSensitiveException.getMessage.contains( + "CSV file header does not contain the expected fields" + )) } - assert(caseSensitiveException.getMessage.contains( - "CSV file header does not contain the expected fields" - )) } } @@ -1371,4 +1373,21 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { checkAnswer(idf, odf) } } + + test("Ignore column name case if spark.sql.caseSensitive is false") { + withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { + withTempPath { path => + import collection.JavaConverters._ + val oschema = new StructType().add("A", StringType) + val odf = spark.createDataFrame(List(Row("0")).asJava, oschema) + odf.write.option("header", true).csv(path.getCanonicalPath) + val ischema = new StructType().add("a", StringType) + val idf = spark.read.schema(ischema) + .option("header", true) + .option("enforceSchema", false) + .csv(path.getCanonicalPath) + checkAnswer(idf, odf) + } + } + } } From b43a7c7ec50e03aaf4990e9bbb6989cdb2c076ef Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 13 Apr 2018 22:06:36 +0200 Subject: [PATCH 32/48] Check header on csv parsing from dataset of strings --- .../apache/spark/sql/DataFrameReader.scala | 7 +++++++ .../datasources/csv/CSVDataSource.scala | 20 +++++++++---------- .../execution/datasources/csv/CSVSuite.scala | 12 ++++++++++- 3 files changed, 28 insertions(+), 11 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 6b7a9ab5b7b71..bbc2e4d1344d1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -21,6 +21,8 @@ import java.util.{Locale, Properties} import scala.collection.JavaConverters._ +import com.univocity.parsers.csv.CsvParser + import org.apache.spark.Partition import org.apache.spark.annotation.InterfaceStability import org.apache.spark.api.java.JavaRDD @@ -486,6 +488,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine => + if (parsedOptions.enforceSchema == false) { + CSVDataSource.checkHeader(firstLine, new CsvParser(parsedOptions.asParserSettings), + actualSchema, csvDataset.getClass.getCanonicalName, checkHeaderFlag = true, + sparkSession.sessionState.conf.caseSensitiveAnalysis) + } filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions)) }.getOrElse(filteredLines.rdd) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index fc80a7006d1fc..73898fc23679e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -143,7 +143,7 @@ object CSVDataSource { s"""|CSV file header does not contain the expected fields. | Header: ${columnNames.mkString(", ")} | Schema: ${fieldNames.mkString(", ")} - |Expected: $nameInSchema but found: $nameInHeader + |Expected: ${columnNames(i)} but found: ${fieldNames(i)} |CSV file: $fileName""".stripMargin ) } @@ -162,6 +162,14 @@ object CSVDataSource { } } } + + def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String, + checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = { + if (checkHeaderFlag) { + checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag, + caseSensitive) + } + } } object TextInputCSVDataSource extends CSVDataSource { @@ -185,7 +193,7 @@ object TextInputCSVDataSource extends CSVDataSource { // Note: if there are only comments in the first block, the header would probably // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => - checkHeader(header, parser.tokenizer, dataSchema, file.filePath, + CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath, checkHeaderFlag = !parser.options.enforceSchema, caseSensitive) } } @@ -248,14 +256,6 @@ object TextInputCSVDataSource extends CSVDataSource { sparkSession.createDataset(rdd)(Encoders.STRING) } } - - def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String, - checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = { - if (checkHeaderFlag) { - checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag, - caseSensitive) - } - } } object MultiLineCSVDataSource extends CSVDataSource { diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index af981a1be04bb..b06480d7ab35a 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1374,7 +1374,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } - test("Ignore column name case if spark.sql.caseSensitive is false") { + test("SPARK-23786: Ignore column name case if spark.sql.caseSensitive is false") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { withTempPath { path => import collection.JavaConverters._ @@ -1390,4 +1390,14 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils { } } } + + test("SPARK-23786: check header on parsing of dataset of strings") { + val ds = Seq("columnA,columnB", "1.0,1000.0").toDS() + val ischema = new StructType().add("columnB", DoubleType).add("columnA", DoubleType) + val exception = intercept[IllegalArgumentException] { + spark.read.schema(ischema).option("header", true).option("enforceSchema", false).csv(ds) + } + + assert(exception.getMessage.contains("CSV file header does not contain the expected fields")) + } } From 9b2d403085f45b0d975cc3e7d5ac559aa81e0c64 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 18 Apr 2018 12:13:33 +0200 Subject: [PATCH 33/48] Make Scala style checker happy --- .../src/main/scala/org/apache/spark/sql/DataFrameReader.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index cd9829750aec8..56826ad12f4b6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -21,8 +21,8 @@ import java.util.{Locale, Properties} import scala.collection.JavaConverters._ -import com.univocity.parsers.csv.CsvParser import com.fasterxml.jackson.databind.ObjectMapper +import com.univocity.parsers.csv.CsvParser import org.apache.spark.Partition import org.apache.spark.annotation.InterfaceStability From 21f8b10dda4b0ef71ba69cc6147d1cf8614812f1 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Mon, 14 May 2018 15:06:02 +0200 Subject: [PATCH 34/48] Removing a space to make Scala style checker happy. --- .../apache/spark/sql/execution/datasources/csv/CSVSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 524c0c81f779d..7704526811fc1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1368,7 +1368,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te checkAnswer(computed, expected) } } - + def checkHeader(multiLine: Boolean): Unit = { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { withTempPath { path => From e3b4275d71d2230b9833d92435157a54cdc0b7e0 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Wed, 16 May 2018 16:37:56 +0200 Subject: [PATCH 35/48] Addressing review comments --- python/pyspark/sql/readwriter.py | 5 +- python/pyspark/sql/streaming.py | 12 +++- python/pyspark/sql/tests.py | 26 +++---- .../apache/spark/sql/DataFrameReader.scala | 6 +- .../datasources/csv/CSVDataSource.scala | 67 ++++++++++++------- .../datasources/csv/UnivocityParser.scala | 1 - .../execution/datasources/csv/CSVSuite.scala | 5 +- 7 files changed, 74 insertions(+), 48 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 8b3bcd90a6010..bd65ac386a8db 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -377,8 +377,9 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non forcibly applied to datasource files and headers in CSV files will be ignored. If the option is set to ``false``, the schema will be validated against headers in CSV files if the ``header`` option is set - to ``true``. The validation is performed in column ordering aware and - case sensitive manner. If None is set, ``true`` is used by default. + to ``true``. The validation is performed in column ordering aware + manner by taking into account ``spark.sql.caseSensitive``. + If None is set, ``true`` is used by default. :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it uses the default value, ``false``. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 15f9407389864..30cc8d3baad96 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -564,7 +564,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None, negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None, maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, - columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None): + columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None, + enforceSchema=None): """Loads a CSV file stream and returns the result as a :class:`DataFrame`. This function will go through the input once to determine the input schema if @@ -592,6 +593,13 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non default value, ``false``. :param inferSchema: infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, ``false``. + :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be + forcibly applied to datasource files and headers in CSV files will be + ignored. If the option is set to ``false``, the schema will be + validated against headers in CSV files if the ``header`` option is set + to ``true``. The validation is performed in column ordering aware + manner by taking into account ``spark.sql.caseSensitive``. + If None is set, ``true`` is used by default. :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it uses the default value, ``false``. @@ -664,7 +672,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non maxCharsPerColumn=maxCharsPerColumn, maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine, - charToEscapeQuoteEscaping=charToEscapeQuoteEscaping) + charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema) if isinstance(path, basestring): return self._df(self._jreader.csv(path)) else: diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 031239a68b3eb..689b27f475ec0 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3043,18 +3043,20 @@ def test_csv_sampling_ratio(self): def test_checking_csv_header(self): tmpPath = tempfile.mkdtemp() shutil.rmtree(tmpPath) - self.spark.createDataFrame([[1, 1000], [2000, 2]]).\ - toDF('f1', 'f2').write.option("header", "true").csv(tmpPath) - schema = StructType([ - StructField('f2', IntegerType(), nullable=True), - StructField('f1', IntegerType(), nullable=True)]) - df = self.spark.read.option('header', 'true').schema(schema).\ - csv(tmpPath, enforceSchema=False) - self.assertRaisesRegexp( - Exception, - "CSV file header does not contain the expected fields", - lambda: df.collect()) - shutil.rmtree(tmpPath) + try: + self.spark.createDataFrame([[1, 1000], [2000, 2]])\ + .toDF('f1', 'f2').write.option("header", "true").csv(tmpPath) + schema = StructType([ + StructField('f2', IntegerType(), nullable=True), + StructField('f1', IntegerType(), nullable=True)]) + df = self.spark.read.option('header', 'true').schema(schema)\ + .csv(tmpPath, enforceSchema=False) + self.assertRaisesRegexp( + Exception, + "CSV file header does not contain the expected fields", + lambda: df.collect()) + finally: + shutil.rmtree(tmpPath) class HiveSparkSubmitTests(SparkSubmitTests): diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index fb02fe2e92894..b991d2538ae82 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -545,9 +545,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { *
  • `header` (default `false`): uses the first line as names of columns.
  • *
  • `enforceSchema` (default `true`): If it is set to `true`, the specified or inferred schema * will be forcibly applied to datasource files and headers in CSV files will be ignored. - * If the option is set to `false`, the schema will be validated against headers in CSV files if - * the `header` option is set to `true`. The validation is performed in column ordering aware and - * case sensitive manner.
  • + * If the option is set to `false`, the schema will be validated against headers in CSV files + * in the case when the `header` option is set to `true`. The validation is performed in column + * ordering aware manner by taking into account `spark.sql.caseSensitive`. *
  • `inferSchema` (default `false`): infers the input schema automatically from data. It * requires one extra pass over the data.
  • *
  • `samplingRatio` (default is 1.0): defines fraction of rows used for schema inferring.
  • diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 0c42050bae673..2c28024a791da 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -34,7 +34,6 @@ import org.apache.spark.rdd.{BinaryFileRDD, RDD} import org.apache.spark.sql.{Dataset, Encoders, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.execution.datasources.csv.CSVDataSource.checkHeaderColumnNames import org.apache.spark.sql.execution.datasources.text.TextFileFormat import org.apache.spark.sql.types.StructType @@ -51,7 +50,7 @@ abstract class CSVDataSource extends Serializable { conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType, + requiredSchema: StructType, // Actual schema of data in the csv file dataSchema: StructType, caseSensitive: Boolean): Iterator[InternalRow] @@ -123,48 +122,58 @@ object CSVDataSource { } } - def checkHeaderColumnNames(schema: StructType, columnNames: Array[String], fileName: String, - checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = { + /** + * Checks that column names in a CSV header and field names in the schema are the same + * by taking into account case sensitivity. + */ + def checkHeaderColumnNames( + schema: StructType, + columnNames: Array[String], + fileName: String, + checkHeaderFlag: Boolean, + caseSensitive: Boolean): Unit = { if (checkHeaderFlag && columnNames != null) { val fieldNames = schema.map(_.name).toIndexedSeq val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) - var error: Option[String] = None if (headerLen == schemaSize) { var i = 0 - while (error.isEmpty && i < headerLen) { + while (i < headerLen) { var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) - if (caseSensitive == false) { + if (!caseSensitive) { nameInSchema = nameInSchema.toLowerCase nameInHeader = nameInHeader.toLowerCase } if (nameInHeader != nameInSchema) { - error = Some( + throw new IllegalArgumentException( s"""|CSV file header does not contain the expected fields. | Header: ${columnNames.mkString(", ")} | Schema: ${fieldNames.mkString(", ")} |Expected: ${columnNames(i)} but found: ${fieldNames(i)} - |CSV file: $fileName""".stripMargin - ) + |CSV file: $fileName""".stripMargin) } i += 1 } } else { - error = Some( + throw new IllegalArgumentException( s"""|Number of column in CSV header is not equal to number of fields in the schema: | Header length: $headerLen, schema size: $schemaSize - |CSV file: $fileName""".stripMargin - ) - } - - error.headOption.foreach { msg => - throw new IllegalArgumentException(msg) + |CSV file: $fileName""".stripMargin) } } } - def checkHeader(header: String, parser: CsvParser, schema: StructType, fileName: String, - checkHeaderFlag: Boolean, caseSensitive: Boolean): Unit = { + /** + * Checks that CSV header contains the same column names as fields names in the given schema + * by taking into account case sensitivity. + */ + def checkHeader( + header: String, + parser: CsvParser, + schema: StructType, + fileName: String, + checkHeaderFlag: Boolean, + caseSensitive: Boolean): Unit = { if (checkHeaderFlag) { checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag, caseSensitive) @@ -175,8 +184,12 @@ object CSVDataSource { object TextInputCSVDataSource extends CSVDataSource { override val isSplitable: Boolean = true - override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType, dataSchema: StructType, + override def readFile( + conf: Configuration, + file: PartitionedFile, + parser: UnivocityParser, + requiredSchema: StructType, + dataSchema: StructType, caseSensitive: Boolean): Iterator[InternalRow] = { val lines = { val linesReader = new HadoopFileLinesReader(file, conf) @@ -198,7 +211,7 @@ object TextInputCSVDataSource extends CSVDataSource { } } - UnivocityParser.parseIterator(lines, parser, schema) + UnivocityParser.parseIterator(lines, parser, requiredSchema) } override def infer( @@ -263,8 +276,12 @@ object TextInputCSVDataSource extends CSVDataSource { object MultiLineCSVDataSource extends CSVDataSource { override val isSplitable: Boolean = false - override def readFile(conf: Configuration, file: PartitionedFile, parser: UnivocityParser, - schema: StructType, dataSchema: StructType, + override def readFile( + conf: Configuration, + file: PartitionedFile, + parser: UnivocityParser, + requiredSchema: StructType, + dataSchema: StructType, caseSensitive: Boolean): Iterator[InternalRow] = { def checkHeader(header: Array[String]): Unit = { CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath, @@ -273,7 +290,7 @@ object MultiLineCSVDataSource extends CSVDataSource { UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), - parser.options.headerFlag, parser, schema, checkHeader) + parser.options.headerFlag, parser, requiredSchema, checkHeader) } override def infer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index a34b69462ad70..944e02bced65b 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -303,7 +303,6 @@ private[csv] object UnivocityParser { parser.options.parseMode, schema, parser.options.columnNameOfCorruptRecord) - filteredLines.flatMap(safeParser.parse) } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 7704526811fc1..3f8c3cf9a48de 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -23,6 +23,8 @@ import java.sql.{Date, Timestamp} import java.text.SimpleDateFormat import java.util.Locale +import scala.collection.JavaConverters._ + import org.apache.commons.lang3.time.FastDateFormat import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec @@ -1372,7 +1374,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te def checkHeader(multiLine: Boolean): Unit = { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { withTempPath { path => - import collection.JavaConverters._ val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) odf.write.option("header", true).csv(path.getCanonicalPath) @@ -1449,7 +1450,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te test("SPARK-23786: CSV header must not be checked if it doesn't exist") { withTempPath { path => - import collection.JavaConverters._ val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) odf.write.option("header", false).csv(path.getCanonicalPath) @@ -1467,7 +1467,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te test("SPARK-23786: Ignore column name case if spark.sql.caseSensitive is false") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { withTempPath { path => - import collection.JavaConverters._ val oschema = new StructType().add("A", StringType) val odf = spark.createDataFrame(List(Row("0")).asJava, oschema) odf.write.option("header", true).csv(path.getCanonicalPath) From d7047666f32601676a0b1f8d46ec7a0878e7404a Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 17 May 2018 15:41:36 +0200 Subject: [PATCH 36/48] Removing unnecessary empty checks --- .../spark/sql/execution/datasources/csv/CSVDataSource.scala | 4 ---- .../sql/execution/datasources/csv/UnivocityParser.scala | 5 ++--- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 2c28024a791da..c00b32578fb65 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -298,15 +298,12 @@ object MultiLineCSVDataSource extends CSVDataSource { inputPaths: Seq[FileStatus], parsedOptions: CSVOptions): StructType = { val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions) - // The header is not checked because there is no schema against with it could be check - def checkHeader(header: Array[String]): Unit = () csv.flatMap { lines => val path = new Path(lines.getPath()) UnivocityParser.tokenizeStream( CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path), dropFirstRecord = false, - checkHeader, new CsvParser(parsedOptions.asParserSettings)) }.take(1).headOption match { case Some(firstRow) => @@ -318,7 +315,6 @@ object MultiLineCSVDataSource extends CSVDataSource { lines.getConfiguration, new Path(lines.getPath())), parsedOptions.headerFlag, - checkHeader, new CsvParser(parsedOptions.asParserSettings)) } val sampled = CSVUtils.sample(tokenRDD, parsedOptions) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 944e02bced65b..fc71323e8d6e3 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -235,9 +235,8 @@ private[csv] object UnivocityParser { def tokenizeStream( inputStream: InputStream, dropFirstRecord: Boolean, - checkFirstRecord: Array[String] => Unit, tokenizer: CsvParser): Iterator[Array[String]] = { - convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord)(tokens => tokens) + convertStream(inputStream, dropFirstRecord, tokenizer)(tokens => tokens) } /** @@ -264,7 +263,7 @@ private[csv] object UnivocityParser { inputStream: InputStream, dropFirstRecord: Boolean, tokenizer: CsvParser, - checkFirstRecord: Array[String] => Unit)( + checkFirstRecord: Array[String] => Unit = _ => ())( convert: Array[String] => T) = new Iterator[T] { tokenizer.beginParsing(inputStream) private var nextRecord = { From 04199e0fd28ae0f5e21050eb4d08ec60d238cab5 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 17 May 2018 16:37:18 +0200 Subject: [PATCH 37/48] Addressing review comments --- python/pyspark/sql/tests.py | 2 +- .../org/apache/spark/sql/DataFrameReader.scala | 4 ++-- .../datasources/csv/CSVDataSource.scala | 18 +++++++++--------- .../execution/datasources/csv/CSVSuite.scala | 6 +++--- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 689b27f475ec0..faa5f72c8b044 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3053,7 +3053,7 @@ def test_checking_csv_header(self): .csv(tmpPath, enforceSchema=False) self.assertRaisesRegexp( Exception, - "CSV file header does not contain the expected fields", + "CSV header is not conform to the schema", lambda: df.collect()) finally: shutil.rmtree(tmpPath) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index b991d2538ae82..8e2e46858db04 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -498,9 +498,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine => - if (parsedOptions.enforceSchema == false) { + if (!parsedOptions.enforceSchema) { CSVDataSource.checkHeader(firstLine, new CsvParser(parsedOptions.asParserSettings), - actualSchema, csvDataset.getClass.getCanonicalName, checkHeaderFlag = true, + actualSchema, csvDataset.getClass.getCanonicalName, parsedOptions.enforceSchema, sparkSession.sessionState.conf.caseSensitiveAnalysis) } filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions)) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index c00b32578fb65..fbb5f2eba99cd 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -130,9 +130,9 @@ object CSVDataSource { schema: StructType, columnNames: Array[String], fileName: String, - checkHeaderFlag: Boolean, + enforceSchema: Boolean, caseSensitive: Boolean): Unit = { - if (checkHeaderFlag && columnNames != null) { + if (!enforceSchema && columnNames != null) { val fieldNames = schema.map(_.name).toIndexedSeq val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) @@ -146,10 +146,10 @@ object CSVDataSource { } if (nameInHeader != nameInSchema) { throw new IllegalArgumentException( - s"""|CSV file header does not contain the expected fields. + s"""|CSV header is not conform to the schema. | Header: ${columnNames.mkString(", ")} | Schema: ${fieldNames.mkString(", ")} - |Expected: ${columnNames(i)} but found: ${fieldNames(i)} + |Expected: ${fieldNames(i)} but found: ${columnNames(i)} |CSV file: $fileName""".stripMargin) } i += 1 @@ -172,10 +172,10 @@ object CSVDataSource { parser: CsvParser, schema: StructType, fileName: String, - checkHeaderFlag: Boolean, + enforceSchema: Boolean, caseSensitive: Boolean): Unit = { - if (checkHeaderFlag) { - checkHeaderColumnNames(schema, parser.parseLine(header), fileName, checkHeaderFlag, + if (!enforceSchema) { + checkHeaderColumnNames(schema, parser.parseLine(header), fileName, enforceSchema, caseSensitive) } } @@ -207,7 +207,7 @@ object TextInputCSVDataSource extends CSVDataSource { // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath, - checkHeaderFlag = !parser.options.enforceSchema, caseSensitive) + parser.options.enforceSchema, caseSensitive) } } @@ -285,7 +285,7 @@ object MultiLineCSVDataSource extends CSVDataSource { caseSensitive: Boolean): Iterator[InternalRow] = { def checkHeader(header: Array[String]): Unit = { CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath, - checkHeaderFlag = !parser.options.enforceSchema, caseSensitive) + parser.options.enforceSchema, caseSensitive) } UnivocityParser.parseStream( diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 3f8c3cf9a48de..fcee02dd9be47 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1388,7 +1388,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .collect() } assert(exception.getMessage.contains( - "CSV file header does not contain the expected fields" + "CSV header is not conform to the schema" )) val shortSchema = new StructType().add("f1", DoubleType) @@ -1434,7 +1434,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .collect() } assert(caseSensitiveException.getMessage.contains( - "CSV file header does not contain the expected fields" + "CSV header is not conform to the schema" )) } } @@ -1487,6 +1487,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te spark.read.schema(ischema).option("header", true).option("enforceSchema", false).csv(ds) } - assert(exception.getMessage.contains("CSV file header does not contain the expected fields")) + assert(exception.getMessage.contains("CSV header is not conform to the schema")) } } From 795a8781129d5265906d537ad87a3e9b0abc890f Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Thu, 17 May 2018 23:40:59 +0200 Subject: [PATCH 38/48] Addressing Hyukjin Kwon's review comments --- python/pyspark/sql/tests.py | 10 +++++----- .../execution/datasources/csv/CSVDataSource.scala | 3 +-- .../datasources/csv/UnivocityParser.scala | 4 ++-- .../sql/execution/datasources/csv/CSVSuite.scala | 14 ++++---------- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index faa5f72c8b044..1edb0dc49afd9 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3041,22 +3041,22 @@ def test_csv_sampling_ratio(self): self.assertEquals(schema, StructType([StructField("_c0", IntegerType(), True)])) def test_checking_csv_header(self): - tmpPath = tempfile.mkdtemp() - shutil.rmtree(tmpPath) + path = tempfile.mkdtemp() + shutil.rmtree(path) try: self.spark.createDataFrame([[1, 1000], [2000, 2]])\ - .toDF('f1', 'f2').write.option("header", "true").csv(tmpPath) + .toDF('f1', 'f2').write.option("header", "true").csv(path) schema = StructType([ StructField('f2', IntegerType(), nullable=True), StructField('f1', IntegerType(), nullable=True)]) df = self.spark.read.option('header', 'true').schema(schema)\ - .csv(tmpPath, enforceSchema=False) + .csv(path, enforceSchema=False) self.assertRaisesRegexp( Exception, "CSV header is not conform to the schema", lambda: df.collect()) finally: - shutil.rmtree(tmpPath) + shutil.rmtree(path) class HiveSparkSubmitTests(SparkSubmitTests): diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index fbb5f2eba99cd..6c5387af650c1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -298,12 +298,11 @@ object MultiLineCSVDataSource extends CSVDataSource { inputPaths: Seq[FileStatus], parsedOptions: CSVOptions): StructType = { val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions) - csv.flatMap { lines => val path = new Path(lines.getPath()) UnivocityParser.tokenizeStream( CodecStreams.createInputStreamWithCloseResource(lines.getConfiguration, path), - dropFirstRecord = false, + shouldDropHeader = false, new CsvParser(parsedOptions.asParserSettings)) }.take(1).headOption match { case Some(firstRow) => diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index fc71323e8d6e3..fd95b063ab830 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -234,9 +234,9 @@ private[csv] object UnivocityParser { */ def tokenizeStream( inputStream: InputStream, - dropFirstRecord: Boolean, + shouldDropHeader: Boolean, tokenizer: CsvParser): Iterator[Array[String]] = { - convertStream(inputStream, dropFirstRecord, tokenizer)(tokens => tokens) + convertStream(inputStream, shouldDropHeader, tokenizer)(tokens => tokens) } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index fcee02dd9be47..2d01ad2270946 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1387,9 +1387,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .csv(path.getCanonicalPath) .collect() } - assert(exception.getMessage.contains( - "CSV header is not conform to the schema" - )) + assert(exception.getMessage.contains("CSV header is not conform to the schema")) val shortSchema = new StructType().add("f1", DoubleType) val exceptionForShortSchema = intercept[SparkException] { @@ -1402,8 +1400,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .collect() } assert(exceptionForShortSchema.getMessage.contains( - "Number of column in CSV header is not equal to number of fields in the schema" - )) + "Number of column in CSV header is not equal to number of fields in the schema")) val longSchema = new StructType() .add("f1", DoubleType) @@ -1419,9 +1416,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .csv(path.getCanonicalPath) .collect() } - assert(exceptionForLongSchema.getMessage.contains( - "Header length: 2, schema size: 3" - )) + assert(exceptionForLongSchema.getMessage.contains("Header length: 2, schema size: 3")) val caseSensitiveSchema = new StructType().add("F1", DoubleType).add("f2", DoubleType) val caseSensitiveException = intercept[SparkException] { @@ -1434,8 +1429,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .collect() } assert(caseSensitiveException.getMessage.contains( - "CSV header is not conform to the schema" - )) + "CSV header is not conform to the schema")) } } } From 05fc7cd8b163b35d388474bd9ea788a7db2c378c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 18 May 2018 22:51:53 +0200 Subject: [PATCH 39/48] Improving description of the option --- python/pyspark/sql/readwriter.py | 9 +++++---- python/pyspark/sql/streaming.py | 9 +++++---- .../org/apache/spark/sql/DataFrameReader.scala | 13 +++++++++---- 3 files changed, 19 insertions(+), 12 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index bd65ac386a8db..eddf3b9611058 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -374,11 +374,12 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param inferSchema: infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, ``false``. :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be - forcibly applied to datasource files and headers in CSV files will be + forcibly applied to datasource files, and headers in CSV files will be ignored. If the option is set to ``false``, the schema will be - validated against headers in CSV files if the ``header`` option is set - to ``true``. The validation is performed in column ordering aware - manner by taking into account ``spark.sql.caseSensitive``. + validated against all headers in CSV files or the first header in RDD + if the ``header`` option is set to ``true``. Field names in the schema + and column names in CSV headers are checked by their positions + taking into account ``spark.sql.caseSensitive``. If None is set, ``true`` is used by default. :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 30cc8d3baad96..4c1a6da286f16 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -594,11 +594,12 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non :param inferSchema: infers the input schema automatically from data. It requires one extra pass over the data. If None is set, it uses the default value, ``false``. :param enforceSchema: If it is set to ``true``, the specified or inferred schema will be - forcibly applied to datasource files and headers in CSV files will be + forcibly applied to datasource files, and headers in CSV files will be ignored. If the option is set to ``false``, the schema will be - validated against headers in CSV files if the ``header`` option is set - to ``true``. The validation is performed in column ordering aware - manner by taking into account ``spark.sql.caseSensitive``. + validated against all headers in CSV files or the first header in RDD + if the ``header`` option is set to ``true``. Field names in the schema + and column names in CSV headers are checked by their positions + taking into account ``spark.sql.caseSensitive``. If None is set, ``true`` is used by default. :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 8e2e46858db04..c66b74e446471 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -474,6 +474,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * it determines the columns as string types and it reads only the first line to determine the * names and the number of fields. * + * If the enforceSchema is set to `false`, only the CSV header in the first line is checked + * to conform specified or inferred schema. + * * @param csvDataset input Dataset with one CSV row per record * @since 2.2.0 */ @@ -544,10 +547,11 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * beginning with this character. By default, it is disabled. *
  • `header` (default `false`): uses the first line as names of columns.
  • *
  • `enforceSchema` (default `true`): If it is set to `true`, the specified or inferred schema - * will be forcibly applied to datasource files and headers in CSV files will be ignored. - * If the option is set to `false`, the schema will be validated against headers in CSV files - * in the case when the `header` option is set to `true`. The validation is performed in column - * ordering aware manner by taking into account `spark.sql.caseSensitive`.
  • + * will be forcibly applied to datasource files, and headers in CSV files will be ignored. + * If the option is set to `false`, the schema will be validated against all headers in CSV files + * in the case when the `header` option is set to `true`. Field names in the schema + * and column names in CSV headers are checked by their positions taking into account + * `spark.sql.caseSensitive`. *
  • `inferSchema` (default `false`): infers the input schema automatically from data. It * requires one extra pass over the data.
  • *
  • `samplingRatio` (default is 1.0): defines fraction of rows used for schema inferring.
  • @@ -592,6 +596,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * created by `PERMISSIVE` mode. This overrides `spark.sql.columnNameOfCorruptRecord`. *
  • `multiLine` (default `false`): parse one record, which may span multiple lines.
  • * + * * @since 2.0.0 */ @scala.annotation.varargs From 11c75913f4afadd0437945e0a41870a141326375 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 18 May 2018 23:13:02 +0200 Subject: [PATCH 40/48] Addressing Wenchen Fan's review comment --- python/pyspark/sql/readwriter.py | 6 ++++-- python/pyspark/sql/streaming.py | 6 ++++-- .../main/scala/org/apache/spark/sql/DataFrameReader.scala | 3 ++- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index eddf3b9611058..a0e20d39c20da 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -379,8 +379,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non validated against all headers in CSV files or the first header in RDD if the ``header`` option is set to ``true``. Field names in the schema and column names in CSV headers are checked by their positions - taking into account ``spark.sql.caseSensitive``. - If None is set, ``true`` is used by default. + taking into account ``spark.sql.caseSensitive``. If None is set, + ``true`` is used by default. Though the default value is ``true``, + it is recommended to disable the ``enforceSchema`` option + to avoid incorrect results. :param ignoreLeadingWhiteSpace: A flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it uses the default value, ``false``. diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py index 4c1a6da286f16..fae50b3d5d532 100644 --- a/python/pyspark/sql/streaming.py +++ b/python/pyspark/sql/streaming.py @@ -599,8 +599,10 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non validated against all headers in CSV files or the first header in RDD if the ``header`` option is set to ``true``. Field names in the schema and column names in CSV headers are checked by their positions - taking into account ``spark.sql.caseSensitive``. - If None is set, ``true`` is used by default. + taking into account ``spark.sql.caseSensitive``. If None is set, + ``true`` is used by default. Though the default value is ``true``, + it is recommended to disable the ``enforceSchema`` option + to avoid incorrect results. :param ignoreLeadingWhiteSpace: a flag indicating whether or not leading whitespaces from values being read should be skipped. If None is set, it uses the default value, ``false``. diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 552c5055bdc69..7eea42147704a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -552,7 +552,8 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { * If the option is set to `false`, the schema will be validated against all headers in CSV files * in the case when the `header` option is set to `true`. Field names in the schema * and column names in CSV headers are checked by their positions taking into account - * `spark.sql.caseSensitive`. + * `spark.sql.caseSensitive`. Though the default value is true, it is recommended to disable + * the `enforceSchema` option to avoid incorrect results. *
  • `inferSchema` (default `false`): infers the input schema automatically from data. It * requires one extra pass over the data.
  • *
  • `samplingRatio` (default is 1.0): defines fraction of rows used for schema inferring.
  • From c0083281e702aea17ee7781e089a17d650382c7c Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 25 May 2018 13:19:14 +0200 Subject: [PATCH 41/48] Output warnings when enforceSchema is enabled and the schema is not conform to CSV header --- .../datasources/csv/CSVDataSource.scala | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 6c5387af650c1..139eed804c6fc 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -30,6 +30,7 @@ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.TaskContext import org.apache.spark.input.{PortableDataStream, StreamInputFormat} +import org.apache.spark.internal.Logging import org.apache.spark.rdd.{BinaryFileRDD, RDD} import org.apache.spark.sql.{Dataset, Encoders, SparkSession} import org.apache.spark.sql.catalyst.InternalRow @@ -113,7 +114,7 @@ abstract class CSVDataSource extends Serializable { } } -object CSVDataSource { +object CSVDataSource extends Logging { def apply(options: CSVOptions): CSVDataSource = { if (options.multiLine) { MultiLineCSVDataSource @@ -132,20 +133,21 @@ object CSVDataSource { fileName: String, enforceSchema: Boolean, caseSensitive: Boolean): Unit = { - if (!enforceSchema && columnNames != null) { + if (columnNames != null) { val fieldNames = schema.map(_.name).toIndexedSeq val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) + var errorMessage: Option[String] = None if (headerLen == schemaSize) { var i = 0 - while (i < headerLen) { + while (errorMessage.isEmpty && i < headerLen) { var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) if (!caseSensitive) { nameInSchema = nameInSchema.toLowerCase nameInHeader = nameInHeader.toLowerCase } if (nameInHeader != nameInSchema) { - throw new IllegalArgumentException( + errorMessage = Some( s"""|CSV header is not conform to the schema. | Header: ${columnNames.mkString(", ")} | Schema: ${fieldNames.mkString(", ")} @@ -155,11 +157,19 @@ object CSVDataSource { i += 1 } } else { - throw new IllegalArgumentException( + errorMessage = Some( s"""|Number of column in CSV header is not equal to number of fields in the schema: | Header length: $headerLen, schema size: $schemaSize |CSV file: $fileName""".stripMargin) } + + errorMessage.foreach { msg => + if (enforceSchema) { + logWarning(msg) + } else { + throw new IllegalArgumentException(msg) + } + } } } From 9f7c4405ce4ca240a11d69126855fbc060444d3b Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 25 May 2018 14:44:43 +0200 Subject: [PATCH 42/48] Added tests for inferSchema is true and enforceSchema is false --- .../execution/datasources/csv/CSVSuite.scala | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index a9b726a0de5a6..adc0f20b34947 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1512,4 +1512,36 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te assert(exception.getMessage.contains("CSV header is not conform to the schema")) } + + test("SPARK-23786: enforce inferred schema") { + val expectedSchema = new StructType().add("_c0", DoubleType).add("_c1", StringType) + val withHeader = spark.read + .option("inferSchema", true) + .option("enforceSchema", false) + .option("header", true) + .csv(Seq("_c0,_c1", "1.0,a").toDS()) + assert(withHeader.schema == expectedSchema) + checkAnswer(withHeader, Seq(Row(1.0, "a"))) + + // Ignore the inferSchema flag if an user sets a schema + val schema = new StructType().add("colA", DoubleType).add("colB", StringType) + val ds = spark.read + .option("inferSchema", true) + .option("enforceSchema", false) + .option("header", true) + .schema(schema) + .csv(Seq("colA,colB", "1.0,a").toDS()) + assert(ds.schema == schema) + checkAnswer(ds, Seq(Row(1.0, "a"))) + + val exception = intercept[IllegalArgumentException] { + spark.read + .option("inferSchema", true) + .option("enforceSchema", false) + .option("header", true) + .schema(schema) + .csv(Seq("col1,col2", "1.0,a").toDS()) + } + assert(exception.getMessage.contains("CSV header is not conform to the schema")) + } } From e83ad60da9c9266274fc09698bea382c762ef815 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 25 May 2018 14:53:44 +0200 Subject: [PATCH 43/48] Rename dropFirstRecord to shouldDropHeader --- .../datasources/csv/UnivocityParser.scala | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala index 075d6405cee4b..5f7d5696b71a6 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala @@ -248,32 +248,32 @@ private[csv] object UnivocityParser { */ def parseStream( inputStream: InputStream, - dropFirstRecord: Boolean, + shouldDropHeader: Boolean, parser: UnivocityParser, schema: StructType, - checkFirstRecord: Array[String] => Unit): Iterator[InternalRow] = { + checkHeader: Array[String] => Unit): Iterator[InternalRow] = { val tokenizer = parser.tokenizer val safeParser = new FailureSafeParser[Array[String]]( input => Seq(parser.convert(input)), parser.options.parseMode, schema, parser.options.columnNameOfCorruptRecord) - convertStream(inputStream, dropFirstRecord, tokenizer, checkFirstRecord) { tokens => + convertStream(inputStream, shouldDropHeader, tokenizer, checkHeader) { tokens => safeParser.parse(tokens) }.flatten } private def convertStream[T]( inputStream: InputStream, - dropFirstRecord: Boolean, + shouldDropHeader: Boolean, tokenizer: CsvParser, - checkFirstRecord: Array[String] => Unit = _ => ())( + checkHeader: Array[String] => Unit = _ => ())( convert: Array[String] => T) = new Iterator[T] { tokenizer.beginParsing(inputStream) private var nextRecord = { - if (dropFirstRecord) { + if (shouldDropHeader) { val firstRecord = tokenizer.parseNext() - checkFirstRecord(firstRecord) + checkHeader(firstRecord) } tokenizer.parseNext() } From c5ee2076321e252ece99ebec930ae47b5bba209a Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 1 Jun 2018 18:37:15 +0200 Subject: [PATCH 44/48] Renaming of 'is not conform' to 'does not conform' --- .../sql/execution/datasources/csv/CSVDataSource.scala | 2 +- .../spark/sql/execution/datasources/csv/CSVSuite.scala | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 139eed804c6fc..645b1e27355b1 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -148,7 +148,7 @@ object CSVDataSource extends Logging { } if (nameInHeader != nameInSchema) { errorMessage = Some( - s"""|CSV header is not conform to the schema. + s"""|CSV header does not conform to the schema. | Header: ${columnNames.mkString(", ")} | Schema: ${fieldNames.mkString(", ")} |Expected: ${fieldNames(i)} but found: ${columnNames(i)} diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index e258d3de56f99..1cb72f985cedd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -1429,7 +1429,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .csv(path.getCanonicalPath) .collect() } - assert(exception.getMessage.contains("CSV header is not conform to the schema")) + assert(exception.getMessage.contains("CSV header does not conform to the schema")) val shortSchema = new StructType().add("f1", DoubleType) val exceptionForShortSchema = intercept[SparkException] { @@ -1471,7 +1471,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .collect() } assert(caseSensitiveException.getMessage.contains( - "CSV header is not conform to the schema")) + "CSV header does not conform to the schema")) } } } @@ -1523,7 +1523,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te spark.read.schema(ischema).option("header", true).option("enforceSchema", false).csv(ds) } - assert(exception.getMessage.contains("CSV header is not conform to the schema")) + assert(exception.getMessage.contains("CSV header does not conform to the schema")) } test("SPARK-23786: enforce inferred schema") { @@ -1555,6 +1555,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te .schema(schema) .csv(Seq("col1,col2", "1.0,a").toDS()) } - assert(exception.getMessage.contains("CSV header is not conform to the schema")) + assert(exception.getMessage.contains("CSV header does not conform to the schema")) } } From a2cbb7b7879c065efcb8e5a56fde0e47d80e1c89 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 1 Jun 2018 18:47:46 +0200 Subject: [PATCH 45/48] Fix Scala coding style --- .../datasources/csv/CSVDataSource.scala | 28 +++++++++++++++---- .../datasources/csv/CSVFileFormat.scala | 7 ++++- .../datasources/csv/CSVOptions.scala | 2 +- 3 files changed, 29 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 645b1e27355b1..745c6135700bb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -185,7 +185,11 @@ object CSVDataSource extends Logging { enforceSchema: Boolean, caseSensitive: Boolean): Unit = { if (!enforceSchema) { - checkHeaderColumnNames(schema, parser.parseLine(header), fileName, enforceSchema, + checkHeaderColumnNames( + schema, + parser.parseLine(header), + fileName, + enforceSchema, caseSensitive) } } @@ -216,8 +220,13 @@ object TextInputCSVDataSource extends CSVDataSource { // Note: if there are only comments in the first block, the header would probably // be not extracted. CSVUtils.extractHeader(lines, parser.options).foreach { header => - CSVDataSource.checkHeader(header, parser.tokenizer, dataSchema, file.filePath, - parser.options.enforceSchema, caseSensitive) + CSVDataSource.checkHeader( + header, + parser.tokenizer, + dataSchema, + file.filePath, + parser.options.enforceSchema, + caseSensitive) } } @@ -294,13 +303,20 @@ object MultiLineCSVDataSource extends CSVDataSource { dataSchema: StructType, caseSensitive: Boolean): Iterator[InternalRow] = { def checkHeader(header: Array[String]): Unit = { - CSVDataSource.checkHeaderColumnNames(dataSchema, header, file.filePath, - parser.options.enforceSchema, caseSensitive) + CSVDataSource.checkHeaderColumnNames( + dataSchema, + header, + file.filePath, + parser.options.enforceSchema, + caseSensitive) } UnivocityParser.parseStream( CodecStreams.createInputStreamWithCloseResource(conf, new Path(new URI(file.filePath))), - parser.options.headerFlag, parser, requiredSchema, checkHeader) + parser.options.headerFlag, + parser, + requiredSchema, + checkHeader) } override def infer( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala index 1497834ab2cb8..b90275de9f40a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVFileFormat.scala @@ -138,7 +138,12 @@ class CSVFileFormat extends TextBasedFileFormat with DataSourceRegister { StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), StructType(requiredSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)), parsedOptions) - CSVDataSource(parsedOptions).readFile(conf, file, parser, requiredSchema, dataSchema, + CSVDataSource(parsedOptions).readFile( + conf, + file, + parser, + requiredSchema, + dataSchema, caseSensitive) } } diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 2e040c86a46a8..fab8d62da0c1d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -160,7 +160,7 @@ class CSVOptions( * Forcibly apply the specified or inferred schema to datasource files. * If the option is enabled, headers of CSV files will be ignored. */ - val enforceSchema = getBool("enforceSchema", true) + val enforceSchema = getBool("enforceSchema", default = true) def asWriterSettings: CsvWriterSettings = { val writerSettings = new CsvWriterSettings() From 70e2b75713d588027fc3ae1ac380f354a52ad2ac Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 1 Jun 2018 18:58:58 +0200 Subject: [PATCH 46/48] Added description of checkHeaderColumnNames's arguments --- .../sql/execution/datasources/csv/CSVDataSource.scala | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index 745c6135700bb..f8ed6fd58059d 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -126,6 +126,15 @@ object CSVDataSource extends Logging { /** * Checks that column names in a CSV header and field names in the schema are the same * by taking into account case sensitivity. + * + * @param schema - provided (or inferred) schema to which CSV must conform. + * @param columnNames - names of CSV columns that must be checked against to the schema. + * @param fileName - name of CSV file that are currently checked. It is used in error messages. + * @param enforceSchema - if it is `true`, column names are ignored otherwise the CSV column + * names are checked for conformance to the schema. In the case if + * the column name don't conform to the schema, an exception is thrown. + * @param caseSensitive - if it is set to `false`, comparison of column names and schema field + * names is not case sensitive. */ def checkHeaderColumnNames( schema: StructType, From e7c3acee886c238a58682b70a621a0c4d3293e21 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 1 Jun 2018 20:07:05 +0200 Subject: [PATCH 47/48] Test checks a warning presents in logs --- .../apache/spark/sql/DataFrameReader.scala | 12 ++--- .../datasources/csv/CSVDataSource.scala | 4 +- .../execution/datasources/csv/CSVSuite.scala | 45 +++++++++++++++++++ 3 files changed, 53 insertions(+), 8 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala index 1e889622b7b52..de6be5f76e15a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -503,11 +503,13 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging { StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) val linesWithoutHeader: RDD[String] = maybeFirstLine.map { firstLine => - if (!parsedOptions.enforceSchema) { - CSVDataSource.checkHeader(firstLine, new CsvParser(parsedOptions.asParserSettings), - actualSchema, csvDataset.getClass.getCanonicalName, parsedOptions.enforceSchema, - sparkSession.sessionState.conf.caseSensitiveAnalysis) - } + CSVDataSource.checkHeader( + firstLine, + new CsvParser(parsedOptions.asParserSettings), + actualSchema, + csvDataset.getClass.getCanonicalName, + parsedOptions.enforceSchema, + sparkSession.sessionState.conf.caseSensitiveAnalysis) filteredLines.rdd.mapPartitions(CSVUtils.filterHeaderLine(_, firstLine, parsedOptions)) }.getOrElse(filteredLines.rdd) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala index f8ed6fd58059d..82322df407521 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala @@ -193,14 +193,12 @@ object CSVDataSource extends Logging { fileName: String, enforceSchema: Boolean, caseSensitive: Boolean): Unit = { - if (!enforceSchema) { - checkHeaderColumnNames( + checkHeaderColumnNames( schema, parser.parseLine(header), fileName, enforceSchema, caseSensitive) - } } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index 1cb72f985cedd..d2f166c7d1877 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -28,6 +28,8 @@ import scala.collection.JavaConverters._ import org.apache.commons.lang3.time.FastDateFormat import org.apache.hadoop.io.SequenceFile.CompressionType import org.apache.hadoop.io.compress.GzipCodec +import org.apache.log4j.{AppenderSkeleton, LogManager} +import org.apache.log4j.spi.LoggingEvent import org.apache.spark.SparkException import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row, UDT} @@ -1557,4 +1559,47 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te } assert(exception.getMessage.contains("CSV header does not conform to the schema")) } + + test("SPARK-23786: warning should be printed if CSV header doesn't conform to schema") { + class TestAppender extends AppenderSkeleton { + var events = new java.util.ArrayList[LoggingEvent] + override def close(): Unit = {} + override def requiresLayout: Boolean = false + protected def append(event: LoggingEvent): Unit = events.add(event) + } + + val testAppender1 = new TestAppender + LogManager.getRootLogger.addAppender(testAppender1) + try { + val ds = Seq("columnA,columnB", "1.0,1000.0").toDS() + val ischema = new StructType().add("columnB", DoubleType).add("columnA", DoubleType) + + spark.read.schema(ischema).option("header", true).option("enforceSchema", true).csv(ds) + } finally { + LogManager.getRootLogger.removeAppender(testAppender1) + } + assert(testAppender1.events.asScala + .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema"))) + + val testAppender2 = new TestAppender + LogManager.getRootLogger.addAppender(testAppender2) + try { + withTempPath { path => + val oschema = new StructType().add("f1", DoubleType).add("f2", DoubleType) + val odf = spark.createDataFrame(List(Row(1.0, 1234.5)).asJava, oschema) + odf.write.option("header", true).csv(path.getCanonicalPath) + val ischema = new StructType().add("f2", DoubleType).add("f1", DoubleType) + spark.read + .schema(ischema) + .option("header", true) + .option("enforceSchema", true) + .csv(path.getCanonicalPath) + .collect() + } + } finally { + LogManager.getRootLogger.removeAppender(testAppender2) + } + assert(testAppender2.events.asScala + .exists(msg => msg.getRenderedMessage.contains("CSV header does not conform to the schema"))) + } } From 3b37712ded664aaf716306574f50072e58b9bbd1 Mon Sep 17 00:00:00 2001 From: Maxim Gekk Date: Fri, 1 Jun 2018 20:29:32 +0200 Subject: [PATCH 48/48] fix python tests --- python/pyspark/sql/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index cf7ed69e17b32..ea2dd7605dc57 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -3069,7 +3069,7 @@ def test_checking_csv_header(self): .csv(path, enforceSchema=False) self.assertRaisesRegexp( Exception, - "CSV header is not conform to the schema", + "CSV header does not conform to the schema", lambda: df.collect()) finally: shutil.rmtree(path)