From 33eea6f9f5294a7041825daf2789d2f82f5a34d3 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 11 Jun 2024 18:53:57 +0800 Subject: [PATCH 1/3] [SPARK-48584][SQL]Perf improvement for unescapePathName --- .../EscapePathBenchmark-results.txt | 20 +++++-- .../catalog/ExternalCatalogUtils.scala | 52 +++++++++++-------- .../sql/catalyst/EscapePathBenchmark.scala | 52 ++++++++++++++++++- .../catalog/ExternalCatalogUtilsSuite.scala | 26 +++++++++- 4 files changed, 123 insertions(+), 27 deletions(-) diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt index 32e44f6e19eff..cfaad5d705532 100644 --- a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt @@ -2,11 +2,23 @@ Escape ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure -AMD EPYC 7763 64-Core Processor +OpenJDK 64-Bit Server VM 17.0.10+0 on Mac OS X 14.5 +Apple M2 Max Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 6719 6726 6 0.1 6719.3 1.0X -New 735 744 21 1.4 735.3 9.1X +Legacy 5323 5821 248 0.2 5323.2 1.0X +New 476 494 20 2.1 475.7 11.2X + + +================================================================================================ +Unescape +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.10+0 on Mac OS X 14.5 +Apple M2 Max +Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Legacy 1981 2003 24 0.5 1981.1 1.0X +New 325 328 2 3.1 325.0 6.1X diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala index 1dc2a760a8af6..350077c4bc693 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala @@ -26,6 +26,7 @@ import org.apache.hadoop.util.Shell import org.apache.spark.sql.catalyst.analysis.Resolver import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{And, AttributeReference, BasePredicate, BoundReference, Expression, Predicate} +import org.apache.spark.sql.catalyst.expressions.Hex.unhexDigits import org.apache.spark.sql.catalyst.util.CharVarcharUtils import org.apache.spark.sql.errors.QueryCompilationErrors import org.apache.spark.sql.internal.SQLConf @@ -97,31 +98,40 @@ object ExternalCatalogUtils { } def unescapePathName(path: String): String = { - val sb = new StringBuilder - var i = 0 - - while (i < path.length) { - val c = path.charAt(i) - if (c == '%' && i + 2 < path.length) { - val code: Int = try { - Integer.parseInt(path.substring(i + 1, i + 3), 16) - } catch { - case _: Exception => -1 - } - if (code >= 0) { - sb.append(code.asInstanceOf[Char]) - i += 3 + if (path == null || path.isEmpty) { + return path + } + var plaintextEndIdx = path.indexOf('%') + val length = path.length + if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > path.length) { + // fast path, no %xx encoding found then return the string identity + path + } else { + val sb = new java.lang.StringBuilder(length) + var plaintextStartIdx = 0 + while(plaintextEndIdx != -1 && plaintextEndIdx + 2 < length) { + if (plaintextEndIdx > plaintextStartIdx) sb.append(path, plaintextStartIdx, plaintextEndIdx) + val high = path.charAt(plaintextEndIdx + 1) + if ((high >>> 8) == 0 && unhexDigits(high) != -1) { + val low = path.charAt(plaintextEndIdx + 2) + if ((low >>> 8) == 0 && unhexDigits(low) != -1) { + sb.append((unhexDigits(high) << 4 | unhexDigits(low)).asInstanceOf[Char]) + plaintextStartIdx = plaintextEndIdx + 3 + } else { + sb.append('%') + plaintextStartIdx = plaintextEndIdx + 1 + } } else { - sb.append(c) - i += 1 + sb.append('%') + plaintextStartIdx = plaintextEndIdx + 1 } - } else { - sb.append(c) - i += 1 + plaintextEndIdx = path.indexOf('%', plaintextStartIdx) + } + if (plaintextStartIdx < length) { + sb.append(path, plaintextStartIdx, length) } + sb.toString } - - sb.toString() } def generatePartitionPath( diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala index ce277456e0be9..4cbffff184cd9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/EscapePathBenchmark.scala @@ -20,7 +20,7 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils /** - * Benchmark for path escaping + * Benchmark for path escaping/unescaping * To run this benchmark: * {{{ * 1. without sbt: @@ -53,6 +53,30 @@ object EscapePathBenchmark extends BenchmarkBase { } benchmark.run() } + + runBenchmark("Unescape") { + val benchmark = new Benchmark("Unescape Tests", N, 10, output = output) + val paths = Seq( + "https%3A%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https:%2F%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https:/%2Fissues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org%2Fjira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org/jira%2Fbrowse%2FSPARK-48551", + "https://issues.apache.org/jira/browse%2FSPARK-48551", + "https://issues.apache.org/jira/browse%2SPARK-48551", + "https://issues.apache.org/jira/browse/SPARK-48551") + benchmark.addCase("Legacy") { _ => + (1 to N).foreach(_ => paths.foreach(unescapePathNameLegacy)) + } + + benchmark.addCase("New") { _ => + (1 to N).foreach(_ => { + paths.foreach(ExternalCatalogUtils.unescapePathName) + }) + } + benchmark.run() + } } /** @@ -71,4 +95,30 @@ object EscapePathBenchmark extends BenchmarkBase { builder.toString() } + + def unescapePathNameLegacy(path: String): String = { + val sb = new StringBuilder + var i = 0 + while (i < path.length) { + val c = path.charAt(i) + if (c == '%' && i + 2 < path.length) { + val code: Int = try { + Integer.parseInt(path.substring(i + 1, i + 3), 16) + } catch { + case _: Exception => -1 + } + if (code >= 0) { + sb.append(code.asInstanceOf[Char]) + i += 3 + } else { + sb.append(c) + i += 1 + } + } else { + sb.append(c) + i += 1 + } + } + sb.toString() + } } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala index 843ffc037d6cc..f0aee94d2a61b 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtilsSuite.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.catalyst.catalog import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.escapePathName +import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils.{escapePathName, unescapePathName} class ExternalCatalogUtilsSuite extends SparkFunSuite { @@ -39,4 +39,28 @@ class ExternalCatalogUtilsSuite extends SparkFunSuite { assert(escapePathName("a,b") === "a,b") assert(escapePathName("a/b") === "a%2Fb") } + + test("SPARK-48551: unescapePathName") { + ExternalCatalogUtils.charToEscape.stream().toArray.map(_.asInstanceOf[Char]).foreach { c => + // Check parity with old conversion technique: + assert(unescapePathName("%" + f"$c%02X") === c.toString, + s"wrong unescaping for $c") + } + assert(unescapePathName(null) === null) + assert(unescapePathName("") === "") + assert(unescapePathName(" ") === " ") + assert(unescapePathName("%0A") === "\n") + assert(unescapePathName("a b") === "a b") + assert(unescapePathName("a%3Ab") === "a:b") + assert(unescapePathName("%3Aab") === ":ab") + assert(unescapePathName("ab%3A") === "ab:") + assert(unescapePathName("a%25b") === "a%b") + assert(unescapePathName("a,b") === "a,b") + assert(unescapePathName("a%2Fb") === "a/b") + assert(unescapePathName("a%2") === "a%2") + assert(unescapePathName("a%F ") === "a%F ") + // scalastyle:off nonascii + assert(unescapePathName("a\u00FF") === "a\u00FF") + // scalastyle:on nonascii + } } From afc6f28f9c40d89d3fca5ee5fe9686be60154757 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Tue, 11 Jun 2024 19:52:09 +0800 Subject: [PATCH 2/3] [SPARK-48584][SQL]Perf improvement for unescapePathName --- .../EscapePathBenchmark-jdk21-results.txt | 16 ++++++++++++++-- .../benchmarks/EscapePathBenchmark-results.txt | 16 ++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt index 4fffb9bfd49a1..3d16c874e8c9b 100644 --- a/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-jdk21-results.txt @@ -6,7 +6,19 @@ OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure AMD EPYC 7763 64-Core Processor Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 7128 7146 8 0.1 7127.9 1.0X -New 790 795 5 1.3 789.7 9.0X +Legacy 6996 7009 9 0.1 6996.5 1.0X +New 771 776 3 1.3 770.7 9.1X + + +================================================================================================ +Unescape +================================================================================================ + +OpenJDK 64-Bit Server VM 21.0.3+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor +Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Legacy 5127 5137 6 0.2 5127.3 1.0X +New 579 583 4 1.7 579.3 8.9X diff --git a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt index cfaad5d705532..7cfa134652c27 100644 --- a/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt +++ b/sql/catalyst/benchmarks/EscapePathBenchmark-results.txt @@ -2,23 +2,23 @@ Escape ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+0 on Mac OS X 14.5 -Apple M2 Max +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor Escape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 5323 5821 248 0.2 5323.2 1.0X -New 476 494 20 2.1 475.7 11.2X +Legacy 6966 6978 12 0.1 6965.9 1.0X +New 725 730 4 1.4 725.4 9.6X ================================================================================================ Unescape ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.10+0 on Mac OS X 14.5 -Apple M2 Max +OpenJDK 64-Bit Server VM 17.0.11+9-LTS on Linux 6.5.0-1021-azure +AMD EPYC 7763 64-Core Processor Unescape Tests: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Legacy 1981 2003 24 0.5 1981.1 1.0X -New 325 328 2 3.1 325.0 6.1X +Legacy 6665 6677 11 0.2 6664.6 1.0X +New 602 606 2 1.7 602.1 11.1X From 70a5e4fdc1db0881f664a0460d4447e716fb5557 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Wed, 12 Jun 2024 16:38:09 +0800 Subject: [PATCH 3/3] nit --- .../spark/sql/catalyst/catalog/ExternalCatalogUtils.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala index 350077c4bc693..e7d91e7f41cb3 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/ExternalCatalogUtils.scala @@ -103,7 +103,7 @@ object ExternalCatalogUtils { } var plaintextEndIdx = path.indexOf('%') val length = path.length - if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > path.length) { + if (plaintextEndIdx == -1 || plaintextEndIdx + 2 > length) { // fast path, no %xx encoding found then return the string identity path } else {