From b0225b18fbd3bf6c21245d2706af5b1b4769a2f9 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 21 May 2024 11:35:53 +0200 Subject: [PATCH 01/10] Initial commit --- .../util/CollationAwareUTF8String.java | 177 +++++++++++++++--- 1 file changed, 155 insertions(+), 22 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index ee0d611d7e652..9a97d368750d9 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -34,6 +34,155 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + /** + * The constant value to indicate that the match is not found when searching for a pattern + * string in a target string. + */ + private static final int MATCH_NOT_FOUND = -1; + + /** + * Returns whether the target string starts with the specified prefix, starting from the + * specified position (0-based index referring to character position in UTF8String), with respect + * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased + * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the + * same prefix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE + */ + public static boolean lowercaseMatchFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that starts with the specified + * prefix, starting from the specified position (0-based index referring to character position + * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * prefix is already lowercased. The method only considers the part of target string that + * starts from the specified (inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the prefix is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return length of the target substring that ends with the specified prefix in lowercase + */ + public static int lowercaseMatchLengthFrom( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int len = 0; len <= target.numChars() - startPos; ++len) { + if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the first occurrence of the pattern string in the target string, + * starting from the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param startPos the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target + */ + public static int lowercaseFind( + final UTF8String target, + final UTF8String lowercasePattern, + int startPos) { + assert startPos >= 0; + for (int i = startPos; i <= target.numChars(); ++i) { + if (lowercaseMatchFrom(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns whether the target string ends with the specified suffix, ending at the specified + * position (0-based index referring to character position in UTF8String), with respect to the + * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior + * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same + * suffix string. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return whether the target string ends with the specified suffix in lowercase + */ + public static boolean lowercaseMatchUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND; + } + + /** + * Returns the length of the substring of the target string that ends with the specified + * suffix, ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * suffix is already lowercased. The method only considers the part of target string that ends + * at the specified (non-inclusive) position (that is, the method does not look at UTF8 + * characters of the target string at or after position `endPos`). If the suffix is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return length of the target substring that ends with the specified suffix in lowercase + */ + public static int lowercaseMatchLengthUntil( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int len = 0; len <= endPos; ++len) { + if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) { + return len; + } + } + return MATCH_NOT_FOUND; + } + + /** + * Returns the position of the last occurrence of the pattern string in the target string, + * ending at the specified position (0-based index referring to character position in + * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the + * pattern string is already lowercased prior to call. If the pattern is not found, + * MATCH_NOT_FOUND is returned. + * + * @param target the string to be searched in + * @param lowercasePattern the string to be searched for + * @param endPos the end position for searching (in the target string) + * @return the position of the last occurrence of pattern in target + */ + public static int lowercaseRFind( + final UTF8String target, + final UTF8String lowercasePattern, + int endPos) { + assert endPos <= target.numChars(); + for (int i = endPos; i >= 0; --i) { + if (lowercaseMatchUntil(target, lowercasePattern, i)) { + return i; + } + } + return MATCH_NOT_FOUND; + } + public static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String @@ -94,44 +243,28 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String if (src.numBytes() == 0 || search.numBytes() == 0) { return src; } - UTF8String lowercaseString = src.toLowerCase(); + UTF8String lowercaseSearch = search.toLowerCase(); int start = 0; - int end = lowercaseString.indexOf(lowercaseSearch, 0); + int end = lowercaseFind(src, lowercaseSearch, start); if (end == -1) { // Search string was not found, so string is unchanged. return src; } - // Initialize byte positions - int c = 0; - int byteStart = 0; // position in byte - int byteEnd = 0; // position in byte - while (byteEnd < src.numBytes() && c < end) { - byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd)); - c += 1; - } - // At least one match was found. Estimate space needed for result. // The 16x multiplier here is chosen to match commons-lang3's implementation. int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16; final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase); while (end != -1) { - buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart); + buf.append(src.substring(start, end)); buf.append(replace); // Update character positions - start = end + lowercaseSearch.numChars(); - end = lowercaseString.indexOf(lowercaseSearch, start); - // Update byte positions - byteStart = byteEnd + search.numBytes(); - while (byteEnd < src.numBytes() && c < end) { - byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd)); - c += 1; - } + start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end); + end = lowercaseFind(src, lowercaseSearch, start); } - buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, - src.numBytes() - byteStart); + buf.append(src.substring(start, src.numChars())); return buf.build(); } From 665742c28e7207a0b78fb64144d977c714147274 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 21 May 2024 11:36:03 +0200 Subject: [PATCH 02/10] Tests --- .../unsafe/types/CollationSupportSuite.java | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 7fc3c4e349c3b..70151be893976 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -610,8 +610,42 @@ public void testFindInSet() throws SparkException { assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4); assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5); assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5); + assertFindInSet("i̇", "İ", "UNICODE_CI", 1); + assertFindInSet("i", "İ", "UNICODE_CI", 0); + assertFindInSet("i̇", "i̇", "UNICODE_CI", 1); + assertFindInSet("i", "i̇", "UNICODE_CI", 0); + assertFindInSet("i̇", "İ,", "UNICODE_CI", 1); + assertFindInSet("i", "İ,", "UNICODE_CI", 0); + assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1); + assertFindInSet("i", "i̇,", "UNICODE_CI", 0); + assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2); + assertFindInSet("i", "ab,İ", "UNICODE_CI", 0); + assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2); + assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0); + assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2); + assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0); + assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2); + assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0); assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2); assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2); + assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2); + assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2); } private void assertReplace(String source, String search, String replace, String collationName, @@ -648,8 +682,22 @@ public void testReplace() throws SparkException { assertReplace("replace", "", "123", "UNICODE_CI", "replace"); assertReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c"); assertReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad"); + assertReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12"); + assertReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12"); + assertReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12"); + assertReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12"); + assertReplace("İi̇İi̇İi̇", "i̇", "x", "UNICODE_CI", "xxxxxx"); + assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇"); assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx"); assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"); + assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI + assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI + assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12"); + assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12"); + assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx"); + assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI + assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx"); + assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy"); } private void assertLocate(String substring, String string, Integer start, String collationName, From 8d05451b3a3782c167d5bd48dcb89229606b86f2 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Tue, 21 May 2024 14:19:41 +0200 Subject: [PATCH 03/10] checkstyle --- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 70151be893976..e72e7b907e0e5 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -23,7 +23,7 @@ import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { /** @@ -1056,3 +1056,4 @@ public void testStringTrim() throws SparkException { // TODO: Test other collation-aware expressions. } +// checkstyle.on: AvoidEscapedUnicodeCharacters From d5a811af09a31fc73f59bf82c914b964944067d3 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 12:03:43 +0200 Subject: [PATCH 04/10] Add tests --- .../apache/spark/unsafe/types/CollationSupportSuite.java | 6 ++++-- .../apache/spark/sql/CollationStringExpressionsSuite.scala | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index e72e7b907e0e5..f818ee9cbf618 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -646,6 +646,7 @@ public void testFindInSet() throws SparkException { assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0); assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2); assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2); + assertFindInSet("c", "A�,B,C,V", "UTF8_BINARY_LCASE", 3); } private void assertReplace(String source, String search, String replace, String collationName, @@ -690,12 +691,13 @@ public void testReplace() throws SparkException { assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇"); assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx"); assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"); - assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI + assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abX\u030712"); // != UNICODE_CI assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12"); assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12"); assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx"); - assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI + assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", + "İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx"); assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy"); } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 9cc123b708aff..82615aac438d4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -325,6 +325,12 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } + test("Menelaos") { + val q = "select find_in_set('c' collate utf8_binary_lcase, " + + "cast(unhex('41C22C422C432C56') as string) collate utf8_binary_lcase)" + checkAnswer(sql(q), Row()) + } + test("Support Replace string expression with collation") { case class ReplaceTestCase[R](source: String, search: String, replace: String, c: String, result: R) From f7d22da33947af0e1760d93e409765ce48694a07 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Fri, 24 May 2024 15:13:47 +0200 Subject: [PATCH 05/10] Update new method access --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 9a97d368750d9..5f39494641313 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -74,7 +74,7 @@ public static boolean lowercaseMatchFrom( * @param startPos the start position for searching (in the target string) * @return length of the target substring that ends with the specified prefix in lowercase */ - public static int lowercaseMatchLengthFrom( + private static int lowercaseMatchLengthFrom( final UTF8String target, final UTF8String lowercasePattern, int startPos) { @@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom( * @param startPos the start position for searching (in the target string) * @return the position of the first occurrence of pattern in target */ - public static int lowercaseFind( + private static int lowercaseFind( final UTF8String target, final UTF8String lowercasePattern, int startPos) { @@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil( * @param endPos the end position for searching (in the target string) * @return length of the target substring that ends with the specified suffix in lowercase */ - public static int lowercaseMatchLengthUntil( + private static int lowercaseMatchLengthUntil( final UTF8String target, final UTF8String lowercasePattern, int endPos) { @@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil( * @param endPos the end position for searching (in the target string) * @return the position of the last occurrence of pattern in target */ - public static int lowercaseRFind( + private static int lowercaseRFind( final UTF8String target, final UTF8String lowercasePattern, int endPos) { From 7ff48e439c715a47adcb0d134e31489d9bd44af4 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 09:42:40 +0200 Subject: [PATCH 06/10] Fix FindInSet --- .../util/CollationAwareUTF8String.java | 36 ++--- .../sql/catalyst/util/CollationSupport.java | 16 +- .../unsafe/types/CollationSupportSuite.java | 151 ++++++++++-------- .../sql/CollationStringExpressionsSuite.scala | 6 - 4 files changed, 102 insertions(+), 107 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 5f39494641313..718550097bc13 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -287,32 +287,28 @@ public static String toTitleCase(final String target, final int collationId) { } public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { + // If the "word" string contains a comma, FindInSet should return 0. if (match.contains(UTF8String.fromString(","))) { return 0; } - - String setString = set.toString(); - StringSearch stringSearch = CollationFactory.getStringSearch(setString, match.toString(), - collationId); - - int wordStart = 0; - while ((wordStart = stringSearch.next()) != StringSearch.DONE) { - boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ','; - boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length() - || setString.charAt(wordStart + stringSearch.getMatchLength()) == ','; - - if (isValidStart && isValidEnd) { - int pos = 0; - for (int i = 0; i < setString.length() && i < wordStart; i++) { - if (setString.charAt(i) == ',') { - pos++; - } + // Otherwise, search for commas in "set" and compare each substring with "word". + int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1; + while (byteIndex < set.numBytes()) { + byte nextByte = set.getByte(byteIndex); + if (nextByte == (byte) ',') { + if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) { + return wordCount; } - - return pos + 1; + lastComma = charIndex; + ++wordCount; } + byteIndex += UTF8String.numBytesForFirstByte(nextByte); + ++charIndex; } - + if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) { + return wordCount; + } + // If no match is found, return 0. return 0; } diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index bea3dc08b4489..3561e978ac46d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -18,6 +18,7 @@ import com.ibm.icu.text.StringSearch; +import org.apache.spark.SparkException; import org.apache.spark.unsafe.types.UTF8String; import java.util.ArrayList; @@ -297,10 +298,8 @@ public static int exec(final UTF8String word, final UTF8String set, final int co CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId); if (collation.supportsBinaryEquality) { return execBinary(word, set); - } else if (collation.supportsLowercaseEquality) { - return execLowercase(word, set); } else { - return execICU(word, set, collationId); + return execCollationAware(word, set, collationId); } } public static String genCode(final String word, final String set, final int collationId) { @@ -308,20 +307,15 @@ public static String genCode(final String word, final String set, final int coll String expr = "CollationSupport.FindInSet.exec"; if (collation.supportsBinaryEquality) { return String.format(expr + "Binary(%s, %s)", word, set); - } else if (collation.supportsLowercaseEquality) { - return String.format(expr + "Lowercase(%s, %s)", word, set); } else { - return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId); + return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId); } } public static int execBinary(final UTF8String word, final UTF8String set) { return set.findInSet(word); } - public static int execLowercase(final UTF8String word, final UTF8String set) { - return set.toLowerCase().findInSet(word.toLowerCase()); - } - public static int execICU(final UTF8String word, final UTF8String set, - final int collationId) { + public static int execCollationAware(final UTF8String word, final UTF8String set, + final int collationId) { return CollationAwareUTF8String.findInSet(word, set, collationId); } } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index f818ee9cbf618..be89b8f871489 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -571,82 +571,93 @@ public void testStringInstr() throws SparkException { assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); } - private void assertFindInSet(String word, String set, String collationName, - Integer expected) throws SparkException { + private void assertFindInSet(String word, UTF8String set, String collationName, + Integer expected) throws SparkException { UTF8String w = UTF8String.fromString(word); - UTF8String s = UTF8String.fromString(set); int collationId = CollationFactory.collationNameToId(collationName); - assertEquals(expected, CollationSupport.FindInSet.exec(w, s, collationId)); + assertEquals(expected, CollationSupport.FindInSet.exec(w, set, collationId)); } @Test public void testFindInSet() throws SparkException { - assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY", 0); - assertFindInSet("abc", "abc,b,ab,c,def", "UTF8_BINARY", 1); - assertFindInSet("def", "abc,b,ab,c,def", "UTF8_BINARY", 5); - assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY", 0); - assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY", 0); - assertFindInSet("a", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); - assertFindInSet("c", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 4); - assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 3); - assertFindInSet("AbC", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 1); - assertFindInSet("abcd", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); - assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); - assertFindInSet("XX", "xx", "UTF8_BINARY_LCASE", 1); - assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0); - assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UTF8_BINARY_LCASE", 4); - assertFindInSet("a", "abc,b,ab,c,def", "UNICODE", 0); - assertFindInSet("ab", "abc,b,ab,c,def", "UNICODE", 3); - assertFindInSet("Ab", "abc,b,ab,c,def", "UNICODE", 0); - assertFindInSet("d,ef", "abc,b,ab,c,def", "UNICODE", 0); - assertFindInSet("xx", "xx", "UNICODE", 1); - assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE", 0); - assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE", 5); - assertFindInSet("a", "abc,b,ab,c,def", "UNICODE_CI", 0); - assertFindInSet("C", "abc,b,ab,c,def", "UNICODE_CI", 4); - assertFindInSet("DeF", "abc,b,ab,c,dEf", "UNICODE_CI", 5); - assertFindInSet("DEFG", "abc,b,ab,c,def", "UNICODE_CI", 0); - assertFindInSet("XX", "xx", "UNICODE_CI", 1); - assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4); - assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5); - assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5); - assertFindInSet("i̇", "İ", "UNICODE_CI", 1); - assertFindInSet("i", "İ", "UNICODE_CI", 0); - assertFindInSet("i̇", "i̇", "UNICODE_CI", 1); - assertFindInSet("i", "i̇", "UNICODE_CI", 0); - assertFindInSet("i̇", "İ,", "UNICODE_CI", 1); - assertFindInSet("i", "İ,", "UNICODE_CI", 0); - assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1); - assertFindInSet("i", "i̇,", "UNICODE_CI", 0); - assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2); - assertFindInSet("i", "ab,İ", "UNICODE_CI", 0); - assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2); - assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0); - assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2); - assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0); - assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2); - assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0); - assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2); - assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2); - assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2); - assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2); - assertFindInSet("c", "A�,B,C,V", "UTF8_BINARY_LCASE", 3); + assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4); + assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3); + assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4); + assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5); + assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4); + assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5); + assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5); + assertFindInSet("i̇", UTF8String.fromString("İ"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("i̇"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("İ,"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("i̇,"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0); + assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2); + assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2); + assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY_LCASE", 2); + assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_BINARY_LCASE", 2); + // Invalid UTF8 strings + assertFindInSet("C", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UTF8_BINARY", 3); + assertFindInSet("c", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UTF8_BINARY_LCASE", 2); + assertFindInSet("C", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UNICODE", 3); + assertFindInSet("c", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UNICODE_CI", 2); } private void assertReplace(String source, String search, String replace, String collationName, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala index 82615aac438d4..9cc123b708aff 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala @@ -325,12 +325,6 @@ class CollationStringExpressionsSuite assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT") } - test("Menelaos") { - val q = "select find_in_set('c' collate utf8_binary_lcase, " + - "cast(unhex('41C22C422C432C56') as string) collate utf8_binary_lcase)" - checkAnswer(sql(q), Row()) - } - test("Support Replace string expression with collation") { case class ReplaceTestCase[R](source: String, search: String, replace: String, c: String, result: R) From d6bc73a80c9e4893733b6aae817bbfa25dea94f9 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Mon, 27 May 2024 13:14:25 +0200 Subject: [PATCH 07/10] Fix scalastyle --- .../org/apache/spark/sql/catalyst/util/CollationSupport.java | 1 - 1 file changed, 1 deletion(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 3561e978ac46d..a8e62cb3aa00d 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -18,7 +18,6 @@ import com.ibm.icu.text.StringSearch; -import org.apache.spark.SparkException; import org.apache.spark.unsafe.types.UTF8String; import java.util.ArrayList; From 9444cfbe05d43e538bf8868cfad678492b8b2349 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 29 May 2024 07:41:50 +0200 Subject: [PATCH 08/10] More tests --- .../spark/unsafe/types/CollationSupportSuite.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 4597541f656f4..1c76d69966ed3 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -653,6 +653,9 @@ public void testFindInSet() throws SparkException { assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5); assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0); assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4); assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3); @@ -661,11 +664,17 @@ public void testFindInSet() throws SparkException { assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1); assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY_LCASE", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY_LCASE", 0); assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4); assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3); assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UNICODE", 0); assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1); assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0); assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5); @@ -673,6 +682,9 @@ public void testFindInSet() throws SparkException { assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4); assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5); assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0); assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1); assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4); assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5); From 1e53c46dd66ec51f3649b3de57c160cb97d42314 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Wed, 26 Jun 2024 15:03:36 +0200 Subject: [PATCH 09/10] Fixes --- .../util/CollationAwareUTF8String.java | 39 ++++++++- .../unsafe/types/CollationSupportSuite.java | 80 +++++++++---------- 2 files changed, 78 insertions(+), 41 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 62c2b4fe02847..32c2f3b86227c 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -231,6 +231,19 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right)); } + /* + * Performs string replacement for ICU collations by searching for instances of the search + * string in the src string, with respect to the specified collation, and then replacing + * them with the replace string. The method returns a new UTF8String with all instances of the + * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour + * used for UTF8_BINARY collation, the method returns src string if the search string is empty. + * + * @param src the string to be searched in + * @param search the string to be searched for + * @param replace the string to be used as replacement + * @param collationId the collation ID to use for string search + * @return the position of the first occurrence of `match` in `set` + */ public static UTF8String replace(final UTF8String src, final UTF8String search, final UTF8String replace, final int collationId) { // This collation aware implementation is based on existing implementation on UTF8String @@ -286,12 +299,26 @@ public static UTF8String replace(final UTF8String src, final UTF8String search, return buf.build(); } + /* + * Performs string replacement for UTF8_LCASE collation by searching for instances of the search + * string in the src string, with respect to lowercased string versions, and then replacing + * them with the replace string. The method returns a new UTF8String with all instances of the + * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour + * used for UTF8_BINARY collation, the method returns src string if the search string is empty. + * + * @param src the string to be searched in + * @param search the string to be searched for + * @param replace the string to be used as replacement + * @param collationId the collation ID to use for string search + * @return the position of the first occurrence of `match` in `set` + */ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search, final UTF8String replace) { if (src.numBytes() == 0 || search.numBytes() == 0) { return src; } + // TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase. UTF8String lowercaseSearch = search.toLowerCase(); int start = 0; @@ -463,6 +490,17 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio BreakIterator.getWordInstance(locale))); } + /* + * Returns the position of the first occurrence of the match string in the set string, + * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner, + * with respect to the specified collation ID. Similar to UTF8String.findInSet behaviour used + * for UTF8_BINARY collation, the method returns 0 if the match string contains no commas. + * + * @param match the string to be searched for + * @param set the string to be searched in + * @param collationId the collation ID to use for string comparison + * @return the position of the first occurrence of `match` in `set` + */ public static int findInSet(final UTF8String match, final UTF8String set, int collationId) { // If the "word" string contains a comma, FindInSet should return 0. if (match.contains(UTF8String.fromString(","))) { @@ -482,7 +520,6 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co byteIndex += UTF8String.numBytesForFirstByte(nextByte); ++charIndex; } - // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) { return wordCount; } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 9630276304e35..bd4ba6254d055 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -892,18 +892,18 @@ public void testFindInSet() throws SparkException { assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1); assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6); assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4); - assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3); - assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY_LCASE", 6); - assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4); + assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3); + assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_LCASE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_LCASE", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0); + assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 4); assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3); assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); @@ -943,34 +943,34 @@ public void testFindInSet() throws SparkException { assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0); assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2); assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2); - assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 1); - assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 2); - assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 0); - assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY_LCASE", 2); - assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_BINARY_LCASE", 2); + assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 0); + assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 0); + assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_LCASE", 2); + assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_LCASE", 2); // Invalid UTF8 strings assertFindInSet("C", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), "UTF8_BINARY", 3); assertFindInSet("c", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UTF8_BINARY_LCASE", 2); + "UTF8_LCASE", 2); assertFindInSet("C", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UNICODE", 3); + "UNICODE", 2); assertFindInSet("c", UTF8String.fromBytes( new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), "UNICODE_CI", 2); @@ -1018,15 +1018,15 @@ public void testReplace() throws SparkException { assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇"); assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx"); assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy"); - assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abX\u030712"); // != UNICODE_CI - assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI - assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12"); - assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12"); - assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx"); - assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", + assertReplace("abi̇12", "i", "X", "UTF8_LCASE", "abX\u030712"); // != UNICODE_CI + assertReplace("abi̇12", "\u0307", "X", "UTF8_LCASE", "abiX12"); // != UNICODE_CI + assertReplace("abi̇12", "İ", "X", "UTF8_LCASE", "abX12"); + assertReplace("abİ12", "i", "X", "UTF8_LCASE", "abİ12"); + assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_LCASE", "xxxxxx"); + assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_LCASE", "İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI - assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx"); - assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy"); + assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_LCASE", "abxx12xx"); + assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_LCASE", "abyy12yy"); } private void assertLocate(String substring, String string, Integer start, String collationName, From b83511c6492ea66bb3d268a1f42b84143b22e543 Mon Sep 17 00:00:00 2001 From: Uros Bojanic <157381213+uros-db@users.noreply.github.com> Date: Thu, 27 Jun 2024 17:49:28 +0200 Subject: [PATCH 10/10] Update comments --- .../util/CollationAwareUTF8String.java | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 32c2f3b86227c..272a8aa128141 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -233,10 +233,10 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String /* * Performs string replacement for ICU collations by searching for instances of the search - * string in the src string, with respect to the specified collation, and then replacing + * string in the `src` string, with respect to the specified collation, and then replacing * them with the replace string. The method returns a new UTF8String with all instances of the - * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour - * used for UTF8_BINARY collation, the method returns src string if the search string is empty. + * search string replaced using the replace string. Similar to UTF8String.findInSet behavior + * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty. * * @param src the string to be searched in * @param search the string to be searched for @@ -303,8 +303,8 @@ public static UTF8String replace(final UTF8String src, final UTF8String search, * Performs string replacement for UTF8_LCASE collation by searching for instances of the search * string in the src string, with respect to lowercased string versions, and then replacing * them with the replace string. The method returns a new UTF8String with all instances of the - * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour - * used for UTF8_BINARY collation, the method returns src string if the search string is empty. + * search string replaced using the replace string. Similar to UTF8String.findInSet behavior + * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty. * * @param src the string to be searched in * @param search the string to be searched for @@ -355,7 +355,7 @@ public static UTF8String toUpperCase(final UTF8String target) { } private static UTF8String toUpperCaseSlow(final UTF8String target) { - // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to // convert the string to uppercase, which only accepts a Java strings as input. // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` return UTF8String.fromString(UCharacter.toUpperCase(target.toString())); @@ -373,7 +373,7 @@ public static UTF8String toUpperCase(final UTF8String target, final int collatio } private static UTF8String toUpperCaseSlow(final UTF8String target, final int collationId) { - // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to // convert the string to uppercase, which only accepts a Java strings as input. ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); @@ -393,7 +393,7 @@ public static UTF8String toLowerCase(final UTF8String target) { } private static UTF8String toLowerCaseSlow(final UTF8String target) { - // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to // convert the string to lowercase, which only accepts a Java strings as input. // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` return UTF8String.fromString(UCharacter.toLowerCase(target.toString())); @@ -411,7 +411,7 @@ public static UTF8String toLowerCase(final UTF8String target, final int collatio } private static UTF8String toLowerCaseSlow(final UTF8String target, final int collationId) { - // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to // convert the string to lowercase, which only accepts a Java strings as input. ULocale locale = CollationFactory.fetchCollation(collationId) .collator.getLocale(ULocale.ACTUAL_LOCALE); @@ -472,7 +472,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) { * Convert the input string to titlecase using the ICU root locale rules. */ public static UTF8String toTitleCase(final UTF8String target) { - // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to + // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to // convert the string to titlecase, which only accepts a Java strings as input. // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid` return UTF8String.fromString(UCharacter.toTitleCase(target.toString(), @@ -493,7 +493,7 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio /* * Returns the position of the first occurrence of the match string in the set string, * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner, - * with respect to the specified collation ID. Similar to UTF8String.findInSet behaviour used + * with respect to the specified collation ID. Similar to UTF8String.findInSet behavior used * for UTF8_BINARY collation, the method returns 0 if the match string contains no commas. * * @param match the string to be searched for