From b0225b18fbd3bf6c21245d2706af5b1b4769a2f9 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 21 May 2024 11:35:53 +0200
Subject: [PATCH 01/10] Initial commit

---
 .../util/CollationAwareUTF8String.java        | 177 +++++++++++++++---
 1 file changed, 155 insertions(+), 22 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index ee0d611d7e652..9a97d368750d9 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -34,6 +34,155 @@
  * Utility class for collation-aware UTF8String operations.
  */
 public class CollationAwareUTF8String {
+
+  /**
+   * The constant value to indicate that the match is not found when searching for a pattern
+   * string in a target string.
+   */
+  private static final int MATCH_NOT_FOUND = -1;
+
+  /**
+   * Returns whether the target string starts with the specified prefix, starting from the
+   * specified position (0-based index referring to character position in UTF8String), with respect
+   * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is already lowercased
+   * prior to method call to avoid the overhead of calling .toLowerCase() multiple times on the
+   * same prefix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return whether the target string starts with the specified prefix in UTF8_BINARY_LCASE
+   */
+  public static boolean lowercaseMatchFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that starts with the specified
+   * prefix, starting from the specified position (0-based index referring to character position
+   * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * prefix is already lowercased. The method only considers the part of target string that
+   * starts from the specified (inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the prefix is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return length of the target substring that ends with the specified prefix in lowercase
+   */
+  public static int lowercaseMatchLengthFrom(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
+    for (int len = 0; len <= target.numChars() - startPos; ++len) {
+      if (target.substring(startPos, startPos + len).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the first occurrence of the pattern string in the target string,
+   * starting from the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param startPos the start position for searching (in the target string)
+   * @return the position of the first occurrence of pattern in target
+   */
+  public static int lowercaseFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int startPos) {
+    assert startPos >= 0;
+    for (int i = startPos; i <= target.numChars(); ++i) {
+      if (lowercaseMatchFrom(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns whether the target string ends with the specified suffix, ending at the specified
+   * position (0-based index referring to character position in UTF8String), with respect to the
+   * UTF8_BINARY_LCASE collation. The method assumes that the suffix is already lowercased prior
+   * to method call to avoid the overhead of calling .toLowerCase() multiple times on the same
+   * suffix string.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return whether the target string ends with the specified suffix in lowercase
+   */
+  public static boolean lowercaseMatchUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) != MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the length of the substring of the target string that ends with the specified
+   * suffix, ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * suffix is already lowercased. The method only considers the part of target string that ends
+   * at the specified (non-inclusive) position (that is, the method does not look at UTF8
+   * characters of the target string at or after position `endPos`). If the suffix is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return length of the target substring that ends with the specified suffix in lowercase
+   */
+  public static int lowercaseMatchLengthUntil(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
+    for (int len = 0; len <= endPos; ++len) {
+      if (target.substring(endPos - len, endPos).toLowerCase().equals(lowercasePattern)) {
+        return len;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
+  /**
+   * Returns the position of the last occurrence of the pattern string in the target string,
+   * ending at the specified position (0-based index referring to character position in
+   * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method assumes that the
+   * pattern string is already lowercased prior to call. If the pattern is not found,
+   * MATCH_NOT_FOUND is returned.
+   *
+   * @param target the string to be searched in
+   * @param lowercasePattern the string to be searched for
+   * @param endPos the end position for searching (in the target string)
+   * @return the position of the last occurrence of pattern in target
+   */
+  public static int lowercaseRFind(
+      final UTF8String target,
+      final UTF8String lowercasePattern,
+      int endPos) {
+    assert endPos <= target.numChars();
+    for (int i = endPos; i >= 0; --i) {
+      if (lowercaseMatchUntil(target, lowercasePattern, i)) {
+        return i;
+      }
+    }
+    return MATCH_NOT_FOUND;
+  }
+
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -94,44 +243,28 @@ public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String
     if (src.numBytes() == 0 || search.numBytes() == 0) {
       return src;
     }
-    UTF8String lowercaseString = src.toLowerCase();
+
     UTF8String lowercaseSearch = search.toLowerCase();
 
     int start = 0;
-    int end = lowercaseString.indexOf(lowercaseSearch, 0);
+    int end = lowercaseFind(src, lowercaseSearch, start);
     if (end == -1) {
       // Search string was not found, so string is unchanged.
       return src;
     }
 
-    // Initialize byte positions
-    int c = 0;
-    int byteStart = 0; // position in byte
-    int byteEnd = 0; // position in byte
-    while (byteEnd < src.numBytes() && c < end) {
-      byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
-      c += 1;
-    }
-
     // At least one match was found. Estimate space needed for result.
     // The 16x multiplier here is chosen to match commons-lang3's implementation.
     int increase = Math.max(0, replace.numBytes() - search.numBytes()) * 16;
     final UTF8StringBuilder buf = new UTF8StringBuilder(src.numBytes() + increase);
     while (end != -1) {
-      buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart, byteEnd - byteStart);
+      buf.append(src.substring(start, end));
       buf.append(replace);
       // Update character positions
-      start = end + lowercaseSearch.numChars();
-      end = lowercaseString.indexOf(lowercaseSearch, start);
-      // Update byte positions
-      byteStart = byteEnd + search.numBytes();
-      while (byteEnd < src.numBytes() && c < end) {
-        byteEnd += UTF8String.numBytesForFirstByte(src.getByte(byteEnd));
-        c += 1;
-      }
+      start = end + lowercaseMatchLengthFrom(src, lowercaseSearch, end);
+      end = lowercaseFind(src, lowercaseSearch, start);
     }
-    buf.appendBytes(src.getBaseObject(), src.getBaseOffset() + byteStart,
-      src.numBytes() - byteStart);
+    buf.append(src.substring(start, src.numChars()));
     return buf.build();
   }
 

From 665742c28e7207a0b78fb64144d977c714147274 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 21 May 2024 11:36:03 +0200
Subject: [PATCH 02/10] Tests

---
 .../unsafe/types/CollationSupportSuite.java   | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 7fc3c4e349c3b..70151be893976 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -610,8 +610,42 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4);
     assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5);
     assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5);
+    assertFindInSet("i̇", "İ", "UNICODE_CI", 1);
+    assertFindInSet("i", "İ", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "i̇", "UNICODE_CI", 1);
+    assertFindInSet("i", "i̇", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "İ,", "UNICODE_CI", 1);
+    assertFindInSet("i", "İ,", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1);
+    assertFindInSet("i", "i̇,", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,İ", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0);
+    assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2);
+    assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0);
     assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2);
     assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2);
+    assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2);
   }
 
   private void assertReplace(String source, String search, String replace, String collationName,
@@ -648,8 +682,22 @@ public void testReplace() throws SparkException {
     assertReplace("replace", "", "123", "UNICODE_CI", "replace");
     assertReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
     assertReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
+    assertReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
+    assertReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
+    assertReplace("abi̇12", "İ", "X", "UNICODE_CI", "abX12");
+    assertReplace("abİ12", "i", "X", "UNICODE_CI", "abİ12");
+    assertReplace("İi̇İi̇İi̇", "i̇", "x", "UNICODE_CI", "xxxxxx");
+    assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
     assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
     assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
+    assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI
+    assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI
+    assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12");
+    assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12");
+    assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx");
+    assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI
+    assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx");
+    assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy");
   }
 
   private void assertLocate(String substring, String string, Integer start, String collationName,

From 8d05451b3a3782c167d5bd48dcb89229606b86f2 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Tue, 21 May 2024 14:19:41 +0200
Subject: [PATCH 03/10] checkstyle

---
 .../org/apache/spark/unsafe/types/CollationSupportSuite.java   | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 70151be893976..e72e7b907e0e5 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -23,7 +23,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-
+// checkstyle.off: AvoidEscapedUnicodeCharacters
 public class CollationSupportSuite {
 
   /**
@@ -1056,3 +1056,4 @@ public void testStringTrim() throws SparkException {
   // TODO: Test other collation-aware expressions.
 
 }
+// checkstyle.on: AvoidEscapedUnicodeCharacters

From d5a811af09a31fc73f59bf82c914b964944067d3 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 24 May 2024 12:03:43 +0200
Subject: [PATCH 04/10] Add tests

---
 .../apache/spark/unsafe/types/CollationSupportSuite.java    | 6 ++++--
 .../apache/spark/sql/CollationStringExpressionsSuite.scala  | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index e72e7b907e0e5..f818ee9cbf618 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -646,6 +646,7 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0);
     assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2);
     assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("c", "A�,B,C,V", "UTF8_BINARY_LCASE", 3);
   }
 
   private void assertReplace(String source, String search, String replace, String collationName,
@@ -690,12 +691,13 @@ public void testReplace() throws SparkException {
     assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
     assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
     assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
-    assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abẊ12"); // != UNICODE_CI
+    assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abX\u030712"); // != UNICODE_CI
     assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI
     assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12");
     assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12");
     assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx");
-    assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE", "İẋİẋİẋ"); // != UNICODE_CI
+    assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE",
+      "İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI
     assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx");
     assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy");
   }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 9cc123b708aff..82615aac438d4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -325,6 +325,12 @@ class CollationStringExpressionsSuite
     assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
+  test("Menelaos") {
+    val q = "select find_in_set('c' collate utf8_binary_lcase, " +
+      "cast(unhex('41C22C422C432C56') as string) collate utf8_binary_lcase)"
+    checkAnswer(sql(q), Row())
+  }
+
   test("Support Replace string expression with collation") {
     case class ReplaceTestCase[R](source: String, search: String, replace: String,
         c: String, result: R)

From f7d22da33947af0e1760d93e409765ce48694a07 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Fri, 24 May 2024 15:13:47 +0200
Subject: [PATCH 05/10] Update new method access

---
 .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 9a97d368750d9..5f39494641313 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -74,7 +74,7 @@ public static boolean lowercaseMatchFrom(
    * @param startPos the start position for searching (in the target string)
    * @return length of the target substring that ends with the specified prefix in lowercase
    */
-  public static int lowercaseMatchLengthFrom(
+  private static int lowercaseMatchLengthFrom(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
@@ -99,7 +99,7 @@ public static int lowercaseMatchLengthFrom(
    * @param startPos the start position for searching (in the target string)
    * @return the position of the first occurrence of pattern in target
    */
-  public static int lowercaseFind(
+  private static int lowercaseFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int startPos) {
@@ -145,7 +145,7 @@ public static boolean lowercaseMatchUntil(
    * @param endPos the end position for searching (in the target string)
    * @return length of the target substring that ends with the specified suffix in lowercase
    */
-  public static int lowercaseMatchLengthUntil(
+  private static int lowercaseMatchLengthUntil(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {
@@ -170,7 +170,7 @@ public static int lowercaseMatchLengthUntil(
    * @param endPos the end position for searching (in the target string)
    * @return the position of the last occurrence of pattern in target
    */
-  public static int lowercaseRFind(
+  private static int lowercaseRFind(
       final UTF8String target,
       final UTF8String lowercasePattern,
       int endPos) {

From 7ff48e439c715a47adcb0d134e31489d9bd44af4 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 09:42:40 +0200
Subject: [PATCH 06/10] Fix FindInSet

---
 .../util/CollationAwareUTF8String.java        |  36 ++---
 .../sql/catalyst/util/CollationSupport.java   |  16 +-
 .../unsafe/types/CollationSupportSuite.java   | 151 ++++++++++--------
 .../sql/CollationStringExpressionsSuite.scala |   6 -
 4 files changed, 102 insertions(+), 107 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 5f39494641313..718550097bc13 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -287,32 +287,28 @@ public static String toTitleCase(final String target, final int collationId) {
   }
 
   public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
+    // If the "word" string contains a comma, FindInSet should return 0.
     if (match.contains(UTF8String.fromString(","))) {
       return 0;
     }
-
-    String setString = set.toString();
-    StringSearch stringSearch = CollationFactory.getStringSearch(setString, match.toString(),
-      collationId);
-
-    int wordStart = 0;
-    while ((wordStart = stringSearch.next()) != StringSearch.DONE) {
-      boolean isValidStart = wordStart == 0 || setString.charAt(wordStart - 1) == ',';
-      boolean isValidEnd = wordStart + stringSearch.getMatchLength() == setString.length()
-        || setString.charAt(wordStart + stringSearch.getMatchLength()) == ',';
-
-      if (isValidStart && isValidEnd) {
-        int pos = 0;
-        for (int i = 0; i < setString.length() && i < wordStart; i++) {
-          if (setString.charAt(i) == ',') {
-            pos++;
-          }
+    // Otherwise, search for commas in "set" and compare each substring with "word".
+    int byteIndex = 0, charIndex = 0, wordCount = 1, lastComma = -1;
+    while (byteIndex < set.numBytes()) {
+      byte nextByte = set.getByte(byteIndex);
+      if (nextByte == (byte) ',') {
+        if (set.substring(lastComma + 1, charIndex).semanticEquals(match, collationId)) {
+          return wordCount;
         }
-
-        return pos + 1;
+        lastComma = charIndex;
+        ++wordCount;
       }
+      byteIndex += UTF8String.numBytesForFirstByte(nextByte);
+      ++charIndex;
     }
-
+    if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) {
+      return wordCount;
+    }
+    // If no match is found, return 0.
     return 0;
   }
 
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index bea3dc08b4489..3561e978ac46d 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -18,6 +18,7 @@
 
 import com.ibm.icu.text.StringSearch;
 
+import org.apache.spark.SparkException;
 import org.apache.spark.unsafe.types.UTF8String;
 
 import java.util.ArrayList;
@@ -297,10 +298,8 @@ public static int exec(final UTF8String word, final UTF8String set, final int co
       CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
       if (collation.supportsBinaryEquality) {
         return execBinary(word, set);
-      } else if (collation.supportsLowercaseEquality) {
-        return execLowercase(word, set);
       } else {
-        return execICU(word, set, collationId);
+        return execCollationAware(word, set, collationId);
       }
     }
     public static String genCode(final String word, final String set, final int collationId) {
@@ -308,20 +307,15 @@ public static String genCode(final String word, final String set, final int coll
       String expr = "CollationSupport.FindInSet.exec";
       if (collation.supportsBinaryEquality) {
         return String.format(expr + "Binary(%s, %s)", word, set);
-      } else if (collation.supportsLowercaseEquality) {
-        return String.format(expr + "Lowercase(%s, %s)", word, set);
       } else {
-        return String.format(expr + "ICU(%s, %s, %d)", word, set, collationId);
+        return String.format(expr + "execCollationAware(%s, %s, %d)", word, set, collationId);
       }
     }
     public static int execBinary(final UTF8String word, final UTF8String set) {
       return set.findInSet(word);
     }
-    public static int execLowercase(final UTF8String word, final UTF8String set) {
-      return set.toLowerCase().findInSet(word.toLowerCase());
-    }
-    public static int execICU(final UTF8String word, final UTF8String set,
-                                  final int collationId) {
+    public static int execCollationAware(final UTF8String word, final UTF8String set,
+        final int collationId) {
       return CollationAwareUTF8String.findInSet(word, set, collationId);
     }
   }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index f818ee9cbf618..be89b8f871489 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -571,82 +571,93 @@ public void testStringInstr() throws SparkException {
     assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3);
   }
 
-  private void assertFindInSet(String word, String set, String collationName,
-        Integer expected) throws SparkException {
+  private void assertFindInSet(String word, UTF8String set, String collationName,
+      Integer expected) throws SparkException {
     UTF8String w = UTF8String.fromString(word);
-    UTF8String s = UTF8String.fromString(set);
     int collationId = CollationFactory.collationNameToId(collationName);
-    assertEquals(expected, CollationSupport.FindInSet.exec(w, s, collationId));
+    assertEquals(expected, CollationSupport.FindInSet.exec(w, set, collationId));
   }
 
   @Test
   public void testFindInSet() throws SparkException {
-    assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY", 0);
-    assertFindInSet("abc", "abc,b,ab,c,def", "UTF8_BINARY", 1);
-    assertFindInSet("def", "abc,b,ab,c,def", "UTF8_BINARY", 5);
-    assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY", 0);
-    assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY", 0);
-    assertFindInSet("a", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("c", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 4);
-    assertFindInSet("AB", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 3);
-    assertFindInSet("AbC", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("abcd", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("d,ef", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("XX", "xx", "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("", "abc,b,ab,c,def", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UTF8_BINARY_LCASE", 4);
-    assertFindInSet("a", "abc,b,ab,c,def", "UNICODE", 0);
-    assertFindInSet("ab", "abc,b,ab,c,def", "UNICODE", 3);
-    assertFindInSet("Ab", "abc,b,ab,c,def", "UNICODE", 0);
-    assertFindInSet("d,ef", "abc,b,ab,c,def", "UNICODE", 0);
-    assertFindInSet("xx", "xx", "UNICODE", 1);
-    assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE", 0);
-    assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE", 5);
-    assertFindInSet("a", "abc,b,ab,c,def", "UNICODE_CI", 0);
-    assertFindInSet("C", "abc,b,ab,c,def", "UNICODE_CI", 4);
-    assertFindInSet("DeF", "abc,b,ab,c,dEf", "UNICODE_CI", 5);
-    assertFindInSet("DEFG", "abc,b,ab,c,def", "UNICODE_CI", 0);
-    assertFindInSet("XX", "xx", "UNICODE_CI", 1);
-    assertFindInSet("界x", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 4);
-    assertFindInSet("界x", "test,大千,界Xx,世,界X,大,千,世界", "UNICODE_CI", 5);
-    assertFindInSet("大", "test,大千,世,界X,大,千,世界", "UNICODE_CI", 5);
-    assertFindInSet("i̇", "İ", "UNICODE_CI", 1);
-    assertFindInSet("i", "İ", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "i̇", "UNICODE_CI", 1);
-    assertFindInSet("i", "i̇", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "İ,", "UNICODE_CI", 1);
-    assertFindInSet("i", "İ,", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "i̇,", "UNICODE_CI", 1);
-    assertFindInSet("i", "i̇,", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "ab,İ", "UNICODE_CI", 2);
-    assertFindInSet("i", "ab,İ", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "ab,i̇", "UNICODE_CI", 2);
-    assertFindInSet("i", "ab,i̇", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "ab,İ,12", "UNICODE_CI", 2);
-    assertFindInSet("i", "ab,İ,12", "UNICODE_CI", 0);
-    assertFindInSet("i̇", "ab,i̇,12", "UNICODE_CI", 2);
-    assertFindInSet("i", "ab,i̇,12", "UNICODE_CI", 0);
-    assertFindInSet("i̇o", "ab,İo,12", "UNICODE_CI", 2);
-    assertFindInSet("İo", "ab,i̇o,12", "UNICODE_CI", 2);
-    assertFindInSet("i̇", "İ", "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", "İ", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "i̇", "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", "i̇", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "İ,", "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", "İ,", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "i̇,", "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", "i̇,", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "ab,İ", "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", "ab,İ", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "ab,i̇", "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", "ab,i̇", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "ab,İ,12", "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", "ab,İ,12", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", "ab,i̇,12", "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", "ab,i̇,12", "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇o", "ab,İo,12", "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("İo", "ab,i̇o,12", "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("c", "A�,B,C,V", "UTF8_BINARY_LCASE", 3);
+    assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+    assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1);
+    assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5);
+    assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+    assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4);
+    assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3);
+    assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4);
+    assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+    assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3);
+    assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+    assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+    assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1);
+    assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0);
+    assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5);
+    assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+    assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4);
+    assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5);
+    assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+    assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1);
+    assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4);
+    assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5);
+    assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 5);
+    assertFindInSet("i̇", UTF8String.fromString("İ"), "UNICODE_CI", 1);
+    assertFindInSet("i", UTF8String.fromString("İ"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("i̇"), "UNICODE_CI", 1);
+    assertFindInSet("i", UTF8String.fromString("i̇"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("İ,"), "UNICODE_CI", 1);
+    assertFindInSet("i", UTF8String.fromString("İ,"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UNICODE_CI", 1);
+    assertFindInSet("i", UTF8String.fromString("i̇,"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UNICODE_CI", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,İ"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UNICODE_CI", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0);
+    assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2);
+    assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2);
+    assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_BINARY_LCASE", 2);
+    // Invalid UTF8 strings
+    assertFindInSet("C", UTF8String.fromBytes(
+      new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
+      "UTF8_BINARY", 3);
+    assertFindInSet("c", UTF8String.fromBytes(
+      new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
+      "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("C", UTF8String.fromBytes(
+      new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
+      "UNICODE", 3);
+    assertFindInSet("c", UTF8String.fromBytes(
+      new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
+      "UNICODE_CI", 2);
   }
 
   private void assertReplace(String source, String search, String replace, String collationName,
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 82615aac438d4..9cc123b708aff 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -325,12 +325,6 @@ class CollationStringExpressionsSuite
     assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
   }
 
-  test("Menelaos") {
-    val q = "select find_in_set('c' collate utf8_binary_lcase, " +
-      "cast(unhex('41C22C422C432C56') as string) collate utf8_binary_lcase)"
-    checkAnswer(sql(q), Row())
-  }
-
   test("Support Replace string expression with collation") {
     case class ReplaceTestCase[R](source: String, search: String, replace: String,
         c: String, result: R)

From d6bc73a80c9e4893733b6aae817bbfa25dea94f9 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Mon, 27 May 2024 13:14:25 +0200
Subject: [PATCH 07/10] Fix scalastyle

---
 .../org/apache/spark/sql/catalyst/util/CollationSupport.java     | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index 3561e978ac46d..a8e62cb3aa00d 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -18,7 +18,6 @@
 
 import com.ibm.icu.text.StringSearch;
 
-import org.apache.spark.SparkException;
 import org.apache.spark.unsafe.types.UTF8String;
 
 import java.util.ArrayList;

From 9444cfbe05d43e538bf8868cfad678492b8b2349 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 29 May 2024 07:41:50 +0200
Subject: [PATCH 08/10] More tests

---
 .../spark/unsafe/types/CollationSupportSuite.java    | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 4597541f656f4..1c76d69966ed3 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -653,6 +653,9 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5);
     assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
     assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0);
+    assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6);
+    assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0);
     assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
     assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4);
     assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3);
@@ -661,11 +664,17 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
     assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1);
     assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
+    assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY_LCASE", 6);
+    assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY_LCASE", 0);
     assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4);
     assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
     assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3);
     assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
     assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
+    assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE", 6);
+    assertFindInSet("", UTF8String.fromString("abc"), "UNICODE", 0);
     assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1);
     assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 0);
     assertFindInSet("大", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE", 5);
@@ -673,6 +682,9 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4);
     assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5);
     assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0);
+    assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6);
+    assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0);
     assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1);
     assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UNICODE_CI", 4);
     assertFindInSet("界x", UTF8String.fromString("test,大千,界Xx,世,界X,大,千,世界"), "UNICODE_CI", 5);

From 1e53c46dd66ec51f3649b3de57c160cb97d42314 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Wed, 26 Jun 2024 15:03:36 +0200
Subject: [PATCH 09/10] Fixes

---
 .../util/CollationAwareUTF8String.java        | 39 ++++++++-
 .../unsafe/types/CollationSupportSuite.java   | 80 +++++++++----------
 2 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 62c2b4fe02847..32c2f3b86227c 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -231,6 +231,19 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
     return lowerCaseCodePoints(left).binaryCompare(lowerCaseCodePoints(right));
   }
 
+  /*
+   * Performs string replacement for ICU collations by searching for instances of the search
+   * string in the src string, with respect to the specified collation, and then replacing
+   * them with the replace string. The method returns a new UTF8String with all instances of the
+   * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour
+   * used for UTF8_BINARY collation, the method returns src string if the search string is empty.
+   *
+   * @param src the string to be searched in
+   * @param search the string to be searched for
+   * @param replace the string to be used as replacement
+   * @param collationId the collation ID to use for string search
+   * @return the position of the first occurrence of `match` in `set`
+   */
   public static UTF8String replace(final UTF8String src, final UTF8String search,
       final UTF8String replace, final int collationId) {
     // This collation aware implementation is based on existing implementation on UTF8String
@@ -286,12 +299,26 @@ public static UTF8String replace(final UTF8String src, final UTF8String search,
     return buf.build();
   }
 
+  /*
+   * Performs string replacement for UTF8_LCASE collation by searching for instances of the search
+   * string in the src string, with respect to lowercased string versions, and then replacing
+   * them with the replace string. The method returns a new UTF8String with all instances of the
+   * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour
+   * used for UTF8_BINARY collation, the method returns src string if the search string is empty.
+   *
+   * @param src the string to be searched in
+   * @param search the string to be searched for
+   * @param replace the string to be used as replacement
+   * @param collationId the collation ID to use for string search
+   * @return the position of the first occurrence of `match` in `set`
+   */
   public static UTF8String lowercaseReplace(final UTF8String src, final UTF8String search,
       final UTF8String replace) {
     if (src.numBytes() == 0 || search.numBytes() == 0) {
       return src;
     }
 
+    // TODO(SPARK-48725): Use lowerCaseCodePoints instead of UTF8String.toLowerCase.
     UTF8String lowercaseSearch = search.toLowerCase();
 
     int start = 0;
@@ -463,6 +490,17 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
       BreakIterator.getWordInstance(locale)));
   }
 
+  /*
+   * Returns the position of the first occurrence of the match string in the set string,
+   * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
+   * with respect to the specified collation ID. Similar to UTF8String.findInSet behaviour used
+   * for UTF8_BINARY collation, the method returns 0 if the match string contains no commas.
+   *
+   * @param match the string to be searched for
+   * @param set the string to be searched in
+   * @param collationId the collation ID to use for string comparison
+   * @return the position of the first occurrence of `match` in `set`
+   */
   public static int findInSet(final UTF8String match, final UTF8String set, int collationId) {
     // If the "word" string contains a comma, FindInSet should return 0.
     if (match.contains(UTF8String.fromString(","))) {
@@ -482,7 +520,6 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co
       byteIndex += UTF8String.numBytesForFirstByte(nextByte);
       ++charIndex;
     }
-    // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     if (set.substring(lastComma + 1, set.numBytes()).semanticEquals(match, collationId)) {
       return wordCount;
     }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index 9630276304e35..bd4ba6254d055 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -892,18 +892,18 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1);
     assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6);
     assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0);
-    assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 4);
-    assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 3);
-    assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY_LCASE", 6);
-    assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_BINARY_LCASE", 4);
+    assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+    assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4);
+    assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3);
+    assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1);
+    assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+    assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+    assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_LCASE", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0);
+    assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_LCASE", 1);
+    assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6);
+    assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0);
+    assertFindInSet("界x", UTF8String.fromString("test,大千,世,界X,大,千,世界"), "UTF8_LCASE", 4);
     assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
     assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3);
     assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0);
@@ -943,34 +943,34 @@ public void testFindInSet() throws SparkException {
     assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UNICODE_CI", 0);
     assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UNICODE_CI", 2);
     assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UNICODE_CI", 2);
-    assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 1);
-    assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_BINARY_LCASE", 0);
-    assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_BINARY_LCASE", 2);
-    assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_BINARY_LCASE", 2);
+    assertFindInSet("i̇", UTF8String.fromString("İ"), "UTF8_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("İ"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("i̇"), "UTF8_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("i̇"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("İ,"), "UTF8_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("İ,"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("i̇,"), "UTF8_LCASE", 1);
+    assertFindInSet("i", UTF8String.fromString("i̇,"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,İ"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,i̇"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,İ,12"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 2);
+    assertFindInSet("i", UTF8String.fromString("ab,i̇,12"), "UTF8_LCASE", 0);
+    assertFindInSet("i̇o", UTF8String.fromString("ab,İo,12"), "UTF8_LCASE", 2);
+    assertFindInSet("İo", UTF8String.fromString("ab,i̇o,12"), "UTF8_LCASE", 2);
     // Invalid UTF8 strings
     assertFindInSet("C", UTF8String.fromBytes(
       new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
       "UTF8_BINARY", 3);
     assertFindInSet("c", UTF8String.fromBytes(
       new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
-      "UTF8_BINARY_LCASE", 2);
+      "UTF8_LCASE", 2);
     assertFindInSet("C", UTF8String.fromBytes(
       new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
-      "UNICODE", 3);
+      "UNICODE", 2);
     assertFindInSet("c", UTF8String.fromBytes(
       new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }),
       "UNICODE_CI", 2);
@@ -1018,15 +1018,15 @@ public void testReplace() throws SparkException {
     assertReplace("İi̇İi̇İi̇", "i", "x", "UNICODE_CI", "İi̇İi̇İi̇");
     assertReplace("abİo12i̇o", "i̇o", "xx", "UNICODE_CI", "abxx12xx");
     assertReplace("abi̇o12i̇o", "İo", "yy", "UNICODE_CI", "abyy12yy");
-    assertReplace("abi̇12", "i", "X", "UTF8_BINARY_LCASE", "abX\u030712"); // != UNICODE_CI
-    assertReplace("abi̇12", "\u0307", "X", "UTF8_BINARY_LCASE", "abiX12"); // != UNICODE_CI
-    assertReplace("abi̇12", "İ", "X", "UTF8_BINARY_LCASE", "abX12");
-    assertReplace("abİ12", "i", "X", "UTF8_BINARY_LCASE", "abİ12");
-    assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_BINARY_LCASE", "xxxxxx");
-    assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_BINARY_LCASE",
+    assertReplace("abi̇12", "i", "X", "UTF8_LCASE", "abX\u030712"); // != UNICODE_CI
+    assertReplace("abi̇12", "\u0307", "X", "UTF8_LCASE", "abiX12"); // != UNICODE_CI
+    assertReplace("abi̇12", "İ", "X", "UTF8_LCASE", "abX12");
+    assertReplace("abİ12", "i", "X", "UTF8_LCASE", "abİ12");
+    assertReplace("İi̇İi̇İi̇", "i̇", "x", "UTF8_LCASE", "xxxxxx");
+    assertReplace("İi̇İi̇İi̇", "i", "x", "UTF8_LCASE",
       "İx\u0307İx\u0307İx\u0307"); // != UNICODE_CI
-    assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_BINARY_LCASE", "abxx12xx");
-    assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_BINARY_LCASE", "abyy12yy");
+    assertReplace("abİo12i̇o", "i̇o", "xx", "UTF8_LCASE", "abxx12xx");
+    assertReplace("abi̇o12i̇o", "İo", "yy", "UTF8_LCASE", "abyy12yy");
   }
 
   private void assertLocate(String substring, String string, Integer start, String collationName,

From b83511c6492ea66bb3d268a1f42b84143b22e543 Mon Sep 17 00:00:00 2001
From: Uros Bojanic <157381213+uros-db@users.noreply.github.com>
Date: Thu, 27 Jun 2024 17:49:28 +0200
Subject: [PATCH 10/10] Update comments

---
 .../util/CollationAwareUTF8String.java        | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index 32c2f3b86227c..272a8aa128141 100644
--- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -233,10 +233,10 @@ private static int compareLowerCaseSlow(final UTF8String left, final UTF8String
 
   /*
    * Performs string replacement for ICU collations by searching for instances of the search
-   * string in the src string, with respect to the specified collation, and then replacing
+   * string in the `src` string, with respect to the specified collation, and then replacing
    * them with the replace string. The method returns a new UTF8String with all instances of the
-   * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour
-   * used for UTF8_BINARY collation, the method returns src string if the search string is empty.
+   * search string replaced using the replace string. Similar to UTF8String.findInSet behavior
+   * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
    *
    * @param src the string to be searched in
    * @param search the string to be searched for
@@ -303,8 +303,8 @@ public static UTF8String replace(final UTF8String src, final UTF8String search,
    * Performs string replacement for UTF8_LCASE collation by searching for instances of the search
    * string in the src string, with respect to lowercased string versions, and then replacing
    * them with the replace string. The method returns a new UTF8String with all instances of the
-   * search string replaced using the replace string. Similar to UTF8String.findInSet behaviour
-   * used for UTF8_BINARY collation, the method returns src string if the search string is empty.
+   * search string replaced using the replace string. Similar to UTF8String.findInSet behavior
+   * used for UTF8_BINARY, the method returns the `src` string if the `search` string is empty.
    *
    * @param src the string to be searched in
    * @param search the string to be searched for
@@ -355,7 +355,7 @@ public static UTF8String toUpperCase(final UTF8String target) {
   }
 
   private static UTF8String toUpperCaseSlow(final UTF8String target) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to uppercase, which only accepts a Java strings as input.
     // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     return UTF8String.fromString(UCharacter.toUpperCase(target.toString()));
@@ -373,7 +373,7 @@ public static UTF8String toUpperCase(final UTF8String target, final int collatio
   }
 
   private static UTF8String toUpperCaseSlow(final UTF8String target, final int collationId) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to uppercase, which only accepts a Java strings as input.
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
@@ -393,7 +393,7 @@ public static UTF8String toLowerCase(final UTF8String target) {
   }
 
   private static UTF8String toLowerCaseSlow(final UTF8String target) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to lowercase, which only accepts a Java strings as input.
     // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     return UTF8String.fromString(UCharacter.toLowerCase(target.toString()));
@@ -411,7 +411,7 @@ public static UTF8String toLowerCase(final UTF8String target, final int collatio
   }
 
   private static UTF8String toLowerCaseSlow(final UTF8String target, final int collationId) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to lowercase, which only accepts a Java strings as input.
     ULocale locale = CollationFactory.fetchCollation(collationId)
       .collator.getLocale(ULocale.ACTUAL_LOCALE);
@@ -472,7 +472,7 @@ private static UTF8String lowerCaseCodePointsSlow(final UTF8String target) {
    * Convert the input string to titlecase using the ICU root locale rules.
    */
   public static UTF8String toTitleCase(final UTF8String target) {
-    // Note: In order to achieve the desired behaviour, we use the ICU UCharacter class to
+    // Note: In order to achieve the desired behavior, we use the ICU UCharacter class to
     // convert the string to titlecase, which only accepts a Java strings as input.
     // TODO(SPARK-48715): All UTF8String -> String conversions should use `makeValid`
     return UTF8String.fromString(UCharacter.toTitleCase(target.toString(),
@@ -493,7 +493,7 @@ public static UTF8String toTitleCase(final UTF8String target, final int collatio
   /*
    * Returns the position of the first occurrence of the match string in the set string,
    * counting ASCII commas as delimiters. The match string is compared in a collation-aware manner,
-   * with respect to the specified collation ID. Similar to UTF8String.findInSet behaviour used
+   * with respect to the specified collation ID. Similar to UTF8String.findInSet behavior used
    * for UTF8_BINARY collation, the method returns 0 if the match string contains no commas.
    *
    * @param match the string to be searched for