apache · davidm-db · Mar 27, 2024 · Mar 27, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java
@@ -175,12 +175,19 @@ public Collation(
    * Auxiliary methods for collation aware string operations.
    */
 
+  /**
+   * Creates an instance of ICU's StringSearch with provided parameters.
+   * @param targetUTF8String UTF8String representation of the string to be searched.
+   * @param patternUTF8String UTF8String representation of the string to search for.
+   * @param collationId ID of the collation to use.
+   * @return Created instance of StringSearch.
+   */
   public static StringSearch getStringSearch(
-      final UTF8String left,
-      final UTF8String right,
+      final UTF8String targetUTF8String,
+      final UTF8String patternUTF8String,
       final int collationId) {
-    String pattern = right.toString();
-    CharacterIterator target = new StringCharacterIterator(left.toString());
+    String pattern = patternUTF8String.toString();
+    CharacterIterator target = new StringCharacterIterator(targetUTF8String.toString());
     Collator collator = CollationFactory.fetchCollation(collationId).collator;
     return new StringSearch(pattern, target, (RuleBasedCollator) collator);
   }

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -585,6 +585,23 @@ public UTF8String trim() {
     return copyUTF8String(s, e);
   }
 
+  /**
+   * Trims space characters from both ends of this string - same as {@link UTF8String#trim()}.
+   * This variant of the method additionally applies provided collation to this string
+   * and space character before searching.
+   *
+   * @param collationId Id of the collation to use.
+   * @return this string with no spaces at the start or end.
+   */
+  public UTF8String trim(int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality
+        || CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
+      return trim();
+    } else {
+      return trim(UTF8String.fromString(" "), collationId);
+    }
+  }
+
   /**
    * Trims whitespace ASCII characters from both ends of this string.
    *
@@ -628,6 +645,27 @@ public UTF8String trim(UTF8String trimString) {
     }
   }
 
+  /**
+   * Trims characters of the given trim string from both ends of this string.
+   * This variant of the method additionally applies provided collation to this string
+   * and trim characters before searching.
+   *
+   * @param trimString The trim characters string.
+   * @param collationId Id of the collation to use.
+   * @return this string with no occurrences of the characters from trim string.
+   */
+  public UTF8String trim(UTF8String trimString, int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return trim(trimString);
+    }
+
+    if (CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
+      return lowercaseTrimLeft(trimString).lowercaseTrimRight(trimString);
+    }
+
+    return trimLeft(trimString, collationId).trimRight(trimString, collationId);
+  }
+
   /**
    * Trims space characters (ASCII 32) from the start of this string.
    *
@@ -648,6 +686,23 @@ public UTF8String trimLeft() {
     return copyUTF8String(s, this.numBytes - 1);
   }
 
+  /**
+   * Trims space characters from the start of this string - same as {@link UTF8String#trimLeft()}.
+   * This variant of the method additionally applies provided collation to this string
+   * and space character before searching.
+   *
+   * @param collationId Id of the collation to use.
+   * @return this string with no spaces at the start.
+   */
+  public UTF8String trimLeft(int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality
+        || CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
+      return trimLeft();
+    } else {
+      return trimLeft(UTF8String.fromString(" "), collationId);
+    }
+  }
+
   /**
    * Trims instances of the given trim string from the start of this string.
    *
@@ -686,6 +741,109 @@ public UTF8String trimLeft(UTF8String trimString) {
     return copyUTF8String(trimIdx, numBytes - 1);
   }
 
+  /**
+   * Trims characters of the given trim string from the start of this string.
+   * This variant of the method additionally applies provided collation to this string
+   * and trim characters before searching.
+   *
+   * @param trimString The trim characters string.
+   * @param collationId Id of the collation to use.
+   * @return this string with no occurrences of the trim characters at the start.
+   */
+  public UTF8String trimLeft(UTF8String trimString, int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return trimLeft(trimString);
+    }
+
+    if (CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
+      return lowercaseTrimLeft(trimString);
+    }
+
+    return collationAwareTrimLeft(trimString, collationId);
+  }
+
+  private UTF8String lowercaseTrimLeft(UTF8String trimString) {
+    if (trimString == null) {
+      return null;
+    }
+
+    // The searching byte position in the lowercase source string
+    int searchIdx = 0;
+    // The byte position of a first non-matching character in the lowercase source string
+    int trimByteIdx = 0;
+
+    // Convert trimString to lowercase so it can be searched properly
+    trimString = trimString.toLowerCase();
+
+    while (searchIdx < numBytes) {
+      UTF8String searchChar = copyUTF8String(
+        searchIdx,
+        searchIdx + numBytesForFirstByte(getByte(searchIdx)) - 1);
+      int searchCharBytes = searchChar.numBytes;
+
+      // Try to find the matching for the lowercase searchChar in the trimString
+      if (trimString.find(searchChar.toLowerCase(), 0) >= 0) {
+        trimByteIdx += searchCharBytes;
+        searchIdx += searchCharBytes;
+      } else {
+        // No matching, exit the search
+        break;
+      }
+    }
+
+    if (searchIdx == 0) {
+      // Nothing trimmed - return original string (not converted to lowercase)
+      return this;
+    }
+    if (trimByteIdx  >= numBytes) {
+      // Everything trimmed
+      return EMPTY_UTF8;
+    }
+    return copyUTF8String(trimByteIdx, numBytes - 1);
+  }
+
+  private UTF8String collationAwareTrimLeft(UTF8String trimString, int collationId) {
+    if (trimString == null) {
+      return null;
+    }
+
+    // The searching byte position in the source string
+    int searchIdx = 0;
+    // The byte position of a first non-matching character in the source string
+    int trimByteIdx = 0;
+
+    while (searchIdx < numBytes) {
+      UTF8String searchChar = copyUTF8String(
+        searchIdx,
+        searchIdx + numBytesForFirstByte(getByte(searchIdx)) - 1);
+      int searchCharBytes = searchChar.numBytes;
+
+      // Try to find the matching for the searchChar in the trimString
+      StringSearch stringSearch = CollationFactory.getStringSearch(
+        trimString, searchChar, collationId);
+      int searchCharIdx = stringSearch.next();
+
+      if (searchCharIdx != StringSearch.DONE
+          && stringSearch.getMatchLength() == stringSearch.getPattern().length()) {
+        trimByteIdx += searchCharBytes;
+        searchIdx += searchCharBytes;
+      } else {
+        // No matching, exit the search
+        break;
+      }
+    }
+
+    if (searchIdx == 0) {
+      // Nothing trimmed - return original string (not converted to lowercase)
+      return this;
+    }
+    if (trimByteIdx >= numBytes) {
+      // Everything trimmed
+      return EMPTY_UTF8;
+    }
+    return copyUTF8String(trimByteIdx, numBytes - 1);
+  }
+
   /**
    * Trims space characters (ASCII 32) from the end of this string.
    *
@@ -706,6 +864,23 @@ public UTF8String trimRight() {
     return copyUTF8String(0, e);
   }
 
+  /**
+   * Trims space characters from the end of this string - same as {@link UTF8String#trimRight()}.
+   * This variant of the method additionally applies provided collation to this string
+   * and space character before searching.
+   *
+   * @param collationId Id of the collation to use.
+   * @return this string with no spaces at the end.
+   */
+  public UTF8String trimRight(int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality
+        || CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
+      return trimRight();
+    } else {
+      return trimRight(UTF8String.fromString(" "), collationId);
+    }
+  }
+
   /**
    * Trims at most `numSpaces` space characters (ASCII 32) from the end of this string.
    */
@@ -767,6 +942,137 @@ public UTF8String trimRight(UTF8String trimString) {
     return copyUTF8String(0, trimEnd);
   }
 
+  /**
+   * Trims characters of the given trim string from the end of this string.
+   * This variant of the method additionally applies provided collation to this string
+   * and trim characters before searching.
+   *
+   * @param trimString The trim characters string.
+   * @param collationId Id of the collation to use.
+   * @return this string with no occurrences of the trim characters at the end.
+   */
+  public UTF8String trimRight(UTF8String trimString, int collationId) {
+    if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return trimRight(trimString);
+    }
+
+    if (CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
+      return lowercaseTrimRight(trimString);
+    }
+
+    return collationAwareTrimRight(trimString, collationId);
+  }
+
+  private UTF8String lowercaseTrimRight(UTF8String trimString) {
+    if (trimString == null) {
+      return null;
+    }
+
+    // Convert trimString to lowercase so it can be searched properly
+    trimString = trimString.toLowerCase();
+
+    // Number of bytes iterated from the source string
+    int byteIdx = 0;
+    // Number of characters iterated from the source string
+    int numChars = 0;
+    // Array of character length for the source string
+    int[] stringCharLen = new int[numBytes];
+    // Array of the first byte position for each character in the source string
+    int[] stringCharPos = new int[numBytes];
+
+    // Build the position and length array
+    while (byteIdx < numBytes) {
+      stringCharPos[numChars] = byteIdx;
+      stringCharLen[numChars] = numBytesForFirstByte(getByte(byteIdx));
+      byteIdx += stringCharLen[numChars];
+      numChars++;
+    }
+
+    // Index trimEnd points to the first no matching byte position from the right side of
+    //  the source string.
+    int trimByteIdx = numBytes - 1;
+
+    while (numChars > 0) {
+      UTF8String searchChar = copyUTF8String(
+        stringCharPos[numChars - 1],
+        stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
+
+      // Try to find the matching for the lowercase searchChar in the trimString
+      if (trimString.find(searchChar.toLowerCase(), 0) >= 0) {
+        trimByteIdx -= stringCharLen[numChars - 1];
+        numChars--;
+      } else {
+        break;
+      }
+    }
+
+    if (trimByteIdx == numBytes - 1) {
+      // Nothing trimmed
+      return this;
+    }
+    if (trimByteIdx < 0) {
+      // Everything trimmed
+      return EMPTY_UTF8;
+    }
+    return copyUTF8String(0, trimByteIdx);
+  }
+
+  private UTF8String collationAwareTrimRight(UTF8String trimString, int collationId) {
+    if (trimString == null) {
+      return null;
+    }
+
+    // Number of bytes iterated from the source string
+    int byteIdx = 0;
+    // Number of characters iterated from the source string
+    int numChars = 0;
+    // Array of character length for the source string
+    int[] stringCharLen = new int[numBytes];
+    // Array of the first byte position for each character in the source string
+    int[] stringCharPos = new int[numBytes];
+
+    // Build the position and length array
+    while (byteIdx < numBytes) {
+      stringCharPos[numChars] = byteIdx;
+      stringCharLen[numChars] = numBytesForFirstByte(getByte(byteIdx));
+      byteIdx += stringCharLen[numChars];
+      numChars++;
+    }
+
+    // Index trimEnd points to the first no matching byte position from the right side of
+    //  the source string.
+    int trimByteIdx = numBytes - 1;
+
+    while (numChars > 0) {
+      UTF8String searchChar = copyUTF8String(
+        stringCharPos[numChars - 1],
+        stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
+
+      // Try to find the matching for the searchChar in the trimString
+      StringSearch stringSearch = CollationFactory.getStringSearch(
+        trimString, searchChar, collationId);
+      int searchCharIdx = stringSearch.next();
+
+      if (searchCharIdx != StringSearch.DONE
+          && stringSearch.getMatchLength() == stringSearch.getPattern().length()) {
+        trimByteIdx -= stringCharLen[numChars - 1];
+        numChars--;
+      } else {
+        break;
+      }
+    }
+
+    if (trimByteIdx == numBytes - 1) {
+      // Nothing trimmed
+      return this;
+    }
+    if (trimByteIdx < 0) {
+      // Everything trimmed
+      return EMPTY_UTF8;
+    }
+    return copyUTF8String(0, trimByteIdx);
+  }
+
   public UTF8String reverse() {
     byte[] result = new byte[this.numBytes];