Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,19 @@ public Collation(
* Auxiliary methods for collation aware string operations.
*/

/**
* Creates an instance of ICU's StringSearch with provided parameters.
* @param targetUTF8String UTF8String representation of the string to be searched.
* @param patternUTF8String UTF8String representation of the string to search for.
* @param collationId ID of the collation to use.
* @return Created instance of StringSearch.
*/
public static StringSearch getStringSearch(
final UTF8String left,
final UTF8String right,
final UTF8String targetUTF8String,
final UTF8String patternUTF8String,
final int collationId) {
String pattern = right.toString();
CharacterIterator target = new StringCharacterIterator(left.toString());
String pattern = patternUTF8String.toString();
CharacterIterator target = new StringCharacterIterator(targetUTF8String.toString());
Collator collator = CollationFactory.fetchCollation(collationId).collator;
return new StringSearch(pattern, target, (RuleBasedCollator) collator);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,23 @@ public UTF8String trim() {
return copyUTF8String(s, e);
}

/**
* Trims space characters from both ends of this string - same as {@link UTF8String#trim()}.
* This variant of the method additionally applies provided collation to this string
* and space character before searching.
*
* @param collationId Id of the collation to use.
* @return this string with no spaces at the start or end.
*/
public UTF8String trim(int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality
|| CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
return trim();
} else {
return trim(UTF8String.fromString(" "), collationId);
}
}

/**
* Trims whitespace ASCII characters from both ends of this string.
*
Expand Down Expand Up @@ -628,6 +645,27 @@ public UTF8String trim(UTF8String trimString) {
}
}

/**
* Trims characters of the given trim string from both ends of this string.
* This variant of the method additionally applies provided collation to this string
* and trim characters before searching.
*
* @param trimString The trim characters string.
* @param collationId Id of the collation to use.
* @return this string with no occurrences of the characters from trim string.
*/
public UTF8String trim(UTF8String trimString, int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
return trim(trimString);
}

if (CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
return lowercaseTrimLeft(trimString).lowercaseTrimRight(trimString);
}

return trimLeft(trimString, collationId).trimRight(trimString, collationId);
}

/**
* Trims space characters (ASCII 32) from the start of this string.
*
Expand All @@ -648,6 +686,23 @@ public UTF8String trimLeft() {
return copyUTF8String(s, this.numBytes - 1);
}

/**
* Trims space characters from the start of this string - same as {@link UTF8String#trimLeft()}.
* This variant of the method additionally applies provided collation to this string
* and space character before searching.
*
* @param collationId Id of the collation to use.
* @return this string with no spaces at the start.
*/
public UTF8String trimLeft(int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality
|| CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
return trimLeft();
} else {
return trimLeft(UTF8String.fromString(" "), collationId);
}
}

/**
* Trims instances of the given trim string from the start of this string.
*
Expand Down Expand Up @@ -686,6 +741,109 @@ public UTF8String trimLeft(UTF8String trimString) {
return copyUTF8String(trimIdx, numBytes - 1);
}

/**
* Trims characters of the given trim string from the start of this string.
* This variant of the method additionally applies provided collation to this string
* and trim characters before searching.
*
* @param trimString The trim characters string.
* @param collationId Id of the collation to use.
* @return this string with no occurrences of the trim characters at the start.
*/
public UTF8String trimLeft(UTF8String trimString, int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
return trimLeft(trimString);
}

if (CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
return lowercaseTrimLeft(trimString);
}

return collationAwareTrimLeft(trimString, collationId);
}

private UTF8String lowercaseTrimLeft(UTF8String trimString) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of implementing lowercaseTrimLeft and collatedTrimLeft separately (these functions look very similar to me), I think we could make use of new StringSearch(pattern, target) (with .toLowerCase() for both params, and no collationId for UTF8_BINARY_LCASE)

For more context, please take a look at: https://github.com/apache/spark/pull/45704/files#r1538624688

if (trimString == null) {
return null;
}

// The searching byte position in the lowercase source string
int searchIdx = 0;
// The byte position of a first non-matching character in the lowercase source string
int trimByteIdx = 0;

// Convert trimString to lowercase so it can be searched properly
trimString = trimString.toLowerCase();
Comment thread
davidm-db marked this conversation as resolved.

while (searchIdx < numBytes) {
UTF8String searchChar = copyUTF8String(
searchIdx,
searchIdx + numBytesForFirstByte(getByte(searchIdx)) - 1);
int searchCharBytes = searchChar.numBytes;

// Try to find the matching for the lowercase searchChar in the trimString
if (trimString.find(searchChar.toLowerCase(), 0) >= 0) {
trimByteIdx += searchCharBytes;
searchIdx += searchCharBytes;
} else {
// No matching, exit the search
break;
}
}

if (searchIdx == 0) {
// Nothing trimmed - return original string (not converted to lowercase)
return this;
}
if (trimByteIdx >= numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(trimByteIdx, numBytes - 1);
}

private UTF8String collationAwareTrimLeft(UTF8String trimString, int collationId) {
if (trimString == null) {
return null;
}

// The searching byte position in the source string
int searchIdx = 0;
// The byte position of a first non-matching character in the source string
int trimByteIdx = 0;

while (searchIdx < numBytes) {
UTF8String searchChar = copyUTF8String(
searchIdx,
searchIdx + numBytesForFirstByte(getByte(searchIdx)) - 1);
int searchCharBytes = searchChar.numBytes;

// Try to find the matching for the searchChar in the trimString
StringSearch stringSearch = CollationFactory.getStringSearch(
trimString, searchChar, collationId);
int searchCharIdx = stringSearch.next();

if (searchCharIdx != StringSearch.DONE
&& stringSearch.getMatchLength() == stringSearch.getPattern().length()) {
trimByteIdx += searchCharBytes;
searchIdx += searchCharBytes;
} else {
// No matching, exit the search
break;
}
}

if (searchIdx == 0) {
// Nothing trimmed - return original string (not converted to lowercase)
return this;
}
if (trimByteIdx >= numBytes) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(trimByteIdx, numBytes - 1);
}

/**
* Trims space characters (ASCII 32) from the end of this string.
*
Expand All @@ -706,6 +864,23 @@ public UTF8String trimRight() {
return copyUTF8String(0, e);
}

/**
* Trims space characters from the end of this string - same as {@link UTF8String#trimRight()}.
* This variant of the method additionally applies provided collation to this string
* and space character before searching.
*
* @param collationId Id of the collation to use.
* @return this string with no spaces at the end.
*/
public UTF8String trimRight(int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality
|| CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
return trimRight();
} else {
return trimRight(UTF8String.fromString(" "), collationId);
}
}

/**
* Trims at most `numSpaces` space characters (ASCII 32) from the end of this string.
*/
Expand Down Expand Up @@ -767,6 +942,137 @@ public UTF8String trimRight(UTF8String trimString) {
return copyUTF8String(0, trimEnd);
}

/**
* Trims characters of the given trim string from the end of this string.
* This variant of the method additionally applies provided collation to this string
* and trim characters before searching.
*
* @param trimString The trim characters string.
* @param collationId Id of the collation to use.
* @return this string with no occurrences of the trim characters at the end.
*/
public UTF8String trimRight(UTF8String trimString, int collationId) {
Comment thread
davidm-db marked this conversation as resolved.
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
return trimRight(trimString);
}

if (CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID == collationId) {
return lowercaseTrimRight(trimString);
}

return collationAwareTrimRight(trimString, collationId);
}

private UTF8String lowercaseTrimRight(UTF8String trimString) {
if (trimString == null) {
return null;
}

// Convert trimString to lowercase so it can be searched properly
trimString = trimString.toLowerCase();

// Number of bytes iterated from the source string
int byteIdx = 0;
// Number of characters iterated from the source string
int numChars = 0;
// Array of character length for the source string
int[] stringCharLen = new int[numBytes];
// Array of the first byte position for each character in the source string
int[] stringCharPos = new int[numBytes];

// Build the position and length array
while (byteIdx < numBytes) {
stringCharPos[numChars] = byteIdx;
stringCharLen[numChars] = numBytesForFirstByte(getByte(byteIdx));
byteIdx += stringCharLen[numChars];
numChars++;
}

// Index trimEnd points to the first no matching byte position from the right side of
// the source string.
int trimByteIdx = numBytes - 1;

while (numChars > 0) {
UTF8String searchChar = copyUTF8String(
stringCharPos[numChars - 1],
stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);

// Try to find the matching for the lowercase searchChar in the trimString
if (trimString.find(searchChar.toLowerCase(), 0) >= 0) {
trimByteIdx -= stringCharLen[numChars - 1];
numChars--;
} else {
break;
}
}

if (trimByteIdx == numBytes - 1) {
// Nothing trimmed
return this;
}
if (trimByteIdx < 0) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(0, trimByteIdx);
}

private UTF8String collationAwareTrimRight(UTF8String trimString, int collationId) {
if (trimString == null) {
return null;
}

// Number of bytes iterated from the source string
int byteIdx = 0;
// Number of characters iterated from the source string
int numChars = 0;
// Array of character length for the source string
int[] stringCharLen = new int[numBytes];
// Array of the first byte position for each character in the source string
int[] stringCharPos = new int[numBytes];

// Build the position and length array
while (byteIdx < numBytes) {
stringCharPos[numChars] = byteIdx;
stringCharLen[numChars] = numBytesForFirstByte(getByte(byteIdx));
byteIdx += stringCharLen[numChars];
numChars++;
}

// Index trimEnd points to the first no matching byte position from the right side of
// the source string.
int trimByteIdx = numBytes - 1;

while (numChars > 0) {
UTF8String searchChar = copyUTF8String(
stringCharPos[numChars - 1],
stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);

// Try to find the matching for the searchChar in the trimString
StringSearch stringSearch = CollationFactory.getStringSearch(
trimString, searchChar, collationId);
int searchCharIdx = stringSearch.next();

if (searchCharIdx != StringSearch.DONE
&& stringSearch.getMatchLength() == stringSearch.getPattern().length()) {
trimByteIdx -= stringCharLen[numChars - 1];
numChars--;
} else {
break;
}
}

if (trimByteIdx == numBytes - 1) {
// Nothing trimmed
return this;
}
if (trimByteIdx < 0) {
// Everything trimmed
return EMPTY_UTF8;
}
return copyUTF8String(0, trimByteIdx);
}

public UTF8String reverse() {
byte[] result = new byte[this.numBytes];

Expand Down
Loading