Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -78,24 +78,36 @@ public static class Collation {
*/
public final boolean supportsBinaryOrdering;

/**
* Support for Lowercase Equality implies that it is possible to check equality on
* byte by byte level, but only after calling "UTF8String.toLowerCase" on both arguments.
* This allows custom collation support for UTF8_BINARY_LCASE collation in various Spark
* expressions, as this particular collation is not supported by the external ICU library.
*/
public final boolean supportsLowercaseEquality;

public Collation(
String collationName,
Collator collator,
Comparator<UTF8String> comparator,
String version,
ToLongFunction<UTF8String> hashFunction,
boolean supportsBinaryEquality,
boolean supportsBinaryOrdering) {
boolean supportsBinaryOrdering,
boolean supportsLowercaseEquality) {
this.collationName = collationName;
this.collator = collator;
this.comparator = comparator;
this.version = version;
this.hashFunction = hashFunction;
this.supportsBinaryEquality = supportsBinaryEquality;
this.supportsBinaryOrdering = supportsBinaryOrdering;
this.supportsLowercaseEquality = supportsLowercaseEquality;

// De Morgan's Law to check supportsBinaryOrdering => supportsBinaryEquality
assert(!supportsBinaryOrdering || supportsBinaryEquality);
// No Collation can simultaneously support binary equality and lowercase equality
assert(!supportsBinaryEquality || !supportsLowercaseEquality);

if (supportsBinaryEquality) {
this.equalsFunction = UTF8String::equals;
Expand All @@ -112,15 +124,17 @@ public Collation(
Collator collator,
String version,
boolean supportsBinaryEquality,
boolean supportsBinaryOrdering) {
boolean supportsBinaryOrdering,
boolean supportsLowercaseEquality) {
this(
collationName,
collator,
(s1, s2) -> collator.compare(s1.toString(), s2.toString()),
version,
s -> (long)collator.getCollationKey(s.toString()).hashCode(),
supportsBinaryEquality,
supportsBinaryOrdering);
supportsBinaryOrdering,
supportsLowercaseEquality);
}
}

Expand All @@ -141,7 +155,8 @@ public Collation(
"1.0",
s -> (long)s.hashCode(),
true,
true);
true,
false);

// Case-insensitive UTF8 binary collation.
// TODO: Do in place comparisons instead of creating new strings.
Expand All @@ -152,17 +167,18 @@ public Collation(
"1.0",
(s) -> (long)s.toLowerCase().hashCode(),
false,
false);
false,
true);

// UNICODE case sensitive comparison (ROOT locale, in ICU).
collationTable[2] = new Collation(
"UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, false);
"UNICODE", Collator.getInstance(ULocale.ROOT), "153.120.0.0", true, false, false);
collationTable[2].collator.setStrength(Collator.TERTIARY);
collationTable[2].collator.freeze();

// UNICODE case-insensitive comparison (ROOT locale, in ICU + Secondary strength).
collationTable[3] = new Collation(
"UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, false);
"UNICODE_CI", Collator.getInstance(ULocale.ROOT), "153.120.0.0", false, false, false);
collationTable[3].collator.setStrength(Collator.SECONDARY);
collationTable[3].collator.freeze();

Expand All @@ -172,19 +188,31 @@ public Collation(
}

/**
* Auxiliary methods for collation aware string operations.
* Returns a StringSearch object for the given pattern and target strings, under collation
* rules corresponding to the given collationId. The external ICU library StringSearch object can
* be used to find occurrences of the pattern in the target string, while respecting collation.
*/

public static StringSearch getStringSearch(
final UTF8String left,
final UTF8String right,
final UTF8String targetUTF8String,
final UTF8String patternUTF8String,
final int collationId) {
String pattern = right.toString();
CharacterIterator target = new StringCharacterIterator(left.toString());
String pattern = patternUTF8String.toString();
CharacterIterator target = new StringCharacterIterator(targetUTF8String.toString());
Collator collator = CollationFactory.fetchCollation(collationId).collator;
return new StringSearch(pattern, target, (RuleBasedCollator) collator);
}

/**
* Returns a collation-unaware StringSearch object for the given pattern and target strings.
* While this object does not respect collation, it can be used to find occurrences of the pattern
* in the target string for UTF8_BINARY or UTF8_BINARY_LCASE (if arguments are lowercased).
*/
public static StringSearch getStringSearch(
Comment thread
uros-db marked this conversation as resolved.
final UTF8String targetUTF8String,
final UTF8String patternUTF8String) {
return new StringSearch(patternUTF8String.toString(), targetUTF8String.toString());
}

/**
* Returns the collation id for the given collation name.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.catalyst.util;

import com.ibm.icu.text.StringSearch;

import org.apache.spark.unsafe.types.UTF8String;

/**
* Static entry point for collation-aware expressions (StringExpressions, RegexpExpressions, and
* other expressions that require custom collation support), as well as private utility methods for
* collation-aware UTF8String operations needed to implement .
*/
public final class CollationSupport {
Comment thread
uros-db marked this conversation as resolved.

/**
* Collation-aware string expressions.
*/

public static class Contains {
public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(l, r);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(l, r);
} else {
return execICU(l, r, collationId);
}
}
public static String genCode(final String l, final String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.Contains.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", l, r);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", l, r);
} else {
return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
}
}
public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.contains(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.toLowerCase().contains(r.toLowerCase());
Comment thread
uros-db marked this conversation as resolved.
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
if (r.numBytes() == 0) return true;
if (l.numBytes() == 0) return false;
StringSearch stringSearch = CollationFactory.getStringSearch(l, r, collationId);
return stringSearch.first() != StringSearch.DONE;
}
}

public static class StartsWith {
public static boolean exec(final UTF8String l, final UTF8String r,
final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(l, r);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(l, r);
} else {
return execICU(l, r, collationId);
}
}
public static String genCode(final String l, final String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.StartsWith.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", l, r);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", l, r);
} else {
return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
}
}
public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.startsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.toLowerCase().startsWith(r.toLowerCase());
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
return CollationAwareUTF8String.matchAt(l, r, 0, collationId);
}
}

public static class EndsWith {
public static boolean exec(final UTF8String l, final UTF8String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(l, r);
} else if (collation.supportsLowercaseEquality) {
return execLowercase(l, r);
} else {
return execICU(l, r, collationId);
}
}
public static String genCode(final String l, final String r, final int collationId) {
CollationFactory.Collation collation = CollationFactory.fetchCollation(collationId);
String expr = "CollationSupport.EndsWith.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", l, r);
} else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", l, r);
} else {
return String.format(expr + "ICU(%s, %s, %d)", l, r, collationId);
}
}
public static boolean execBinary(final UTF8String l, final UTF8String r) {
return l.endsWith(r);
}
public static boolean execLowercase(final UTF8String l, final UTF8String r) {
return l.toLowerCase().endsWith(r.toLowerCase());
}
public static boolean execICU(final UTF8String l, final UTF8String r,
final int collationId) {
return CollationAwareUTF8String.matchAt(l, r, l.numBytes() - r.numBytes(), collationId);
}
}

// TODO: Add more collation-aware string expressions.

/**
* Collation-aware regexp expressions.
*/

// TODO: Add more collation-aware regexp expressions.

/**
* Other collation-aware expressions.
*/

// TODO: Add other collation-aware expressions.

/**
* Utility class for collation-aware UTF8String operations.
*/

private static class CollationAwareUTF8String {

private static boolean matchAt(final UTF8String target, final UTF8String pattern,
final int pos, final int collationId) {
if (pattern.numChars() + pos > target.numChars() || pos < 0) {
return false;
}
if (pattern.numBytes() == 0 || target.numBytes() == 0) {
return pattern.numBytes() == 0;
}
return CollationFactory.getStringSearch(target.substring(
pos, pos + pattern.numChars()), pattern, collationId).last() == 0;
}

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
import com.esotericsoftware.kryo.io.Input;
import com.esotericsoftware.kryo.io.Output;

import com.ibm.icu.text.StringSearch;
import org.apache.spark.sql.catalyst.util.CollationFactory;
import org.apache.spark.unsafe.Platform;
import org.apache.spark.unsafe.UTF8StringBuilder;
Expand Down Expand Up @@ -342,28 +341,6 @@ public boolean contains(final UTF8String substring) {
return false;
}

public boolean contains(final UTF8String substring, int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
return this.contains(substring);
}
if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
return this.toLowerCase().contains(substring.toLowerCase());
}
return collatedContains(substring, collationId);
}

private boolean collatedContains(final UTF8String substring, int collationId) {
if (substring.numBytes == 0) return true;
if (this.numBytes == 0) return false;
StringSearch stringSearch = CollationFactory.getStringSearch(this, substring, collationId);
while (stringSearch.next() != StringSearch.DONE) {
if (stringSearch.getMatchLength() == stringSearch.getPattern().length()) {
return true;
}
}
return false;
}

/**
* Returns the byte at position `i`.
*/
Expand All @@ -378,45 +355,14 @@ public boolean matchAt(final UTF8String s, int pos) {
return ByteArrayMethods.arrayEquals(base, offset + pos, s.base, s.offset, s.numBytes);
}

private boolean matchAt(final UTF8String s, int pos, int collationId) {
if (s.numChars() + pos > this.numChars() || pos < 0) {
return false;
}
if (s.numBytes == 0 || this.numBytes == 0) {
return s.numBytes == 0;
}
return CollationFactory.getStringSearch(this.substring(pos, pos + s.numChars()),
s, collationId).last() == 0;
}

public boolean startsWith(final UTF8String prefix) {
return matchAt(prefix, 0);
}

public boolean startsWith(final UTF8String prefix, int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
return this.startsWith(prefix);
}
if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
return this.toLowerCase().startsWith(prefix.toLowerCase());
}
return matchAt(prefix, 0, collationId);
}

public boolean endsWith(final UTF8String suffix) {
return matchAt(suffix, numBytes - suffix.numBytes);
}

public boolean endsWith(final UTF8String suffix, int collationId) {
if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
return this.endsWith(suffix);
}
if (collationId == CollationFactory.UTF8_BINARY_LCASE_COLLATION_ID) {
return this.toLowerCase().endsWith(suffix.toLowerCase());
}
return matchAt(suffix, numBytes - suffix.numBytes, collationId);
}

/**
* Returns the upper case of this string
*/
Expand Down
Loading