diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index f77b534455540..37837bdb52d4a 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -1623,106 +1623,169 @@ public void testStringInstr() throws SparkException { assertStringInstr("aπŸ™ƒxπŸ™ƒb", "b", "UNICODE_CI", 5); } + /** + * Verify the behaviour of the `FindInSet` collation support class. + */ + private void assertFindInSet(String word, UTF8String set, String collationName, - Integer expected) throws SparkException { + int expected) throws SparkException { UTF8String w = UTF8String.fromString(word); int collationId = CollationFactory.collationNameToId(collationName); - assertEquals(expected, CollationSupport.FindInSet.exec(w, set, collationId)); + int result = CollationSupport.FindInSet.exec(w, set, collationId); + assertEquals(expected, result); } @Test public void testFindInSet() throws SparkException { - assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1); - assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + // Empty strings. + assertFindInSet("", UTF8String.fromString(""), "UTF8_BINARY", 1); + assertFindInSet("", UTF8String.fromString(""), "UTF8_LCASE", 1); + assertFindInSet("", UTF8String.fromString(""), "UNICODE", 1); + assertFindInSet("", UTF8String.fromString(""), "UNICODE_CI", 1); assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6); - assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("c", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4); - assertFindInSet("AB", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3); - assertFindInSet("AbC", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1); - assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); - assertFindInSet("XX", UTF8String.fromString("xx"), "UTF8_LCASE", 1); assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_BINARY", 1); assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UTF8_LCASE", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6); - assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0); - assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_LCASE", 4); - assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 3); - assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); - assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE", 1); + assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_BINARY", 6); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UTF8_LCASE", 6); assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE", 6); + assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_BINARY", 0); + assertFindInSet("", UTF8String.fromString("abc"), "UTF8_LCASE", 0); assertFindInSet("", UTF8String.fromString("abc"), "UNICODE", 0); + assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0); + // Basic tests. + assertFindInSet("xx", UTF8String.fromString("xx"), "UTF8_BINARY", 1); + assertFindInSet("xx", UTF8String.fromString("xx"), "UTF8_LCASE", 1); assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE", 1); - assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE", 0); - assertFindInSet("倧", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE", 5); + assertFindInSet("xx", UTF8String.fromString("xx"), "UNICODE_CI", 1); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); assertFindInSet("a", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 1); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 1); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 1); + assertFindInSet("abc", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 1); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("abcd", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 5); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 5); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 5); + assertFindInSet("def", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 5); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("xyz", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 3); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("Ab", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 3); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); + assertFindInSet("d,ef", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_BINARY", 0); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UTF8_LCASE", 4); + assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE", 0); assertFindInSet("C", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 4); - assertFindInSet("DeF", UTF8String.fromString("abc,b,ab,c,dEf"), "UNICODE_CI", 5); - assertFindInSet("DEFG", UTF8String.fromString("abc,b,ab,c,def"), "UNICODE_CI", 0); - assertFindInSet("", UTF8String.fromString(",abc,b,ab,c,def"), "UNICODE_CI", 1); - assertFindInSet("", UTF8String.fromString("abc,b,ab,c,def,"), "UNICODE_CI", 6); - assertFindInSet("", UTF8String.fromString("abc"), "UNICODE_CI", 0); - assertFindInSet("XX", UTF8String.fromString("xx"), "UNICODE_CI", 1); + // Advanced tests. + assertFindInSet("倧", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_BINARY", 5); + assertFindInSet("倧", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_LCASE", 5); + assertFindInSet("倧", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE", 5); + assertFindInSet("倧", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE_CI", 5); + assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_BINARY", 0); + assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_LCASE", 4); + assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE", 0); assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE_CI", 4); + assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,η•ŒXx,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_BINARY", 0); + assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,η•ŒXx,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UTF8_LCASE", 5); + assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,η•ŒXx,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE", 0); assertFindInSet("η•Œx", UTF8String.fromString("test,倧千,η•ŒXx,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE_CI", 5); - assertFindInSet("倧", UTF8String.fromString("test,倧千,δΈ–,η•ŒX,倧,千,δΈ–η•Œ"), "UNICODE_CI", 5); - assertFindInSet("iΜ‡", UTF8String.fromString("Δ°"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("Δ°"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("iΜ‡"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("iΜ‡"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("Δ°,"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("Δ°,"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("iΜ‡,"), "UNICODE_CI", 1); - assertFindInSet("i", UTF8String.fromString("iΜ‡,"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,Δ°"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,Δ°"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,iΜ‡"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,iΜ‡"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,Δ°,12"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,Δ°,12"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,iΜ‡,12"), "UNICODE_CI", 2); - assertFindInSet("i", UTF8String.fromString("ab,iΜ‡,12"), "UNICODE_CI", 0); - assertFindInSet("iΜ‡o", UTF8String.fromString("ab,Δ°o,12"), "UNICODE_CI", 2); - assertFindInSet("Δ°o", UTF8String.fromString("ab,iΜ‡o,12"), "UNICODE_CI", 2); - assertFindInSet("iΜ‡", UTF8String.fromString("Δ°"), "UTF8_LCASE", 1); + // One-to-many case mapping (e.g. Turkish dotted I). + assertFindInSet("i\u0307", UTF8String.fromString("Δ°"), "UTF8_BINARY", 0); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°"), "UTF8_LCASE", 1); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°"), "UNICODE", 0); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("Δ°"), "UTF8_BINARY", 0); assertFindInSet("i", UTF8String.fromString("Δ°"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("iΜ‡"), "UTF8_LCASE", 1); - assertFindInSet("i", UTF8String.fromString("iΜ‡"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("Δ°,"), "UTF8_LCASE", 1); + assertFindInSet("i", UTF8String.fromString("Δ°"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("Δ°"), "UNICODE_CI", 0); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UTF8_BINARY", 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UTF8_LCASE", 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UNICODE", 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("i\u0307"), "UTF8_BINARY", 0); + assertFindInSet("i", UTF8String.fromString("i\u0307"), "UTF8_LCASE", 0); + assertFindInSet("i", UTF8String.fromString("i\u0307"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("i\u0307"), "UNICODE_CI", 0); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°,"), "UTF8_BINARY", 0); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°,"), "UTF8_LCASE", 1); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°,"), "UNICODE", 0); + assertFindInSet("i\u0307", UTF8String.fromString("Δ°,"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("Δ°,"), "UTF8_BINARY", 0); assertFindInSet("i", UTF8String.fromString("Δ°,"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("iΜ‡,"), "UTF8_LCASE", 1); - assertFindInSet("i", UTF8String.fromString("iΜ‡,"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,Δ°"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("Δ°,"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("Δ°,"), "UNICODE_CI", 0); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UTF8_BINARY", 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UTF8_LCASE", 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UNICODE", 1); + assertFindInSet("i\u0307", UTF8String.fromString("i\u0307,"), "UNICODE_CI", 1); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UTF8_BINARY", 0); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UTF8_LCASE", 0); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("i\u0307,"), "UNICODE_CI", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°"), "UTF8_BINARY", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°"), "UTF8_LCASE", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°"), "UNICODE", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,Δ°"), "UTF8_BINARY", 0); assertFindInSet("i", UTF8String.fromString("ab,Δ°"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,iΜ‡"), "UTF8_LCASE", 2); - assertFindInSet("i", UTF8String.fromString("ab,iΜ‡"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,Δ°,12"), "UTF8_LCASE", 2); + assertFindInSet("i", UTF8String.fromString("ab,Δ°"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("ab,Δ°"), "UNICODE_CI", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UNICODE", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 0); + assertFindInSet("Δ°", UTF8String.fromString("ab,i\u0307"), "UTF8_BINARY", 0); + assertFindInSet("Δ°", UTF8String.fromString("ab,i\u0307"), "UTF8_LCASE", 2); + assertFindInSet("Δ°", UTF8String.fromString("ab,i\u0307"), "UNICODE", 0); + assertFindInSet("Δ°", UTF8String.fromString("ab,i\u0307"), "UNICODE_CI", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°,12"), "UTF8_BINARY", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°,12"), "UTF8_LCASE", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°,12"), "UNICODE", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,Δ°,12"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,Δ°,12"), "UTF8_BINARY", 0); assertFindInSet("i", UTF8String.fromString("ab,Δ°,12"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡", UTF8String.fromString("ab,iΜ‡,12"), "UTF8_LCASE", 2); - assertFindInSet("i", UTF8String.fromString("ab,iΜ‡,12"), "UTF8_LCASE", 0); - assertFindInSet("iΜ‡o", UTF8String.fromString("ab,Δ°o,12"), "UTF8_LCASE", 2); - assertFindInSet("Δ°o", UTF8String.fromString("ab,iΜ‡o,12"), "UTF8_LCASE", 2); - // Invalid UTF8 strings - assertFindInSet("C", UTF8String.fromBytes( - new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UTF8_BINARY", 3); - assertFindInSet("c", UTF8String.fromBytes( - new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UTF8_LCASE", 2); - assertFindInSet("C", UTF8String.fromBytes( - new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UNICODE", 2); - assertFindInSet("c", UTF8String.fromBytes( - new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), - "UNICODE_CI", 2); - // Greek sigmas. + assertFindInSet("i", UTF8String.fromString("ab,Δ°,12"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("ab,Δ°,12"), "UNICODE_CI", 0); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UTF8_BINARY", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UTF8_LCASE", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UNICODE", 2); + assertFindInSet("i\u0307", UTF8String.fromString("ab,i\u0307,12"), "UNICODE_CI", 2); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UTF8_BINARY", 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UTF8_LCASE", 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UNICODE", 0); + assertFindInSet("i", UTF8String.fromString("ab,i\u0307,12"), "UNICODE_CI", 0); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,Δ°o,12"), "UTF8_BINARY", 0); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,Δ°o,12"), "UTF8_LCASE", 2); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,Δ°o,12"), "UNICODE", 0); + assertFindInSet("i\u0307o", UTF8String.fromString("ab,Δ°o,12"), "UNICODE_CI", 2); + assertFindInSet("Δ°o", UTF8String.fromString("ab,i\u0307o,12"), "UTF8_BINARY", 0); + assertFindInSet("Δ°o", UTF8String.fromString("ab,i\u0307o,12"), "UTF8_LCASE", 2); + assertFindInSet("Δ°o", UTF8String.fromString("ab,i\u0307o,12"), "UNICODE", 0); + assertFindInSet("Δ°o", UTF8String.fromString("ab,i\u0307o,12"), "UNICODE_CI", 2); + // Conditional case mapping (e.g. Greek sigmas). assertFindInSet("Οƒ", UTF8String.fromString("Οƒ"), "UTF8_BINARY", 1); assertFindInSet("Οƒ", UTF8String.fromString("Ο‚"), "UTF8_BINARY", 0); assertFindInSet("Οƒ", UTF8String.fromString("Ξ£"), "UTF8_BINARY", 0); @@ -1759,6 +1822,72 @@ public void testFindInSet() throws SparkException { assertFindInSet("Ξ£", UTF8String.fromString("Οƒ"), "UNICODE_CI", 1); assertFindInSet("Ξ£", UTF8String.fromString("Ο‚"), "UNICODE_CI", 1); assertFindInSet("Ξ£", UTF8String.fromString("Ξ£"), "UNICODE_CI", 1); + // Surrogate pairs. + assertFindInSet("a", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_BINARY", 0); + assertFindInSet("a", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_LCASE", 0); + assertFindInSet("a", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE", 0); + assertFindInSet("a", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE_CI", 0); + assertFindInSet("aπŸ™ƒ", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_BINARY", 1); + assertFindInSet("aπŸ™ƒ", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_LCASE", 1); + assertFindInSet("aπŸ™ƒ", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE", 1); + assertFindInSet("aπŸ™ƒ", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE_CI", 1); + assertFindInSet("b", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_BINARY", 2); + assertFindInSet("b", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_LCASE", 2); + assertFindInSet("b", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE", 2); + assertFindInSet("b", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE_CI", 2); + assertFindInSet("πŸ™ƒc", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_BINARY", 3); + assertFindInSet("πŸ™ƒc", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UTF8_LCASE", 3); + assertFindInSet("πŸ™ƒc", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE", 3); + assertFindInSet("πŸ™ƒc", UTF8String.fromString("aπŸ™ƒ,b,πŸ™ƒc"), "UNICODE_CI", 3); + assertFindInSet("πŸ˜„πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UTF8_BINARY", 0); + assertFindInSet("πŸ˜„πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UTF8_LCASE", 0); + assertFindInSet("πŸ˜„πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UNICODE", 0); + assertFindInSet("πŸ˜„πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UNICODE_CI", 0); + assertFindInSet("πŸ˜€πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UTF8_BINARY", 1); + assertFindInSet("πŸ˜€πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UTF8_LCASE", 1); + assertFindInSet("πŸ˜€πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UNICODE", 1); + assertFindInSet("πŸ˜€πŸ˜†", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UNICODE_CI", 1); + assertFindInSet("πŸ˜ƒπŸ˜„", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UTF8_BINARY", 2); + assertFindInSet("πŸ˜ƒπŸ˜„", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UTF8_LCASE", 2); + assertFindInSet("πŸ˜ƒπŸ˜„", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UNICODE", 2); + assertFindInSet("πŸ˜ƒπŸ˜„", UTF8String.fromString("πŸ˜€πŸ˜†,πŸ˜ƒπŸ˜„"), "UNICODE_CI", 2); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 0); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0); + assertFindInSet("x", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 0); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 1); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 1); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 1); + assertFindInSet("a", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 1); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0); + assertFindInSet("A", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 3); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 3); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 3); + assertFindInSet("𝔸", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 1); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 2); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 2); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 2); + assertFindInSet("𐐅", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 2); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_BINARY", 0); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UTF8_LCASE", 2); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE", 0); + assertFindInSet("𐐭", UTF8String.fromString("a,𐐅,𝔸"), "UNICODE_CI", 2); + // Invalid UTF8 strings + assertFindInSet("C", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UTF8_BINARY", 3); + assertFindInSet("c", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UTF8_LCASE", 2); + assertFindInSet("C", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UNICODE", 2); + assertFindInSet("c", UTF8String.fromBytes( + new byte[] { 0x41, (byte) 0xC2, 0x2C, 0x42, 0x2C, 0x43, 0x2C, 0x43, 0x2C, 0x56 }), + "UNICODE_CI", 2); } /**