Skip to content

Commit 36dcebc

Browse files
authored
[spark] Support varchar/char type (apache#3361)
1 parent 7579c83 commit 36dcebc

File tree

10 files changed

+121
-13
lines changed

10 files changed

+121
-13
lines changed

docs/content/spark/quick-start.md

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,17 @@ All Spark's data types are available in package `org.apache.spark.sql.types`.
290290
</tr>
291291
<tr>
292292
<td><code>StringType</code></td>
293-
<td><code>VarCharType</code>, <code>CharType</code></td>
293+
<td><code>VarCharType(Integer.MAX_VALUE)</code></td>
294+
<td>true</td>
295+
</tr>
296+
<tr>
297+
<td><code>VarCharType(length)</code></td>
298+
<td><code>VarCharType(length)</code></td>
299+
<td>true</td>
300+
</tr>
301+
<tr>
302+
<td><code>CharType(length)</code></td>
303+
<td><code>CharType(length)</code></td>
294304
<td>true</td>
295305
</tr>
296306
<tr>

paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkInternalRow.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
import org.apache.spark.sql.types.BooleanType;
4141
import org.apache.spark.sql.types.ByteType;
4242
import org.apache.spark.sql.types.CalendarIntervalType;
43+
import org.apache.spark.sql.types.CharType;
4344
import org.apache.spark.sql.types.DateType;
4445
import org.apache.spark.sql.types.Decimal;
4546
import org.apache.spark.sql.types.DecimalType;
@@ -53,6 +54,7 @@
5354
import org.apache.spark.sql.types.StructType;
5455
import org.apache.spark.sql.types.TimestampType;
5556
import org.apache.spark.sql.types.UserDefinedType;
57+
import org.apache.spark.sql.types.VarcharType;
5658
import org.apache.spark.unsafe.types.CalendarInterval;
5759
import org.apache.spark.unsafe.types.UTF8String;
5860

@@ -205,7 +207,9 @@ public Object get(int ordinal, org.apache.spark.sql.types.DataType dataType) {
205207
if (dataType instanceof DoubleType) {
206208
return getDouble(ordinal);
207209
}
208-
if (dataType instanceof StringType) {
210+
if (dataType instanceof StringType
211+
|| dataType instanceof CharType
212+
|| dataType instanceof VarcharType) {
209213
return getUTF8String(ordinal);
210214
}
211215
if (dataType instanceof DecimalType) {

paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkTypeUtils.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,16 @@ private static class PaimonToSparkTypeVisitor extends DataTypeDefaultVisitor<Dat
7575

7676
@Override
7777
public DataType visit(CharType charType) {
78-
return DataTypes.StringType;
78+
return new org.apache.spark.sql.types.CharType(charType.getLength());
7979
}
8080

8181
@Override
8282
public DataType visit(VarCharType varCharType) {
83-
return DataTypes.StringType;
83+
if (varCharType.getLength() == VarCharType.MAX_LENGTH) {
84+
return DataTypes.StringType;
85+
} else {
86+
return new org.apache.spark.sql.types.VarcharType(varCharType.getLength());
87+
}
8488
}
8589

8690
@Override

paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/PaimonPartitionManagement.scala

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,11 @@ import org.apache.paimon.operation.FileStoreCommit
2323
import org.apache.paimon.table.FileStoreTable
2424
import org.apache.paimon.table.sink.BatchWriteBuilder
2525
import org.apache.paimon.types.RowType
26-
import org.apache.paimon.utils.{FileStorePathFactory, RowDataPartitionComputer}
26+
import org.apache.paimon.utils.RowDataPartitionComputer
2727

2828
import org.apache.spark.sql.Row
2929
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
30+
import org.apache.spark.sql.catalyst.util.CharVarcharUtils
3031
import org.apache.spark.sql.connector.catalog.SupportsPartitionManagement
3132
import org.apache.spark.sql.types.StructType
3233

@@ -51,7 +52,7 @@ trait PaimonPartitionManagement extends SupportsPartitionManagement {
5152
override def dropPartition(internalRow: InternalRow): Boolean = {
5253
// convert internalRow to row
5354
val row: Row = CatalystTypeConverters
54-
.createToScalaConverter(partitionSchema())
55+
.createToScalaConverter(CharVarcharUtils.replaceCharVarcharWithString(partitionSchema()))
5556
.apply(internalRow)
5657
.asInstanceOf[Row]
5758
val rowDataPartitionComputer = new RowDataPartitionComputer(

paimon-spark/paimon-spark-common/src/test/java/org/apache/paimon/spark/SparkInternalRowTest.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import org.apache.paimon.utils.DateTimeUtils;
2929

3030
import org.apache.spark.sql.catalyst.CatalystTypeConverters;
31+
import org.apache.spark.sql.catalyst.util.CharVarcharUtils;
3132
import org.junit.jupiter.api.Test;
3233

3334
import java.math.BigDecimal;
@@ -54,6 +55,8 @@ public void test() {
5455
GenericRow.of(
5556
1,
5657
fromString("jingsong"),
58+
fromString("apache"),
59+
fromString("paimon"),
5760
22.2,
5861
new GenericMap(
5962
Stream.of(
@@ -79,9 +82,12 @@ public void test() {
7982
Decimal.fromBigDecimal(BigDecimal.valueOf(65782123123.01), 38, 2),
8083
Decimal.fromBigDecimal(BigDecimal.valueOf(62123123.5), 10, 1));
8184

85+
// CatalystTypeConverters does not support char and varchar, we need to replace char and
86+
// varchar with string
8287
Function1<Object, Object> sparkConverter =
8388
CatalystTypeConverters.createToScalaConverter(
84-
SparkTypeUtils.fromPaimonType(ALL_TYPES));
89+
CharVarcharUtils.replaceCharVarcharWithString(
90+
SparkTypeUtils.fromPaimonType(ALL_TYPES)));
8591
org.apache.spark.sql.Row sparkRow =
8692
(org.apache.spark.sql.Row)
8793
sparkConverter.apply(new SparkInternalRow(ALL_TYPES).replace(rowData));
@@ -90,6 +96,8 @@ public void test() {
9096
"{"
9197
+ "\"id\":1,"
9298
+ "\"name\":\"jingsong\","
99+
+ "\"char\":\"apache\","
100+
+ "\"varchar\":\"paimon\","
93101
+ "\"salary\":22.2,"
94102
+ "\"locations\":{\"key1\":{\"posX\":1.2,\"posY\":2.3},\"key2\":{\"posX\":2.4,\"posY\":3.5}},"
95103
+ "\"strArray\":[\"v1\",\"v5\"],"

paimon-spark/paimon-spark-common/src/test/java/org/apache/paimon/spark/SparkReadITCase.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,8 @@ public void testCreateTableAs() {
177177
spark.sql("CREATE TABLE testCreateTableAs AS SELECT * FROM testCreateTable");
178178
List<Row> result = spark.sql("SELECT * FROM testCreateTableAs").collectAsList();
179179

180-
assertThat(result.stream().map(Row::toString)).containsExactlyInAnyOrder("[1,a,b]");
180+
assertThat(result.stream().map(Row::toString))
181+
.containsExactlyInAnyOrder("[1,a,b ]");
181182

182183
// partitioned table
183184
spark.sql(
@@ -224,11 +225,13 @@ public void testCreateTableAs() {
224225
+ " 'file.format' = 'parquet',\n"
225226
+ " 'path' = '%s')\n"
226227
+ "]]",
227-
showCreateString("testTableAs", "a BIGINT", "b STRING", "c STRING"),
228+
showCreateString(
229+
"testTableAs", "a BIGINT", "b VARCHAR(10)", "c CHAR(10)"),
228230
new Path(warehousePath, "default.db/testTableAs")));
229231
List<Row> resultProp = spark.sql("SELECT * FROM testTableAs").collectAsList();
230232

231-
assertThat(resultProp.stream().map(Row::toString)).containsExactlyInAnyOrder("[1,a,b]");
233+
assertThat(resultProp.stream().map(Row::toString))
234+
.containsExactlyInAnyOrder("[1,a,b ]");
232235

233236
// primary key
234237
spark.sql(

paimon-spark/paimon-spark-common/src/test/java/org/apache/paimon/spark/SparkTypeTest.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ public class SparkTypeTest {
3939
1)) // posX and posY have field id 0 and 1, here we start from 2
4040
.field("id", DataTypes.INT().notNull())
4141
.field("name", DataTypes.STRING()) /* optional by default */
42+
.field("char", DataTypes.CHAR(10))
43+
.field("varchar", DataTypes.VARCHAR(10))
4244
.field("salary", DataTypes.DOUBLE().notNull())
4345
.field(
4446
"locations",
@@ -79,6 +81,8 @@ public void testAllTypes() {
7981
"StructType("
8082
+ "StructField(id,IntegerType,true),"
8183
+ "StructField(name,StringType,true),"
84+
+ "StructField(char,CharType(10),true),"
85+
+ "StructField(varchar,VarcharType(10),true),"
8286
+ "StructField(salary,DoubleType,true),"
8387
+ nestedRowMapType
8488
+ ","

paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/sql/DDLTestBase.scala

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package org.apache.paimon.spark.sql
2020

2121
import org.apache.paimon.spark.PaimonSparkTestBase
2222

23+
import org.apache.spark.sql.Row
2324
import org.junit.jupiter.api.Assertions
2425

2526
abstract class DDLTestBase extends PaimonSparkTestBase {
@@ -84,4 +85,70 @@ abstract class DDLTestBase extends PaimonSparkTestBase {
8485
"SparkCatalog can only create paimon table, but current provider is parquet"))
8586
}
8687
}
88+
89+
test("Paimon DDL: create table with char/varchar/string") {
90+
Seq("orc", "avro").foreach(
91+
format => {
92+
withTable("paimon_tbl") {
93+
spark.sql(
94+
s"""
95+
|CREATE TABLE paimon_tbl (id int, col_s1 char(9), col_s2 varchar(10), col_s3 string)
96+
|USING PAIMON
97+
|TBLPROPERTIES ('file.format' = '$format')
98+
|""".stripMargin)
99+
100+
spark.sql(s"""
101+
|insert into paimon_tbl values
102+
|(1, 'Wednesday', 'Wednesday', 'Wednesday'),
103+
|(2, 'Friday', 'Friday', 'Friday')
104+
|""".stripMargin)
105+
106+
// check description
107+
checkAnswer(
108+
spark
109+
.sql(s"DESC paimon_tbl")
110+
.select("col_name", "data_type")
111+
.where("col_name LIKE 'col_%'")
112+
.orderBy("col_name"),
113+
Row("col_s1", "char(9)") :: Row("col_s2", "varchar(10)") :: Row(
114+
"col_s3",
115+
"string") :: Nil
116+
)
117+
118+
// check select
119+
if (format == "orc" && !gteqSpark3_4) {
120+
// Orc reader will right trim the char type, e.g. "Friday " => "Friday" (see orc's `CharTreeReader`)
121+
// and Spark has a conf `spark.sql.readSideCharPadding` to auto padding char only since 3.4 (default true)
122+
// So when using orc with Spark3.4-, here will return "Friday"
123+
checkAnswer(
124+
spark.sql(s"select col_s1 from paimon_tbl where id = 2"),
125+
Row("Friday") :: Nil
126+
)
127+
// Spark will auto create the filter like Filter(isnotnull(col_s1#124) AND (col_s1#124 = Friday ))
128+
// for char type, so here will not return any rows
129+
checkAnswer(
130+
spark.sql(s"select col_s1 from paimon_tbl where col_s1 = 'Friday'"),
131+
Nil
132+
)
133+
} else {
134+
checkAnswer(
135+
spark.sql(s"select col_s1 from paimon_tbl where id = 2"),
136+
Row("Friday ") :: Nil
137+
)
138+
checkAnswer(
139+
spark.sql(s"select col_s1 from paimon_tbl where col_s1 = 'Friday'"),
140+
Row("Friday ") :: Nil
141+
)
142+
}
143+
checkAnswer(
144+
spark.sql(s"select col_s2 from paimon_tbl where col_s2 = 'Friday'"),
145+
Row("Friday") :: Nil
146+
)
147+
checkAnswer(
148+
spark.sql(s"select col_s3 from paimon_tbl where col_s3 = 'Friday'"),
149+
Row("Friday") :: Nil
150+
)
151+
}
152+
})
153+
}
87154
}

paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/sql/PaimonPartitionManagementTest.scala

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,17 @@ class PaimonPartitionManagementTest extends PaimonSparkTestBase {
161161

162162
checkAnswer(
163163
spark.sql("select * from T"),
164-
Row("a", "b", 1L, 20230816L, "1132") :: Row("a", "b", 1L, 20230816L, "1133") :: Row(
164+
Row("a", "b ", 1L, 20230816L, "1132") :: Row(
165165
"a",
166-
"b",
166+
"b ",
167+
1L,
168+
20230816L,
169+
"1133") :: Row("a", "b ", 2L, 20230817L, "1132") :: Row(
170+
"a",
171+
"b ",
167172
2L,
168173
20230817L,
169-
"1132") :: Row("a", "b", 2L, 20230817L, "1134") :: Nil
174+
"1134") :: Nil
170175
)
171176
}
172177
}

paimon-spark/paimon-spark-common/src/test/scala/org/apache/paimon/spark/sql/SparkVersionSupport.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,6 @@ trait SparkVersionSupport {
2424
lazy val sparkVersion: String = SPARK_VERSION
2525

2626
lazy val gteqSpark3_3: Boolean = sparkVersion >= "3.3"
27+
28+
lazy val gteqSpark3_4: Boolean = sparkVersion >= "3.4"
2729
}

0 commit comments

Comments
 (0)