-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Kernel] Default Parquet reader implementation
This PR is part of #1783. It implements Parquet reader based on `parquet-mr` and generates the output as columnar batches of `ColumnVector` and `ColumnarBatch` interface implementations. UTs Closes #1846
- Loading branch information
1 parent
cb89436
commit 04a29a4
Showing
28 changed files
with
3,407 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
164 changes: 164 additions & 0 deletions
164
kernel/kernel-default/src/main/java/io/delta/kernel/DefaultKernelUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,164 @@ | ||
/* | ||
* Copyright (2023) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Objects; | ||
import java.util.stream.Collectors; | ||
import org.apache.parquet.schema.GroupType; | ||
import org.apache.parquet.schema.MessageType; | ||
import org.apache.parquet.schema.Type; | ||
|
||
import io.delta.kernel.types.DataType; | ||
import io.delta.kernel.types.StructField; | ||
import io.delta.kernel.types.StructType; | ||
|
||
public class DefaultKernelUtils | ||
{ | ||
private DefaultKernelUtils() {} | ||
|
||
/** | ||
* Given the file schema in Parquet file and selected columns by Delta, return | ||
* a subschema of the file schema. | ||
* | ||
* @param fileSchema | ||
* @param deltaType | ||
* @return | ||
*/ | ||
public static final MessageType pruneSchema( | ||
GroupType fileSchema, // parquet | ||
StructType deltaType) // delta-core | ||
{ | ||
return deltaType.fields().stream() | ||
.map(column -> { | ||
Type type = findSubFieldType(fileSchema, column); | ||
if (type == null) { | ||
return null; | ||
} | ||
Type prunedSubfields = pruneSubfields(type, column.getDataType()); | ||
return new MessageType(column.getName(), prunedSubfields); | ||
}) | ||
.filter(Objects::nonNull) | ||
.reduce(MessageType::union) | ||
.get(); | ||
} | ||
|
||
/** | ||
* Search for the Parquet type in {@code groupType} of subfield which is equivalent to | ||
* given {@code field}. | ||
* | ||
* @param groupType Parquet group type coming from the file schema. | ||
* @param field Sub field given as Delta Kernel's {@link StructField} | ||
* @return {@link Type} of the Parquet field. Returns {@code null}, if not found. | ||
*/ | ||
public static Type findSubFieldType(GroupType groupType, StructField field) | ||
{ | ||
// TODO: Need a way to search by id once we start supporting column mapping `id` mode. | ||
final String columnName = field.getName(); | ||
if (groupType.containsField(columnName)) { | ||
return groupType.getType(columnName); | ||
} | ||
// Parquet is case-sensitive, but the engine that generated the parquet file may not be. | ||
// Check for direct match above but if no match found, try case-insensitive match. | ||
for (org.apache.parquet.schema.Type type : groupType.getFields()) { | ||
if (type.getName().equalsIgnoreCase(columnName)) { | ||
return type; | ||
} | ||
} | ||
|
||
return null; | ||
} | ||
|
||
// Note this only prunes top-level fields | ||
private static Type pruneSubfields(Type type, DataType deltaDatatype) | ||
{ | ||
if (!(deltaDatatype instanceof StructType)) { | ||
// there is no pruning for non-struct types | ||
return type; | ||
} | ||
|
||
GroupType groupType = (GroupType) type; | ||
List<Type> newParquetSubFields = | ||
((StructType) deltaDatatype).fields().stream() | ||
.map(structField -> findSubFieldType(groupType, structField)) | ||
.filter(Objects::nonNull) | ||
.collect(Collectors.toList()); | ||
|
||
return groupType.withNewFields(newParquetSubFields); | ||
} | ||
|
||
/** | ||
* Precondition-style validation that throws {@link IllegalArgumentException}. | ||
* | ||
* @param isValid {@code true} if valid, {@code false} if an exception should be thrown | ||
* @throws IllegalArgumentException if {@code isValid} is false | ||
*/ | ||
public static void checkArgument(boolean isValid) | ||
throws IllegalArgumentException | ||
{ | ||
if (!isValid) { | ||
throw new IllegalArgumentException(); | ||
} | ||
} | ||
|
||
/** | ||
* Precondition-style validation that throws {@link IllegalArgumentException}. | ||
* | ||
* @param isValid {@code true} if valid, {@code false} if an exception should be thrown | ||
* @param message A String message for the exception. | ||
* @throws IllegalArgumentException if {@code isValid} is false | ||
*/ | ||
public static void checkArgument(boolean isValid, String message) | ||
throws IllegalArgumentException | ||
{ | ||
if (!isValid) { | ||
throw new IllegalArgumentException(message); | ||
} | ||
} | ||
|
||
/** | ||
* Precondition-style validation that throws {@link IllegalArgumentException}. | ||
* | ||
* @param isValid {@code true} if valid, {@code false} if an exception should be thrown | ||
* @param message A String message for the exception. | ||
* @param args Objects used to fill in {@code %s} placeholders in the message | ||
* @throws IllegalArgumentException if {@code isValid} is false | ||
*/ | ||
public static void checkArgument(boolean isValid, String message, Object... args) | ||
throws IllegalArgumentException | ||
{ | ||
if (!isValid) { | ||
throw new IllegalArgumentException( | ||
String.format(String.valueOf(message), args)); | ||
} | ||
} | ||
|
||
/** | ||
* Precondition-style validation that throws {@link IllegalStateException}. | ||
* | ||
* @param isValid {@code true} if valid, {@code false} if an exception should be thrown | ||
* @param message A String message for the exception. | ||
* @throws IllegalStateException if {@code isValid} is false | ||
*/ | ||
public static void checkState(boolean isValid, String message) | ||
throws IllegalStateException | ||
{ | ||
if (!isValid) { | ||
throw new IllegalStateException(message); | ||
} | ||
} | ||
} |
Oops, something went wrong.