Skip to content

Commit c76692b

Browse files
committed
Add base representation of storage files
1 parent dc20996 commit c76692b

26 files changed

+232
-121
lines changed

xtable-api/src/main/java/org/apache/xtable/model/TableChange.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import lombok.NonNull;
2323
import lombok.Value;
2424

25-
import org.apache.xtable.model.storage.DataFilesDiff;
25+
import org.apache.xtable.model.storage.InternalFilesDiff;
2626

2727
/**
2828
* Captures the changes in a single commit/instant from the source table.
@@ -33,7 +33,7 @@
3333
@Builder(toBuilder = true)
3434
public class TableChange {
3535
// Change in files at the specified instant
36-
DataFilesDiff filesDiff;
36+
InternalFilesDiff filesDiff;
3737

3838
/** The {@link InternalTable} at the commit time to which this table change belongs. */
3939
InternalTable tableAsOfChange;

xtable-api/src/main/java/org/apache/xtable/model/storage/FilesDiff.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,12 +92,12 @@ public static <L, P> FilesDiff<L, P> findNewAndRemovedFiles(
9292
* @param <P> the type of the previous files
9393
* @return the set of files that are added
9494
*/
95-
public static <P> FilesDiff<InternalDataFile, P> findNewAndRemovedFiles(
95+
public static <P> FilesDiff<InternalFile, P> findNewAndRemovedFiles(
9696
List<PartitionFileGroup> latestFileGroups, Map<String, P> previousFiles) {
97-
Map<String, InternalDataFile> latestFiles =
97+
Map<String, InternalFile> latestFiles =
9898
latestFileGroups.stream()
9999
.flatMap(group -> group.getFiles().stream())
100-
.collect(Collectors.toMap(InternalDataFile::getPhysicalPath, Function.identity()));
100+
.collect(Collectors.toMap(InternalFile::getPhysicalPath, Function.identity()));
101101
return findNewAndRemovedFiles(latestFiles, previousFiles);
102102
}
103103
}

xtable-api/src/main/java/org/apache/xtable/model/storage/InternalDataFile.java

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,12 @@
2222
import java.util.List;
2323

2424
import lombok.Builder;
25+
import lombok.EqualsAndHashCode;
26+
import lombok.Getter;
2527
import lombok.NonNull;
26-
import lombok.Value;
28+
import lombok.ToString;
29+
import lombok.experimental.FieldDefaults;
30+
import lombok.experimental.SuperBuilder;
2731

2832
import org.apache.xtable.model.stat.ColumnStat;
2933
import org.apache.xtable.model.stat.PartitionValue;
@@ -33,18 +37,17 @@
3337
*
3438
* @since 0.1
3539
*/
36-
@Builder(toBuilder = true)
37-
@Value
38-
public class InternalDataFile {
39-
// physical path of the file (absolute with scheme)
40-
@NonNull String physicalPath;
40+
@SuperBuilder(toBuilder = true)
41+
@FieldDefaults(makeFinal = true, level = lombok.AccessLevel.PRIVATE)
42+
@ToString(callSuper = true)
43+
@EqualsAndHashCode(callSuper = true)
44+
@Getter
45+
public class InternalDataFile extends InternalFile {
4146
// file format
4247
@Builder.Default @NonNull FileFormat fileFormat = FileFormat.APACHE_PARQUET;
4348
// partition ranges for the data file
4449
@Builder.Default @NonNull List<PartitionValue> partitionValues = Collections.emptyList();
4550

46-
long fileSizeBytes;
47-
long recordCount;
4851
// column stats for each column in the data file
4952
@Builder.Default @NonNull List<ColumnStat> columnStats = Collections.emptyList();
5053
// last modified time in millis since epoch
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
package org.apache.xtable.model.storage;
20+
21+
import lombok.AllArgsConstructor;
22+
import lombok.EqualsAndHashCode;
23+
import lombok.Getter;
24+
import lombok.NonNull;
25+
import lombok.ToString;
26+
import lombok.experimental.FieldDefaults;
27+
import lombok.experimental.SuperBuilder;
28+
29+
/**
30+
* This class is an internal representation of a logical storage file of a table and is the base
31+
* class of all types of storage files. The most common type of storage file is a data-file, which
32+
* contains the actual records of a table. Other examples of a storage file are positional delete
33+
* files (containing ordinals of the records to be deleted from a data file), stat files, and index
34+
* files. For completeness of the conversion process, XTable needs to recognize current storage
35+
* files of a table, and also the storage files that are added and removed over time as the state of
36+
* the table changes. Different table formats support different storage file types. This base file
37+
* representation is generic and extensible and is needed to design stable interfaces.
38+
*/
39+
@SuperBuilder(toBuilder = true)
40+
@FieldDefaults(makeFinal = true, level = lombok.AccessLevel.PRIVATE)
41+
@ToString(callSuper = true)
42+
@EqualsAndHashCode
43+
@AllArgsConstructor
44+
@Getter
45+
public abstract class InternalFile {
46+
// Absolute path of the storage file, with the scheme, that contains this logical file. Typically,
47+
// one physical storage file contains only one base file, for e.g. a parquet data file. However,
48+
// in some cases, one storage file can contain multiple logical storage files for optimizations.
49+
@NonNull String physicalPath;
50+
51+
// The size of the logical file in the physical storage file.
52+
long fileSizeBytes;
53+
54+
// The number of records in the storage file.
55+
long recordCount;
56+
}

xtable-api/src/main/java/org/apache/xtable/model/storage/DataFilesDiff.java renamed to xtable-api/src/main/java/org/apache/xtable/model/storage/InternalFilesDiff.java

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import java.util.List;
2222
import java.util.Map;
23+
import java.util.Set;
2324
import java.util.function.Function;
2425
import java.util.stream.Collectors;
2526

@@ -31,17 +32,18 @@
3132
@Value
3233
@EqualsAndHashCode(callSuper = true)
3334
@SuperBuilder
34-
public class DataFilesDiff extends FilesDiff<InternalDataFile, InternalDataFile> {
35+
public class InternalFilesDiff extends FilesDiff<InternalFile, InternalFile> {
3536

3637
/**
37-
* Creates a DataFilesDiff from the list of files in the target table and the list of files in the
38-
* source table.
38+
* Creates a InternalFilesDiff from the list of files in the target table and the list of files in
39+
* the source table.
3940
*
4041
* @param source list of files currently in the source table
4142
* @param target list of files currently in the target table
4243
* @return files that need to be added and removed for the target table match the source table
4344
*/
44-
public static DataFilesDiff from(List<InternalDataFile> source, List<InternalDataFile> target) {
45+
public static InternalFilesDiff from(
46+
List<InternalDataFile> source, List<InternalDataFile> target) {
4547
Map<String, InternalDataFile> targetPaths =
4648
target.stream()
4749
.collect(Collectors.toMap(InternalDataFile::getPhysicalPath, Function.identity()));
@@ -51,9 +53,35 @@ public static DataFilesDiff from(List<InternalDataFile> source, List<InternalDat
5153

5254
FilesDiff<InternalDataFile, InternalDataFile> diff =
5355
findNewAndRemovedFiles(sourcePaths, targetPaths);
54-
return DataFilesDiff.builder()
56+
return InternalFilesDiff.builder()
5557
.filesAdded(diff.getFilesAdded())
5658
.filesRemoved(diff.getFilesRemoved())
5759
.build();
5860
}
61+
62+
/**
63+
* Filters files of type {@link InternalDataFile} from the list of files added to the source table
64+
* and returns the list.
65+
*/
66+
public Set<InternalDataFile> dataFilesAdded() {
67+
Set<InternalDataFile> result =
68+
getFilesAdded().stream()
69+
.filter(InternalDataFile.class::isInstance)
70+
.map(file -> (InternalDataFile) file)
71+
.collect(Collectors.toSet());
72+
return result;
73+
}
74+
75+
/**
76+
* Filters files of type {@link InternalDataFile} from the list of files removed to the source
77+
* table and returns the list.
78+
*/
79+
public Set<InternalDataFile> dataFilesRemoved() {
80+
Set<InternalDataFile> result =
81+
getFilesRemoved().stream()
82+
.filter(InternalDataFile.class::isInstance)
83+
.map(file -> (InternalDataFile) file)
84+
.collect(Collectors.toSet());
85+
return result;
86+
}
5987
}

xtable-api/src/main/java/org/apache/xtable/model/storage/PartitionFileGroup.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,12 +28,12 @@
2828

2929
import org.apache.xtable.model.stat.PartitionValue;
3030

31-
/** Represents a grouping of {@link InternalDataFile} with the same partition values. */
31+
/** Represents a grouping of {@link InternalFile} with the same partition values. */
3232
@Value
3333
@Builder
3434
public class PartitionFileGroup {
3535
List<PartitionValue> partitionValues;
36-
List<InternalDataFile> files;
36+
List<? extends InternalFile> files;
3737

3838
public static List<PartitionFileGroup> fromFiles(List<InternalDataFile> files) {
3939
return fromFiles(files.stream());
@@ -51,4 +51,12 @@ public static List<PartitionFileGroup> fromFiles(Stream<InternalDataFile> files)
5151
.build())
5252
.collect(Collectors.toList());
5353
}
54+
55+
/** Filters storage files of type {@link InternalDataFile} and returns them. */
56+
public List<InternalDataFile> getDataFiles() {
57+
return files.stream()
58+
.filter(InternalDataFile.class::isInstance)
59+
.map(file -> (InternalDataFile) file)
60+
.collect(Collectors.toList());
61+
}
5462
}

xtable-api/src/main/java/org/apache/xtable/spi/sync/ConversionTarget.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import org.apache.xtable.model.metadata.TableSyncMetadata;
2929
import org.apache.xtable.model.schema.InternalPartitionField;
3030
import org.apache.xtable.model.schema.InternalSchema;
31-
import org.apache.xtable.model.storage.DataFilesDiff;
31+
import org.apache.xtable.model.storage.InternalFilesDiff;
3232
import org.apache.xtable.model.storage.PartitionFileGroup;
3333

3434
/** A client that provides the major functionality for syncing changes to a target system. */
@@ -68,9 +68,9 @@ public interface ConversionTarget {
6868
* Syncs the changes in files to the target system. This method is required to both add and remove
6969
* files.
7070
*
71-
* @param dataFilesDiff the diff that needs to be synced
71+
* @param internalFilesDiff the diff that needs to be synced
7272
*/
73-
void syncFilesForDiff(DataFilesDiff dataFilesDiff);
73+
void syncFilesForDiff(InternalFilesDiff internalFilesDiff);
7474

7575
/**
7676
* Starts the sync and performs any initialization required

xtable-api/src/test/java/org/apache/xtable/model/storage/TestDataFilesDiff.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,13 @@ void testFrom() {
3737
InternalDataFile.builder().physicalPath("file://already_in_target2.parquet").build();
3838
InternalDataFile sourceFileInTargetAlready =
3939
InternalDataFile.builder().physicalPath("file://already_in_target3.parquet").build();
40-
DataFilesDiff actual =
41-
DataFilesDiff.from(
40+
InternalFilesDiff actual =
41+
InternalFilesDiff.from(
4242
Arrays.asList(sourceFile1, sourceFile2, sourceFileInTargetAlready),
4343
Arrays.asList(targetFile1, targetFile2, sourceFileInTargetAlready));
4444

45-
DataFilesDiff expected =
46-
DataFilesDiff.builder()
45+
InternalFilesDiff expected =
46+
InternalFilesDiff.builder()
4747
.filesAdded(Arrays.asList(sourceFile1, sourceFile2))
4848
.filesRemoved(Arrays.asList(targetFile1, targetFile2))
4949
.build();

xtable-api/src/test/java/org/apache/xtable/model/storage/TestFilesDiff.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ void findDiffFromFileGroups() {
5050
previousFiles.put("file2NoGroup", file2);
5151
previousFiles.put("file2Group2", file3);
5252

53-
FilesDiff<InternalDataFile, File> diff =
53+
FilesDiff<InternalFile, File> diff =
5454
FilesDiff.findNewAndRemovedFiles(latestFileGroups, previousFiles);
5555
assertEquals(2, diff.getFilesAdded().size());
5656
assertTrue(diff.getFilesAdded().contains(file1Group2));

xtable-api/src/test/java/org/apache/xtable/spi/extractor/TestExtractFromSource.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@
4141
import org.apache.xtable.model.InternalSnapshot;
4242
import org.apache.xtable.model.InternalTable;
4343
import org.apache.xtable.model.TableChange;
44-
import org.apache.xtable.model.storage.DataFilesDiff;
4544
import org.apache.xtable.model.storage.InternalDataFile;
45+
import org.apache.xtable.model.storage.InternalFilesDiff;
4646
import org.apache.xtable.model.storage.PartitionFileGroup;
4747

4848
public class TestExtractFromSource {
@@ -89,7 +89,7 @@ public void extractTableChanges() {
8989
TableChange.builder()
9090
.tableAsOfChange(tableAtFirstInstant)
9191
.filesDiff(
92-
DataFilesDiff.builder().fileAdded(newFile1).fileRemoved(initialFile2).build())
92+
InternalFilesDiff.builder().fileAdded(newFile1).fileRemoved(initialFile2).build())
9393
.sourceIdentifier("0")
9494
.build();
9595
when(mockConversionSource.getTableChangeForCommit(firstCommitToSync))
@@ -98,7 +98,7 @@ public void extractTableChanges() {
9898
TableChange.builder()
9999
.tableAsOfChange(tableAtFirstInstant)
100100
.filesDiff(
101-
DataFilesDiff.builder().fileAdded(newFile1).fileRemoved(initialFile2).build())
101+
InternalFilesDiff.builder().fileAdded(newFile1).fileRemoved(initialFile2).build())
102102
.sourceIdentifier("0")
103103
.build();
104104

@@ -112,7 +112,7 @@ public void extractTableChanges() {
112112
TableChange.builder()
113113
.tableAsOfChange(tableAtSecondInstant)
114114
.filesDiff(
115-
DataFilesDiff.builder()
115+
InternalFilesDiff.builder()
116116
.filesAdded(Arrays.asList(newFile2, newFile3))
117117
.filesRemoved(Arrays.asList(initialFile3, newFile1))
118118
.build())
@@ -124,7 +124,7 @@ public void extractTableChanges() {
124124
TableChange.builder()
125125
.tableAsOfChange(tableAtSecondInstant)
126126
.filesDiff(
127-
DataFilesDiff.builder()
127+
InternalFilesDiff.builder()
128128
.filesAdded(Arrays.asList(newFile2, newFile3))
129129
.filesRemoved(Arrays.asList(initialFile3, newFile1))
130130
.build())

0 commit comments

Comments
 (0)