@@ -273,8 +273,8 @@ pub enum ReadError {
273
273
#[ error( "parquet value type mismatch: got {0:?} expected {1:?}" ) ]
274
274
WrongParquetType ( ParquetValue , Type ) ,
275
275
276
- #[ error( "only append-only delta tables are supported" ) ]
277
- DeltaLakeForbiddenRemoval ,
276
+ #[ error( "deletion vectors in delta tables are not supported" ) ]
277
+ DeltaDeletionVectorsNotSupported ,
278
278
}
279
279
280
280
#[ derive( Debug , thiserror:: Error , Clone , Eq , PartialEq ) ]
@@ -1909,6 +1909,18 @@ impl ObjectDownloader {
1909
1909
}
1910
1910
}
1911
1911
1912
+ #[ derive( Debug ) ]
1913
+ pub struct DeltaReaderAction {
1914
+ action_type : DataEventType ,
1915
+ path : String ,
1916
+ }
1917
+
1918
+ impl DeltaReaderAction {
1919
+ pub fn new ( action_type : DataEventType , path : String ) -> Self {
1920
+ Self { action_type, path }
1921
+ }
1922
+ }
1923
+
1912
1924
pub struct DeltaTableReader {
1913
1925
table : DeltaTable ,
1914
1926
streaming_mode : ConnectorMode ,
@@ -1921,7 +1933,8 @@ pub struct DeltaTableReader {
1921
1933
current_version : i64 ,
1922
1934
last_fully_read_version : Option < i64 > ,
1923
1935
rows_read_within_version : i64 ,
1924
- parquet_files_queue : VecDeque < String > ,
1936
+ parquet_files_queue : VecDeque < DeltaReaderAction > ,
1937
+ current_event_type : DataEventType ,
1925
1938
}
1926
1939
1927
1940
const DELTA_LAKE_INITIAL_POLL_DURATION : Duration = Duration :: from_millis ( 5 ) ;
@@ -1940,7 +1953,7 @@ impl DeltaTableReader {
1940
1953
let runtime = create_async_tokio_runtime ( ) ?;
1941
1954
let table = runtime. block_on ( async { open_delta_table ( path, storage_options) . await } ) ?;
1942
1955
let current_version = table. version ( ) ;
1943
- let parquet_files_queue = Self :: get_file_uris ( & table) ?;
1956
+ let parquet_files_queue = Self :: get_reader_actions ( & table, path ) ?;
1944
1957
1945
1958
Ok ( Self {
1946
1959
table,
@@ -1955,21 +1968,39 @@ impl DeltaTableReader {
1955
1968
reader : None ,
1956
1969
parquet_files_queue,
1957
1970
rows_read_within_version : 0 ,
1971
+ current_event_type : DataEventType :: Insert ,
1958
1972
} )
1959
1973
}
1960
1974
1961
- fn get_file_uris ( table : & DeltaTable ) -> Result < VecDeque < String > , ReadError > {
1962
- Ok ( table. get_file_uris ( ) ?. collect ( ) )
1975
+ fn get_reader_actions (
1976
+ table : & DeltaTable ,
1977
+ base_path : & str ,
1978
+ ) -> Result < VecDeque < DeltaReaderAction > , ReadError > {
1979
+ Ok ( table
1980
+ . snapshot ( ) ?
1981
+ . file_actions ( ) ?
1982
+ . into_iter ( )
1983
+ . map ( |action| {
1984
+ DeltaReaderAction :: new (
1985
+ DataEventType :: Insert ,
1986
+ Self :: ensure_absolute_path_with_base ( & action. path , base_path) ,
1987
+ )
1988
+ } )
1989
+ . collect ( ) )
1963
1990
}
1964
1991
1965
1992
fn ensure_absolute_path ( & self , path : & str ) -> String {
1966
- if path. starts_with ( & self . base_path ) {
1993
+ Self :: ensure_absolute_path_with_base ( path, & self . base_path )
1994
+ }
1995
+
1996
+ fn ensure_absolute_path_with_base ( path : & str , base_path : & str ) -> String {
1997
+ if path. starts_with ( base_path) {
1967
1998
return path. to_string ( ) ;
1968
1999
}
1969
- if self . base_path . ends_with ( '/' ) {
1970
- format ! ( "{}{path}" , self . base_path )
2000
+ if base_path. ends_with ( '/' ) {
2001
+ format ! ( "{base_path }{path}" )
1971
2002
} else {
1972
- format ! ( "{}/{path}" , self . base_path )
2003
+ format ! ( "{base_path }/{path}" )
1973
2004
}
1974
2005
}
1975
2006
@@ -1998,18 +2029,23 @@ impl DeltaTableReader {
1998
2029
for action in txn_actions {
1999
2030
// Protocol description for Delta Lake actions:
2000
2031
// https://github.com/delta-io/delta/blob/master/PROTOCOL.md#actions
2001
- match action {
2032
+ let action = match action {
2002
2033
DeltaLakeAction :: Remove ( action) => {
2003
- if action. data_change {
2004
- return Err ( ReadError :: DeltaLakeForbiddenRemoval ) ;
2034
+ if action. deletion_vector . is_some ( ) {
2035
+ return Err ( ReadError :: DeltaDeletionVectorsNotSupported ) ;
2005
2036
}
2037
+ data_changed |= action. data_change ;
2038
+ let action_path = self . ensure_absolute_path ( & action. path ) ;
2039
+ DeltaReaderAction :: new ( DataEventType :: Delete , action_path)
2006
2040
}
2007
2041
DeltaLakeAction :: Add ( action) => {
2008
2042
data_changed |= action. data_change ;
2009
- added_blocks. push_back ( self . ensure_absolute_path ( & action. path ) ) ;
2043
+ let action_path = self . ensure_absolute_path ( & action. path ) ;
2044
+ DeltaReaderAction :: new ( DataEventType :: Insert , action_path)
2010
2045
}
2011
2046
_ => continue ,
2012
2047
} ;
2048
+ added_blocks. push_back ( action) ;
2013
2049
}
2014
2050
2015
2051
self . last_fully_read_version = Some ( self . current_version ) ;
@@ -2040,9 +2076,9 @@ impl DeltaTableReader {
2040
2076
return Err ( ReadError :: NoObjectsToRead ) ;
2041
2077
}
2042
2078
}
2043
- let next_parquet_file = self . parquet_files_queue . pop_front ( ) . unwrap ( ) ;
2044
- let local_object =
2045
- self . object_downloader . download_object ( & next_parquet_file ) ? ;
2079
+ let next_action = self . parquet_files_queue . pop_front ( ) . unwrap ( ) ;
2080
+ let local_object = self . object_downloader . download_object ( & next_action . path ) ? ;
2081
+ self . current_event_type = next_action . action_type ;
2046
2082
self . reader = Some ( DeltaLakeParquetReader :: try_from ( local_object) ?. into_iter ( ) ) ;
2047
2083
}
2048
2084
}
@@ -2116,7 +2152,7 @@ impl Reader for DeltaTableReader {
2116
2152
2117
2153
self . rows_read_within_version += 1 ;
2118
2154
Ok ( ReadResult :: Data (
2119
- ReaderContext :: from_diff ( DataEventType :: Insert , None , row_map. into ( ) ) ,
2155
+ ReaderContext :: from_diff ( self . current_event_type , None , row_map. into ( ) ) ,
2120
2156
(
2121
2157
OffsetKey :: Empty ,
2122
2158
OffsetValue :: DeltaTablePosition {
@@ -2156,15 +2192,17 @@ impl Reader for DeltaTableReader {
2156
2192
// The offset is based on the full set of files present for `version`
2157
2193
self . current_version = * version;
2158
2194
runtime. block_on ( async { self . table . load_version ( self . current_version ) . await } ) ?;
2159
- self . parquet_files_queue = Self :: get_file_uris ( & self . table ) ?;
2195
+ self . parquet_files_queue = Self :: get_reader_actions ( & self . table , & self . base_path ) ?;
2160
2196
}
2161
2197
2162
2198
self . rows_read_within_version = 0 ;
2163
2199
while !self . parquet_files_queue . is_empty ( ) {
2164
2200
let next_block = self . parquet_files_queue . front ( ) . unwrap ( ) ;
2165
- let block_size = Self :: rows_in_file_count ( next_block) ?;
2201
+ let block_size = Self :: rows_in_file_count ( & next_block. path ) ?;
2166
2202
if self . rows_read_within_version + block_size <= * n_rows_to_rewind {
2167
- info ! ( "Skipping parquet block with the size of {block_size} entries: {next_block}" ) ;
2203
+ info ! (
2204
+ "Skipping parquet block with the size of {block_size} entries: {next_block:?}"
2205
+ ) ;
2168
2206
self . rows_read_within_version += block_size;
2169
2207
self . parquet_files_queue . pop_front ( ) ;
2170
2208
} else {
0 commit comments