73
73
//! assert_eq!(all_items, all_items_sorted);
74
74
//!
75
75
//! // If you want to iterate through the items in unsorted order.
76
- //! let unsorted_items: Vec<_> = UnsortedShardReader::<DataStruct>::open(filename)? .collect();
76
+ //! let unsorted_items: Vec<_> = UnsortedShardReader::<DataStruct>::open(filename).collect();
77
77
//! // You will get the items in the order they are written to disk.
78
78
//! assert_eq!(unsorted_items.len(), all_items.len());
79
79
//!
84
84
85
85
#![ deny( warnings) ]
86
86
#![ deny( missing_docs) ]
87
+ use std:: any:: type_name;
87
88
use std:: borrow:: Cow ;
88
89
use std:: collections:: BTreeSet ;
89
90
use std:: fs:: File ;
@@ -94,7 +95,6 @@ use std::os::unix::fs::FileExt;
94
95
use std:: path:: Path ;
95
96
use std:: sync:: { atomic:: AtomicBool , Arc , Mutex } ;
96
97
use std:: thread;
97
- use std:: { any:: type_name, path:: PathBuf } ;
98
98
99
99
use anyhow:: { format_err, Error } ;
100
100
use bincode:: { deserialize_from, serialize_into} ;
@@ -112,6 +112,9 @@ pub mod helper;
112
112
pub use crate :: range:: Range ;
113
113
use range:: Rorder ;
114
114
115
+ mod unsorted;
116
+ pub use unsorted:: * ;
117
+
115
118
/// The size (in bytes) of a ShardIter object (mostly buffers)
116
119
// ? sizeof(T)
117
120
// + 8 usize items_remaining
@@ -1392,157 +1395,6 @@ where
1392
1395
}
1393
1396
}
1394
1397
1395
- #[ derive( Clone , Copy , Serialize , Deserialize , Debug , PartialEq , Eq , PartialOrd , Ord ) ]
1396
- /// A group of `len_items` items, from shard `shard`, stored at position `offset`, using `len_bytes` bytes on-disk.
1397
- /// Similar to ShardRecord, just that we don't store the key. Used for `UnsortedShardReader`
1398
- struct KeylessShardRecord {
1399
- offset : usize ,
1400
- len_bytes : usize ,
1401
- len_items : usize ,
1402
- }
1403
-
1404
- /// Read from a collection of shardio files in the order in which items are written without
1405
- /// considering the sort order.
1406
- ///
1407
- /// Useful if you just want to iterate over all the items irrespective of the ordering.
1408
- ///
1409
- #[ allow( dead_code) ]
1410
- pub struct UnsortedShardReader < T , S = DefaultSort >
1411
- where
1412
- S : SortKey < T > ,
1413
- {
1414
- shard_files : Vec < PathBuf > ,
1415
- // Which file among the shard_files are we reading from
1416
- active_file_num : usize ,
1417
- // The index of the shard file we are reading from
1418
- active_file_index : Vec < KeylessShardRecord > ,
1419
- // Which KeylessShardRecord among the active_file_index are we reading now
1420
- active_index_num : usize ,
1421
- // How many items within a compressed block have we read so far
1422
- active_index_items_read : usize ,
1423
- decoder : Option < lz4:: Decoder < BufReader < ReadAdapter < File , File > > > > ,
1424
- phantom : PhantomData < ( T , S ) > ,
1425
- }
1426
-
1427
- impl < T , S > UnsortedShardReader < T , S >
1428
- where
1429
- T : DeserializeOwned ,
1430
- <S as SortKey < T > >:: Key : Clone + Ord + DeserializeOwned ,
1431
- S : SortKey < T > ,
1432
- {
1433
- /// Open a single shard file
1434
- pub fn open < P : AsRef < Path > > ( shard_file : P ) -> Result < Self , Error > {
1435
- UnsortedShardReader :: open_set ( & [ shard_file] )
1436
- }
1437
-
1438
- /// Open a set of shard files
1439
- pub fn open_set < P : AsRef < Path > > ( shard_files : & [ P ] ) -> Result < Self , Error > {
1440
- let shard_files: Vec < _ > = shard_files. iter ( ) . map ( |f| f. as_ref ( ) . into ( ) ) . collect ( ) ;
1441
-
1442
- Ok ( UnsortedShardReader {
1443
- shard_files,
1444
- active_file_num : 0 ,
1445
- active_file_index : Vec :: new ( ) ,
1446
- active_index_num : 0 ,
1447
- active_index_items_read : 0 ,
1448
- decoder : None ,
1449
- phantom : PhantomData ,
1450
- } )
1451
- }
1452
- }
1453
-
1454
- impl < T , S > Iterator for UnsortedShardReader < T , S >
1455
- where
1456
- T : DeserializeOwned ,
1457
- <S as SortKey < T > >:: Key : Clone + Ord + DeserializeOwned ,
1458
- S : SortKey < T > ,
1459
- {
1460
- type Item = Result < T , Error > ;
1461
- fn next ( & mut self ) -> Option < Self :: Item > {
1462
- loop {
1463
- if self . active_file_num >= self . shard_files . len ( ) {
1464
- // We are done going through all the files
1465
- return None ;
1466
- }
1467
- if self . decoder . is_none ( ) {
1468
- // Open the next file
1469
- self . active_index_num = 0 ;
1470
- self . active_index_items_read = 0 ;
1471
-
1472
- let reader = match ShardReaderSingle :: < T , S > :: open (
1473
- & self . shard_files [ self . active_file_num ] ,
1474
- ) {
1475
- Ok ( r) => r,
1476
- Err ( e) => return Some ( Err ( e) ) ,
1477
- } ;
1478
- self . active_file_index = reader
1479
- . index
1480
- . into_iter ( )
1481
- . map ( |r| KeylessShardRecord {
1482
- offset : r. offset ,
1483
- len_bytes : r. len_bytes ,
1484
- len_items : r. len_items ,
1485
- } )
1486
- . collect ( ) ;
1487
-
1488
- let decoder = match self . active_file_index . first ( ) {
1489
- Some ( rec) => lz4:: Decoder :: new ( BufReader :: new ( ReadAdapter :: new (
1490
- reader. file ,
1491
- rec. offset ,
1492
- rec. len_bytes ,
1493
- ) ) ) ,
1494
- None => {
1495
- // There are no chunks in this file
1496
- self . active_file_num += 1 ;
1497
- continue ;
1498
- }
1499
- } ;
1500
- self . decoder = match decoder {
1501
- Ok ( d) => Some ( d) ,
1502
- Err ( e) => return Some ( Err ( e. into ( ) ) ) ,
1503
- } ;
1504
- }
1505
- if self . active_index_items_read
1506
- >= self . active_file_index [ self . active_index_num ] . len_items
1507
- {
1508
- // We are done with this chunk
1509
- self . active_index_num += 1 ;
1510
- self . active_index_items_read = 0 ;
1511
-
1512
- if self . active_index_num >= self . active_file_index . len ( ) {
1513
- // We are done with this file
1514
- self . decoder = None ;
1515
- self . active_file_num += 1 ;
1516
- self . active_index_num = 0 ;
1517
- } else {
1518
- // Load up the decoder for the next chunk
1519
- let decoder = self . decoder . take ( ) . unwrap ( ) ;
1520
- let ( buf, _) = decoder. finish ( ) ;
1521
- let file = buf. into_inner ( ) . file ;
1522
- let rec = self . active_file_index [ self . active_index_num ] ;
1523
- let decoder = lz4:: Decoder :: new ( BufReader :: new ( ReadAdapter :: new (
1524
- file,
1525
- rec. offset ,
1526
- rec. len_bytes ,
1527
- ) ) ) ;
1528
- self . decoder = match decoder {
1529
- Ok ( d) => Some ( d) ,
1530
- Err ( e) => return Some ( Err ( e. into ( ) ) ) ,
1531
- } ;
1532
- }
1533
- continue ;
1534
- } else {
1535
- // Read the next item
1536
- self . active_index_items_read += 1 ;
1537
- match deserialize_from ( self . decoder . as_mut ( ) . unwrap ( ) ) {
1538
- Ok ( item) => return Some ( Ok ( item) ) ,
1539
- Err ( e) => return Some ( Err ( e. into ( ) ) ) ,
1540
- }
1541
- }
1542
- }
1543
- }
1544
- }
1545
-
1546
1398
#[ cfg( test) ]
1547
1399
mod shard_tests {
1548
1400
use super :: * ;
@@ -1553,6 +1405,7 @@ mod shard_tests {
1553
1405
use std:: fmt:: Debug ;
1554
1406
use std:: hash:: Hash ;
1555
1407
use std:: iter:: { repeat, FromIterator } ;
1408
+ use std:: path:: PathBuf ;
1556
1409
1557
1410
#[ derive( Copy , Clone , Eq , PartialEq , Serialize , Deserialize , Debug , PartialOrd , Ord , Hash ) ]
1558
1411
struct T1 {
@@ -1835,7 +1688,7 @@ mod shard_tests {
1835
1688
}
1836
1689
1837
1690
let unsorted_items =
1838
- UnsortedShardReader :: < T , S > :: open_set ( & files) ? . collect :: < Result < Vec < _ > , _ > > ( ) ?;
1691
+ UnsortedShardReader :: < T , S > :: open_set ( & files) . collect :: < Result < Vec < _ > , _ > > ( ) ?;
1839
1692
assert ! ( set_compare( & out_items, & unsorted_items) ) ;
1840
1693
1841
1694
Ok ( out_items)
@@ -1938,10 +1791,12 @@ mod shard_tests {
1938
1791
disk_chunk_size, producer_chunk_size, n_items
1939
1792
) ;
1940
1793
1941
- let tmp = tempfile :: NamedTempFile :: new ( ) ? ;
1794
+ // Write two files to check file set reading logic.
1942
1795
1943
- // Write and close file
1944
- let true_items = {
1796
+ let create = || -> Result < _ , Error > {
1797
+ let tmp = tempfile:: NamedTempFile :: new ( ) ?;
1798
+
1799
+ // Write and close file
1945
1800
let mut writer: ShardWriter < T1 > = ShardWriter :: new (
1946
1801
tmp. path ( ) ,
1947
1802
producer_chunk_size,
@@ -1955,12 +1810,20 @@ mod shard_tests {
1955
1810
1956
1811
writer. finish ( ) ?;
1957
1812
true_items. sort ( ) ;
1958
- true_items
1813
+ Ok ( ( tmp , true_items) )
1959
1814
} ;
1960
1815
1816
+ let ( tmp0, true_items0) = create ( ) ?;
1817
+ let ( tmp1, true_items1) = create ( ) ?;
1818
+
1819
+ let mut true_items = Vec :: from_iter ( true_items0. into_iter ( ) . chain ( true_items1) ) ;
1820
+ true_items. sort ( ) ;
1821
+
1822
+ let file_set = [ tmp0. path ( ) , tmp1. path ( ) ] ;
1823
+
1961
1824
if do_read {
1962
- // Open finished file
1963
- let reader = ShardReader :: < T1 > :: open ( tmp . path ( ) ) ?;
1825
+ // Open finished files
1826
+ let reader = ShardReader :: < T1 > :: open_set ( & file_set ) ?;
1964
1827
let iter = reader. iter_range ( & Range :: all ( ) ) ?;
1965
1828
1966
1829
let all_items_res: Result < Vec < _ > , Error > = iter. collect ( ) ;
@@ -1974,8 +1837,8 @@ mod shard_tests {
1974
1837
}
1975
1838
1976
1839
for rc in [ 1 , 3 , 8 , 15 , 27 ] . iter ( ) {
1977
- // Open finished file & test chunked reads
1978
- let set_reader = ShardReader :: < T1 > :: open ( tmp . path ( ) ) ?;
1840
+ // Open finished files & test chunked reads
1841
+ let set_reader = ShardReader :: < T1 > :: open_set ( & file_set ) ?;
1979
1842
let mut all_items_chunks = Vec :: new ( ) ;
1980
1843
1981
1844
// Read in chunks
@@ -2000,10 +1863,29 @@ mod shard_tests {
2000
1863
}
2001
1864
2002
1865
// Check the unsorted read
2003
- let unsorted_reader = UnsortedShardReader :: < T1 > :: open ( tmp. path ( ) ) ?;
1866
+ assert_eq ! ( 2 * n_items, UnsortedShardReader :: <T1 >:: len( & file_set) ?) ;
1867
+ let unsorted_reader = UnsortedShardReader :: < T1 > :: open_set ( & file_set) ;
2004
1868
let all_items_res: Result < Vec < _ > , Error > = unsorted_reader. collect ( ) ;
2005
1869
let all_items = all_items_res?;
2006
1870
assert ! ( set_compare( & true_items, & all_items) ) ;
1871
+
1872
+ let check_unsorted_skip = |to_skip : usize | -> Result < ( ) , Error > {
1873
+ let mut unsorted_reader_skip = UnsortedShardReader :: < T1 > :: open_set ( & file_set) ;
1874
+ let skipped = unsorted_reader_skip. skip_lazy ( to_skip) ?;
1875
+ assert_eq ! ( to_skip, skipped) ;
1876
+ let all_items_res_skip: Result < Vec < _ > , Error > = unsorted_reader_skip. collect ( ) ;
1877
+ let all_items_skip = all_items_res_skip?;
1878
+ assert_eq ! ( & all_items[ to_skip..] , & all_items_skip) ;
1879
+ Ok ( ( ) )
1880
+ } ;
1881
+
1882
+ check_unsorted_skip ( 0 ) ?;
1883
+ check_unsorted_skip ( 1 ) ?;
1884
+ check_unsorted_skip ( disk_chunk_size) ?;
1885
+ check_unsorted_skip ( ( disk_chunk_size * 3 ) + 1 ) ?;
1886
+ check_unsorted_skip ( n_items) ?; // skip entire first file
1887
+ check_unsorted_skip ( n_items + 1 ) ?; // skip entire first file plus next item
1888
+ check_unsorted_skip ( n_items * 2 ) ?; // skip everything
2007
1889
}
2008
1890
Ok ( ( ) )
2009
1891
}
@@ -2084,7 +1966,7 @@ mod shard_tests {
2084
1966
assert ! ( set_compare( & true_items, & all_items_chunks) ) ;
2085
1967
2086
1968
// Check the unsorted read
2087
- let unsorted_reader = UnsortedShardReader :: < T1 , FieldDSort > :: open ( tmp. path ( ) ) ? ;
1969
+ let unsorted_reader = UnsortedShardReader :: < T1 , FieldDSort > :: open ( tmp. path ( ) ) ;
2088
1970
let all_items_res: Result < Vec < _ > , Error > = unsorted_reader. collect ( ) ;
2089
1971
let all_items = all_items_res?;
2090
1972
assert ! ( set_compare( & true_items, & all_items) ) ;
@@ -2262,7 +2144,13 @@ mod shard_tests {
2262
2144
#[ test]
2263
2145
fn test_empty_open_set ( ) {
2264
2146
let shard_files = Vec :: < PathBuf > :: new ( ) ;
2265
- let reader = UnsortedShardReader :: < u8 > :: open_set ( & shard_files) . unwrap ( ) ;
2147
+ let reader = UnsortedShardReader :: < u8 > :: open_set ( & shard_files) ;
2148
+ assert_eq ! ( reader. count( ) , 0 ) ;
2149
+
2150
+ // Test that skipping an empty set works correctly.
2151
+ let mut reader = UnsortedShardReader :: < u8 > :: open_set ( & shard_files) ;
2152
+ let skipped = reader. skip_lazy ( 10 ) . unwrap ( ) ;
2153
+ assert_eq ! ( 0 , skipped) ;
2266
2154
assert_eq ! ( reader. count( ) , 0 ) ;
2267
2155
}
2268
2156
}
0 commit comments