Skip to content

Commit 54f2453

Browse files
authored
feat: allow lazily chunking unsorted iteration (#55)
* Move unsorted code to a new module, refactor, implement skipping. * Improve reading tests by using two files to test set reading.
1 parent e75937d commit 54f2453

File tree

3 files changed

+404
-166
lines changed

3 files changed

+404
-166
lines changed

benches/my_bench.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ fn main() {
9595

9696
// Open finished file
9797
let all_items = if unsorted_read {
98-
UnsortedShardReader::<T1>::open(tmp.path())?.collect::<Result<_, _>>()?
98+
UnsortedShardReader::<T1>::open(tmp.path()).collect::<Result<_, _>>()?
9999
} else {
100100
let reader = ShardReader::<T1>::open(tmp.path())?;
101101

src/lib.rs

Lines changed: 53 additions & 165 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
//! assert_eq!(all_items, all_items_sorted);
7474
//!
7575
//! // If you want to iterate through the items in unsorted order.
76-
//! let unsorted_items: Vec<_> = UnsortedShardReader::<DataStruct>::open(filename)?.collect();
76+
//! let unsorted_items: Vec<_> = UnsortedShardReader::<DataStruct>::open(filename).collect();
7777
//! // You will get the items in the order they are written to disk.
7878
//! assert_eq!(unsorted_items.len(), all_items.len());
7979
//!
@@ -84,6 +84,7 @@
8484
8585
#![deny(warnings)]
8686
#![deny(missing_docs)]
87+
use std::any::type_name;
8788
use std::borrow::Cow;
8889
use std::collections::BTreeSet;
8990
use std::fs::File;
@@ -94,7 +95,6 @@ use std::os::unix::fs::FileExt;
9495
use std::path::Path;
9596
use std::sync::{atomic::AtomicBool, Arc, Mutex};
9697
use std::thread;
97-
use std::{any::type_name, path::PathBuf};
9898

9999
use anyhow::{format_err, Error};
100100
use bincode::{deserialize_from, serialize_into};
@@ -112,6 +112,9 @@ pub mod helper;
112112
pub use crate::range::Range;
113113
use range::Rorder;
114114

115+
mod unsorted;
116+
pub use unsorted::*;
117+
115118
/// The size (in bytes) of a ShardIter object (mostly buffers)
116119
// ? sizeof(T)
117120
// + 8 usize items_remaining
@@ -1392,157 +1395,6 @@ where
13921395
}
13931396
}
13941397

1395-
#[derive(Clone, Copy, Serialize, Deserialize, Debug, PartialEq, Eq, PartialOrd, Ord)]
1396-
/// A group of `len_items` items, from shard `shard`, stored at position `offset`, using `len_bytes` bytes on-disk.
1397-
/// Similar to ShardRecord, just that we don't store the key. Used for `UnsortedShardReader`
1398-
struct KeylessShardRecord {
1399-
offset: usize,
1400-
len_bytes: usize,
1401-
len_items: usize,
1402-
}
1403-
1404-
/// Read from a collection of shardio files in the order in which items are written without
1405-
/// considering the sort order.
1406-
///
1407-
/// Useful if you just want to iterate over all the items irrespective of the ordering.
1408-
///
1409-
#[allow(dead_code)]
1410-
pub struct UnsortedShardReader<T, S = DefaultSort>
1411-
where
1412-
S: SortKey<T>,
1413-
{
1414-
shard_files: Vec<PathBuf>,
1415-
// Which file among the shard_files are we reading from
1416-
active_file_num: usize,
1417-
// The index of the shard file we are reading from
1418-
active_file_index: Vec<KeylessShardRecord>,
1419-
// Which KeylessShardRecord among the active_file_index are we reading now
1420-
active_index_num: usize,
1421-
// How many items within a compressed block have we read so far
1422-
active_index_items_read: usize,
1423-
decoder: Option<lz4::Decoder<BufReader<ReadAdapter<File, File>>>>,
1424-
phantom: PhantomData<(T, S)>,
1425-
}
1426-
1427-
impl<T, S> UnsortedShardReader<T, S>
1428-
where
1429-
T: DeserializeOwned,
1430-
<S as SortKey<T>>::Key: Clone + Ord + DeserializeOwned,
1431-
S: SortKey<T>,
1432-
{
1433-
/// Open a single shard file
1434-
pub fn open<P: AsRef<Path>>(shard_file: P) -> Result<Self, Error> {
1435-
UnsortedShardReader::open_set(&[shard_file])
1436-
}
1437-
1438-
/// Open a set of shard files
1439-
pub fn open_set<P: AsRef<Path>>(shard_files: &[P]) -> Result<Self, Error> {
1440-
let shard_files: Vec<_> = shard_files.iter().map(|f| f.as_ref().into()).collect();
1441-
1442-
Ok(UnsortedShardReader {
1443-
shard_files,
1444-
active_file_num: 0,
1445-
active_file_index: Vec::new(),
1446-
active_index_num: 0,
1447-
active_index_items_read: 0,
1448-
decoder: None,
1449-
phantom: PhantomData,
1450-
})
1451-
}
1452-
}
1453-
1454-
impl<T, S> Iterator for UnsortedShardReader<T, S>
1455-
where
1456-
T: DeserializeOwned,
1457-
<S as SortKey<T>>::Key: Clone + Ord + DeserializeOwned,
1458-
S: SortKey<T>,
1459-
{
1460-
type Item = Result<T, Error>;
1461-
fn next(&mut self) -> Option<Self::Item> {
1462-
loop {
1463-
if self.active_file_num >= self.shard_files.len() {
1464-
// We are done going through all the files
1465-
return None;
1466-
}
1467-
if self.decoder.is_none() {
1468-
// Open the next file
1469-
self.active_index_num = 0;
1470-
self.active_index_items_read = 0;
1471-
1472-
let reader = match ShardReaderSingle::<T, S>::open(
1473-
&self.shard_files[self.active_file_num],
1474-
) {
1475-
Ok(r) => r,
1476-
Err(e) => return Some(Err(e)),
1477-
};
1478-
self.active_file_index = reader
1479-
.index
1480-
.into_iter()
1481-
.map(|r| KeylessShardRecord {
1482-
offset: r.offset,
1483-
len_bytes: r.len_bytes,
1484-
len_items: r.len_items,
1485-
})
1486-
.collect();
1487-
1488-
let decoder = match self.active_file_index.first() {
1489-
Some(rec) => lz4::Decoder::new(BufReader::new(ReadAdapter::new(
1490-
reader.file,
1491-
rec.offset,
1492-
rec.len_bytes,
1493-
))),
1494-
None => {
1495-
// There are no chunks in this file
1496-
self.active_file_num += 1;
1497-
continue;
1498-
}
1499-
};
1500-
self.decoder = match decoder {
1501-
Ok(d) => Some(d),
1502-
Err(e) => return Some(Err(e.into())),
1503-
};
1504-
}
1505-
if self.active_index_items_read
1506-
>= self.active_file_index[self.active_index_num].len_items
1507-
{
1508-
// We are done with this chunk
1509-
self.active_index_num += 1;
1510-
self.active_index_items_read = 0;
1511-
1512-
if self.active_index_num >= self.active_file_index.len() {
1513-
// We are done with this file
1514-
self.decoder = None;
1515-
self.active_file_num += 1;
1516-
self.active_index_num = 0;
1517-
} else {
1518-
// Load up the decoder for the next chunk
1519-
let decoder = self.decoder.take().unwrap();
1520-
let (buf, _) = decoder.finish();
1521-
let file = buf.into_inner().file;
1522-
let rec = self.active_file_index[self.active_index_num];
1523-
let decoder = lz4::Decoder::new(BufReader::new(ReadAdapter::new(
1524-
file,
1525-
rec.offset,
1526-
rec.len_bytes,
1527-
)));
1528-
self.decoder = match decoder {
1529-
Ok(d) => Some(d),
1530-
Err(e) => return Some(Err(e.into())),
1531-
};
1532-
}
1533-
continue;
1534-
} else {
1535-
// Read the next item
1536-
self.active_index_items_read += 1;
1537-
match deserialize_from(self.decoder.as_mut().unwrap()) {
1538-
Ok(item) => return Some(Ok(item)),
1539-
Err(e) => return Some(Err(e.into())),
1540-
}
1541-
}
1542-
}
1543-
}
1544-
}
1545-
15461398
#[cfg(test)]
15471399
mod shard_tests {
15481400
use super::*;
@@ -1553,6 +1405,7 @@ mod shard_tests {
15531405
use std::fmt::Debug;
15541406
use std::hash::Hash;
15551407
use std::iter::{repeat, FromIterator};
1408+
use std::path::PathBuf;
15561409

15571410
#[derive(Copy, Clone, Eq, PartialEq, Serialize, Deserialize, Debug, PartialOrd, Ord, Hash)]
15581411
struct T1 {
@@ -1835,7 +1688,7 @@ mod shard_tests {
18351688
}
18361689

18371690
let unsorted_items =
1838-
UnsortedShardReader::<T, S>::open_set(&files)?.collect::<Result<Vec<_>, _>>()?;
1691+
UnsortedShardReader::<T, S>::open_set(&files).collect::<Result<Vec<_>, _>>()?;
18391692
assert!(set_compare(&out_items, &unsorted_items));
18401693

18411694
Ok(out_items)
@@ -1938,10 +1791,12 @@ mod shard_tests {
19381791
disk_chunk_size, producer_chunk_size, n_items
19391792
);
19401793

1941-
let tmp = tempfile::NamedTempFile::new()?;
1794+
// Write two files to check file set reading logic.
19421795

1943-
// Write and close file
1944-
let true_items = {
1796+
let create = || -> Result<_, Error> {
1797+
let tmp = tempfile::NamedTempFile::new()?;
1798+
1799+
// Write and close file
19451800
let mut writer: ShardWriter<T1> = ShardWriter::new(
19461801
tmp.path(),
19471802
producer_chunk_size,
@@ -1955,12 +1810,20 @@ mod shard_tests {
19551810

19561811
writer.finish()?;
19571812
true_items.sort();
1958-
true_items
1813+
Ok((tmp, true_items))
19591814
};
19601815

1816+
let (tmp0, true_items0) = create()?;
1817+
let (tmp1, true_items1) = create()?;
1818+
1819+
let mut true_items = Vec::from_iter(true_items0.into_iter().chain(true_items1));
1820+
true_items.sort();
1821+
1822+
let file_set = [tmp0.path(), tmp1.path()];
1823+
19611824
if do_read {
1962-
// Open finished file
1963-
let reader = ShardReader::<T1>::open(tmp.path())?;
1825+
// Open finished files
1826+
let reader = ShardReader::<T1>::open_set(&file_set)?;
19641827
let iter = reader.iter_range(&Range::all())?;
19651828

19661829
let all_items_res: Result<Vec<_>, Error> = iter.collect();
@@ -1974,8 +1837,8 @@ mod shard_tests {
19741837
}
19751838

19761839
for rc in [1, 3, 8, 15, 27].iter() {
1977-
// Open finished file & test chunked reads
1978-
let set_reader = ShardReader::<T1>::open(tmp.path())?;
1840+
// Open finished files & test chunked reads
1841+
let set_reader = ShardReader::<T1>::open_set(&file_set)?;
19791842
let mut all_items_chunks = Vec::new();
19801843

19811844
// Read in chunks
@@ -2000,10 +1863,29 @@ mod shard_tests {
20001863
}
20011864

20021865
// Check the unsorted read
2003-
let unsorted_reader = UnsortedShardReader::<T1>::open(tmp.path())?;
1866+
assert_eq!(2 * n_items, UnsortedShardReader::<T1>::len(&file_set)?);
1867+
let unsorted_reader = UnsortedShardReader::<T1>::open_set(&file_set);
20041868
let all_items_res: Result<Vec<_>, Error> = unsorted_reader.collect();
20051869
let all_items = all_items_res?;
20061870
assert!(set_compare(&true_items, &all_items));
1871+
1872+
let check_unsorted_skip = |to_skip: usize| -> Result<(), Error> {
1873+
let mut unsorted_reader_skip = UnsortedShardReader::<T1>::open_set(&file_set);
1874+
let skipped = unsorted_reader_skip.skip_lazy(to_skip)?;
1875+
assert_eq!(to_skip, skipped);
1876+
let all_items_res_skip: Result<Vec<_>, Error> = unsorted_reader_skip.collect();
1877+
let all_items_skip = all_items_res_skip?;
1878+
assert_eq!(&all_items[to_skip..], &all_items_skip);
1879+
Ok(())
1880+
};
1881+
1882+
check_unsorted_skip(0)?;
1883+
check_unsorted_skip(1)?;
1884+
check_unsorted_skip(disk_chunk_size)?;
1885+
check_unsorted_skip((disk_chunk_size * 3) + 1)?;
1886+
check_unsorted_skip(n_items)?; // skip entire first file
1887+
check_unsorted_skip(n_items + 1)?; // skip entire first file plus next item
1888+
check_unsorted_skip(n_items * 2)?; // skip everything
20071889
}
20081890
Ok(())
20091891
}
@@ -2084,7 +1966,7 @@ mod shard_tests {
20841966
assert!(set_compare(&true_items, &all_items_chunks));
20851967

20861968
// Check the unsorted read
2087-
let unsorted_reader = UnsortedShardReader::<T1, FieldDSort>::open(tmp.path())?;
1969+
let unsorted_reader = UnsortedShardReader::<T1, FieldDSort>::open(tmp.path());
20881970
let all_items_res: Result<Vec<_>, Error> = unsorted_reader.collect();
20891971
let all_items = all_items_res?;
20901972
assert!(set_compare(&true_items, &all_items));
@@ -2262,7 +2144,13 @@ mod shard_tests {
22622144
#[test]
22632145
fn test_empty_open_set() {
22642146
let shard_files = Vec::<PathBuf>::new();
2265-
let reader = UnsortedShardReader::<u8>::open_set(&shard_files).unwrap();
2147+
let reader = UnsortedShardReader::<u8>::open_set(&shard_files);
2148+
assert_eq!(reader.count(), 0);
2149+
2150+
// Test that skipping an empty set works correctly.
2151+
let mut reader = UnsortedShardReader::<u8>::open_set(&shard_files);
2152+
let skipped = reader.skip_lazy(10).unwrap();
2153+
assert_eq!(0, skipped);
22662154
assert_eq!(reader.count(), 0);
22672155
}
22682156
}

0 commit comments

Comments
 (0)