Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 16 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion kiru-core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# kiru/Cargo.toml
[package]
name = "kiru"
version = "0.1.10"
version = "0.1.11"
edition = "2021"
description = "Fast text chunking for Rust"
license = "MIT"
Expand Down
52 changes: 11 additions & 41 deletions kiru-core/benches/par_chunking.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use kiru::{ChunkerBuilder, ChunkerEnum, Source};
use kiru::{ChunkerBuilder, Source};
use std::fs;
use std::hint::black_box;
use std::time::Duration;
Expand Down Expand Up @@ -33,10 +33,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) {
group.bench_function("serial_single", |b| {
let source = Source::File(LARGE_FILE_PATH.to_string());
b.iter(|| {
let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker.on_source(source.clone()).unwrap();
let chunks: Vec<_> = iter.collect();
black_box(chunks);
Expand All @@ -54,10 +51,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker.on_sources(sources.clone()).unwrap();
let chunks: Vec<_> = iter.collect();
black_box(chunks);
Expand All @@ -71,10 +65,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let chunks: Vec<_> = chunker.on_sources_par(sources.clone()).unwrap();
black_box(chunks);
});
Expand All @@ -87,10 +78,7 @@ fn benchmark_bytes_chunking(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker
.on_sources_par_stream(sources.clone(), CHANNEL_SIZE)
.unwrap();
Expand Down Expand Up @@ -122,10 +110,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) {
group.bench_function("serial_single", |b| {
let source = Source::File(LARGE_FILE_PATH.to_string());
b.iter(|| {
let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker.on_source(source.clone()).unwrap();
let chunks: Vec<_> = iter.collect();
black_box(chunks);
Expand All @@ -143,10 +128,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker.on_sources(sources.clone()).unwrap();
let chunks: Vec<_> = iter.collect();
black_box(chunks);
Expand All @@ -160,10 +142,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let chunks: Vec<_> = chunker.on_sources_par(sources.clone()).unwrap();
black_box(chunks);
});
Expand All @@ -176,10 +155,7 @@ fn benchmark_characters_chunking(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker
.on_sources_par_stream(sources.clone(), CHANNEL_SIZE)
.unwrap();
Expand Down Expand Up @@ -219,10 +195,7 @@ fn benchmark_channel_size(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_bytes(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker
.on_sources_par_stream(sources.clone(), channel_size)
.unwrap();
Expand All @@ -238,10 +211,7 @@ fn benchmark_channel_size(c: &mut Criterion) {
&sources,
|b, sources| {
b.iter(|| {
let chunker = ChunkerBuilder::by_characters(ChunkerEnum::Characters {
chunk_size: CHUNK_SIZE,
overlap: OVERLAP,
});
let chunker = ChunkerBuilder::by_characters(CHUNK_SIZE, OVERLAP).unwrap();
let iter = chunker
.on_sources_par_stream(sources.clone(), channel_size)
.unwrap();
Expand Down
50 changes: 28 additions & 22 deletions kiru-core/src/bin/benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// kiru-core/src/bin/benchmark.rs

use kiru::{ChunkerBuilder, ChunkerEnum, Source};
use kiru::{ChunkerBuilder, Source};
use serde::Serialize;
use std::env;
use std::time::Instant;
Expand Down Expand Up @@ -77,21 +77,6 @@ fn run_benchmark(
chunk_size: usize,
overlap: usize,
) -> Result<BenchmarkResult, Box<dyn std::error::Error>> {
// Create the chunker using ChunkerBuilder
let chunker = match strategy {
"bytes" => ChunkerBuilder::by_bytes(ChunkerEnum::Bytes {
chunk_size,
overlap,
}),
"chars" => ChunkerBuilder::by_characters(ChunkerEnum::Characters {
chunk_size,
overlap,
}),
_ => {
return Err(format!("Invalid strategy '{}'. Use 'bytes' or 'chars'", strategy).into());
}
};

// Parse the source based on source_type
let source = match source_type {
"file" => Source::File(path.to_string()),
Expand All @@ -106,21 +91,42 @@ fn run_benchmark(
}
};

// Run the benchmark
// Create the chunker using ChunkerBuilder
match strategy {
"bytes" => {
let chunker = ChunkerBuilder::by_bytes(chunk_size, overlap)?;
bench_with(chunker, source)
}
"chars" => {
let chunker = ChunkerBuilder::by_characters(chunk_size, overlap)?;
bench_with(chunker, source)
}
_ => {
Err(format!("Invalid strategy '{}'. Use 'bytes' or 'chars'", strategy).into())
}
}
}

// Generic benchmarking body specialized for the concrete chunker type.
fn bench_with<C>(
chunker: kiru::ChunkerWithStrategy<C>,
source: Source,
) -> Result<BenchmarkResult, Box<dyn std::error::Error>>
where
C: kiru::Chunker,
{
let start = Instant::now();
let mut num_chunks = 0;
let mut total_bytes = 0;
let mut num_chunks = 0usize;
let mut total_bytes = 0usize;

let iterator = chunker.on_source(source)?;

for chunk in iterator {
num_chunks += 1;
total_bytes += chunk.len();
std::hint::black_box(chunk.len());
}

let elapsed = start.elapsed();
let elapsed_secs = elapsed.as_secs_f64();
let elapsed_secs = start.elapsed().as_secs_f64();
let throughput_mb_s = (total_bytes as f64) / (1024.0 * 1024.0) / elapsed_secs;

Ok(BenchmarkResult {
Expand Down
1 change: 1 addition & 0 deletions kiru-core/src/bytes_chunker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ struct BytesChunkIndices {
new_position: usize,
}

#[derive(Clone)]
pub struct BytesChunker {
chunk_size: usize,
overlap: usize,
Expand Down
1 change: 1 addition & 0 deletions kiru-core/src/characters_chunker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ struct CharactersChunkIndices {
new_char_position: usize,
}

#[derive(Clone)]
pub struct CharactersChunker {
chunk_size: usize,
overlap: usize,
Expand Down
Loading
Loading