quickwit-oss · PSeitz · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock
diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml
@@ -86,6 +86,7 @@ async-compression = { version = "0.4", features = ["tokio", "gzip"] }
 async-speed-limit = "0.4"
 async-trait = "0.1"
 base64 = "0.22"
+binggan = { version = "0.14" }
 bytes = { version = "1", features = ["serde"] }
 bytesize = { version = "1.3.0", features = ["serde"] }
 bytestring = "1.3.0"

diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml
@@ -36,7 +36,7 @@ quickwit-proto = { workspace = true }
 quickwit-query = { workspace = true }
 
 [dev-dependencies]
-criterion = { workspace = true }
+binggan = { workspace = true }
 matches = { workspace = true }
 serde_yaml = { workspace = true }
 time = { workspace = true }

diff --git a/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs b/quickwit/quickwit-doc-mapper/benches/doc_to_json_bench.rs
@@ -17,13 +17,15 @@
 // You should have received a copy of the GNU Affero General Public License
 // along with this program. If not, see <http://www.gnu.org/licenses/>.
 
-use criterion::{criterion_group, criterion_main, Criterion, Throughput};
+use binggan::plugins::*;
+use binggan::{black_box, BenchRunner, PeakMemAlloc, INSTRUMENTED_SYSTEM};
 use quickwit_doc_mapper::DocMapper;
 use tantivy::TantivyDocument;
 
-const JSON_TEST_DATA: &str = include_str!("data/simple-parse-bench.json");
+const SIMPLE_JSON_TEST_DATA: &str = include_str!("data/simple-parse-bench.json");
+const ROUTING_TEST_DATA: &str = include_str!("data/simple-routing-expression-bench.json");
 
-const DOC_MAPPER_CONF: &str = r#"{
+const DOC_MAPPER_CONF_SIMPLE_JSON: &str = r#"{
     "type": "default",
     "default_search_fields": [],
     "tag_fields": [],
@@ -35,28 +37,92 @@ const DOC_MAPPER_CONF: &str = r#"{
     ]
 }"#;
 
-pub fn simple_json_to_doc_benchmark(c: &mut Criterion) {
-    let doc_mapper: Box<DocMapper> = serde_json::from_str(DOC_MAPPER_CONF).unwrap();
-    let lines: Vec<&str> = JSON_TEST_DATA.lines().map(|line| line.trim()).collect();
+/// Note that {"name": "date", "type": "datetime", "input_formats": ["%Y-%m-%d"], "output_format":
+/// "%Y-%m-%d"}, is removed since tantivy parsing only supports RFC3339
+const ROUTING_DOC_MAPPER_CONF: &str = r#"{
+    "type": "default",
+    "default_search_fields": [],
+    "tag_fields": [],
+    "field_mappings": [
+        {"name": "timestamp", "type": "datetime", "input_formats": ["unix_timestamp"], "output_format": "%Y-%m-%d %H:%M:%S", "output_format": "%Y-%m-%d %H:%M:%S", "fast": true },
+        {"name": "source", "type": "text" },
+        {"name": "vin", "type": "text" },
+        {"name": "vid", "type": "text" },
+        {"name": "domain", "type": "text" },
+        {"name": "seller", "type": "object", "field_mappings": [
+            {"name": "id", "type": "text" },
+            {"name": "name", "type": "text" },
+            {"name": "address", "type": "text" },
+            {"name": "zip", "type": "text" }
+        ]}
+    ],
+    "partition_key": "seller.id"
+}"#;
+
+#[global_allocator]
+pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
+
+fn get_test_data(
+    name: &'static str,
+    raw: &'static str,
+    doc_mapper: &'static str,
+) -> (&'static str, usize, Vec<&'static str>, Box<DocMapper>) {
+    let lines: Vec<&str> = raw.lines().map(|line| line.trim()).collect();
+    (
+        name,
+        raw.len(),
+        lines,
+        serde_json::from_str(doc_mapper).unwrap(),
+    )
+}
 
-    let mut group = c.benchmark_group("simple-json-to-doc");
-    group.throughput(Throughput::Bytes(JSON_TEST_DATA.len() as u64));
-    group.bench_function("simple-json-to-doc", |b| {
-        b.iter(|| {
-            for line in &lines {
-                doc_mapper.doc_from_json_str(line).unwrap();
+fn run_bench() {
+    let inputs: Vec<(&str, usize, Vec<&str>, Box<DocMapper>)> = vec![
+        (get_test_data(
+            "flat_json",
+            SIMPLE_JSON_TEST_DATA,
+            DOC_MAPPER_CONF_SIMPLE_JSON,
+        )),
+        (get_test_data("routing_json", ROUTING_TEST_DATA, ROUTING_DOC_MAPPER_CONF)),
+    ];
+
+    let mut runner: BenchRunner = BenchRunner::new();
+
+    runner.config().set_num_iter_for_bench(1);
+    runner.config().set_num_iter_for_group(100);
+    runner
+        .add_plugin(CacheTrasher::default())
+        .add_plugin(BPUTrasher::default())
+        .add_plugin(PeakMemAllocPlugin::new(GLOBAL));
+
+    for (input_name, size, data, doc_mapper) in inputs.iter() {
+        let dynamic_doc_mapper: DocMapper =
+            serde_json::from_str(r#"{ "mode": "dynamic" }"#).unwrap();
+        let mut group = runner.new_group();
+        group.set_name(input_name);
+        group.set_input_size(*size);
+        group.register_with_input("doc_mapper", data, |lines| {
+            for line in lines {
+                black_box(doc_mapper.doc_from_json_str(line).unwrap());
             }
-        })
-    });
-    group.bench_function("simple-json-to-doc-tantivy", |b| {
-        b.iter(|| {
+        });
+
+        group.register_with_input("doc_mapper_dynamic", data, |lines| {
+            for line in lines {
+                black_box(dynamic_doc_mapper.doc_from_json_str(line).unwrap());
+            }
+        });
+
+        group.register_with_input("tantivy parse json", data, |lines| {
             let schema = doc_mapper.schema();
-            for line in &lines {
-                let _doc = TantivyDocument::parse_json(&schema, line).unwrap();
+            for line in lines {
+                let _doc = black_box(TantivyDocument::parse_json(&schema, line).unwrap());
             }
-        })
-    });
+        });
+        group.run();
+    }
 }
 
-criterion_group!(benches, simple_json_to_doc_benchmark);
-criterion_main!(benches);
+fn main() {
+    run_bench();
+}