From fd89a54a16bdc9a08eeb33ae650c579b313d7bec Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Fri, 12 Jul 2024 18:58:56 +0200 Subject: [PATCH 1/2] add doc mapping uid to splits --- .../quickwit-indexing/src/actors/indexer.rs | 5 +- .../src/actors/merge_executor.rs | 20 +- .../quickwit-indexing/src/actors/packager.rs | 3 +- .../quickwit-indexing/src/actors/uploader.rs | 7 +- .../quickwit-indexing/src/merge_policy/mod.rs | 2 +- .../src/models/indexed_split.rs | 4 +- .../src/models/split_attrs.rs | 6 +- .../quickwit-metastore/src/split_metadata.rs | 8 +- .../src/split_metadata_version.rs | 9 +- .../file-backed-index/v0.7.expected.json | 221 +++++++++--------- .../file-backed-index/v0.8.expected.json | 221 +++++++++--------- .../file-backed-index/v0.9.expected.json | 221 +++++++++--------- .../test-data/file-backed-index/v0.9.json | 221 +++++++++--------- .../split-metadata/v0.7.expected.json | 39 ++-- .../split-metadata/v0.8.expected.json | 39 ++-- .../split-metadata/v0.9.expected.json | 39 ++-- .../test-data/split-metadata/v0.9.json | 39 ++-- 17 files changed, 576 insertions(+), 528 deletions(-) diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index 44e7e74486e..16fc397ecd4 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -42,7 +42,7 @@ use quickwit_proto::indexing::{IndexingPipelineId, PipelineMetrics}; use quickwit_proto::metastore::{ LastDeleteOpstampRequest, MetastoreService, MetastoreServiceClient, }; -use quickwit_proto::types::PublishToken; +use quickwit_proto::types::{DocMappingUid, PublishToken}; use quickwit_query::get_quickwit_fastfield_normalizer_manager; use serde::Serialize; use tantivy::schema::Schema; @@ -98,6 +98,7 @@ struct IndexerState { publish_lock: PublishLock, publish_token_opt: Option, schema: Schema, + doc_mapping_uid: DocMappingUid, tokenizer_manager: TokenizerManager, max_num_partitions: NonZeroU32, index_settings: IndexSettings, @@ -130,6 +131,7 @@ impl IndexerState { self.pipeline_id.clone(), partition_id, last_delete_opstamp, + self.doc_mapping_uid, self.indexing_directory.clone(), index_builder, io_controls, @@ -572,6 +574,7 @@ impl Indexer { publish_lock: PublishLock::default(), publish_token_opt: None, schema, + doc_mapping_uid: doc_mapper.doc_mapping_uid(), tokenizer_manager: tokenizer_manager.tantivy_manager().clone(), index_settings, max_num_partitions: doc_mapper.max_num_partitions(), diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 3324bd90754..fd58195aff4 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -236,7 +236,7 @@ pub fn merge_split_attrs( pipeline_id: MergePipelineId, merge_split_id: SplitId, splits: &[SplitMetadata], -) -> SplitAttrs { +) -> anyhow::Result { let partition_id = combine_partition_ids_aux(splits.iter().map(|split| split.partition_id)); let time_range: Option> = merge_time_range(splits); let uncompressed_docs_size_in_bytes = sum_doc_sizes_in_bytes(splits); @@ -250,10 +250,21 @@ pub fn merge_split_attrs( .map(|split| split.delete_opstamp) .min() .unwrap_or(0); - SplitAttrs { + let doc_mapping_uid = splits + .first() + .ok_or_else(|| anyhow::anyhow!("attempted to merge zero splits"))? + .doc_mapping_uid; + if splits + .iter() + .any(|split| split.doc_mapping_uid != doc_mapping_uid) + { + anyhow::bail!("attempted to merge splits with different doc mapping uid"); + } + Ok(SplitAttrs { node_id: pipeline_id.node_id.clone(), index_uid: pipeline_id.index_uid.clone(), source_id: pipeline_id.source_id.clone(), + doc_mapping_uid, split_id: merge_split_id, partition_id, replaced_split_ids, @@ -262,7 +273,7 @@ pub fn merge_split_attrs( uncompressed_docs_size_in_bytes, delete_opstamp, num_merge_ops: max_merge_ops(splits) + 1, - } + }) } fn max_merge_ops(splits: &[SplitMetadata]) -> usize { @@ -324,7 +335,7 @@ impl MergeExecutor { )?; ctx.record_progress(); - let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits); + let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits)?; Ok(IndexedSplit { split_attrs, index: merged_index, @@ -436,6 +447,7 @@ impl MergeExecutor { node_id: NodeId::new(split.node_id), index_uid: split.index_uid, source_id: split.source_id, + doc_mapping_uid: split.doc_mapping_uid, split_id: merge_split_id, partition_id: split.partition_id, replaced_split_ids: vec![split.split_id.clone()], diff --git a/quickwit/quickwit-indexing/src/actors/packager.rs b/quickwit/quickwit-indexing/src/actors/packager.rs index 06577544a10..ac026e410cd 100644 --- a/quickwit/quickwit-indexing/src/actors/packager.rs +++ b/quickwit/quickwit-indexing/src/actors/packager.rs @@ -388,7 +388,7 @@ mod tests { use quickwit_actors::{ObservationType, Universe}; use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_proto::search::{deserialize_split_fields, ListFieldsEntryResponse}; - use quickwit_proto::types::{IndexUid, NodeId}; + use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId}; use tantivy::directory::MmapDirectory; use tantivy::schema::{NumericOptions, Schema, Type, FAST, STRING, TEXT}; use tantivy::{doc, DateTime, IndexBuilder, IndexSettings}; @@ -519,6 +519,7 @@ mod tests { node_id, index_uid, source_id, + doc_mapping_uid: DocMappingUid::default(), split_id: "test-split".to_string(), partition_id: 17u64, num_docs, diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index f12128bf32e..7933ad8d201 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -492,7 +492,7 @@ mod tests { use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; - use quickwit_proto::types::NodeId; + use quickwit_proto::types::{DocMappingUid, NodeId}; use quickwit_storage::RamStorage; use tantivy::DateTime; use tokio::sync::oneshot; @@ -552,6 +552,7 @@ mod tests { node_id, index_uid, source_id, + doc_mapping_uid: DocMappingUid::default(), partition_id: 3u64, time_range: Some( DateTime::from_timestamp_secs(1_628_203_589) @@ -661,6 +662,7 @@ mod tests { node_id: node_id.clone(), index_uid: index_uid.clone(), source_id: source_id.clone(), + doc_mapping_uid: DocMappingUid::default(), split_id: "test-split-1".to_string(), partition_id: 3u64, num_docs: 10, @@ -687,6 +689,7 @@ mod tests { node_id, index_uid, source_id, + doc_mapping_uid: DocMappingUid::default(), split_id: "test-split-2".to_string(), partition_id: 3u64, num_docs: 10, @@ -811,6 +814,7 @@ mod tests { node_id, index_uid, source_id, + doc_mapping_uid: DocMappingUid::default(), split_id: "test-split".to_string(), partition_id: 3u64, time_range: None, @@ -987,6 +991,7 @@ mod tests { node_id, index_uid, source_id, + doc_mapping_uid: DocMappingUid::default(), partition_id: 3u64, time_range: Some( DateTime::from_timestamp_secs(1_628_203_589) diff --git a/quickwit/quickwit-indexing/src/merge_policy/mod.rs b/quickwit/quickwit-indexing/src/merge_policy/mod.rs index 1c82903cba9..69fccbbede0 100644 --- a/quickwit/quickwit-indexing/src/merge_policy/mod.rs +++ b/quickwit/quickwit-indexing/src/merge_policy/mod.rs @@ -379,7 +379,7 @@ pub mod tests { index_uid: IndexUid::new_with_random_ulid("test_index"), source_id: "test_source".to_string(), }; - let split_attrs = merge_split_attrs(pipeline_id, merged_split_id, splits); + let split_attrs = merge_split_attrs(pipeline_id, merged_split_id, splits).unwrap(); create_split_metadata(merge_policy, &split_attrs, tags, 0..0) } diff --git a/quickwit/quickwit-indexing/src/models/indexed_split.rs b/quickwit/quickwit-indexing/src/models/indexed_split.rs index 5cc5a25e76c..b5b37fe8070 100644 --- a/quickwit/quickwit-indexing/src/models/indexed_split.rs +++ b/quickwit/quickwit-indexing/src/models/indexed_split.rs @@ -25,7 +25,7 @@ use quickwit_common::metrics::GaugeGuard; use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_proto::indexing::IndexingPipelineId; -use quickwit_proto::types::{IndexUid, PublishToken}; +use quickwit_proto::types::{DocMappingUid, IndexUid, PublishToken}; use tantivy::directory::MmapDirectory; use tantivy::IndexBuilder; use tracing::{instrument, Span}; @@ -82,6 +82,7 @@ impl IndexedSplitBuilder { pipeline_id: IndexingPipelineId, partition_id: u64, last_delete_opstamp: u64, + doc_mapping_uid: DocMappingUid, scratch_directory: TempDirectory, index_builder: IndexBuilder, io_controls: IoControls, @@ -105,6 +106,7 @@ impl IndexedSplitBuilder { node_id: pipeline_id.node_id, index_uid: pipeline_id.index_uid, source_id: pipeline_id.source_id, + doc_mapping_uid, partition_id, split_id, num_docs: 0, diff --git a/quickwit/quickwit-indexing/src/models/split_attrs.rs b/quickwit/quickwit-indexing/src/models/split_attrs.rs index 010dba8153d..5ac0de40ff3 100644 --- a/quickwit/quickwit-indexing/src/models/split_attrs.rs +++ b/quickwit/quickwit-indexing/src/models/split_attrs.rs @@ -23,7 +23,7 @@ use std::ops::{Range, RangeInclusive}; use std::sync::Arc; use quickwit_metastore::SplitMetadata; -use quickwit_proto::types::{IndexUid, NodeId, SourceId, SplitId}; +use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId, SourceId, SplitId}; use tantivy::DateTime; use time::OffsetDateTime; @@ -37,6 +37,9 @@ pub struct SplitAttrs { /// Source ID to which the split belongs. pub source_id: SourceId, + /// Doc mapping UID used to produce this split. + pub doc_mapping_uid: DocMappingUid, + /// Split ID. Joined with the index URI (/), this ID /// should be enough to uniquely identify a split. /// In reality, some information may be implicitly configured @@ -100,6 +103,7 @@ pub fn create_split_metadata( node_id: split_attrs.node_id.to_string(), index_uid: split_attrs.index_uid.clone(), source_id: split_attrs.source_id.clone(), + doc_mapping_uid: split_attrs.doc_mapping_uid, split_id: split_attrs.split_id.clone(), partition_id: split_attrs.partition_id, num_docs: split_attrs.num_docs as usize, diff --git a/quickwit/quickwit-metastore/src/split_metadata.rs b/quickwit/quickwit-metastore/src/split_metadata.rs index c40ff2256b6..53608f307d8 100644 --- a/quickwit/quickwit-metastore/src/split_metadata.rs +++ b/quickwit/quickwit-metastore/src/split_metadata.rs @@ -25,7 +25,7 @@ use std::str::FromStr; use std::time::Duration; use bytesize::ByteSize; -use quickwit_proto::types::{IndexUid, SourceId, SplitId}; +use quickwit_proto::types::{DocMappingUid, IndexUid, SourceId, SplitId}; use serde::{Deserialize, Serialize}; use serde_with::{serde_as, DurationMilliSeconds}; use time::OffsetDateTime; @@ -133,6 +133,10 @@ pub struct SplitMetadata { /// Number of merge operations that was involved to create /// this split. pub num_merge_ops: usize, + + /// Doc mapping UID used when creating this split. This split may only be merged with other + /// splits using the same doc mapping UID. + pub doc_mapping_uid: DocMappingUid, } impl fmt::Debug for SplitMetadata { @@ -281,6 +285,7 @@ impl quickwit_config::TestableForRegression for SplitMetadata { tags: ["234".to_string(), "aaa".to_string()].into_iter().collect(), footer_offsets: 1000..2000, num_merge_ops: 3, + doc_mapping_uid: DocMappingUid::default(), } } @@ -420,6 +425,7 @@ mod tests { footer_offsets: 0..1024, delete_opstamp: 0, num_merge_ops: 0, + doc_mapping_uid: DocMappingUid::default(), }; let expected_output = "SplitMetadata { split_id: \"split-1\", index_uid: IndexUid { \ diff --git a/quickwit/quickwit-metastore/src/split_metadata_version.rs b/quickwit/quickwit-metastore/src/split_metadata_version.rs index ad777636251..ac65be327e6 100644 --- a/quickwit/quickwit-metastore/src/split_metadata_version.rs +++ b/quickwit/quickwit-metastore/src/split_metadata_version.rs @@ -20,7 +20,7 @@ use std::collections::BTreeSet; use std::ops::{Range, RangeInclusive}; -use quickwit_proto::types::{IndexUid, SplitId}; +use quickwit_proto::types::{DocMappingUid, IndexUid, SplitId}; use serde::{Deserialize, Serialize}; use crate::split_metadata::{utc_now_timestamp, SplitMaturity}; @@ -92,6 +92,11 @@ pub(crate) struct SplitMetadataV0_8 { #[serde(default)] num_merge_ops: usize, + + // we default fill with zero: we don't know the right uid, and it's correct to assume all + // splits before when updates first appeared are compatible with each other. + #[serde(default)] + doc_mapping_uid: DocMappingUid, } impl From for SplitMetadata { @@ -127,6 +132,7 @@ impl From for SplitMetadata { tags: v8.tags, footer_offsets: v8.footer_offsets, num_merge_ops: v8.num_merge_ops, + doc_mapping_uid: v8.doc_mapping_uid, } } } @@ -148,6 +154,7 @@ impl From for SplitMetadataV0_8 { tags: split.tags, footer_offsets: split.footer_offsets, num_merge_ops: split.num_merge_ops, + doc_mapping_uid: split.doc_mapping_uid, } } } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json index 7ea56fa7af8..6a45c2858ea 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json @@ -1,195 +1,196 @@ { - "delete_tasks": [ - { - "create_timestamp": 0, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - }, - "opstamp": 10 - } - ], + "version": "0.9", "index": { - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } - }, - "create_timestamp": 1789, + "version": "0.9", + "index_uid": "my-index:00000000000000000000000000", "index_config": { + "version": "0.9", + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", + "mode": "dynamic", "dynamic_mapping": { - "expand_dots": true, - "fast": { - "normalizer": "raw" - }, "indexed": true, + "tokenizer": "raw", "record": "basic", "stored": true, - "tokenizer": "raw" + "expand_dots": true, + "fast": { + "normalizer": "raw" + } }, "field_mappings": [ { - "coerce": true, - "fast": true, - "indexed": true, "name": "tenant_id", - "output_format": "number", + "type": "u64", "stored": true, - "type": "u64" + "indexed": true, + "fast": true, + "coerce": true, + "output_format": "number" }, { - "fast": true, - "fast_precision": "seconds", - "indexed": true, + "name": "timestamp", + "type": "datetime", "input_formats": [ "rfc3339", "unix_timestamp" ], - "name": "timestamp", "output_format": "rfc3339", + "fast_precision": "seconds", + "indexed": true, "stored": true, - "type": "datetime" + "fast": true }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "log_level", + "type": "text", + "indexed": true, + "tokenizer": "raw", "record": "basic", + "fieldnorms": false, "stored": true, - "tokenizer": "raw", - "type": "text" + "fast": false }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "message", + "type": "text", + "indexed": true, + "tokenizer": "default", "record": "position", + "fieldnorms": false, "stored": true, - "tokenizer": "default", - "type": "text" + "fast": false } ], - "index_field_presence": true, - "max_num_partitions": 100, - "mode": "dynamic", - "partition_key": "tenant_id", - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tag_fields": [ "log_level", "tenant_id" ], - "timestamp_field": "timestamp", + "partition_key": "tenant_id", + "max_num_partitions": 100, + "index_field_presence": true, + "store_document_size": false, + "store_source": true, "tokenizers": [ { - "filters": [], "name": "custom_tokenizer", + "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "type": "regex" + "filters": [] } ] }, - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_blocksize": 1000000, "docstore_compression_level": 8, + "docstore_blocksize": 1000000, + "split_num_docs_target": 10000001, "merge_policy": { - "maturation_period": "2days", - "max_merge_factor": 11, - "merge_factor": 9, + "type": "stable_log", "min_level_num_docs": 100000, - "type": "stable_log" + "merge_factor": 9, + "max_merge_factor": 11, + "maturation_period": "2days" }, "resources": { "heap_size": "50.0 MB" - }, - "split_num_docs_target": 10000001 - }, - "retention": { - "period": "90 days", - "schedule": "daily" + } }, "search_settings": { "default_search_fields": [ "message" ] }, - "version": "0.9" + "retention": { + "period": "90 days", + "schedule": "daily" + } }, - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "sources": [ { - "enabled": true, - "input_format": "json", + "version": "0.9", + "source_id": "kafka-source", "num_pipelines": 2, + "enabled": true, + "source_type": "kafka", "params": { - "client_params": {}, - "topic": "kafka-topic" + "topic": "kafka-topic", + "client_params": {} }, - "source_id": "kafka-source", - "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "version": "0.9" - } - ], - "version": "0.9" - }, - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "shard_id": "00000000000000000001", - "source_id": "_ingest-source", - "shard_state": 1, - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "doc_mapping_uid": "00000000000000000000000000", - "publish_position_inclusive": "" + "input_format": "json" } ] }, "splits": [ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "split_state": "Published", + "update_timestamp": 1789, + "publish_timestamp": 1789, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000000", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, - "publish_timestamp": 1789, "source_id": "source", - "split_id": "split", - "split_state": "Published", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "update_timestamp": 1789, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } ], - "version": "0.9" + "shards": { + "_ingest-source": [ + { + "index_uid": "my-index:00000000000000000000000000", + "source_id": "_ingest-source", + "shard_id": "00000000000000000001", + "leader_id": "leader-ingester", + "follower_id": "follower-ingester", + "shard_state": 1, + "publish_position_inclusive": "", + "doc_mapping_uid": "00000000000000000000000000" + } + ] + }, + "delete_tasks": [ + { + "create_timestamp": 0, + "opstamp": 10, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + } + } + ] } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json index 7ea56fa7af8..6a45c2858ea 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json @@ -1,195 +1,196 @@ { - "delete_tasks": [ - { - "create_timestamp": 0, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - }, - "opstamp": 10 - } - ], + "version": "0.9", "index": { - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } - }, - "create_timestamp": 1789, + "version": "0.9", + "index_uid": "my-index:00000000000000000000000000", "index_config": { + "version": "0.9", + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", + "mode": "dynamic", "dynamic_mapping": { - "expand_dots": true, - "fast": { - "normalizer": "raw" - }, "indexed": true, + "tokenizer": "raw", "record": "basic", "stored": true, - "tokenizer": "raw" + "expand_dots": true, + "fast": { + "normalizer": "raw" + } }, "field_mappings": [ { - "coerce": true, - "fast": true, - "indexed": true, "name": "tenant_id", - "output_format": "number", + "type": "u64", "stored": true, - "type": "u64" + "indexed": true, + "fast": true, + "coerce": true, + "output_format": "number" }, { - "fast": true, - "fast_precision": "seconds", - "indexed": true, + "name": "timestamp", + "type": "datetime", "input_formats": [ "rfc3339", "unix_timestamp" ], - "name": "timestamp", "output_format": "rfc3339", + "fast_precision": "seconds", + "indexed": true, "stored": true, - "type": "datetime" + "fast": true }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "log_level", + "type": "text", + "indexed": true, + "tokenizer": "raw", "record": "basic", + "fieldnorms": false, "stored": true, - "tokenizer": "raw", - "type": "text" + "fast": false }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "message", + "type": "text", + "indexed": true, + "tokenizer": "default", "record": "position", + "fieldnorms": false, "stored": true, - "tokenizer": "default", - "type": "text" + "fast": false } ], - "index_field_presence": true, - "max_num_partitions": 100, - "mode": "dynamic", - "partition_key": "tenant_id", - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tag_fields": [ "log_level", "tenant_id" ], - "timestamp_field": "timestamp", + "partition_key": "tenant_id", + "max_num_partitions": 100, + "index_field_presence": true, + "store_document_size": false, + "store_source": true, "tokenizers": [ { - "filters": [], "name": "custom_tokenizer", + "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "type": "regex" + "filters": [] } ] }, - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_blocksize": 1000000, "docstore_compression_level": 8, + "docstore_blocksize": 1000000, + "split_num_docs_target": 10000001, "merge_policy": { - "maturation_period": "2days", - "max_merge_factor": 11, - "merge_factor": 9, + "type": "stable_log", "min_level_num_docs": 100000, - "type": "stable_log" + "merge_factor": 9, + "max_merge_factor": 11, + "maturation_period": "2days" }, "resources": { "heap_size": "50.0 MB" - }, - "split_num_docs_target": 10000001 - }, - "retention": { - "period": "90 days", - "schedule": "daily" + } }, "search_settings": { "default_search_fields": [ "message" ] }, - "version": "0.9" + "retention": { + "period": "90 days", + "schedule": "daily" + } }, - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "sources": [ { - "enabled": true, - "input_format": "json", + "version": "0.9", + "source_id": "kafka-source", "num_pipelines": 2, + "enabled": true, + "source_type": "kafka", "params": { - "client_params": {}, - "topic": "kafka-topic" + "topic": "kafka-topic", + "client_params": {} }, - "source_id": "kafka-source", - "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "version": "0.9" - } - ], - "version": "0.9" - }, - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "shard_id": "00000000000000000001", - "source_id": "_ingest-source", - "shard_state": 1, - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "doc_mapping_uid": "00000000000000000000000000", - "publish_position_inclusive": "" + "input_format": "json" } ] }, "splits": [ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "split_state": "Published", + "update_timestamp": 1789, + "publish_timestamp": 1789, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000000", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, - "publish_timestamp": 1789, "source_id": "source", - "split_id": "split", - "split_state": "Published", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "update_timestamp": 1789, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } ], - "version": "0.9" + "shards": { + "_ingest-source": [ + { + "index_uid": "my-index:00000000000000000000000000", + "source_id": "_ingest-source", + "shard_id": "00000000000000000001", + "leader_id": "leader-ingester", + "follower_id": "follower-ingester", + "shard_state": 1, + "publish_position_inclusive": "", + "doc_mapping_uid": "00000000000000000000000000" + } + ] + }, + "delete_tasks": [ + { + "create_timestamp": 0, + "opstamp": 10, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + } + } + ] } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json index fbd61156d9f..8694ae89469 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json @@ -1,195 +1,196 @@ { - "delete_tasks": [ - { - "create_timestamp": 0, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - }, - "opstamp": 10 - } - ], + "version": "0.9", "index": { - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } - }, - "create_timestamp": 1789, + "version": "0.9", + "index_uid": "my-index:00000000000000000000000001", "index_config": { + "version": "0.9", + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", + "mode": "dynamic", "dynamic_mapping": { - "expand_dots": true, - "fast": { - "normalizer": "raw" - }, "indexed": true, + "tokenizer": "raw", "record": "basic", "stored": true, - "tokenizer": "raw" + "expand_dots": true, + "fast": { + "normalizer": "raw" + } }, "field_mappings": [ { - "coerce": true, - "fast": true, - "indexed": true, "name": "tenant_id", - "output_format": "number", + "type": "u64", "stored": true, - "type": "u64" + "indexed": true, + "fast": true, + "coerce": true, + "output_format": "number" }, { - "fast": true, - "fast_precision": "seconds", - "indexed": true, + "name": "timestamp", + "type": "datetime", "input_formats": [ "rfc3339", "unix_timestamp" ], - "name": "timestamp", "output_format": "rfc3339", + "fast_precision": "seconds", + "indexed": true, "stored": true, - "type": "datetime" + "fast": true }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "log_level", + "type": "text", + "indexed": true, + "tokenizer": "raw", "record": "basic", + "fieldnorms": false, "stored": true, - "tokenizer": "raw", - "type": "text" + "fast": false }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "message", + "type": "text", + "indexed": true, + "tokenizer": "default", "record": "position", + "fieldnorms": false, "stored": true, - "tokenizer": "default", - "type": "text" + "fast": false } ], - "index_field_presence": true, - "max_num_partitions": 100, - "mode": "dynamic", - "partition_key": "tenant_id", - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tag_fields": [ "log_level", "tenant_id" ], - "timestamp_field": "timestamp", + "partition_key": "tenant_id", + "max_num_partitions": 100, + "index_field_presence": true, + "store_document_size": false, + "store_source": true, "tokenizers": [ { - "filters": [], "name": "custom_tokenizer", + "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "type": "regex" + "filters": [] } ] }, - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_blocksize": 1000000, "docstore_compression_level": 8, + "docstore_blocksize": 1000000, + "split_num_docs_target": 10000001, "merge_policy": { - "maturation_period": "2days", - "max_merge_factor": 11, - "merge_factor": 9, + "type": "stable_log", "min_level_num_docs": 100000, - "type": "stable_log" + "merge_factor": 9, + "max_merge_factor": 11, + "maturation_period": "2days" }, "resources": { "heap_size": "50.0 MB" - }, - "split_num_docs_target": 10000001 - }, - "retention": { - "period": "90 days", - "schedule": "daily" + } }, "search_settings": { "default_search_fields": [ "message" ] }, - "version": "0.9" + "retention": { + "period": "90 days", + "schedule": "daily" + } }, - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "sources": [ { - "enabled": true, - "input_format": "json", + "version": "0.9", + "source_id": "kafka-source", "num_pipelines": 2, + "enabled": true, + "source_type": "kafka", "params": { - "client_params": {}, - "topic": "kafka-topic" + "topic": "kafka-topic", + "client_params": {} }, - "source_id": "kafka-source", - "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "version": "0.9" - } - ], - "version": "0.9" - }, - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "shard_id": "00000000000000000001", - "source_id": "_ingest-source", - "shard_state": 1, - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "doc_mapping_uid": "00000000000000000000000001", - "publish_position_inclusive": "" + "input_format": "json" } ] }, "splits": [ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "split_state": "Published", + "update_timestamp": 1789, + "publish_timestamp": 1789, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000001", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, - "publish_timestamp": 1789, "source_id": "source", - "split_id": "split", - "split_state": "Published", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "update_timestamp": 1789, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } ], - "version": "0.9" + "shards": { + "_ingest-source": [ + { + "index_uid": "my-index:00000000000000000000000001", + "source_id": "_ingest-source", + "shard_id": "00000000000000000001", + "leader_id": "leader-ingester", + "follower_id": "follower-ingester", + "shard_state": 1, + "publish_position_inclusive": "", + "doc_mapping_uid": "00000000000000000000000001" + } + ] + }, + "delete_tasks": [ + { + "create_timestamp": 0, + "opstamp": 10, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + } + } + ] } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json index fbd61156d9f..8694ae89469 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json @@ -1,195 +1,196 @@ { - "delete_tasks": [ - { - "create_timestamp": 0, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - }, - "opstamp": 10 - } - ], + "version": "0.9", "index": { - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } - }, - "create_timestamp": 1789, + "version": "0.9", + "index_uid": "my-index:00000000000000000000000001", "index_config": { + "version": "0.9", + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", + "mode": "dynamic", "dynamic_mapping": { - "expand_dots": true, - "fast": { - "normalizer": "raw" - }, "indexed": true, + "tokenizer": "raw", "record": "basic", "stored": true, - "tokenizer": "raw" + "expand_dots": true, + "fast": { + "normalizer": "raw" + } }, "field_mappings": [ { - "coerce": true, - "fast": true, - "indexed": true, "name": "tenant_id", - "output_format": "number", + "type": "u64", "stored": true, - "type": "u64" + "indexed": true, + "fast": true, + "coerce": true, + "output_format": "number" }, { - "fast": true, - "fast_precision": "seconds", - "indexed": true, + "name": "timestamp", + "type": "datetime", "input_formats": [ "rfc3339", "unix_timestamp" ], - "name": "timestamp", "output_format": "rfc3339", + "fast_precision": "seconds", + "indexed": true, "stored": true, - "type": "datetime" + "fast": true }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "log_level", + "type": "text", + "indexed": true, + "tokenizer": "raw", "record": "basic", + "fieldnorms": false, "stored": true, - "tokenizer": "raw", - "type": "text" + "fast": false }, { - "fast": false, - "fieldnorms": false, - "indexed": true, "name": "message", + "type": "text", + "indexed": true, + "tokenizer": "default", "record": "position", + "fieldnorms": false, "stored": true, - "tokenizer": "default", - "type": "text" + "fast": false } ], - "index_field_presence": true, - "max_num_partitions": 100, - "mode": "dynamic", - "partition_key": "tenant_id", - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tag_fields": [ "log_level", "tenant_id" ], - "timestamp_field": "timestamp", + "partition_key": "tenant_id", + "max_num_partitions": 100, + "index_field_presence": true, + "store_document_size": false, + "store_source": true, "tokenizers": [ { - "filters": [], "name": "custom_tokenizer", + "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "type": "regex" + "filters": [] } ] }, - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_blocksize": 1000000, "docstore_compression_level": 8, + "docstore_blocksize": 1000000, + "split_num_docs_target": 10000001, "merge_policy": { - "maturation_period": "2days", - "max_merge_factor": 11, - "merge_factor": 9, + "type": "stable_log", "min_level_num_docs": 100000, - "type": "stable_log" + "merge_factor": 9, + "max_merge_factor": 11, + "maturation_period": "2days" }, "resources": { "heap_size": "50.0 MB" - }, - "split_num_docs_target": 10000001 - }, - "retention": { - "period": "90 days", - "schedule": "daily" + } }, "search_settings": { "default_search_fields": [ "message" ] }, - "version": "0.9" + "retention": { + "period": "90 days", + "schedule": "daily" + } }, - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "sources": [ { - "enabled": true, - "input_format": "json", + "version": "0.9", + "source_id": "kafka-source", "num_pipelines": 2, + "enabled": true, + "source_type": "kafka", "params": { - "client_params": {}, - "topic": "kafka-topic" + "topic": "kafka-topic", + "client_params": {} }, - "source_id": "kafka-source", - "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "version": "0.9" - } - ], - "version": "0.9" - }, - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "shard_id": "00000000000000000001", - "source_id": "_ingest-source", - "shard_state": 1, - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "doc_mapping_uid": "00000000000000000000000001", - "publish_position_inclusive": "" + "input_format": "json" } ] }, "splits": [ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "split_state": "Published", + "update_timestamp": 1789, + "publish_timestamp": 1789, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000001", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, - "publish_timestamp": 1789, "source_id": "source", - "split_id": "split", - "split_state": "Published", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "update_timestamp": 1789, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } ], - "version": "0.9" + "shards": { + "_ingest-source": [ + { + "index_uid": "my-index:00000000000000000000000001", + "source_id": "_ingest-source", + "shard_id": "00000000000000000001", + "leader_id": "leader-ingester", + "follower_id": "follower-ingester", + "shard_state": 1, + "publish_position_inclusive": "", + "doc_mapping_uid": "00000000000000000000000001" + } + ] + }, + "delete_tasks": [ + { + "create_timestamp": 0, + "opstamp": 10, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + } + } + ] } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json index b7a7ffef08b..248baebc68e 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json @@ -1,29 +1,30 @@ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000000", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, "source_id": "source", - "split_id": "split", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json index b7a7ffef08b..248baebc68e 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json @@ -1,29 +1,30 @@ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000000", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, "source_id": "source", - "split_id": "split", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json index 9fe4c60aae1..85bdfca81e0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json @@ -1,29 +1,30 @@ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000001", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, "source_id": "source", - "split_id": "split", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json index 9fe4c60aae1..85bdfca81e0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json @@ -1,29 +1,30 @@ { - "create_timestamp": 3, - "delete_opstamp": 10, - "footer_offsets": { - "end": 2000, - "start": 1000 - }, + "version": "0.9", + "split_id": "split", "index_uid": "my-index:00000000000000000000000001", - "maturity": { - "maturation_period_millis": 4000, - "type": "immature" - }, - "node_id": "node", - "num_docs": 12303, - "num_merge_ops": 3, "partition_id": 7, "source_id": "source", - "split_id": "split", + "node_id": "node", + "num_docs": 12303, + "uncompressed_docs_size_in_bytes": 234234, + "time_range": { + "start": 121000, + "end": 130198 + }, + "create_timestamp": 3, + "maturity": { + "type": "immature", + "maturation_period_millis": 4000 + }, "tags": [ "234", "aaa" ], - "time_range": { - "end": 130198, - "start": 121000 + "footer_offsets": { + "start": 1000, + "end": 2000 }, - "uncompressed_docs_size_in_bytes": 234234, - "version": "0.9" + "delete_opstamp": 10, + "num_merge_ops": 3, + "doc_mapping_uid": "00000000000000000000000000" } From 069d7a247587fe55000eae9bebe836bd2cd61e44 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Tue, 30 Apr 2024 17:01:41 +0200 Subject: [PATCH 2/2] partition merges by doc mapper version --- .../src/actors/merge_planner.rs | 107 ++++++++++++++---- 1 file changed, 82 insertions(+), 25 deletions(-) diff --git a/quickwit/quickwit-indexing/src/actors/merge_planner.rs b/quickwit/quickwit-indexing/src/actors/merge_planner.rs index 77fb64ad652..dd0db3eb876 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_planner.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_planner.rs @@ -25,6 +25,7 @@ use async_trait::async_trait; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; use quickwit_metastore::SplitMetadata; use quickwit_proto::indexing::MergePipelineId; +use quickwit_proto::types::DocMappingUid; use serde::Serialize; use tantivy::Inventory; use time::OffsetDateTime; @@ -37,11 +38,26 @@ use crate::merge_policy::MergeOperation; use crate::models::NewSplits; use crate::MergePolicy; +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct MergePartition { + partition_id: u64, + doc_mapping_uid: DocMappingUid, +} + +impl MergePartition { + fn from_split_meta(split_meta: &SplitMetadata) -> MergePartition { + MergePartition { + partition_id: split_meta.partition_id, + doc_mapping_uid: split_meta.doc_mapping_uid, + } + } +} + /// The merge planner decides when to start a merge task. pub struct MergePlanner { /// A young split is a split that has not reached maturity /// yet and can be candidate to merge operations. - partitioned_young_splits: HashMap>, + partitioned_young_splits: HashMap>, /// This set contains all of the split ids that we "acknowledged". /// The point of this set is to rapidly dismiss redundant `NewSplit` message. @@ -228,7 +244,7 @@ impl MergePlanner { fn record_split(&mut self, new_split: SplitMetadata) { let splits_for_partition: &mut Vec = self .partitioned_young_splits - .entry(new_split.partition_id) + .entry(MergePartition::from_split_meta(&new_split)) .or_default(); splits_for_partition.push(new_split); } @@ -326,7 +342,7 @@ mod tests { use quickwit_config::IndexingSettings; use quickwit_metastore::{SplitMaturity, SplitMetadata}; use quickwit_proto::indexing::MergePipelineId; - use quickwit_proto::types::{IndexUid, NodeId}; + use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId}; use time::OffsetDateTime; use crate::actors::MergePlanner; @@ -339,6 +355,7 @@ mod tests { index_uid: &IndexUid, split_id: &str, partition_id: u64, + doc_mapping_uid: DocMappingUid, num_docs: usize, num_merge_ops: usize, ) -> SplitMetadata { @@ -354,6 +371,7 @@ mod tests { maturity: SplitMaturity::Immature { maturation_period: Duration::from_secs(3600), }, + doc_mapping_uid, ..Default::default() } } @@ -363,6 +381,8 @@ mod tests { let node_id = NodeId::from("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); + let doc_mapping_uid1 = DocMappingUid::random(); + let doc_mapping_uid2 = DocMappingUid::random(); let pipeline_id = MergePipelineId { node_id, index_uid: index_uid.clone(), @@ -394,8 +414,9 @@ mod tests { // send one split let message = NewSplits { new_splits: vec![ - split_metadata_for_test(&index_uid, "1_1", 1, 2500, 0), - split_metadata_for_test(&index_uid, "1_2", 2, 3000, 0), + split_metadata_for_test(&index_uid, "1_1", 1, doc_mapping_uid1, 2500, 0), + split_metadata_for_test(&index_uid, "1v2_1", 1, doc_mapping_uid2, 2500, 0), + split_metadata_for_test(&index_uid, "1_2", 2, doc_mapping_uid1, 3000, 0), ], }; merge_planner_mailbox.send_message(message).await?; @@ -406,8 +427,9 @@ mod tests { // send two splits with a duplicate let message = NewSplits { new_splits: vec![ - split_metadata_for_test(&index_uid, "2_1", 1, 2000, 0), - split_metadata_for_test(&index_uid, "1_2", 2, 3000, 0), + split_metadata_for_test(&index_uid, "2_1", 1, doc_mapping_uid1, 2000, 0), + split_metadata_for_test(&index_uid, "2v2_1", 1, doc_mapping_uid2, 2500, 0), + split_metadata_for_test(&index_uid, "1_2", 2, doc_mapping_uid1, 3000, 0), ], }; merge_planner_mailbox.send_message(message).await?; @@ -418,27 +440,41 @@ mod tests { // send four more splits to generate merge let message = NewSplits { new_splits: vec![ - split_metadata_for_test(&index_uid, "3_1", 1, 1500, 0), - split_metadata_for_test(&index_uid, "4_1", 1, 1000, 0), - split_metadata_for_test(&index_uid, "2_2", 2, 2000, 0), - split_metadata_for_test(&index_uid, "3_2", 2, 4000, 0), + split_metadata_for_test(&index_uid, "3_1", 1, doc_mapping_uid1, 1500, 0), + split_metadata_for_test(&index_uid, "4_1", 1, doc_mapping_uid1, 1000, 0), + split_metadata_for_test(&index_uid, "3v2_1", 1, doc_mapping_uid2, 1500, 0), + split_metadata_for_test(&index_uid, "2_2", 2, doc_mapping_uid1, 2000, 0), + split_metadata_for_test(&index_uid, "3_2", 2, doc_mapping_uid1, 4000, 0), ], }; merge_planner_mailbox.send_message(message).await?; merge_planner_handle.process_pending_and_observe().await; let operations = merge_split_downloader_inbox.drain_for_test_typed::(); - assert_eq!(operations.len(), 2); - let mut merge_operations = operations.into_iter().sorted_by(|left_op, right_op| { - left_op.splits[0] - .partition_id - .cmp(&right_op.splits[0].partition_id) - }); + assert_eq!(operations.len(), 3); + let mut merge_operations = operations + .into_iter() + .sorted_by_key(|op| (op.splits[0].partition_id, op.splits[0].doc_mapping_uid)); let first_merge_operation = merge_operations.next().unwrap(); assert_eq!(first_merge_operation.splits.len(), 4); + assert!(first_merge_operation + .splits + .iter() + .all(|split| split.partition_id == 1 && split.doc_mapping_uid == doc_mapping_uid1)); let second_merge_operation = merge_operations.next().unwrap(); assert_eq!(second_merge_operation.splits.len(), 3); + assert!(second_merge_operation + .splits + .iter() + .all(|split| split.partition_id == 1 && split.doc_mapping_uid == doc_mapping_uid2)); + + let third_merge_operation = merge_operations.next().unwrap(); + assert_eq!(third_merge_operation.splits.len(), 3); + assert!(third_merge_operation + .splits + .iter() + .all(|split| split.partition_id == 2 && split.doc_mapping_uid == doc_mapping_uid1)); } universe.assert_quit().await; @@ -451,6 +487,7 @@ mod tests { let node_id = NodeId::from("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); + let doc_mapping_uid = DocMappingUid::random(); let pipeline_id = MergePipelineId { node_id, index_uid: index_uid.clone(), @@ -472,12 +509,20 @@ mod tests { }; let immature_splits = vec![ split_metadata_for_test( - &index_uid, "a_small", 0, // partition_id - 1_000_000, 2, + &index_uid, + "a_small", + 0, // partition_id + doc_mapping_uid, + 1_000_000, + 2, ), split_metadata_for_test( - &index_uid, "b_small", 0, // partition_id - 1_000_000, 2, + &index_uid, + "b_small", + 0, // partition_id + doc_mapping_uid, + 1_000_000, + 2, ), ]; let merge_policy: Arc = merge_policy_from_settings(&indexing_settings); @@ -527,6 +572,7 @@ mod tests { let node_id = NodeId::from("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); + let doc_mapping_uid = DocMappingUid::random(); let pipeline_id = MergePipelineId { node_id, index_uid, @@ -558,6 +604,7 @@ mod tests { &other_index_uid, "a_small", 0, // partition_id + doc_mapping_uid, 1_000_000, 2, ), @@ -565,6 +612,7 @@ mod tests { &other_index_uid, "b_small", 0, // partition_id + doc_mapping_uid, 1_000_000, 2, ), @@ -595,6 +643,7 @@ mod tests { let node_id = NodeId::from("test-node"); let index_uid = IndexUid::new_with_random_ulid("test-index"); let source_id = "test-source".to_string(); + let doc_mapping_uid = DocMappingUid::random(); let pipeline_id = MergePipelineId { node_id, index_uid: index_uid.clone(), @@ -617,12 +666,20 @@ mod tests { }; let immature_splits = vec![ split_metadata_for_test( - &index_uid, "a_small", 0, // partition_id - 1_000_000, 2, + &index_uid, + "a_small", + 0, // partition_id + doc_mapping_uid, + 1_000_000, + 2, ), split_metadata_for_test( - &index_uid, "b_small", 0, // partition_id - 1_000_000, 2, + &index_uid, + "b_small", + 0, // partition_id + doc_mapping_uid, + 1_000_000, + 2, ), ]; let merge_policy: Arc = merge_policy_from_settings(&indexing_settings);