Skip to content

Commit a2ffa08

Browse files
committed
Reaquire shards only if they are stale
1 parent cbfd340 commit a2ffa08

File tree

3 files changed

+228
-86
lines changed

3 files changed

+228
-86
lines changed

quickwit/quickwit-indexing/src/source/queue_sources/coordinator.rs

Lines changed: 88 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ pub struct QueueCoordinator {
7878
shared_state: QueueSharedState,
7979
local_state: QueueLocalState,
8080
publish_token: String,
81-
visible_settings: VisibilitySettings,
81+
visibility_settings: VisibilitySettings,
8282
}
8383

8484
impl fmt::Debug for QueueCoordinator {
@@ -102,6 +102,9 @@ impl QueueCoordinator {
102102
metastore: source_runtime.metastore,
103103
source_id: source_runtime.pipeline_id.source_id.clone(),
104104
index_uid: source_runtime.pipeline_id.index_uid.clone(),
105+
reacquire_grace_period: Duration::from_secs(
106+
2 * source_runtime.indexing_setting.commit_timeout_secs as u64,
107+
),
105108
},
106109
local_state: QueueLocalState::default(),
107110
pipeline_id: source_runtime.pipeline_id,
@@ -113,7 +116,7 @@ impl QueueCoordinator {
113116
message_type,
114117
publish_lock: PublishLock::default(),
115118
publish_token: Ulid::new().to_string(),
116-
visible_settings: VisibilitySettings::from_commit_timeout(
119+
visibility_settings: VisibilitySettings::from_commit_timeout(
117120
source_runtime.indexing_setting.commit_timeout_secs,
118121
),
119122
}
@@ -157,7 +160,7 @@ impl QueueCoordinator {
157160
async fn poll_messages(&mut self, ctx: &SourceContext) -> Result<(), ActorExitStatus> {
158161
let raw_messages = self
159162
.queue_receiver
160-
.receive(1, self.visible_settings.deadline_for_receive)
163+
.receive(1, self.visibility_settings.deadline_for_receive)
161164
.await?;
162165

163166
let mut format_errors = Vec::new();
@@ -215,7 +218,7 @@ impl QueueCoordinator {
215218
self.queue.clone(),
216219
message.metadata.ack_id.clone(),
217220
message.metadata.initial_deadline,
218-
self.visible_settings.clone(),
221+
self.visibility_settings.clone(),
219222
),
220223
content: message,
221224
position,
@@ -254,7 +257,7 @@ impl QueueCoordinator {
254257
.await?;
255258
if in_progress_ref.batch_reader.is_eof() {
256259
self.local_state
257-
.drop_currently_read(self.visible_settings.deadline_for_last_extension)
260+
.drop_currently_read(self.visibility_settings.deadline_for_last_extension)
258261
.await?;
259262
self.observable_state.num_messages_processed += 1;
260263
}
@@ -319,7 +322,7 @@ mod tests {
319322
use crate::source::doc_file_reader::file_test_helpers::{generate_dummy_doc_file, DUMMY_DOC};
320323
use crate::source::queue_sources::memory_queue::MemoryQueueForTests;
321324
use crate::source::queue_sources::message::PreProcessedPayload;
322-
use crate::source::queue_sources::shared_state::shared_state_for_tests::shared_state_for_tests;
325+
use crate::source::queue_sources::shared_state::shared_state_for_tests::init_state;
323326
use crate::source::{SourceActor, BATCH_NUM_BYTES_LIMIT};
324327

325328
fn setup_coordinator(
@@ -347,7 +350,7 @@ mod tests {
347350
source_type: SourceType::Unspecified,
348351
storage_resolver: StorageResolver::for_test(),
349352
publish_token: Ulid::new().to_string(),
350-
visible_settings: VisibilitySettings::from_commit_timeout(5),
353+
visibility_settings: VisibilitySettings::from_commit_timeout(5),
351354
}
352355
}
353356

@@ -401,7 +404,7 @@ mod tests {
401404
#[tokio::test]
402405
async fn test_process_empty_queue() {
403406
let queue = Arc::new(MemoryQueueForTests::new());
404-
let shared_state = shared_state_for_tests("test-index", Default::default());
407+
let shared_state = init_state("test-index", Default::default());
405408
let mut coordinator = setup_coordinator(queue.clone(), shared_state);
406409
let batches = process_messages(&mut coordinator, queue, &[]).await;
407410
assert_eq!(batches.len(), 0);
@@ -410,7 +413,7 @@ mod tests {
410413
#[tokio::test]
411414
async fn test_process_one_small_message() {
412415
let queue = Arc::new(MemoryQueueForTests::new());
413-
let shared_state = shared_state_for_tests("test-index", Default::default());
416+
let shared_state = init_state("test-index", Default::default());
414417
let mut coordinator = setup_coordinator(queue.clone(), shared_state.clone());
415418
let (dummy_doc_file, _) = generate_dummy_doc_file(false, 10).await;
416419
let test_uri = Uri::from_str(dummy_doc_file.path().to_str().unwrap()).unwrap();
@@ -424,7 +427,7 @@ mod tests {
424427
#[tokio::test]
425428
async fn test_process_one_big_message() {
426429
let queue = Arc::new(MemoryQueueForTests::new());
427-
let shared_state = shared_state_for_tests("test-index", Default::default());
430+
let shared_state = init_state("test-index", Default::default());
428431
let mut coordinator = setup_coordinator(queue.clone(), shared_state);
429432
let lines = BATCH_NUM_BYTES_LIMIT as usize / DUMMY_DOC.len() + 1;
430433
let (dummy_doc_file, _) = generate_dummy_doc_file(true, lines).await;
@@ -437,7 +440,7 @@ mod tests {
437440
#[tokio::test]
438441
async fn test_process_two_messages_different_compression() {
439442
let queue = Arc::new(MemoryQueueForTests::new());
440-
let shared_state = shared_state_for_tests("test-index", Default::default());
443+
let shared_state = init_state("test-index", Default::default());
441444
let mut coordinator = setup_coordinator(queue.clone(), shared_state);
442445
let (dummy_doc_file_1, _) = generate_dummy_doc_file(false, 10).await;
443446
let test_uri_1 = Uri::from_str(dummy_doc_file_1.path().to_str().unwrap()).unwrap();
@@ -456,7 +459,7 @@ mod tests {
456459
#[tokio::test]
457460
async fn test_process_local_duplicate_message() {
458461
let queue = Arc::new(MemoryQueueForTests::new());
459-
let shared_state = shared_state_for_tests("test-index", Default::default());
462+
let shared_state = init_state("test-index", Default::default());
460463
let mut coordinator = setup_coordinator(queue.clone(), shared_state);
461464
let (dummy_doc_file, _) = generate_dummy_doc_file(false, 10).await;
462465
let test_uri = Uri::from_str(dummy_doc_file.path().to_str().unwrap()).unwrap();
@@ -477,11 +480,15 @@ mod tests {
477480
let partition_id = PreProcessedPayload::ObjectUri(test_uri.clone()).partition_id();
478481

479482
let queue = Arc::new(MemoryQueueForTests::new());
480-
let shared_state = shared_state_for_tests(
483+
let shared_state = init_state(
481484
"test-index",
482485
&[(
483486
partition_id.clone(),
484-
("existing_token".to_string(), Position::eof(file_size)),
487+
(
488+
"existing_token".to_string(),
489+
Position::eof(file_size),
490+
false,
491+
),
485492
)],
486493
);
487494
let mut coordinator = setup_coordinator(queue.clone(), shared_state.clone());
@@ -492,30 +499,82 @@ mod tests {
492499
assert!(coordinator.local_state.is_completed(&partition_id));
493500
}
494501

502+
#[tokio::test]
503+
async fn test_process_existing_messages() {
504+
let (dummy_doc_file_1, _) = generate_dummy_doc_file(false, 10).await;
505+
let test_uri_1 = Uri::from_str(dummy_doc_file_1.path().to_str().unwrap()).unwrap();
506+
let partition_id_1 = PreProcessedPayload::ObjectUri(test_uri_1.clone()).partition_id();
507+
508+
let (dummy_doc_file_2, _) = generate_dummy_doc_file(false, 10).await;
509+
let test_uri_2 = Uri::from_str(dummy_doc_file_2.path().to_str().unwrap()).unwrap();
510+
let partition_id_2 = PreProcessedPayload::ObjectUri(test_uri_2.clone()).partition_id();
511+
512+
let (dummy_doc_file_3, _) = generate_dummy_doc_file(false, 10).await;
513+
let test_uri_3 = Uri::from_str(dummy_doc_file_3.path().to_str().unwrap()).unwrap();
514+
let partition_id_3 = PreProcessedPayload::ObjectUri(test_uri_3.clone()).partition_id();
515+
516+
let queue = Arc::new(MemoryQueueForTests::new());
517+
let shared_state = init_state(
518+
"test-index",
519+
&[
520+
(
521+
partition_id_1.clone(),
522+
("existing_token_1".to_string(), Position::Beginning, true),
523+
),
524+
(
525+
partition_id_2.clone(),
526+
(
527+
"existing_token_2".to_string(),
528+
Position::offset((DUMMY_DOC.len() + 1) * 2),
529+
true,
530+
),
531+
),
532+
(
533+
partition_id_3.clone(),
534+
(
535+
"existing_token_3".to_string(),
536+
Position::offset((DUMMY_DOC.len() + 1) * 6),
537+
false, // should not be processed because not stale yet
538+
),
539+
),
540+
],
541+
);
542+
let mut coordinator = setup_coordinator(queue.clone(), shared_state.clone());
543+
let batches = process_messages(
544+
&mut coordinator,
545+
queue,
546+
&[
547+
(&test_uri_1, "ack-id-1"),
548+
(&test_uri_2, "ack-id-2"),
549+
(&test_uri_3, "ack-id-3"),
550+
],
551+
)
552+
.await;
553+
assert_eq!(batches.len(), 2);
554+
assert_eq!(batches.iter().map(|b| b.docs.len()).sum::<usize>(), 18);
555+
assert!(coordinator.local_state.is_awaiting_commit(&partition_id_1));
556+
assert!(coordinator.local_state.is_awaiting_commit(&partition_id_2));
557+
}
558+
495559
#[tokio::test]
496560
async fn test_process_multiple_coordinator() {
497561
let queue = Arc::new(MemoryQueueForTests::new());
498-
let shared_state = shared_state_for_tests("test-index", Default::default());
499-
let mut proc_1 = setup_coordinator(queue.clone(), shared_state.clone());
500-
let mut proc_2 = setup_coordinator(queue.clone(), shared_state.clone());
562+
let shared_state = init_state("test-index", Default::default());
563+
let mut coord_1 = setup_coordinator(queue.clone(), shared_state.clone());
564+
let mut coord_2 = setup_coordinator(queue.clone(), shared_state.clone());
501565
let (dummy_doc_file, _) = generate_dummy_doc_file(false, 10).await;
502566
let test_uri = Uri::from_str(dummy_doc_file.path().to_str().unwrap()).unwrap();
503567
let partition_id = PreProcessedPayload::ObjectUri(test_uri.clone()).partition_id();
504568

505-
let batches_1 = process_messages(&mut proc_1, queue.clone(), &[(&test_uri, "ack1")]).await;
506-
let batches_2 = process_messages(&mut proc_2, queue, &[(&test_uri, "ack2")]).await;
569+
let batches_1 = process_messages(&mut coord_1, queue.clone(), &[(&test_uri, "ack1")]).await;
570+
let batches_2 = process_messages(&mut coord_2, queue, &[(&test_uri, "ack2")]).await;
507571

508572
assert_eq!(batches_1.len(), 1);
509573
assert_eq!(batches_1[0].docs.len(), 10);
510-
assert!(proc_1.local_state.is_awaiting_commit(&partition_id));
511-
// proc_2 doesn't know for sure what is happening with the message
512-
// (proc_1 might have crashed), so it just acquires it and takes over
513-
// processing
514-
//
515-
// TODO: this test should fail once we implement the grace
516-
// period before a partition can be re-acquired
517-
assert_eq!(batches_2.len(), 1);
518-
assert_eq!(batches_2[0].docs.len(), 10);
519-
assert!(proc_2.local_state.is_awaiting_commit(&partition_id));
574+
assert!(coord_1.local_state.is_awaiting_commit(&partition_id));
575+
// proc_2 learns from shared state that the message is likely still
576+
// being processed and skips it
577+
assert_eq!(batches_2.len(), 0);
578+
assert!(!coord_2.local_state.is_tracked(&partition_id));
520579
}
521580
}

0 commit comments

Comments
 (0)