From 40a4defc822660ed77f6c71921852b03afb5e163 Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Fri, 1 Sep 2023 17:56:13 +0800 Subject: [PATCH 01/13] fix(duplication): fix slog gc crash --- src/replica/duplication/load_from_private_log.cpp | 4 ++++ src/replica/duplication/replica_duplicator.cpp | 6 ++++++ src/replica/duplication/replica_duplicator.h | 4 +++- src/replica/replica.h | 7 +++++++ 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/replica/duplication/load_from_private_log.cpp b/src/replica/duplication/load_from_private_log.cpp index aa458b22a4..e53177de7f 100644 --- a/src/replica/duplication/load_from_private_log.cpp +++ b/src/replica/duplication/load_from_private_log.cpp @@ -122,6 +122,8 @@ void load_from_private_log::run() void load_from_private_log::find_log_file_to_start() { + _duplicator->set_duplication_plog_checking(true); + // `file_map` has already excluded the useless log files during replica init. auto file_map = _private_log->get_log_file_map(); @@ -166,6 +168,8 @@ void load_from_private_log::find_log_file_to_start(std::map l } } start_from_log_file(_current); + + _duplicator->set_duplication_plog_checking(false); } void load_from_private_log::replay_log_block() diff --git a/src/replica/duplication/replica_duplicator.cpp b/src/replica/duplication/replica_duplicator.cpp index a19947fe20..0beeea5e09 100644 --- a/src/replica/duplication/replica_duplicator.cpp +++ b/src/replica/duplication/replica_duplicator.cpp @@ -257,5 +257,11 @@ uint64_t replica_duplicator::get_pending_mutations_count() const return cnt > 0 ? static_cast(cnt) : 0; } +void replica_duplicator::set_duplication_plog_checking(bool checking) +{ + _replica->set_duplication_plog_checking(checking); +} + + } // namespace replication } // namespace dsn diff --git a/src/replica/duplication/replica_duplicator.h b/src/replica/duplication/replica_duplicator.h index 1b2526d290..04fb89563c 100644 --- a/src/replica/duplication/replica_duplicator.h +++ b/src/replica/duplication/replica_duplicator.h @@ -138,7 +138,9 @@ class replica_duplicator : public replica_base, public pipeline::base // For metric "dup.pending_mutations_count" uint64_t get_pending_mutations_count() const; - duplication_status::type status() const { return _status; }; + duplication_status::type status() const { return _status; } + + void set_duplication_plog_checking(bool checking); private: friend class duplication_test_base; diff --git a/src/replica/replica.h b/src/replica/replica.h index 3933784da0..04f3983d8b 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -254,6 +254,12 @@ class replica : public serverlet, public ref_counter, public replica_ba replica_duplicator_manager *get_duplication_manager() const { return _duplication_mgr.get(); } bool is_duplication_master() const { return _is_duplication_master; } bool is_duplication_follower() const { return _is_duplication_follower; } + bool is_duplication_plog_checking() const { return _is_duplication_plog_checking.load(); } + void set_duplication_plog_checking(bool checking) + { + _is_duplication_plog_checking.store(checking); + } + // // Backup @@ -633,6 +639,7 @@ class replica : public serverlet, public ref_counter, public replica_ba bool _is_manual_emergency_checkpointing{false}; bool _is_duplication_master{false}; bool _is_duplication_follower{false}; + std::atomic _is_duplication_plog_checking{false}; // backup std::unique_ptr _backup_mgr; From 24b312162c860d0f964839929a095b30fcbc7eba Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Wed, 13 Sep 2023 14:52:36 +0800 Subject: [PATCH 02/13] add mistake plog checking --- src/replica/replica_chkpt.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/replica/replica_chkpt.cpp b/src/replica/replica_chkpt.cpp index b24f11157b..0aff779095 100644 --- a/src/replica/replica_chkpt.cpp +++ b/src/replica/replica_chkpt.cpp @@ -154,6 +154,13 @@ void replica::on_checkpoint_timer() return; } + if (is_duplication_plog_checking()) { + LOG_DEBUG_PREFIX("gc_private {}: skip gc because duplication is checking plog files", + enum_to_string(status())); + return; + } + + tasking::enqueue(LPC_GARBAGE_COLLECT_LOGS_AND_REPLICAS, &_tracker, [this, plog, cleanable_decree, valid_start_offset] { From 30b895722f3f9e51d7faf6c5f87ceb4e7f7a4152 Mon Sep 17 00:00:00 2001 From: guoningshen Date: Wed, 13 Sep 2023 15:38:52 +0800 Subject: [PATCH 03/13] reformat --- src/replica/duplication/replica_duplicator.cpp | 1 - src/replica/replica.h | 1 - src/replica/replica_chkpt.cpp | 3 +-- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/replica/duplication/replica_duplicator.cpp b/src/replica/duplication/replica_duplicator.cpp index 0beeea5e09..e7db4308b0 100644 --- a/src/replica/duplication/replica_duplicator.cpp +++ b/src/replica/duplication/replica_duplicator.cpp @@ -262,6 +262,5 @@ void replica_duplicator::set_duplication_plog_checking(bool checking) _replica->set_duplication_plog_checking(checking); } - } // namespace replication } // namespace dsn diff --git a/src/replica/replica.h b/src/replica/replica.h index 04f3983d8b..9145362a44 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -260,7 +260,6 @@ class replica : public serverlet, public ref_counter, public replica_ba _is_duplication_plog_checking.store(checking); } - // // Backup // diff --git a/src/replica/replica_chkpt.cpp b/src/replica/replica_chkpt.cpp index 0aff779095..309422e75e 100644 --- a/src/replica/replica_chkpt.cpp +++ b/src/replica/replica_chkpt.cpp @@ -156,11 +156,10 @@ void replica::on_checkpoint_timer() if (is_duplication_plog_checking()) { LOG_DEBUG_PREFIX("gc_private {}: skip gc because duplication is checking plog files", - enum_to_string(status())); + enum_to_string(status())); return; } - tasking::enqueue(LPC_GARBAGE_COLLECT_LOGS_AND_REPLICAS, &_tracker, [this, plog, cleanable_decree, valid_start_offset] { From ab78d62ebd3e7366ef6d1cd55b6f3ac782ade263 Mon Sep 17 00:00:00 2001 From: guoningshen Date: Thu, 19 Oct 2023 11:21:17 +0800 Subject: [PATCH 04/13] add some comment --- src/replica/replica.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/replica/replica.h b/src/replica/replica.h index 9145362a44..f5baf0a92e 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -638,7 +638,9 @@ class replica : public serverlet, public ref_counter, public replica_ba bool _is_manual_emergency_checkpointing{false}; bool _is_duplication_master{false}; bool _is_duplication_follower{false}; - std::atomic _is_duplication_plog_checking{false}; + std::atomic _is_duplication_plog_checking{false}; // replica is finding some private logs + // to load for duplication,avoid + // unexpected plog gc // backup std::unique_ptr _backup_mgr; From cec10c0a1d7f43defdce88b9782e7ff3d2ec4d17 Mon Sep 17 00:00:00 2001 From: guoningshen Date: Thu, 19 Oct 2023 11:28:00 +0800 Subject: [PATCH 05/13] reformat code --- src/replica/replica.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/replica/replica.h b/src/replica/replica.h index f5baf0a92e..e3de825e59 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -638,9 +638,8 @@ class replica : public serverlet, public ref_counter, public replica_ba bool _is_manual_emergency_checkpointing{false}; bool _is_duplication_master{false}; bool _is_duplication_follower{false}; - std::atomic _is_duplication_plog_checking{false}; // replica is finding some private logs - // to load for duplication,avoid - // unexpected plog gc + // replica is finding some private logs to load for duplication,avoid unexpected plog gc + std::atomic _is_duplication_plog_checking{false}; // backup std::unique_ptr _backup_mgr; From 91a44f1c4a1e07eb1e31b85a8218b97a90b8984b Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Mon, 29 Jan 2024 11:01:06 +0800 Subject: [PATCH 06/13] add comment --- src/replica/replica.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/replica/replica.h b/src/replica/replica.h index 04f3983d8b..4c310666fe 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -639,7 +639,7 @@ class replica : public serverlet, public ref_counter, public replica_ba bool _is_manual_emergency_checkpointing{false}; bool _is_duplication_master{false}; bool _is_duplication_follower{false}; - std::atomic _is_duplication_plog_checking{false}; + std::atomic _is_duplication_plog_checking{false}; //replica is checking plog on duplication stage // backup std::unique_ptr _backup_mgr; From 4b12582d3e9031f76568909423848b00148ebf32 Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Mon, 29 Jan 2024 21:06:30 +0800 Subject: [PATCH 07/13] format code --- src/replica/replica.h | 1 - src/replica/replica_chkpt.cpp | 42 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/replica/replica.h b/src/replica/replica.h index 7442457c4a..9a83b49953 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -633,7 +633,6 @@ class replica : public serverlet, public ref_counter, public replica_ba // replica is finding some private logs to load for duplication,avoid unexpected plog gc std::atomic _is_duplication_plog_checking{false}; - // backup std::unique_ptr _backup_mgr; diff --git a/src/replica/replica_chkpt.cpp b/src/replica/replica_chkpt.cpp index ab22ad9f21..74b665d3ee 100644 --- a/src/replica/replica_chkpt.cpp +++ b/src/replica/replica_chkpt.cpp @@ -231,11 +231,12 @@ void replica::init_checkpoint(bool is_emergency) // // we may issue a new task to do backgroup_async_checkpoint // even if the old one hasn't finished yet - tasking::enqueue(LPC_CHECKPOINT_REPLICA, - &_tracker, - [this, is_emergency] { background_async_checkpoint(is_emergency); }, - 0, - 10_ms); + tasking::enqueue( + LPC_CHECKPOINT_REPLICA, + &_tracker, + [this, is_emergency] { background_async_checkpoint(is_emergency); }, + 0, + 10_ms); if (is_emergency) { METRIC_VAR_INCREMENT(emergency_checkpoints); @@ -307,11 +308,12 @@ error_code replica::background_async_checkpoint(bool is_emergency) LOG_INFO_PREFIX("call app.async_checkpoint() returns ERR_TRY_AGAIN, time_used_ns = {}" ", schedule later checkpoint after 10 seconds", used_time); - tasking::enqueue(LPC_PER_REPLICA_CHECKPOINT_TIMER, - &_tracker, - [this] { init_checkpoint(false); }, - get_gpid().thread_hash(), - std::chrono::seconds(10)); + tasking::enqueue( + LPC_PER_REPLICA_CHECKPOINT_TIMER, + &_tracker, + [this] { init_checkpoint(false); }, + get_gpid().thread_hash(), + std::chrono::seconds(10)); return err; } @@ -375,11 +377,11 @@ void replica::catch_up_with_private_logs(partition_status::type s) auto err = apply_learned_state_from_private_log(state); if (s == partition_status::PS_POTENTIAL_SECONDARY) { - _potential_secondary_states.learn_remote_files_completed_task = - tasking::create_task(LPC_CHECKPOINT_REPLICA_COMPLETED, - &_tracker, - [this, err]() { this->on_learn_remote_state_completed(err); }, - get_gpid().thread_hash()); + _potential_secondary_states.learn_remote_files_completed_task = tasking::create_task( + LPC_CHECKPOINT_REPLICA_COMPLETED, + &_tracker, + [this, err]() { this->on_learn_remote_state_completed(err); }, + get_gpid().thread_hash()); _potential_secondary_states.learn_remote_files_completed_task->enqueue(); } else if (s == partition_status::PS_PARTITION_SPLIT) { _split_states.async_learn_task = tasking::enqueue( @@ -388,11 +390,11 @@ void replica::catch_up_with_private_logs(partition_status::type s) std::bind(&replica_split_manager::child_catch_up_states, get_split_manager()), get_gpid().thread_hash()); } else { - _secondary_states.checkpoint_completed_task = - tasking::create_task(LPC_CHECKPOINT_REPLICA_COMPLETED, - &_tracker, - [this, err]() { this->on_checkpoint_completed(err); }, - get_gpid().thread_hash()); + _secondary_states.checkpoint_completed_task = tasking::create_task( + LPC_CHECKPOINT_REPLICA_COMPLETED, + &_tracker, + [this, err]() { this->on_checkpoint_completed(err); }, + get_gpid().thread_hash()); _secondary_states.checkpoint_completed_task->enqueue(); } } From 775ced80fa6b7700c3b4548d085eb555f4497ba3 Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Mon, 5 Feb 2024 20:19:18 +0800 Subject: [PATCH 08/13] format 3.9 --- src/replica/replica_chkpt.cpp | 42 +++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/replica/replica_chkpt.cpp b/src/replica/replica_chkpt.cpp index 74b665d3ee..ab22ad9f21 100644 --- a/src/replica/replica_chkpt.cpp +++ b/src/replica/replica_chkpt.cpp @@ -231,12 +231,11 @@ void replica::init_checkpoint(bool is_emergency) // // we may issue a new task to do backgroup_async_checkpoint // even if the old one hasn't finished yet - tasking::enqueue( - LPC_CHECKPOINT_REPLICA, - &_tracker, - [this, is_emergency] { background_async_checkpoint(is_emergency); }, - 0, - 10_ms); + tasking::enqueue(LPC_CHECKPOINT_REPLICA, + &_tracker, + [this, is_emergency] { background_async_checkpoint(is_emergency); }, + 0, + 10_ms); if (is_emergency) { METRIC_VAR_INCREMENT(emergency_checkpoints); @@ -308,12 +307,11 @@ error_code replica::background_async_checkpoint(bool is_emergency) LOG_INFO_PREFIX("call app.async_checkpoint() returns ERR_TRY_AGAIN, time_used_ns = {}" ", schedule later checkpoint after 10 seconds", used_time); - tasking::enqueue( - LPC_PER_REPLICA_CHECKPOINT_TIMER, - &_tracker, - [this] { init_checkpoint(false); }, - get_gpid().thread_hash(), - std::chrono::seconds(10)); + tasking::enqueue(LPC_PER_REPLICA_CHECKPOINT_TIMER, + &_tracker, + [this] { init_checkpoint(false); }, + get_gpid().thread_hash(), + std::chrono::seconds(10)); return err; } @@ -377,11 +375,11 @@ void replica::catch_up_with_private_logs(partition_status::type s) auto err = apply_learned_state_from_private_log(state); if (s == partition_status::PS_POTENTIAL_SECONDARY) { - _potential_secondary_states.learn_remote_files_completed_task = tasking::create_task( - LPC_CHECKPOINT_REPLICA_COMPLETED, - &_tracker, - [this, err]() { this->on_learn_remote_state_completed(err); }, - get_gpid().thread_hash()); + _potential_secondary_states.learn_remote_files_completed_task = + tasking::create_task(LPC_CHECKPOINT_REPLICA_COMPLETED, + &_tracker, + [this, err]() { this->on_learn_remote_state_completed(err); }, + get_gpid().thread_hash()); _potential_secondary_states.learn_remote_files_completed_task->enqueue(); } else if (s == partition_status::PS_PARTITION_SPLIT) { _split_states.async_learn_task = tasking::enqueue( @@ -390,11 +388,11 @@ void replica::catch_up_with_private_logs(partition_status::type s) std::bind(&replica_split_manager::child_catch_up_states, get_split_manager()), get_gpid().thread_hash()); } else { - _secondary_states.checkpoint_completed_task = tasking::create_task( - LPC_CHECKPOINT_REPLICA_COMPLETED, - &_tracker, - [this, err]() { this->on_checkpoint_completed(err); }, - get_gpid().thread_hash()); + _secondary_states.checkpoint_completed_task = + tasking::create_task(LPC_CHECKPOINT_REPLICA_COMPLETED, + &_tracker, + [this, err]() { this->on_checkpoint_completed(err); }, + get_gpid().thread_hash()); _secondary_states.checkpoint_completed_task->enqueue(); } } From 05d29d234aecafdc421f3c8285e966901a8b3cc8 Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Thu, 7 Mar 2024 14:49:49 +0800 Subject: [PATCH 09/13] Update src/replica/replica.h --- src/replica/replica.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/replica/replica.h b/src/replica/replica.h index 9a83b49953..5e0d107d8c 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -630,7 +630,8 @@ class replica : public serverlet, public ref_counter, public replica_ba bool _is_manual_emergency_checkpointing{false}; bool _is_duplication_master{false}; bool _is_duplication_follower{false}; - // replica is finding some private logs to load for duplication,avoid unexpected plog gc + // Indicate whether the replica is during finding out some private logs to + // load for duplication. It useful to prevent plog GCed unexpectedly. std::atomic _is_duplication_plog_checking{false}; // backup From a36ee60bf3534c4342b16b1273edf5642f0af3a7 Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Tue, 26 Mar 2024 17:18:47 +0800 Subject: [PATCH 10/13] handle open read ec when find log file to start --- src/replica/duplication/load_from_private_log.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/replica/duplication/load_from_private_log.cpp b/src/replica/duplication/load_from_private_log.cpp index bb595c3106..a273b97165 100644 --- a/src/replica/duplication/load_from_private_log.cpp +++ b/src/replica/duplication/load_from_private_log.cpp @@ -159,6 +159,7 @@ void load_from_private_log::find_log_file_to_start() error_s es = log_utils::open_read(pr.second->path(), file); if (!es.is_ok()) { LOG_ERROR_PREFIX("{}", es); + _duplicator->set_duplication_plog_checking(false); return; } new_file_map.emplace(pr.first, file); From 7b7f5d12b36680c30e304fd80dcd259f547491d7 Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Tue, 26 Mar 2024 19:13:54 +0800 Subject: [PATCH 11/13] use defer to set plog flag to false --- src/replica/duplication/load_from_private_log.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/replica/duplication/load_from_private_log.cpp b/src/replica/duplication/load_from_private_log.cpp index a273b97165..e8e2d26e17 100644 --- a/src/replica/duplication/load_from_private_log.cpp +++ b/src/replica/duplication/load_from_private_log.cpp @@ -147,6 +147,7 @@ void load_from_private_log::run() void load_from_private_log::find_log_file_to_start() { _duplicator->set_duplication_plog_checking(true); + auto cleanup = dsn::defer([this]() { _duplicator->set_duplication_plog_checking(false); }); // `file_map` has already excluded the useless log files during replica init. const auto &file_map = _private_log->get_log_file_map(); @@ -159,7 +160,6 @@ void load_from_private_log::find_log_file_to_start() error_s es = log_utils::open_read(pr.second->path(), file); if (!es.is_ok()) { LOG_ERROR_PREFIX("{}", es); - _duplicator->set_duplication_plog_checking(false); return; } new_file_map.emplace(pr.first, file); @@ -171,6 +171,8 @@ void load_from_private_log::find_log_file_to_start() void load_from_private_log::find_log_file_to_start( const mutation_log::log_file_map_by_index &log_file_map) { + auto cleanup = dsn::defer([this]() { _duplicator->set_duplication_plog_checking(false); }); + _current = nullptr; if (dsn_unlikely(log_file_map.empty())) { LOG_ERROR_PREFIX("unable to start duplication since no log file is available"); @@ -194,8 +196,6 @@ void load_from_private_log::find_log_file_to_start( } } start_from_log_file(_current); - - _duplicator->set_duplication_plog_checking(false); } void load_from_private_log::replay_log_block() From 7985df1c8adb09fb2b92a4ff65914ceaac005684 Mon Sep 17 00:00:00 2001 From: nins Date: Wed, 27 Mar 2024 19:49:54 +0800 Subject: [PATCH 12/13] make find_log_file_to_start(logfile) privatly --- src/replica/duplication/load_from_private_log.cpp | 2 -- src/replica/duplication/load_from_private_log.h | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/replica/duplication/load_from_private_log.cpp b/src/replica/duplication/load_from_private_log.cpp index e8e2d26e17..8a049c627a 100644 --- a/src/replica/duplication/load_from_private_log.cpp +++ b/src/replica/duplication/load_from_private_log.cpp @@ -171,8 +171,6 @@ void load_from_private_log::find_log_file_to_start() void load_from_private_log::find_log_file_to_start( const mutation_log::log_file_map_by_index &log_file_map) { - auto cleanup = dsn::defer([this]() { _duplicator->set_duplication_plog_checking(false); }); - _current = nullptr; if (dsn_unlikely(log_file_map.empty())) { LOG_ERROR_PREFIX("unable to start duplication since no log file is available"); diff --git a/src/replica/duplication/load_from_private_log.h b/src/replica/duplication/load_from_private_log.h index 314951354a..bf288ec545 100644 --- a/src/replica/duplication/load_from_private_log.h +++ b/src/replica/duplication/load_from_private_log.h @@ -61,7 +61,6 @@ class load_from_private_log final : public replica_base, /// Find the log file that contains `_start_decree`. void find_log_file_to_start(); - void find_log_file_to_start(const mutation_log::log_file_map_by_index &log_files); void replay_log_block(); @@ -82,6 +81,9 @@ class load_from_private_log final : public replica_base, static constexpr int MAX_ALLOWED_BLOCK_REPEATS{3}; static constexpr int MAX_ALLOWED_FILE_REPEATS{10}; +private: + void find_log_file_to_start(const mutation_log::log_file_map_by_index &log_files); + private: friend class load_from_private_log_test; friend class load_fail_mode_test; From ed0b3c782daebf89d9d7544bf8a7b247631bacda Mon Sep 17 00:00:00 2001 From: ninsmiracle Date: Thu, 28 Mar 2024 11:40:23 +0800 Subject: [PATCH 13/13] add header file --- src/replica/duplication/load_from_private_log.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/replica/duplication/load_from_private_log.cpp b/src/replica/duplication/load_from_private_log.cpp index 8a049c627a..6b0af82251 100644 --- a/src/replica/duplication/load_from_private_log.cpp +++ b/src/replica/duplication/load_from_private_log.cpp @@ -30,6 +30,7 @@ #include "replica/replica.h" #include "replica_duplicator.h" #include "utils/autoref_ptr.h" +#include "utils/defer.h" #include "utils/error_code.h" #include "utils/errors.h" #include "utils/fail_point.h"