Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
hx235 committed Jun 22, 2024
1 parent 981fd43 commit b7eb35f
Show file tree
Hide file tree
Showing 10 changed files with 126 additions and 94 deletions.
10 changes: 5 additions & 5 deletions db_stress_tool/db_stress_driver.cc
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,13 @@ bool RunStressTestImpl(SharedState* shared) {
stress->TrackExpectedState(shared);
}

// Since wrie fault and sync fault implementations are coupled with each
// other in `TestFSWritableFile()`, we can not enable or disable only one
// of the two.
// TODO(hx235): decouple implementations of write fault injection and sync
// fault injection.
if (FLAGS_sync_fault_injection || FLAGS_write_fault_one_in > 0) {
fault_fs_guard->SetFilesystemDirectWritable(false);
fault_fs_guard->SetInjectUnsyncedDataLoss(FLAGS_sync_fault_injection);
if (FLAGS_exclude_wal_from_write_fault_injection) {
fault_fs_guard->SetFileTypesExcludedFromFaultInjection(
{FileType::kWalFile});
}
}
now = clock->NowMicros();
fprintf(stdout, "%s Starting database operations\n",
Expand Down
3 changes: 3 additions & 0 deletions db_stress_tool/db_stress_gflags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,9 @@ DEFINE_int32(write_fault_one_in, 0,
"On non-zero, enables fault injection on write. Currently only"
"injects write error when writing to SST files.");

DEFINE_bool(exclude_wal_from_write_fault_injection, false,
"If true, we won't inject write fault when writing to WAL file");

DEFINE_int32(metadata_write_fault_one_in, 1000,
"On non-zero, enables fault injection on metadata write (i.e, "
"directory and file metadata write)");
Expand Down
4 changes: 2 additions & 2 deletions db_stress_tool/db_stress_listener.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ class DbStressListener : public EventListener {
FaultInjectionIOType::kMetadataWrite);
// TODO(hx235): only exempt the flush thread during error recovery instead
// of all the flush threads from error injection
fault_fs_guard->SetIOActivtiesExemptedFromFaultInjection(
fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection(
{Env::IOActivity::kFlush});
}
}
Expand All @@ -300,7 +300,7 @@ class DbStressListener : public EventListener {
FaultInjectionIOType::kMetadataRead);
fault_fs_guard->EnableThreadLocalErrorInjection(
FaultInjectionIOType::kMetadataWrite);
fault_fs_guard->SetIOActivtiesExemptedFromFaultInjection({});
fault_fs_guard->SetIOActivtiesExcludedFromFaultInjection({});
}
}

Expand Down
1 change: 1 addition & 0 deletions db_stress_tool/db_stress_shared_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ DECLARE_int32(metadata_read_fault_one_in);
DECLARE_int32(metadata_write_fault_one_in);
DECLARE_int32(read_fault_one_in);
DECLARE_int32(write_fault_one_in);
DECLARE_bool(exclude_wal_from_write_fault_injection);
DECLARE_int32(open_metadata_read_fault_one_in);
DECLARE_int32(open_metadata_write_fault_one_in);
DECLARE_int32(open_write_fault_one_in);
Expand Down
29 changes: 18 additions & 11 deletions db_stress_tool/db_stress_test_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,7 @@ void StressTest::FinishInitDb(SharedState* shared) {
void StressTest::TrackExpectedState(SharedState* shared) {
// When data loss is simulated, recovery from potential data loss is a prefix
// recovery that requires tracing
if (MightHaveDataLoss() && IsStateTracked()) {
if (MightHaveUnsyncedDataLoss() && IsStateTracked()) {
Status s = shared->SaveAtAndAfter(db_);
if (!s.ok()) {
fprintf(stderr, "Error enabling history tracing: %s\n",
Expand Down Expand Up @@ -3426,17 +3426,23 @@ void StressTest::Open(SharedState* shared, bool reopen) {
}
// TODO; test transaction DB Open with fault injection
if (!FLAGS_use_txn) {
bool inject_sync_fault = FLAGS_sync_fault_injection;
bool inject_open_meta_read_error =
FLAGS_open_metadata_read_fault_one_in > 0;
bool inject_open_meta_write_error =
FLAGS_open_metadata_write_fault_one_in > 0;
bool inject_open_read_error = FLAGS_open_read_fault_one_in > 0;
bool inject_open_write_error = FLAGS_open_write_fault_one_in > 0;
if ((inject_open_meta_read_error || inject_open_meta_write_error ||
inject_open_read_error || inject_open_write_error) &&
if ((inject_sync_fault || inject_open_meta_read_error ||
inject_open_meta_write_error || inject_open_read_error ||
inject_open_write_error) &&
fault_fs_guard
->FileExists(FLAGS_db + "/CURRENT", IOOptions(), nullptr)
.ok()) {
if (inject_sync_fault || inject_open_write_error) {
fault_fs_guard->SetFilesystemDirectWritable(false);
fault_fs_guard->SetInjectUnsyncedDataLoss(inject_sync_fault);
}
fault_fs_guard->SetThreadLocalErrorContext(
FaultInjectionIOType::kMetadataRead,
static_cast<uint32_t>(FLAGS_seed),
Expand All @@ -3460,7 +3466,6 @@ void StressTest::Open(SharedState* shared, bool reopen) {
fault_fs_guard->EnableThreadLocalErrorInjection(
FaultInjectionIOType::kRead);

fault_fs_guard->SetFilesystemDirectWritable(false);
fault_fs_guard->SetThreadLocalErrorContext(
FaultInjectionIOType::kWrite, static_cast<uint32_t>(FLAGS_seed),
FLAGS_open_write_fault_one_in, false /* retryable */,
Expand Down Expand Up @@ -3495,11 +3500,12 @@ void StressTest::Open(SharedState* shared, bool reopen) {
}
}

if (inject_open_meta_read_error || inject_open_meta_write_error ||
inject_open_read_error || inject_open_write_error) {
if (inject_sync_fault || inject_open_meta_read_error ||
inject_open_meta_write_error || inject_open_read_error ||
inject_open_write_error) {
fault_fs_guard->SetInjectUnsyncedDataLoss(false);
fault_fs_guard->DisableThreadLocalErrorInjection(
FaultInjectionIOType::kRead);
fault_fs_guard->SetFilesystemDirectWritable(true);
fault_fs_guard->DisableThreadLocalErrorInjection(
FaultInjectionIOType::kWrite);
fault_fs_guard->DisableThreadLocalErrorInjection(
Expand All @@ -3522,9 +3528,10 @@ void StressTest::Open(SharedState* shared, bool reopen) {
}
}
if (!s.ok()) {
// After failure to opening a DB due to IO error, retry should
// successfully open the DB with correct data if no IO error shows
// up.
// After failure to opening a DB due to IO error or unsynced data
// loss, retry should successfully open the DB with correct data if
// no IO error shows up.
inject_sync_fault = false;
inject_open_meta_read_error = false;
inject_open_meta_write_error = false;
inject_open_read_error = false;
Expand Down Expand Up @@ -3721,7 +3728,7 @@ void StressTest::Reopen(ThreadState* thread) {
clock_->TimeToString(now / 1000000).c_str(), num_times_reopened_);
Open(thread->shared, /*reopen=*/true);

if (thread->shared->GetStressTest()->MightHaveDataLoss() &&
if (thread->shared->GetStressTest()->MightHaveUnsyncedDataLoss() &&
IsStateTracked()) {
Status s = thread->shared->SaveAtAndAfter(db_);
if (!s.ok()) {
Expand Down
5 changes: 2 additions & 3 deletions db_stress_tool/db_stress_test_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,8 @@ class StressTest {
virtual void VerifyDb(ThreadState* thread) const = 0;
virtual void ContinuouslyVerifyDb(ThreadState* /*thread*/) const = 0;
void PrintStatistics();
bool MightHaveDataLoss() {
return FLAGS_sync_fault_injection || FLAGS_write_fault_one_in > 0 ||
FLAGS_metadata_write_fault_one_in > 0 || FLAGS_disable_wal ||
bool MightHaveUnsyncedDataLoss() {
return FLAGS_sync_fault_injection || FLAGS_disable_wal ||
FLAGS_manual_wal_flush_one_in > 0;
}

Expand Down
8 changes: 4 additions & 4 deletions db_stress_tool/db_stress_tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,10 @@ int db_stress_tool(int argc, char** argv) {
FaultInjectionTestFS* fs =
new FaultInjectionTestFS(raw_env->GetFileSystem());
fault_fs_guard.reset(fs);
// Set it to direct writable here to not lose files created during DB open
// when no open fault injection is not enabled.
// This will be overwritten in StressTest::Open() for open fault injection
// and in RunStressTestImpl() for proper write fault injection setup.
// Set it to direct writable here to initially bypass any fault injection
// during DB open This will correspondingly be overwritten in
// StressTest::Open() for open fault injection and in RunStressTestImpl()
// for proper fault injection setup.
fault_fs_guard->SetFilesystemDirectWritable(true);
fault_env_guard =
std::make_shared<CompositeEnvWrapper>(raw_env, fault_fs_guard);
Expand Down
14 changes: 7 additions & 7 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@
"metadata_write_fault_one_in": lambda: random.choice([0, 128, 1000]),
"read_fault_one_in": lambda: random.choice([0, 32, 1000]),
"write_fault_one_in": lambda: random.choice([0, 128, 1000]),
"exclude_wal_from_write_fault_injection": 0,
"open_metadata_write_fault_one_in": lambda: random.choice([0, 0, 8]),
"open_metadata_read_fault_one_in": lambda: random.choice([0, 0, 8]),
"open_write_fault_one_in": lambda: random.choice([0, 0, 16]),
Expand Down Expand Up @@ -622,11 +623,11 @@ def is_direct_io_supported(dbname):
"enable_compaction_filter": 0,
"create_timestamped_snapshot_one_in": 50,
"sync_fault_injection": 0,
"metadata_write_fault_one_in": 0,
"manual_wal_flush": 0,
# This test has aggressive flush frequency and small write buffer size.
# Disabling write fault to avoid writes being stopped.
"write_fault_one_in": 0,
"metadata_write_fault_one_in": 0,
# PutEntity in transactions is not yet implemented
"use_put_entity_one_in": 0,
"use_get_entity": 0,
Expand Down Expand Up @@ -736,14 +737,10 @@ def finalize_and_sanitize(src_params):
# logic for unsynced data loss relies on max sequence number stored
# in MANIFEST, so they don't work together.
dest_params["sync_fault_injection"] = 0
dest_params["write_fault_one_in"] = 0
dest_params["metadata_write_fault_one_in"] = 0
dest_params["disable_wal"] = 0
dest_params["manual_wal_flush_one_in"] = 0
if (
dest_params.get("sync_fault_injection") == 1
or dest_params.get("write_fault_one_in") > 0
or dest_params.get("metadata_write_fault_one_in") > 0
or dest_params.get("disable_wal") == 1
or dest_params.get("manual_wal_flush_one_in") > 0
):
Expand All @@ -759,6 +756,11 @@ def finalize_and_sanitize(src_params):
# files, which would be problematic when unsynced data can be lost in
# crash recoveries.
dest_params["enable_compaction_filter"] = 0
# Prefix-recoverability relies on tracing successful user writes. Currently we trace all user writes regardless of whether it later succeeds or not.
# To simplify, we disable any user write failure injection.
# TODO(hx235): support tracing user writes with failure injection.
dest_params["metadata_write_fault_one_in"] = 0
dest_params["exclude_wal_from_write_fault_injection"] = 1
# Only under WritePrepared txns, unordered_write would provide the same guarnatees as vanilla rocksdb
# unordered_write is only enabled with --txn, and txn_params disables inplace_update_support, so
# setting allow_concurrent_memtable_write=1 won't conflcit with inplace_update_support.
Expand Down Expand Up @@ -813,8 +815,6 @@ def finalize_and_sanitize(src_params):
# compatible with only write committed policy
if dest_params.get("use_txn") == 1 and dest_params.get("txn_write_policy", 0) != 0:
dest_params["sync_fault_injection"] = 0
dest_params["write_fault_one_in"] = 0
dest_params["metadata_write_fault_one_in"] = 0
dest_params["disable_wal"] = 0
dest_params["manual_wal_flush_one_in"] = 0
# Wide-column pessimistic transaction APIs are initially supported for
Expand Down
Loading

0 comments on commit b7eb35f

Please sign in to comment.