diff --git a/.gitignore b/.gitignore index 489ad62a5..64c6d9e3b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ make_config.mk rocksdb.pc +out.txt + + *.a *.arc *.d diff --git a/Makefile b/Makefile index c7662a6ce..53fdccb4c 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ BASH_EXISTS := $(shell which bash) SHELL := $(shell which bash) include common.mk +USE_RTTI = 1 CLEAN_FILES = # deliberately empty, so we can append below. CFLAGS += ${EXTRA_CFLAGS} CXXFLAGS += ${EXTRA_CXXFLAGS} diff --git a/README.md b/README.md index 25989d346..c6f38afb6 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,38 @@ -## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage +## Prophet -[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb) -[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main) -[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb) +Build Prophet: -RocksDB is developed and maintained by Facebook Database Engineering Team. -It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com) -and Jeff Dean (jeff@google.com) +Please make sure you have installed the required dependencies in [RocksDB](https://github.com/facebook/rocksdb/blob/main/INSTALL.md) and replace `` to real ZNS SSD device name. -This code is a library that forms the core building block for a fast -key-value server, especially suited for storing data on flash drives. -It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs -between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF) -and Space-Amplification-Factor (SAF). It has multi-threaded compactions, -making it especially suitable for storing multiple terabytes of data in a -single database. +```bash +sudo git clone https://github.com/Flappybird11101001/prophet-rocksdb.git rocksdb +cd rocksdb +sudo git clone https://github.com/Flappybird11101001/prophet-zenfs.git plugin/zenfs +sudo DISABLE_WARNING_AS_ERROR=1 ROCKSDB_PLUGINS=zenfs make -j db_bench install DEBUG_LEVEL=0 +pushd . +cd plugin/zenfs/util +sudo make +popd +``` -Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples +initialize ZNS SSD device -See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation. +```bash +echo deadline > /sys/class/block//queue/scheduler +sudo ./plugin/zenfs/util/zenfs mkfs --zbd= --aux_path=./temp --force +``` -The public interface is in `include/`. Callers should not include or -rely on the details of any other header files in this package. Those -internal APIs may be changed without warning. +# Benchmark -Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups. +run db_bench to test(the same config with paper in 64MB SST file size). -## License +```bash +sudo ./db_bench -num=400000000 -key_size=8 -value_size=256 -statistics=true -max_bytes_for_level_base=268435456 -target_file_size_base=67108864 -write_buffer_size=134217728 writable_file_max_buffer_size=134217728 -max_bytes_for_level_multiplier=4 -max_background_compactions=1 -max_background_flushes=1 -max_background_jobs=1 -soft_pending_compaction_bytes_limit=67108864 -hard_pending_compaction_bytes_limit=67108864 -level0_stop_writes_trigger=12 -level0_slowdown_writes_trigger=8 -level0_file_num_compaction_trigger=4 -max_write_buffer_number=1 -threads=1 -compaction_pri=4 -open_files=1000 -target_file_size_multiplier=1 --fs_uri=zenfs://dev: --benchmarks='fillrandom,stats' --use_direct_io_for_flush_and_compaction +``` -RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory). You may select, at your option, one of the above-listed licenses. + +![allocation_migrated_data](./allocation_migrated_data.jpg) + +![allocation_wa](./allocation_wa.jpg) + +![allocation_zone_number_page-0001](./allocation_zone_number.jpg) \ No newline at end of file diff --git a/allocation_migrated_data.jpg b/allocation_migrated_data.jpg new file mode 100644 index 000000000..07c481a9c Binary files /dev/null and b/allocation_migrated_data.jpg differ diff --git a/allocation_wa.jpg b/allocation_wa.jpg new file mode 100644 index 000000000..62c876985 Binary files /dev/null and b/allocation_wa.jpg differ diff --git a/allocation_zone_number.jpg b/allocation_zone_number.jpg new file mode 100644 index 000000000..b75f2981f Binary files /dev/null and b/allocation_zone_number.jpg differ diff --git a/clear.sh b/clear.sh new file mode 100644 index 000000000..28dbc336f --- /dev/null +++ b/clear.sh @@ -0,0 +1,7 @@ +rm -f level.out +rm -f lifetime.out +rm -f number_life.out +rm -f factor.out +rm -f last_compact.out +rm -f rank.out +rm -rf clock.out diff --git a/clock_pic.py b/clock_pic.py new file mode 100644 index 000000000..3be474937 --- /dev/null +++ b/clock_pic.py @@ -0,0 +1,31 @@ +import matplotlib.pyplot as plt +import numpy as np + +prev_list = [] +tmp_prev_flush_list = [] +prev_flush_list = [] +type_list = [] +tot = 0 +for line in open("clock.out"): + tot = tot + 1 + if(tot != 1): + prev_list.append(int(line.split(' ')[0])) + tmp_prev_flush_list.append(int(line.split(' ')[1])) + type_list.append(int(line.split(' ')[2])) + + +y = np.array(prev_list) +plt.hist(prev_list, bins=100, color="brown") +plt.show() + + +tot = 0 +for i in range(0, len(type_list)): + tot = tot + 1 + if i + 1 < len(type_list) and type_list[i] == 2 and type_list[i + 1] == 1: + prev_flush_list.append(tmp_prev_flush_list[i]) + + +# y = np.array(prev_flush_list) +plt.hist(prev_flush_list, bins=100, color="brown") +plt.show() diff --git a/db/builder.cc b/db/builder.cc index 03760ec91..a70b9b9a4 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -43,6 +43,12 @@ namespace ROCKSDB_NAMESPACE { +extern void get_predict(int level, const FileMetaData &file, Version *v, const Compaction* compaction_, int &predict_, int &predict_type_, int &tmp_rank); +extern void set_deleted_time(int fnumber, int clock); +extern void update_fname(uint64_t id, std::string name); +extern std::string get_fname(uint64_t id); +extern int get_clock(); + class TableFactory; TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, @@ -147,10 +153,31 @@ Status BuildTable( bool use_direct_writes = file_options.use_direct_writes; TEST_SYNC_POINT_CALLBACK("BuildTable:create_file", &use_direct_writes); #endif // !NDEBUG - IOStatus io_s = NewWritableFile(fs, fname, &file, file_options); + //file_options.lifetime = 1000; + FileOptions tmp_file_options = file_options; + tmp_file_options.lifetime = 100; + + update_fname(meta->fd.GetNumber(), fname); + //在这里写入 + IOStatus io_s = NewWritableFile(fs, fname, &file, tmp_file_options); + + + int predict; + int predict_type; + int rank; + const int output_level = 0; + + get_predict(output_level, *meta, versions->GetColumnFamilySet()->GetDefault()->current(), nullptr, predict, predict_type, rank); + set_deleted_time(meta->fnumber, predict + get_clock()); + printf("meta->fname=%s get_clock=%d lifetime=%d\n", fname.c_str(), get_clock(), predict + get_clock()); + fs->SetFileLifetime(fname, predict + get_clock(), get_clock(), 0, output_level, std::vector {}); + + + + assert(s.ok()); s = io_s; - if (io_status->ok()) { + if (io_status->ok()) { *io_status = io_s; } if (!s.ok()) { diff --git a/db/column_family.cc b/db/column_family.cc index ebeb574fa..489031f8e 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1118,7 +1118,7 @@ Compaction* ColumnFamilyData::PickCompaction( imm_.current()->GetEarliestSequenceNumber(false)); auto* result = compaction_picker_->PickCompaction( GetName(), mutable_options, mutable_db_options, current_->storage_info(), - log_buffer, earliest_mem_seqno); + log_buffer, earliest_mem_seqno); //PickCompaction来选择需要被Compact的文件 if (result != nullptr) { result->SetInputVersion(current_); } diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index b624035e6..6bcaa69f9 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -49,6 +49,7 @@ struct AtomicCompactionUnitBoundary { const InternalKey* largest = nullptr; }; +//使用此结构维护同一个level中所有的SST files // The structure that manages compaction input files associated // with the same physical level. struct CompactionInputFiles { @@ -438,6 +439,7 @@ class Compaction { bool l0_files_might_overlap_; // Compaction input files organized by level. Constant after construction + //Compaction中的输入变量 const std::vector inputs_; // A copy of inputs_, organized more closely in memory diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 39550e212..5f98514fe 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -59,6 +59,18 @@ namespace ROCKSDB_NAMESPACE { + +enum LOG_TYPE { + FLUSH = 1, + COMPACTION = 2, + OTHER = 10 +}; + +extern void log_print(const char *s, LOG_TYPE log_type, int level, Compaction *c); +extern void after_flush_or_compaction(VersionStorageInfo *vstorage, int level, std::vector files_output, ColumnFamilyData* cfd, Compaction* const compaction); +extern void SetRocksIO(uint64_t rocks_io); +extern int get_clock(); +extern void update_fname(uint64_t id, std::string name); const char* GetCompactionReasonString(CompactionReason compaction_reason) { switch (compaction_reason) { case CompactionReason::kUnknown: @@ -225,81 +237,82 @@ void CompactionJob::ReportStartedCompaction(Compaction* compaction) { compaction_job_stats_->is_full_compaction = compaction->is_full_compaction(); } -void CompactionJob::Prepare() { - AutoThreadOperationStageUpdater stage_updater( - ThreadStatus::STAGE_COMPACTION_PREPARE); + void CompactionJob::Prepare() { + AutoThreadOperationStageUpdater stage_updater( + ThreadStatus::STAGE_COMPACTION_PREPARE); - // Generate file_levels_ for compaction before making Iterator - auto* c = compact_->compaction; - ColumnFamilyData* cfd = c->column_family_data(); - assert(cfd != nullptr); - assert(cfd->current()->storage_info()->NumLevelFiles( - compact_->compaction->level()) > 0); - - write_hint_ = cfd->CalculateSSTWriteHint(c->output_level()); - bottommost_level_ = c->bottommost_level(); - - if (c->ShouldFormSubcompactions()) { - StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); - GenSubcompactionBoundaries(); - } - if (boundaries_.size() > 1) { - for (size_t i = 0; i <= boundaries_.size(); i++) { - compact_->sub_compact_states.emplace_back( - c, (i != 0) ? std::optional(boundaries_[i - 1]) : std::nullopt, - (i != boundaries_.size()) ? std::optional(boundaries_[i]) - : std::nullopt, - static_cast(i)); - // assert to validate that boundaries don't have same user keys (without - // timestamp part). - assert(i == 0 || i == boundaries_.size() || - cfd->user_comparator()->CompareWithoutTimestamp( - boundaries_[i - 1], true, boundaries_[i], true) < 0); + // Generate file_levels_ for compaction before making Iterator + auto* c = compact_->compaction; + ColumnFamilyData* cfd = c->column_family_data(); + assert(cfd != nullptr); + assert(cfd->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + + write_hint_ = cfd->CalculateSSTWriteHint(c->output_level()); + bottommost_level_ = c->bottommost_level(); + + if (c->ShouldFormSubcompactions()) { + StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME); + GenSubcompactionBoundaries(); } - RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, - compact_->sub_compact_states.size()); - } else { - compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt, - /*sub_job_id*/ 0); - } - - if (c->immutable_options()->preclude_last_level_data_seconds > 0) { - // TODO(zjay): move to a function - seqno_time_mapping_.SetMaxTimeDuration( - c->immutable_options()->preclude_last_level_data_seconds); - // setup seqno_time_mapping_ - for (const auto& each_level : *c->inputs()) { - for (const auto& fmd : each_level.files) { - std::shared_ptr tp; - Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); - if (s.ok()) { - seqno_time_mapping_.Add(tp->seqno_to_time_mapping) - .PermitUncheckedError(); - seqno_time_mapping_.Add(fmd->fd.smallest_seqno, - fmd->oldest_ancester_time); - } + printf("boundaries.size() %ld\n", boundaries_.size()); + if (boundaries_.size() > 1) { + for (size_t i = 0; i <= boundaries_.size(); i++) { + compact_->sub_compact_states.emplace_back( + c, (i != 0) ? std::optional(boundaries_[i - 1]) : std::nullopt, + (i != boundaries_.size()) ? std::optional(boundaries_[i]) + : std::nullopt, + static_cast(i)); + // assert to validate that boundaries don't have same user keys (without + // timestamp part). + assert(i == 0 || i == boundaries_.size() || + cfd->user_comparator()->CompareWithoutTimestamp( + boundaries_[i - 1], true, boundaries_[i], true) < 0); } + RecordInHistogram(stats_, NUM_SUBCOMPACTIONS_SCHEDULED, + compact_->sub_compact_states.size()); + } else { + compact_->sub_compact_states.emplace_back(c, std::nullopt, std::nullopt, + /*sub_job_id*/ 0); } - auto status = seqno_time_mapping_.Sort(); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Invalid sequence number to time mapping: Status: %s", - status.ToString().c_str()); - } - int64_t _current_time = 0; - status = db_options_.clock->GetCurrentTime(&_current_time); - if (!status.ok()) { - ROCKS_LOG_WARN(db_options_.info_log, - "Failed to get current time in compaction: Status: %s", - status.ToString().c_str()); - penultimate_level_cutoff_seqno_ = 0; - } else { - penultimate_level_cutoff_seqno_ = - seqno_time_mapping_.TruncateOldEntries(_current_time); + if (c->immutable_options()->preclude_last_level_data_seconds > 0) { + // TODO(zjay): move to a function + seqno_time_mapping_.SetMaxTimeDuration( + c->immutable_options()->preclude_last_level_data_seconds); + // setup seqno_time_mapping_ + for (const auto& each_level : *c->inputs()) { + for (const auto& fmd : each_level.files) { + std::shared_ptr tp; + Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); + if (s.ok()) { + seqno_time_mapping_.Add(tp->seqno_to_time_mapping) + .PermitUncheckedError(); + seqno_time_mapping_.Add(fmd->fd.smallest_seqno, + fmd->oldest_ancester_time); + } + } + } + + auto status = seqno_time_mapping_.Sort(); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Invalid sequence number to time mapping: Status: %s", + status.ToString().c_str()); + } + int64_t _current_time = 0; + status = db_options_.clock->GetCurrentTime(&_current_time); + if (!status.ok()) { + ROCKS_LOG_WARN(db_options_.info_log, + "Failed to get current time in compaction: Status: %s", + status.ToString().c_str()); + penultimate_level_cutoff_seqno_ = 0; + } else { + penultimate_level_cutoff_seqno_ = + seqno_time_mapping_.TruncateOldEntries(_current_time); + } } } -} uint64_t CompactionJob::GetSubcompactionsLimit() { return extra_num_subcompaction_threads_reserved_ + @@ -576,12 +589,18 @@ Status CompactionJob::Run() { ThreadStatus::STAGE_COMPACTION_RUN); TEST_SYNC_POINT("CompactionJob::Run():Start"); log_buffer_->FlushBufferToLog(); - LogCompaction(); + LogCompaction(); //print log before compaction - const size_t num_threads = compact_->sub_compact_states.size(); + const size_t num_threads = compact_->sub_compact_states.size(); //线程数量 assert(num_threads > 0); const uint64_t start_micros = db_options_.clock->NowMicros(); + printf("CompactionJob::Run clock=%d level=%d\n" , get_clock(), compact_->compaction->start_level()); + + + /* + 这里多线程的操作比较妙,为了更有效的使用资源,将index=0的sub_compaction交给当前线程执行,将其余的compaction交给其他线程执行 + */ // Launch a thread for each of subcompactions 1...num_threads-1 std::vector thread_pool; thread_pool.reserve(num_threads - 1); @@ -598,7 +617,7 @@ Status CompactionJob::Run() { for (auto& thread : thread_pool) { thread.join(); } - + //到这里实际上所有的sub_compaction都已经结束了 compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros); for (auto& state : compact_->sub_compact_states) { @@ -655,14 +674,24 @@ Status CompactionJob::Run() { if (status.ok()) { status = io_s; } + + + if (status.ok()) { + //i get it thread_pool.clear(); + + //在这里统计所有sub_compaction的output信息 + + //有一个很amazing的problem是这些output_file在cfd的version中都找不到。 + //update:知道了,原因是version是在之后的CompactionJob::Install中update的 std::vector files_output; for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.GetOutputs()) { files_output.emplace_back(&output); } } + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); auto& prefix_extractor = compact_->compaction->mutable_cf_options()->prefix_extractor; @@ -680,9 +709,11 @@ Status CompactionJob::Run() { // we will regard this verification as user reads since the goal is // to cache it here for further user reads ReadOptions read_options; + + InternalIterator* iter = cfd->table_cache()->NewIterator( read_options, file_options_, cfd->internal_comparator(), - files_output[file_idx]->meta, /*range_del_agg=*/nullptr, + files_output[file_idx]->meta, /*range_del_agg= */nullptr, prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( @@ -694,6 +725,8 @@ Status CompactionJob::Run() { /*smallest_compaction_key=*/nullptr, /*largest_compaction_key=*/nullptr, /*allow_unprepared_value=*/false); + + auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { @@ -723,6 +756,8 @@ Status CompactionJob::Run() { } } }; + + for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) { thread_pool.emplace_back(verify_table, std::ref(compact_->sub_compact_states[i].status)); @@ -738,6 +773,9 @@ Status CompactionJob::Run() { break; } } + + tmp_files_output = files_output; + } ReleaseSubcompactionResources(); @@ -762,8 +800,9 @@ Status CompactionJob::Run() { RecordCompactionIOStats(); LogFlush(db_options_.info_log); TEST_SYNC_POINT("CompactionJob::Run():End"); - + compact_->status = status; + return status; } @@ -783,7 +822,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { compaction_stats_); if (status.ok()) { - status = InstallCompactionResults(mutable_cf_options); + status = InstallCompactionResults(mutable_cf_options);//这里才是真正的把output_files update的函数 } if (!versions_->io_status().ok()) { io_status_ = versions_->io_status(); @@ -791,6 +830,11 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); + + + //vstorage->UpdateFilesByCompactionPri(*cfd->ioptions(), mutable_cf_options); + after_flush_or_compaction(vstorage, compact_->compaction->level(0), tmp_files_output, cfd, compact_->compaction); + const auto& stats = compaction_stats_.stats; double read_write_amp = 0.0; @@ -1059,6 +1103,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { // GenSubcompactionBoundaries doesn't strip away the timestamp. size_t ts_sz = cfd->user_comparator()->timestamp_size(); if (start.has_value()) { + printf("start has value"); read_options.iterate_lower_bound = &start.value(); if (ts_sz > 0) { start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz); @@ -1066,6 +1111,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } } if (end.has_value()) { + printf("end has value"); read_options.iterate_upper_bound = &end.value(); if (ts_sz > 0) { end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz); @@ -1073,6 +1119,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { } } + + // // Although the v2 aggregator is what the level iterator(s) know about, // the AddTombstones calls will be propagated down to the v1 aggregator. std::unique_ptr raw_input(versions_->MakeInputIterator( @@ -1237,6 +1285,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { RecordCompactionIOStats(); } + // 在这里openWritableFile // Add current compaction_iterator key to target compaction output, if the // output file needs to be close or open, it will call the `open_file_func` // and `close_file_func`. @@ -1425,7 +1474,7 @@ Status CompactionJob::FinishCompactionOutputFile( FileMetaData* meta = outputs.GetMetaData(); uint64_t output_number = meta->fd.GetNumber(); assert(output_number != 0); - + //printf("FinishCompactionOutputFile fname=%s number=%ld\n", meta->fname.c_str(), output_number); ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); std::string file_checksum = kUnknownFileChecksum; std::string file_checksum_func_name = kUnknownFileChecksumFuncName; @@ -1459,6 +1508,7 @@ Status CompactionJob::FinishCompactionOutputFile( const uint64_t current_entries = outputs.NumEntries(); + outputs.fs_ = fs_; s = outputs.Finish(s, seqno_time_mapping_); if (s.ok()) { @@ -1588,6 +1638,7 @@ Status CompactionJob::FinishCompactionOutputFile( return s; } +//被CompactionJob::Install调用 Status CompactionJob::InstallCompactionResults( const MutableCFOptions& mutable_cf_options) { assert(compact_); @@ -1674,6 +1725,8 @@ Status CompactionJob::InstallCompactionResults( } void CompactionJob::RecordCompactionIOStats() { + //printf("IOSTATS=%ld STATS=%s\n", IOSTATS(bytes_written), stats_->ToString().c_str()); + SetRocksIO(stats_->getTickerCount(COMPACT_WRITE_BYTES) + stats_->getTickerCount(FLUSH_WRITE_BYTES)); RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); CompactionReason compaction_reason = @@ -1703,6 +1756,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); std::string fname = GetTableFileName(file_number); + update_fname(file_number, fname); + //printf("fname=%s", fname.c_str()); // Fire events. ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); #ifndef ROCKSDB_LITE @@ -1729,9 +1784,12 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, temperature = sub_compact->compaction->mutable_cf_options()->last_level_temperature; } + fo_copy.lifetime = 1000; fo_copy.temperature = temperature; Status s; + + //writable_file是得到的result IOStatus io_s = NewWritableFile(fs_.get(), fname, &writable_file, fo_copy); s = io_s; if (sub_compact->io_status.ok()) { @@ -1784,6 +1842,8 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, // Initialize a SubcompactionState::Output and add it to sub_compact->outputs { FileMetaData meta; + meta.fname = fname; + meta.fnumber = file_number; meta.fd = FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0); meta.oldest_ancester_time = oldest_ancester_time; @@ -1936,6 +1996,8 @@ void CompactionJob::LogCompaction() { Compaction* compaction = compact_->compaction; ColumnFamilyData* cfd = compaction->column_family_data(); + log_print("Compaction", COMPACTION, compaction->level(0), compaction); + // Let's check if anything will get logged. Don't prepare all the info if // we're not logging if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) { @@ -1950,18 +2012,21 @@ void CompactionJob::LogCompaction() { cfd->GetName().c_str(), scratch); // build event logger report auto stream = event_logger_->Log(); + stream << "job" << job_id_ << "event" << "compaction_started" << "compaction_reason" << GetCompactionReasonString(compaction->compaction_reason()); for (size_t i = 0; i < compaction->num_input_levels(); ++i) { stream << ("files_L" + std::to_string(compaction->level(i))); + // printf(" %d", compaction->level(i)); stream.StartArray(); for (auto f : *compaction->inputs(i)) { stream << f->fd.GetNumber(); } stream.EndArray(); } + puts(""); stream << "score" << compaction->score() << "input_data_size" << compaction->CalculateTotalInputSize() << "oldest_snapshot_seqno" << (existing_snapshots_.empty() diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index d281b4c79..430a3c47c 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -191,6 +191,9 @@ class CompactionJob { // Return the IO status IOStatus io_status() const { return io_status_; } + //自己加的,为了记录一下在CompactionJob::Run中得到的output_file + std::vector tmp_files_output; + protected: void UpdateCompactionStats(); void LogCompaction(); diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 849a583fb..77d21bb55 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -11,9 +11,15 @@ #include "db/compaction/compaction_outputs.h" #include "db/builder.h" +#include "rocksdb/plugin/zenfs/fs/zbd_zenfs.h" +#include namespace ROCKSDB_NAMESPACE { - +extern void get_predict(int level, const FileMetaData &file, Version *v, const Compaction* compaction_, int &predict_, int &predict_type_, int &tmp_rank); +extern void get_overlap(const FileMetaData &file, int target_level, Version *v, std::vector &overlap_list); +extern void set_deleted_time(int fnumber, int clock); +extern int get_clock(); +extern std::string get_fname(uint64_t id); void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { builder_.reset(NewTableBuilder(tboptions, file_writer_.get())); } @@ -21,6 +27,44 @@ void CompactionOutputs::NewBuilder(const TableBuilderOptions& tboptions) { Status CompactionOutputs::Finish(const Status& intput_status, const SeqnoToTimeMapping& seqno_time_mapping) { FileMetaData* meta = GetMetaData(); + int predict; + int predict_type; + int rank; + const int output_level = GetCompaction()->output_level(); + printf("CompactionOutputs::Finish number=%ld get_clock=%d output_level=%d start_level=%d num_input_level=%ld\n", meta->fnumber, get_clock(), output_level, GetCompaction()->start_level(), GetCompaction()->num_input_levels()); + get_predict(output_level, *meta, GetCompaction()->column_family_data()->current(), GetCompaction(), predict, predict_type, rank); + std::vector overlap_list; + if(output_level + 1 <= 6) { + get_overlap(*meta, output_level + 1, GetCompaction()->column_family_data()->current(), overlap_list); + } + set_deleted_time(meta->fnumber, predict + get_clock()); + + //fs_->SetFileLifetime(get_fname(meta->fd.GetNumber()), predict + get_clock(), get_clock(), 0, (predict < 50) ? 1: output_level, overlap_list); + if(ENABLE_SHORT_WITH_TYPE0 != -1) { + fs_->SetFileLifetime(get_fname(meta->fd.GetNumber()), predict + get_clock(), get_clock(), 0, (predict < ENABLE_SHORT_WITH_TYPE0) ? 1 : output_level, overlap_list); + } else { + fs_->SetFileLifetime(get_fname(meta->fd.GetNumber()), predict + get_clock(), get_clock(), 0, output_level, overlap_list); + } + if(!update_input_file_lifetime) { + for(size_t i = 0; i < GetCompaction()->num_input_levels(); i++) { + //printf("vector[%ld] element:\n", i); + for(size_t j = 0; j < GetCompaction()->num_input_files(i); j++) { + FileMetaData *tmp = GetCompaction()->input(i, j); + fs_->SetFileLifetime(get_fname(tmp->fd.GetNumber()), get_clock(), get_clock(), 1, output_level, std::vector {}); + } + } + update_input_file_lifetime = 1; + } + + + + std::cout << "Finish:" + << meta->fd.GetNumber() + << '[' << meta->smallest.user_key().ToString() << ',' + << meta->largest.user_key().ToString() << ']' + << "lifetime=" << predict + << '\n'; + assert(meta != nullptr); Status s = intput_status; if (s.ok()) { @@ -76,6 +120,7 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status, return io_s; } +//重点观察对象 Status CompactionOutputs::AddToOutput( const CompactionIterator& c_iter, const CompactionFileOpenFunc& open_file_func, @@ -100,7 +145,7 @@ Status CompactionOutputs::AddToOutput( // Open output file if necessary if (!HasBuilder()) { - s = open_file_func(*this); + s = open_file_func(*this); //在这里调用OpenCompactionOutputFile } if (!s.ok()) { return s; diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h index 635924989..a3ef6020d 100644 --- a/db/compaction/compaction_outputs.h +++ b/db/compaction/compaction_outputs.h @@ -23,10 +23,14 @@ using CompactionFileOpenFunc = std::function; using CompactionFileCloseFunc = std::function; +//这个类非常重要,里面记录了output_的信息 // Files produced by subcompaction, most of the functions are used by // compaction_job Open/Close compaction file functions. class CompactionOutputs { public: + + FileSystemPtr fs_;//自己加的 + bool update_input_file_lifetime = 0; // compaction output file struct Output { Output(FileMetaData&& _meta, const InternalKeyComparator& _icmp, @@ -41,7 +45,6 @@ class CompactionOutputs { bool finished; std::shared_ptr table_properties; }; - CompactionOutputs() = delete; explicit CompactionOutputs(const Compaction* compaction, @@ -189,6 +192,9 @@ class CompactionOutputs { bool HasRangeDel() const { return range_del_agg_ && !range_del_agg_->IsEmpty(); } + const Compaction* GetCompaction() const { + return compaction_; + } private: friend class SubcompactionState; diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index 5895209e5..643319606 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -19,6 +19,8 @@ namespace ROCKSDB_NAMESPACE { +extern int get_clock(); + bool LevelCompactionPicker::NeedsCompaction( const VersionStorageInfo* vstorage) const { if (!vstorage->ExpiredTtlFiles().empty()) { @@ -152,12 +154,12 @@ class LevelCompactionBuilder { void LevelCompactionBuilder::PickFileToCompact( const autovector>& level_files, bool compact_to_next_level) { - for (auto& level_file : level_files) { + for (auto& level_file : level_files) { //枚举所有level的files // If it's being compacted it has nothing to do here. // If this assert() fails that means that some function marked some // files as being_compacted, but didn't call ComputeCompactionScore() - assert(!level_file.second->being_compacted); - start_level_ = level_file.first; + assert(!level_file.second->being_compacted);//有其他的thread正在compact + start_level_ = level_file.first; //当前level的第一个SST file if ((compact_to_next_level && start_level_ == vstorage_->num_non_empty_levels() - 1) || (start_level_ == 0 && @@ -166,7 +168,7 @@ void LevelCompactionBuilder::PickFileToCompact( } if (compact_to_next_level) { output_level_ = - (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; + (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; //设置输出的level } else { output_level_ = start_level_; } @@ -183,7 +185,7 @@ void LevelCompactionBuilder::PickFileToCompact( void LevelCompactionBuilder::SetupInitialFiles() { // Find the compactions by size on all levels. bool skipped_l0_to_base = false; - for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { + for (int i = 0; i < compaction_picker_->NumberLevels() - 1; i++) { //这里枚举的是所有参与compaction的level,也就是i=0是start_level start_level_score_ = vstorage_->CompactionScore(i); start_level_ = vstorage_->CompactionScoreLevel(i); assert(i == 0 || start_level_score_ <= vstorage_->CompactionScore(i - 1)); @@ -451,7 +453,7 @@ bool LevelCompactionBuilder::SetupOtherInputsIfNeeded() { Compaction* LevelCompactionBuilder::PickCompaction() { // Pick up the first file to start compaction. It may have been extended // to a clean cut. - SetupInitialFiles(); + SetupInitialFiles(); //初始化需要compact的文件 if (start_level_inputs_.empty()) { return nullptr; } @@ -459,13 +461,13 @@ Compaction* LevelCompactionBuilder::PickCompaction() { // If it is a L0 -> base level compaction, we need to set up other L0 // files if needed. - if (!SetupOtherL0FilesIfNeeded()) { + if (!SetupOtherL0FilesIfNeeded()) { //如果需要的话,选择L0的文件 return nullptr; } // Pick files in the output level and expand more files in the start level // if needed. - if (!SetupOtherInputsIfNeeded()) { + if (!SetupOtherInputsIfNeeded()) { //选择对应的输出文件 return nullptr; } @@ -634,6 +636,7 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { if (start_level_inputs_.size() == 1 && (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) && (mutable_cf_options_.compression_per_level.empty())) { + // Only file of `index`, and it is likely a trivial move. Try to // expand if it is still a trivial move, but not beyond // max_compaction_bytes or 4 files, so that we don't create too @@ -679,6 +682,12 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) { } start_level_inputs_.files.push_back(next_file); } + + printf("TryExtendNonL0TrivialMove called clock=%d start_level=%d start_level_input_size=%ld output_level=%d [", get_clock(), start_level_, start_level_inputs_.size(), output_level_); + for(auto &x: start_level_inputs_.files) { + printf("%ld ", x->fd.GetNumber()); + } + printf("] \n"); return start_level_inputs_.size() > 1; } return false; @@ -700,9 +709,9 @@ bool LevelCompactionBuilder::PickFileToCompact() { assert(start_level_ >= 0); - if (TryPickL0TrivialMove()) { - return true; - } + // if (TryPickL0TrivialMove()) { + // return true; + // } const std::vector& level_files = vstorage_->LevelFiles(start_level_); @@ -714,9 +723,9 @@ bool LevelCompactionBuilder::PickFileToCompact() { unsigned int cmp_idx; for (cmp_idx = vstorage_->NextCompactionIndex(start_level_); - cmp_idx < file_scores.size(); cmp_idx++) { - int index = file_scores[cmp_idx]; - auto* f = level_files[index]; + cmp_idx < file_scores.size(); cmp_idx++) { //枚举所有将要被compact但是还未被compact的文件 + int index = file_scores[cmp_idx]; //获取到实际要compact的文件的index + auto* f = level_files[index]; //获取到实际的SST file // do not pick a file to compact if it is being compacted // from n-1 level. @@ -747,6 +756,8 @@ bool LevelCompactionBuilder::PickFileToCompact() { continue; } + //现在已经选取好了input level file + //接下来需要选取output level file // Now that input level is fully expanded, we check whether any output // files are locked due to pending compaction. // @@ -760,9 +771,9 @@ bool LevelCompactionBuilder::PickFileToCompact() { vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest, &output_level_inputs.files); if (output_level_inputs.empty()) { - if (TryExtendNonL0TrivialMove(index)) { - break; - } + // if (TryExtendNonL0TrivialMove(index)) { + // break; + // } } else { if (!compaction_picker_->ExpandInputsToCleanCut(cf_name_, vstorage_, &output_level_inputs)) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 6ec257836..7f8e65cfa 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -13,6 +13,7 @@ #include #endif +#include #include #include #include @@ -24,6 +25,7 @@ #include #include #include +#include #include "db/arena_wrapped_db_iter.h" #include "db/builder.h" @@ -107,9 +109,24 @@ #include "util/stop_watch.h" #include "util/string_util.h" #include "utilities/trace/replayer_impl.h" +#include "utilities/my_logger.h" namespace ROCKSDB_NAMESPACE { + +enum LOG_TYPE { + FLUSH = 1, + COMPACTION = 2, + OTHER = 10 +}; +void log_print(const char *s, LOG_TYPE log_type, int level, Compaction *c); +void print_compaction(Compaction *compaction, int level); +void after_flush_or_compaction(VersionStorageInfo *vstorage, int level, std::vector files_output, ColumnFamilyData* cfd, Compaction* const compaction); +void all_profiling_print(); +void profiling_print(); +void get_predict(int level, const FileMetaData &file, Version *v, const Compaction* compaction_, int &predict_, int &predict_type_, int &tmp_rank); +void set_deleted_time(int fnumber, int clock); +int get_clock(); const std::string kDefaultColumnFamilyName("default"); const std::string kPersistentStatsColumnFamilyName( "___rocksdb_stats_history___"); @@ -738,6 +755,11 @@ Status DBImpl::CloseHelper() { Status DBImpl::CloseImpl() { return CloseHelper(); } DBImpl::~DBImpl() { + printf("DB Impl Destory Function Called\n"); + profiling_print(); + all_profiling_print(); + + // TODO: remove this. init_logger_creation_s_.PermitUncheckedError(); @@ -755,6 +777,7 @@ DBImpl::~DBImpl() { closing_status_ = CloseImpl(); closing_status_.PermitUncheckedError(); + printf("Close DB\n"); } void DBImpl::MaybeIgnoreError(Status* s) const { @@ -4330,6 +4353,7 @@ Status DBImpl::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { void DBImpl::GetColumnFamilyMetaData(ColumnFamilyHandle* column_family, ColumnFamilyMetaData* cf_meta) { + assert(column_family); auto* cfd = static_cast_with_check(column_family)->cfd(); @@ -5847,4 +5871,738 @@ void DBImpl::RecordSeqnoToTimeMapping() { } #endif // ROCKSDB_LITE + +const int LEVEL = 101; +int flush_num, compaction_num; +const int FlushLevel = 2; +const int CompactLevel = 6; +const int INF = 1e9; +const int AVERAGE_LIFETIME_THRESHOLD = 1; +double ans_wp; +double ans_wp_no_set; +int ans_reset_num; +int ans_allocated_num; +std::vector compaction_level_list; +int flush_level[LEVEL]; //每个level被Flush的次数 +int compact_level[LEVEL]; //每个level被compact的次数 +uint64_t compact_level_lifetime[LEVEL]; //被compact的SST file的总的lifetime +std::queue recent_level_lifetime_queue[LEVEL]; +uint64_t recent_level_lifetime[LEVEL]; +uint32_t correct_predict_time[LEVEL]; //进行了预测的SST file中正确预测的文件数量 +uint32_t compacted_number[LEVEL]; //进行了预测的SST file中被合并的文件数量 +uint32_t level_file_num[LEVEL]; +const int PREDICT_THRESHOLD = 25; +std::map pre; //key: file_number value: file被创建的时间 +std::map predict; //key: file_number value: 预测的lifetime的值 +std::map predict_type; +std::map number_life; +std::map number_level; +std::map deleted_time; +std::map rank_; +std::map id_to_name; +std::vector time_level; //Flush/Compaction的level按时间递增的分布 +//int global_clock; +int get_clock() { + return flush_num + compaction_num; +} +void add_level_file_num(int level) { + level_file_num[level]++; +} +int get_ave_time(int level) { + if(compacted_number[level] <= 0) return 0; + return compact_level_lifetime[level] / compacted_number[level]; +} + +void update_fname(uint64_t id, std::string name) { + //printf("update_fname id=%ld name=%s", id, name.c_str()); + id_to_name[id] = name; +} +std::string get_fname(uint64_t id) { + if(id_to_name.find(id) == id_to_name.end()) { + printf("ERROR: can't find fname id=%ld\n", id); + return "/"; + } else { + return id_to_name[id]; + } + +} +void update_average_lifetime(int level, int lifetime) { + recent_level_lifetime_queue[level].push(lifetime); + recent_level_lifetime[level] += lifetime; + if(recent_level_lifetime_queue[level].size() > AVERAGE_LIFETIME_THRESHOLD) { + recent_level_lifetime[level] -= recent_level_lifetime_queue[level].front(); + recent_level_lifetime_queue[level].pop(); + } +} +int get_recent_average_lifetime(int level) { + if(recent_level_lifetime_queue[level].size() == 0) return 0; + return recent_level_lifetime[level] / recent_level_lifetime_queue[level].size(); +} +double get_predict_rate(int level) { + if(compacted_number[level] <= 0) return 0; + return (double) correct_predict_time[level] / compacted_number[level]; +} +int min_int(int a, int b) { + return a < b ? a : b; +} +int max_int(int a, int b) { + return a > b ? a : b; +} +int last_compact[LEVEL]; +int star_time[LEVEL]; +int level_round[LEVEL]; + +int level_len[LEVEL] = {5, 1, 1, 1, 1, 1, 1}; +int CYCLE = 9; + +class life_meta { +public: + life_meta(int type_, int lifetime_, int predict_lifetime_, int real_type_, uint64_t fnumber_, int time_clock_) { + type = type_; + lifetime = lifetime_; + predict_lifetime = predict_lifetime_; + real_type = real_type_ == 0 ? 0 : -1; + fnumber = fnumber_; + time_clock = time_clock_; + // printf("number=%ld predict_time=%d\n", fnumber, predict_lifetime_); + } + int type; //predict compacted reason + int lifetime; + int predict_lifetime; + int real_type; //real compacted reason + uint64_t fnumber; + int time_clock; +}; +std::map > life_profiling; + + +//每50次Compact会调用此函数打印状态 +//printf profiling information +void profiling_print() { + printf("Profiling wp1=%lf wp2=%lf allocated_num=%d reset_num=%d\n", ans_wp_no_set, ans_wp, ans_allocated_num, ans_reset_num); + for(int i = 0; i <= FlushLevel; i++) { + printf("FlushLevel %d=%d\n", i, flush_level[i]); + } + for(int i = 0; i <= CompactLevel; i++) { + printf("CompactLevel %d=%d ave_lifetime=%d rate=%.2lf level_file_num=%d\n", i, compact_level[i], get_ave_time(i), get_predict_rate(i), level_file_num[i]); + } +} + +void all_profiling_print() { + FILE * fp = fopen("lifetime.out", "a"); + for(int i = 0; i <= CompactLevel; i++) {; + for(auto &x: life_profiling[i]) { + // int diff_time = x.predict_lifetime - x.lifetime; + fprintf(fp, "%d %d %d %d %d %ld %d %d\n", i, x.predict_lifetime, x.type, x.lifetime, x.real_type, x.fnumber, x.time_clock, level_file_num[i]); + + } + } + fclose(fp); +} + +struct timeval time; +uint64_t prev_time, prev_flush_time; +int prev_type; +//after flush or compaction +//print compaction input/output file information +std::mutex print_mutex; +void log_print(const char *s, LOG_TYPE log_type, int level, Compaction *c) { + const std::lock_guard lock(print_mutex); + //bool add = 1; + if(log_type == FLUSH) { + flush_num++; + flush_level[level]++; + time_level.push_back(level); + } else if(log_type == COMPACTION) { + compaction_num++; + compact_level[level]++; + time_level.push_back(level); + } + printf("%10s flush_num=%d compaction_num=%d time=%d level=%d\n", s, flush_num, compaction_num, get_clock(), level); + + + + gettimeofday(&time, NULL); + long long us = (time.tv_sec*1000 + time.tv_usec/1000); + if(prev_type != FLUSH && log_type == FLUSH) prev_flush_time = us; + prev_type = log_type; + + FILE * fp2 = fopen("clock.out", "a"); + fprintf(fp2, "%lld %lld %d flush_num=%d compaction_num=%d time=%d level=%d\n", us - prev_time, us - prev_flush_time, log_type, flush_num, compaction_num, get_clock(), level); + fclose(fp2); + prev_time = us; + + + + FILE * fp = fopen("level.out", "a"); + fprintf(fp, "%d %d\n", get_clock(), level); + fclose(fp); + if(log_type == COMPACTION) { + print_compaction(c, level); + } +} + + +void update_factor_predict(int level) { + compaction_level_list.emplace_back(level); + if(last_compact[level] != 0 && (get_clock() - last_compact[level] > level_round[level])) { + level_round[level] = get_clock() - last_compact[level]; + star_time[level] = get_clock(); + } + last_compact[level] = get_clock(); +} + + +//level层的某个文件被compact后调用 +//统计相关信息 +void add_calc(int level, int lifetime, int predict_lifetime, int type, int real_type, uint64_t fnumber, int time_clock) { + compacted_number[level]++; + correct_predict_time[level] += (abs(lifetime - predict_lifetime) <= PREDICT_THRESHOLD); + compact_level_lifetime[level] += lifetime; + update_average_lifetime(level, lifetime); + life_profiling[level].push_back(life_meta(type, lifetime, predict_lifetime, real_type, fnumber, time_clock)); +} + + +uint64_t get_number(const FileMetaData tmp) { + return tmp.fd.GetNumber(); +} + +void print_compaction(Compaction *compaction, int level) { + + if(compaction == nullptr) return ; + printf("level=%d\n", level); + update_factor_predict(level); + for(size_t i = 0; i < compaction->num_input_levels(); i++) { + printf("vector[%ld] element:\n", i); + for(size_t j = 0; j < compaction->num_input_files(i); j++) { + FileMetaData *tmp = compaction->input(i, j); + uint64_t number = get_number(*tmp); + printf("number=%lu clock=%d ", number, get_clock()); + + if(pre.find(number) != pre.end()) { + int lifetime = get_clock() - pre[number]; + printf(" real_time=%d predict_time=%d predict_deleted_time=%d predict_type=%d level_file_num=%d", lifetime, predict[number], deleted_time[number], predict_type[number], level_file_num[level + i]); + number_life[number] = lifetime; + number_level[number] = level; + add_calc(compaction->level() + i, lifetime, predict[number], predict_type[number], i, number, get_clock()); + } else { + printf("ERROR: can't find SST's lifetime"); + } + putchar('\n'); + } + } +} + +struct Fsize { + size_t index; + FileMetaData* file; +}; + +void SortFileByRoundRobin(const InternalKeyComparator& icmp, + std::vector* compact_cursor, + bool level0_non_overlapping, int level, + std::vector* temp) { + std::sort(temp->begin(), temp->end(), + [icmp](const Fsize& f1, const Fsize& f2) -> bool { + return icmp.Compare(f1.file->smallest, f2.file->smallest) < 0; + }); + + if (level == 0 && !level0_non_overlapping) { + // Using kOldestSmallestSeqFirst when level === 0, since the + // files may overlap (not fully sorted) + std::sort(temp->begin(), temp->end(), + [](const Fsize& f1, const Fsize& f2) -> bool { + return f1.file->fd.smallest_seqno < f2.file->fd.smallest_seqno; + }); + return; + } + + bool should_move_files = + compact_cursor->at(level).size() > 0 && temp->size() > 1; + + // The iterator points to the Fsize with smallest key larger than or equal to + // the given cursor + std::vector::iterator current_file_iter; + if (should_move_files) { + // Find the file of which the smallest key is larger than or equal to + // the cursor (the smallest key in the successor file of the last + // chosen file), skip this if the cursor is invalid or there is only + // one file in this level + current_file_iter = std::lower_bound( + temp->begin(), temp->end(), compact_cursor->at(level), + [&](const Fsize& f, const InternalKey& cursor) -> bool { + return icmp.Compare(cursor, f.file->smallest) > 0; + }); + + should_move_files = + current_file_iter != temp->end() && current_file_iter != temp->begin(); + } + if (should_move_files) { + // Construct a local temporary vector + std::vector local_temp; + local_temp.reserve(temp->size()); + // Move the selected File into the first position and its successors + // into the second, third, ..., positions + for (auto iter = current_file_iter; iter != temp->end(); iter++) { + local_temp.push_back(*iter); + } + // Move the origin predecessors of the selected file in a round-robin + // manner + for (auto iter = temp->begin(); iter != current_file_iter; iter++) { + local_temp.push_back(*iter); + } + // Replace all the items in temp + for (size_t i = 0; i < local_temp.size(); i++) { + temp->at(i) = local_temp[i]; + } + } +} + +//获取在满足largestKey > cp的前提下,有多少比这个file的LargetKey小 +int get_rank(int level, const FileMetaData &file, Version *v, const Compaction* compaction_) { + int BEGIN_LEVEL_NUM = (level == 0 ? 1 : 4); + int result = -1; + + VersionStorageInfo* vstorage_t = v->storage_info(); + FileMetaData file_tmp = file; + //需要把input中的file都忽略掉 + const std::vector& files = vstorage_t->files_[level]; + if(level == 0) { + return files.size() / BEGIN_LEVEL_NUM;; + } + uint32_t n = files.size(); + std::vector temp; + int num = 0; + for (size_t i = 0; i < n; i++) { + bool flag = 0; + if(compaction_ != nullptr) { //主要注意,level=0的情况依然获取不到compaction + size_t target_level = level - compaction_->start_level(); + for(size_t j = 0; j < compaction_->num_input_files(target_level); j++) { + if(files[i] != nullptr && compaction_->input(target_level, j) != nullptr + && get_number(*files[i]) == get_number(*(compaction_->input(target_level, 0))) + ) { + flag = 1; + break; + } + } + } + if(!flag) { + num++; + temp.push_back({i, files[i]}); + } + + } + temp.push_back({static_cast(num), &file_tmp}); + + if(temp.size() != 1) { + SortFileByRoundRobin(*(vstorage_t->internal_comparator_), &(vstorage_t->compact_cursor_), vstorage_t->level0_non_overlapping_, level, &temp); + } + for(long unsigned int i = 0 ; i < temp.size(); i++) { + if(file_tmp.unique_id[1] == temp[i].file->unique_id[1]) { + result = i; + break; + } + } + printf("GetRankFinished: level=%d number=%ld rank=%d level_size=%ld BEGIN_LEVEL_NUM=%d\n", level, file.fnumber, result, temp.size(), BEGIN_LEVEL_NUM); + return result / BEGIN_LEVEL_NUM;; +} + +bool query_is_compacting(int level) { + if(!compact_level[level]) return 0; + int last = compaction_level_list.size() - 1; + for(int i = last; i >= std::max(0, last - CYCLE * 3); i--) { + if(compaction_level_list[i] == level) + return 1; + } + return 0; + +} + +//pre_time是之前的层所消耗的时间 +//target是要最小化的时间 +void dfs(int level, int deep, const FileMetaData &file, Version *v, const Compaction* compaction_, int pre_time, int &predict_, int &predict_type_, int &tmp_rank) { + if(level == 0) return ; + int upper_level = level - 1; + const InternalKey* begin = &file.smallest; + const InternalKey* end = &file.largest; + auto vstorage = v->storage_info(); + auto user_cmp = vstorage->InternalComparator()->user_comparator(); + std::vector level_file = vstorage->LevelFiles(upper_level); + for(int i = 0; i < vstorage->NumLevelFiles(upper_level); i++) { + FileMetaData *f = level_file[i]; + const Slice file_start = f->smallest.user_key(); + const Slice file_end = f->largest.user_key(); + if (begin != nullptr && user_cmp->CompareWithoutTimestamp(file_end, begin->user_key()) < 0) { + } else if (end != nullptr && user_cmp->CompareWithoutTimestamp(file_start, end->user_key()) > 0) { + } else { + bool flag = 0; + if(compaction_ != nullptr) { //主要注意,level=0的情况依然获取不到compaction + for(size_t target_level = 0; target_level < compaction_->num_input_levels(); target_level++) { + for(size_t j = 0; j < compaction_->num_input_files(target_level); j++) { + if(get_number(*f) == get_number(*(compaction_->input(target_level, j))) + ) { + flag = 1; + break; + } + } + if(flag == 1) break; + } + + } + if(flag) continue; + + int rank = get_rank(upper_level, *f, v, compaction_); + tmp_rank = rank; + int T1 = CYCLE; + printf("Case1 Prediction upper_level=%d number=%ld Case1=%d\n", upper_level, file.fnumber, T1); + if(T1 < predict_) { + predict_ = T1; + predict_type_ = 1; + } + } + } +} + +bool has_overlap(const FileMetaData &file, int target_level, Version *v) { + const InternalKey* begin = &file.smallest; + const InternalKey* end = &file.largest; + auto vstorage = v->storage_info(); + auto user_cmp = vstorage->InternalComparator()->user_comparator(); + std::vector level_file = vstorage->LevelFiles(target_level); + for(int i = 0; i < vstorage->NumLevelFiles(target_level); i++) { + FileMetaData *f = level_file[i]; + const Slice file_start = f->smallest.user_key(); + const Slice file_end = f->largest.user_key(); + if (begin != nullptr && user_cmp->CompareWithoutTimestamp(file_end, begin->user_key()) < 0) { + } else if (end != nullptr && user_cmp->CompareWithoutTimestamp(file_start, end->user_key()) > 0) { + } else { + return true; + } + } + return false; +} +void get_overlap(const FileMetaData &file, int target_level, Version *v, std::vector &overlap_list) { + const InternalKey* begin = &file.smallest; + const InternalKey* end = &file.largest; + auto vstorage = v->storage_info(); + auto user_cmp = vstorage->InternalComparator()->user_comparator(); + std::vector level_file = vstorage->LevelFiles(target_level); + for(int i = 0; i < vstorage->NumLevelFiles(target_level); i++) { + FileMetaData *f = level_file[i]; + const Slice file_start = f->smallest.user_key(); + const Slice file_end = f->largest.user_key(); + if (begin != nullptr && user_cmp->CompareWithoutTimestamp(file_end, begin->user_key()) < 0) { + } else if (end != nullptr && user_cmp->CompareWithoutTimestamp(file_start, end->user_key()) > 0) { + } else { + overlap_list.emplace_back(get_fname(f->fd.GetNumber())); + } + } +} + +void set_deleted_time(int fnumber, int clock) { + if(deleted_time.find(fnumber) == deleted_time.end()) { + deleted_time[fnumber] = clock; + } +} + +void get_predict(int level, const FileMetaData &file, Version *v, const Compaction* compaction_, int &predict_, int &predict_type_, int &tmp_rank) { + printf("get_predict begin: number=%ld clock=%d level=%d compact_level_number=%d\n", file.fnumber, get_clock(), level, compact_level[level]); + + predict_ = INF; + predict_type_ = 0; + int T1_rank = 0; + if(strstr(get_fname(file.fd.GetNumber()).c_str(), ".log") != nullptr) { + predict_ = 1; + predict_type_ = 0; + } + else if(level == 0) { + predict_ = get_rank(level, file, v, compaction_); + predict_type_ = 0; + } else { + + // Case2B: + dfs(level, -1, file, v, compaction_, 0, predict_, predict_type_, tmp_rank); + int T1 = 1e9, T4 = 1e9; + if(predict_ == INF) { + // Case 1 + if(level + 1 <= CompactLevel && query_is_compacting(level)) { + T1_rank = get_rank(level, file, v, compaction_); + T1 = CYCLE * T1_rank; + printf("Case2 number=%ld Predict=%d\n", file.fnumber, T1); + if(T1 < predict_) { + predict_ = T1; + predict_type_ = 2; // + } + } + + // Case 2A + T4 = get_recent_average_lifetime(level); ///no way to predict the future compaction; + if(T4) { + printf("Case3 number=%ld Predict=%d\n", file.fnumber, T4); + if(T4 < predict_) { + predict_ = T4; + predict_type_ = 3; + } + } + } + + //Case 3 trivial move + if(predict_type_ == 2 && T1 < T4 && level + 1 <= CompactLevel && !has_overlap(file, level + 1, v) && get_recent_average_lifetime(level + 1) != 0) { + printf("Case4 trivial move T4=%d T5=%d\n", CYCLE * get_rank(level + 1, file, v, compaction_), get_recent_average_lifetime(level + 1)); + predict_type_ = 4; + predict_ = get_recent_average_lifetime(level + 1) + T1; + } + } + + if(predict_ == INF || predict_ < 0) predict_ = 1; //lifetime can't = 0 + uint64_t number = get_number(file); + rank_[number] = T1_rank; + predict[number] = predict_; + printf("get_predict finish: number=%ld clock=%d level=%d predict_time=%d\n", number, get_clock(), level, predict[number]); + predict_type[number] = predict_type_; + if(level == 0) { //Flush的时候获取不到output,只能在这里设置了 + pre[number] = get_clock(); + } + +} + +//flush/compact的最后后调用此函数, 此函数可以调用最新的Version信息 +//we know the output file information at this moment +void after_flush_or_compaction(VersionStorageInfo *vstorage, int level, std::vector files_output, ColumnFamilyData* cfd, Compaction* const compaction) { + + puts("AllFiles"); + //这里打log要打所有file的log + if(vstorage == nullptr) { + vstorage = cfd->current()->storage_info(); + } + if(vstorage != nullptr) { + for (int l = 0; l < vstorage->num_levels(); ++l) { + int level_file_number = vstorage->NumLevelFiles(l); + printf("level %d files num:%d: [", l, level_file_number); + std::vector level_file = vstorage->LevelFiles(l); + for(auto &x: level_file) { + printf("%ld ", get_number(*x)); + } + printf("]"); + level_file_num[level + l] = level_file_number; //level + puts(""); + } + } + + puts("Input Files"); + if(compaction != nullptr) { + for (size_t i = 0; i < compaction->num_input_levels(); ++i) { + printf("level=%d\n", compaction->level(i)); + for (auto f : *compaction->inputs(i)) { + printf("number=%ld fname=%s\n", f->fd.GetNumber(), get_fname(f->fd.GetNumber()).c_str()); + } + puts(""); + } + } + puts("Output Files"); + + if(!files_output.empty()) { + for(auto &x: files_output) { + const FileMetaData y = x->meta; + + uint64_t number = get_number(y); + pre[number] = get_clock(); + int rank = 0; + if(rank_.find(number) != rank_.end()) { + rank = rank_[number]; + } + printf("number=%lu level=%d rank=%d clock=%d predict_lifetime=%d predict_type=%d fname=%s\n", + number, compaction->output_level(), rank, get_clock(), predict[number], predict_type[number], y.fname.c_str()); + + } + } + + puts("------End-----------"); + + +} +DBImpl *rocksdb_impl; +void SetDBImpl(DBImpl *db) { + printf("SetDBImpl called\n"); + //if(rocksdb_impl == nullptr) + rocksdb_impl = db; +} + +void set_write_amplification(double wp) { + ans_wp = wp; +} +void set_write_amplification_no_set(double wp) { + ans_wp_no_set = wp; +} +void set_reset_num(int reset_num) { + ans_reset_num = reset_num; +} +void set_allocated_num(int allocated_num) { + ans_allocated_num = allocated_num; +} + + +extern int pre_compaction_num; +extern int precompaction_file_num; +int get_bg_compaction_scheduled_() { + ColumnFamilyMetaData meta; + if(rocksdb_impl->DefaultColumnFamily() == nullptr) { + return -1; + } + rocksdb_impl->GetColumnFamilyMetaData(rocksdb_impl->DefaultColumnFamily(), &meta); + printf("get_bg_compaction_scheduled_=%d", rocksdb_impl->get_bg_compaction_scheduled_()); + return rocksdb_impl->get_bg_compaction_scheduled_(); +} +bool DoPreCompaction(std::vector file_list, int ENABLE_LIMIT_LEVEL, int MAX_LIFETIME) { + printf("Recieve Compaction Request Time=%d\n", get_clock()); + printf("file_list.size()=%ld\n", file_list.size()); + + assert(rocksdb_impl != nullptr); + + printf("GetName=%s\n", rocksdb_impl->GetName().c_str()); + if(file_list.size() == 0) return true; + ColumnFamilyMetaData meta; + if(rocksdb_impl->DefaultColumnFamily() == nullptr) { + return false; + } + rocksdb_impl->GetColumnFamilyMetaData(rocksdb_impl->DefaultColumnFamily(), &meta); + std::vector input_file_names; + std::vector output_level_list; + int output_level = -1, count = 0; + //TODO: optimize + + printf("Begining FileList: "); + for(auto &x: file_list) printf("%ld ", x); + puts(""); + if(ENABLE_LIMIT_LEVEL) { + std::vector tobe_compacted_list; + for(auto &id: file_list) { + bool flag = 0; + int l = 0; + for(auto &x: meta.levels) { + int level = x.level; + for(auto &file: x.files) { + if(id == file.file_number) { + l = level; + flag = 1; + break; + } + } + if(flag == 1) break; + } + if(flag == 1) + printf("file id=%ld deletion_time=%d level=%d\n", id, pre[id] + predict[id], l); + if(flag == 1 && l <= ENABLE_LIMIT_LEVEL && pre[id] + predict[id] <= get_clock() && predict_type[id] == 2 && pre[id] + predict[id] <= MAX_LIFETIME) + tobe_compacted_list.emplace_back(id); + } + file_list.clear(); + file_list.insert(file_list.begin(), tobe_compacted_list.begin(), tobe_compacted_list.end()); + } + + printf("After FileList: "); + for(auto &x: file_list) printf("%ld ", x); + puts(""); + + if(file_list.empty()) { + printf("FileList is empty\n"); + return false; + } + + + + puts("Rocksdb File:"); + for(auto &x: meta.levels) { + int level = x.level; + printf("level=%d: size=%ld ", level, x.files.size()); + for(auto &file: x.files) { + printf("%ld ", file.file_number); + } + puts(""); + } + + + for(auto &id: file_list) { + bool flag = 0; + for(auto &x: meta.levels) { + int level = x.level; + for(auto &file: x.files) { + if(id == file.file_number) { + count++; + //printf("all_level=%ld all_file=%ld level=%d file_number=%ld\n", meta.levels.size(), x.files.size(), level, file.file_number); + input_file_names.emplace_back(file.name); + output_level = std::max(output_level, level + 1); + output_level_list.emplace_back(level + 1); + flag = 1; + break; + } + } + if(flag == 1) break; + } + } + if(count != static_cast(file_list.size())) { + printf("ERROR:count not equal count=%d file_list.size()=%d\n", count, static_cast(file_list.size())); + //return false; + } + if(count == 0) { + printf("count == 0, no need to do precompaction"); + return false; + } + if(ENABLE_LIMIT_LEVEL && output_level > ENABLE_LIMIT_LEVEL) { + printf("output_level is %d limit_level= %d\n", output_level, ENABLE_LIMIT_LEVEL); + return false; + } + + printf("Ready To PreCompaction=%d bg_compaction_scheduled_=%d num_running_compactions_=%d unscheduled_compactions_=%d bg_bottom_compaction_scheduled_=%d output_level=%d\n", pre_compaction_num, rocksdb_impl->get_bg_compaction_scheduled_(), rocksdb_impl->get_num_running_compactions_(), rocksdb_impl->get_unscheduled_compactions_(), rocksdb_impl->get_bg_bottom_compaction_scheduled_(), output_level); + + if(ENABLE_LIMIT_LEVEL) { + for(uint32_t i = 0; i < input_file_names.size(); i++) { + CompactionOptions options; + std::vector input_file_list; + input_file_list.emplace_back(input_file_names[i]); + printf("Ready To Compac %s output_level=%d\n", input_file_names[i].c_str(), output_level_list[i]); + Status s = rocksdb_impl->CompactFiles(options, input_file_list, output_level_list[i]); + if(!s.ok()) { + printf("ERROR PreCompaction fails %s\n", s.getState()); + //return false; + } else { + precompaction_file_num ++; + } + } + } else { + CompactionOptions options; + printf("Ready To FullCompac num=%ld output_level=%d\n", input_file_names.size(), output_level); + Status s = rocksdb_impl->CompactFiles(options, input_file_names, output_level); + if(!s.ok()) { + printf("ERROR PreCompaction fails %s\n", s.getState()); + //return false; + } else { + precompaction_file_num += input_file_names.size(); + } + } + + + + puts("After PreCompaction"); + ColumnFamilyMetaData new_meta; + rocksdb_impl->GetColumnFamilyMetaData(rocksdb_impl->DefaultColumnFamily(), &new_meta); + for(auto &x: new_meta.levels) { + int level = x.level; + printf("level=%d: ", level); + for(auto &file: x.files) { + printf("%ld ", file.file_number); + } + puts(""); + } + return true; +} + +uint64_t rocks_io; +void SetRocksIO(uint64_t rocks_io_) { + rocks_io = rocks_io_; +} + +uint64_t GetIOSTATS() { + return rocks_io; +} } // namespace ROCKSDB_NAMESPACE diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 03418c1d5..ba94d51f0 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -68,6 +68,8 @@ #include "util/repeatable_thread.h" #include "util/stop_watch.h" #include "util/thread_local.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_util.h" namespace ROCKSDB_NAMESPACE { @@ -185,7 +187,23 @@ class DBImpl : public DB { virtual ~DBImpl(); + + int get_unscheduled_compactions_() { + return unscheduled_compactions_; + } // ---- Implementations of the DB interface ---- + int get_bg_bottom_compaction_scheduled_() { + return bg_bottom_compaction_scheduled_; + } + + // count how many background compactions are running or have been scheduled + int get_bg_compaction_scheduled_() { + return bg_compaction_scheduled_; + } + int get_num_running_compactions_() { + return num_running_compactions_; + } + using DB::Resume; Status Resume() override; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index b6882df06..8e3579b35 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1343,13 +1343,13 @@ Status DBImpl::CompactFilesImpl( "yet supported in CompactFiles()"); } } - + printf("Before Pick InputFiles\n"); Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( &input_set, cf_meta, output_level); if (!s.ok()) { return s; } - + printf("After Pick InputFiles\n"); std::vector input_files; s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( &input_files, &input_set, version->storage_info(), compact_options); @@ -1419,6 +1419,8 @@ Status DBImpl::CompactFilesImpl( &blob_callback_, &bg_compaction_scheduled_, &bg_bottom_compaction_scheduled_); + printf("PreCompaction Job Ready\n"); + // Creating a compaction influences the compaction score because the score // takes running compactions into account (by skipping files that are already // being compacted). Since we just changed compaction score, we recalculate it @@ -1437,7 +1439,14 @@ Status DBImpl::CompactFilesImpl( TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); + + + Status status = compaction_job.Install(*c->mutable_cf_options()); + + + + if (status.ok()) { assert(compaction_job.io_status().ok()); InstallSuperVersionAndScheduleWork(c->column_family_data(), @@ -2607,9 +2616,9 @@ ColumnFamilyData* DBImpl::PickCompactionFromQueue( assert(*token == nullptr); autovector throttled_candidates; ColumnFamilyData* cfd = nullptr; - while (!compaction_queue_.empty()) { - auto first_cfd = *compaction_queue_.begin(); - compaction_queue_.pop_front(); + while (!compaction_queue_.empty()) { //当队列不为空时 + auto first_cfd = *compaction_queue_.begin(); //取队列头的任务 + compaction_queue_.pop_front(); //删除队列头的任务 assert(first_cfd->queued_for_compaction()); if (!RequestCompactionToken(first_cfd, false, token, log_buffer)) { throttled_candidates.push_back(first_cfd); @@ -2924,6 +2933,7 @@ void DBImpl::BackgroundCallFlush(Env::Priority thread_pri) { } } +//触发compaction的后台线程 void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, Env::Priority bg_thread_pri) { bool made_progress = false; @@ -2932,6 +2942,9 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, immutable_db_options_.info_log.get()); { + //在Pick的过程中加db锁 + //在BackgroundCompaction构造完compaction_job之前都需要加锁 + //Compaction_job.Run()会解锁,跑完再加锁 InstrumentedMutexLock l(&mutex_); // This call will unlock/lock the mutex to wait for current running @@ -2947,6 +2960,7 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, assert((bg_thread_pri == Env::Priority::BOTTOM && bg_bottom_compaction_scheduled_) || (bg_thread_pri == Env::Priority::LOW && bg_compaction_scheduled_)); + //进行Compaction Status s = BackgroundCompaction(&made_progress, &job_context, &log_buffer, prepicked_compaction, bg_thread_pri); TEST_SYNC_POINT("BackgroundCallCompaction:1"); @@ -3051,6 +3065,8 @@ void DBImpl::BackgroundCallCompaction(PrepickedCompaction* prepicked_compaction, } } +//BackgroundCompaction是整个Compaction的PostMaster +//其中最主要的三个阶段是Compaction::Prepare、Compaction::Run and Compaction::Install Status DBImpl::BackgroundCompaction(bool* made_progress, JobContext* job_context, LogBuffer* log_buffer, @@ -3059,7 +3075,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, ManualCompactionState* manual_compaction = prepicked_compaction == nullptr ? nullptr - : prepicked_compaction->manual_compaction_state; + : prepicked_compaction->manual_compaction_state; //看一下有没有手动触发的Compaction *made_progress = false; mutex_.AssertHeld(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); @@ -3169,6 +3185,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, return Status::OK(); } + //从Queue中拿取Compaction任务 auto cfd = PickCompactionFromQueue(&task_token, log_buffer); if (cfd == nullptr) { // Can't find any executable task from the compaction queue. @@ -3425,6 +3442,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, db_id_, db_session_id_, c->column_family_data()->GetFullHistoryTsLow(), c->trim_ts(), &blob_callback_, &bg_compaction_scheduled_, &bg_bottom_compaction_scheduled_); + + //Step1: Prepare() compaction_job.Prepare(); NotifyOnCompactionBegin(c->column_family_data(), c.get(), status, @@ -3432,11 +3451,13 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, mutex_.Unlock(); TEST_SYNC_POINT_CALLBACK( "DBImpl::BackgroundCompaction:NonTrivial:BeforeRun", nullptr); + //Step2: Run() // Should handle erorr? compaction_job.Run().PermitUncheckedError(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); + //Step3: Install() status = compaction_job.Install(*c->mutable_cf_options()); io_s = compaction_job.io_status(); if (status.ok()) { diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 1f1dcb6a1..710f067cd 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1752,7 +1752,7 @@ IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number, } return io_s; } - +extern void SetDBImpl(DBImpl *db); Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr, @@ -1778,6 +1778,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname, } DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn); + SetDBImpl(impl); if (!impl->immutable_db_options_.info_log) { s = impl->init_logger_creation_s_; delete impl; diff --git a/db/db_test_util.h b/db/db_test_util.h index 0a35d9ffc..022f8a2cd 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -764,6 +764,7 @@ class FileTemperatureTestFS : public FileSystemWrapper { return target()->NewWritableFile(fname, opts, result, dbg); } + void CopyCurrentSstFileTemperatures(std::map* out) { MutexLock lock(&mu_); *out = current_sst_file_temperatures_; diff --git a/db/flush_job.cc b/db/flush_job.cc index d2099c337..f03cdb5c1 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -48,6 +48,16 @@ namespace ROCKSDB_NAMESPACE { +enum LOG_TYPE { + FLUSH = 1, + COMPACTION = 2, + OTHER = 10 +}; + +extern void log_print(const char *s, LOG_TYPE log_type, int level, Compaction *c); +extern void after_flush_or_compaction(VersionStorageInfo *vstorage, int level, std::vector files_output, ColumnFamilyData* cfd, Compaction* const compaction); + + const char* GetFlushReasonString (FlushReason flush_reason) { switch (flush_reason) { case FlushReason::kOthers: @@ -299,6 +309,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, } else if (write_manifest_) { TEST_SYNC_POINT("FlushJob::InstallResults"); // Replace immutable memtable with the generated Table + // Install s = cfd_->imm()->TryInstallMemtableFlushResults( cfd_, mutable_cf_options_, mems_, prep_tracker, versions_, db_mutex_, meta_.fd.GetNumber(), &job_context_->memtables_to_free, db_directory_, @@ -313,6 +324,10 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, } RecordFlushIOStats(); + if(edit_->GetNewFiles().size() != 0) { + log_print("Flush", LOG_TYPE::FLUSH, edit_->GetNewFiles().back().first, nullptr); + } + // When measure_io_stats_ is true, the default 512 bytes is not enough. auto stream = event_logger_->LogToBuffer(log_buffer_, 1024); stream << "job" << job_context_->job_id << "event" @@ -327,6 +342,10 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, } stream.EndArray(); + + after_flush_or_compaction(cfd_->current()->storage_info(), 0, std::vector{}, nullptr, nullptr); + + const auto& blob_files = vstorage->GetBlobFiles(); if (!blob_files.empty()) { assert(blob_files.front()); @@ -946,6 +965,9 @@ Status FlushJob::WriteLevel0Table() { job_context_->job_id, io_priority, &table_properties_, write_hint, full_history_ts_low, blob_callback_, &num_input_entries, &memtable_payload_bytes, &memtable_garbage_bytes); + + + // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); io_s.PermitUncheckedError(); diff --git a/db/version_edit.h b/db/version_edit.h index aba9c0957..7fedfb6a7 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -165,6 +165,12 @@ struct FileSampledStats { }; struct FileMetaData { + + //下面两个是自己加的 + std::string fname; + uint64_t fnumber; + + FileDescriptor fd; InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table diff --git a/db/version_set.cc b/db/version_set.cc index 49f41d4b8..19635ee44 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1679,7 +1679,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, } Status Version::GetPropertiesOfTablesInRange( - const Range* range, std::size_t n, TablePropertiesCollection* props) const { + const Range* range, std::size_t n, TablePropertiesCollection* props) { for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { for (decltype(n) i = 0; i < n; i++) { // Convert user_key into a corresponding internal key. @@ -1759,10 +1759,11 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { auto* ioptions = cfd_->ioptions(); auto* vstorage = storage_info(); - + //printf("Version::GetColumnFamilyMetaData: NumberLevels()=%d\n", cfd_->NumberLevels()); for (int level = 0; level < cfd_->NumberLevels(); level++) { uint64_t level_size = 0; cf_meta->file_count += vstorage->LevelFiles(level).size(); + //printf("Version::GetColumnFamilyMetaData: Level=%d LevelFiles=%ld\n", level, vstorage->LevelFiles(level).size()); std::vector files; for (const auto& file : vstorage->LevelFiles(level)) { uint32_t path_id = file->fd.GetPathId(); @@ -3756,6 +3757,44 @@ void SortFileByRoundRobin(const InternalKeyComparator& icmp, } } // namespace +void VersionStorageInfo::SortFileRR() { + printf("SorFileRR Begin"); + for (int level = 0; level < num_levels() - 1; level++) { + const std::vector& files = files_[level]; + auto& files_by_compaction_pri = files_by_compaction_pri_[level]; + assert(files_by_compaction_pri.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (size_t i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + // size_t num = VersionStorageInfo::kNumberFilesToSort; + //if (num > temp.size()) { + //uint64_t num = temp.size(); + //} + + + SortFileByRoundRobin(*internal_comparator_, &compact_cursor_, + level0_non_overlapping_, level, &temp); + + + + // initialize files_by_compaction_pri_ + for (size_t i = 0; i < temp.size(); i++) { + files_by_compaction_pri.push_back(static_cast(temp[i].index)); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_compaction_pri_[level].size()); + } + printf("SorFileRR End"); + +} + + void VersionStorageInfo::UpdateFilesByCompactionPri( const ImmutableOptions& ioptions, const MutableCFOptions& options) { if (compaction_style_ == kCompactionStyleNone || @@ -3779,9 +3818,9 @@ void VersionStorageInfo::UpdateFilesByCompactionPri( // sort the top number_of_files_to_sort_ based on file size size_t num = VersionStorageInfo::kNumberFilesToSort; - if (num > temp.size()) { + //if (num > temp.size()) { num = temp.size(); - } + //} switch (ioptions.compaction_pri) { case kByCompensatedSize: std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), @@ -3959,12 +3998,12 @@ bool VersionStorageInfo::OverlapInLevel(int level, void VersionStorageInfo::GetOverlappingInputs( int level, const InternalKey* begin, const InternalKey* end, std::vector* inputs, int hint_index, int* file_index, - bool expand_range, InternalKey** next_smallest) const { + bool expand_range, InternalKey** next_smallest) { if (level >= num_non_empty_levels_) { // this level is empty, no overlapping inputs return; } - + SetInputsTmp(inputs); inputs->clear(); if (file_index) { *file_index = -1; diff --git a/db/version_set.h b/db/version_set.h index 077bb26c3..c4664d5d7 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -128,10 +128,18 @@ class VersionStorageInfo { VersionStorageInfo* src_vstorage, bool _force_consistency_checks); // No copying allowed - VersionStorageInfo(const VersionStorageInfo&) = delete; - void operator=(const VersionStorageInfo&) = delete; + // VersionStorageInfo(const VersionStorageInfo&) = delete; +// void operator=(const VersionStorageInfo&) = delete; ~VersionStorageInfo(); + std::vector* inputs_tmp; + + void SortFileRR(); + + void SetInputsTmp(std::vector* inputs_) { + inputs_tmp = inputs_; + } + void Reserve(int level, size_t size) { files_[level].reserve(size); } void AddFile(int level, FileMetaData* f); @@ -258,7 +266,7 @@ class VersionStorageInfo { // range and overlap each other. If false, // then just files intersecting the range InternalKey** next_smallest = nullptr) // if non-null, returns the - const; // smallest key of next file not included + ; // smallest key of next file not included void GetCleanInputsWithinInterval( int level, const InternalKey* begin, // nullptr means before all keys const InternalKey* end, // nullptr means after all keys @@ -581,14 +589,17 @@ class VersionStorageInfo { const Slice& largest_user_key, int last_level, int last_l0_idx); - private: + + void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options, + const MutableCFOptions& mutable_cf_options); + + + public: void ComputeCompensatedSizes(); void UpdateNumNonEmptyLevels(); void CalculateBaseBytes(const ImmutableOptions& ioptions, const MutableCFOptions& options); - void UpdateFilesByCompactionPri(const ImmutableOptions& immutable_options, - const MutableCFOptions& mutable_cf_options); - + void GenerateFileIndexer() { file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); } @@ -918,7 +929,7 @@ class Version { Status GetPropertiesOfAllTables(TablePropertiesCollection* props); Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, - TablePropertiesCollection* props) const; + TablePropertiesCollection* props) ; // Print summary of range delete tombstones in SST files into out_str, // with maximum max_entries_to_print entries printed out. @@ -944,6 +955,9 @@ class Version { int TEST_refs() const { return refs_; } + VersionStorageInfo storage_info_instance() { + return storage_info_; + } VersionStorageInfo* storage_info() { return &storage_info_; } const VersionStorageInfo* storage_info() const { return &storage_info_; } diff --git a/env/composite_env_wrapper.h b/env/composite_env_wrapper.h index 78da6f0ed..2834e4847 100644 --- a/env/composite_env_wrapper.h +++ b/env/composite_env_wrapper.h @@ -44,6 +44,11 @@ class CompositeEnv : public Env { Status NewWritableFile(const std::string& f, std::unique_ptr* r, const EnvOptions& options) override; + + Status SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) { + return file_system_->SetFileLifetime(fname, lifetime, clock, flag, level, overlap_list); + } Status ReopenWritableFile(const std::string& fname, std::unique_ptr* result, diff --git a/env/env.cc b/env/env.cc index c322acde9..5528f7d19 100644 --- a/env/env.cc +++ b/env/env.cc @@ -153,7 +153,7 @@ class LegacyRandomAccessFileWrapper : public FSRandomAccessFile { IOStatus MultiRead(FSReadRequest* fs_reqs, size_t num_reqs, const IOOptions& /*options*/, - IODebugContext* /*dbg*/) override { + IODebugContext* /*dbg*/) override { std::vector reqs; Status status; @@ -406,6 +406,11 @@ class LegacyFileSystemWrapper : public FileSystem { } return status_to_io_status(std::move(s)); } + IOStatus SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) override { + Status s = target_->SetFileLifetime(fname, lifetime, clock, flag, level, overlap_list); + return status_to_io_status(std::move(s)); + } IOStatus ReopenWritableFile(const std::string& fname, const FileOptions& file_opts, std::unique_ptr* result, diff --git a/env/env_hybrid.cc b/env/env_hybrid.cc new file mode 100644 index 000000000..f9bdf0cae --- /dev/null +++ b/env/env_hybrid.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + +#include "env_hybrid.h" + +#include +#include + +#include "env/io_posix.h" +#include "monitoring/thread_status_updater.h" +#include "port/lang.h" +#include "rocksdb/env.h" +#include "rocksdb/fs_hybrid.h" +#include "util/string_util.h" + +namespace ROCKSDB_NAMESPACE { + +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; + +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +HybridFSEnv::HybridFSEnv(const std::string& bdevname) + : CompositeEnv(PosixZenFSHybridFileSystem::Default(bdevname), + SystemClock::Default()), + thread_pools_(thread_pools_storage_), + thread_pools_storage_(Env::Priority::TOTAL), + threads_to_join_(threads_to_join_storage_), + mu_(mu_storage_) { + ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(this); + } + thread_status_updater_ = new ThreadStatusUpdater(); +} + +HybridFSEnv::~HybridFSEnv() { + // All threads must be joined before the deletion of + // thread_status_updater_. + delete thread_status_updater_; +} + +void HybridFSEnv::Schedule(void (*function)(void* arg1), void* arg, + Env::Priority pri, void* tag, + void (*unschedFunction)(void* arg)) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +void HybridFSEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + ThreadPoolImpl::PthreadCall( + "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state)); + ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +Status HybridFSEnv::GetHostName(char* name, uint64_t len) { + int ret = gethostname(name, static_cast(len)); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) { + return Status::InvalidArgument(errnoStr(errno).c_str()); + } else { + return IOError("GetHostName", name, errno); + } + } + return Status::OK(); +} + +// Allow increasing the number of worker threads. +void HybridFSEnv::SetBackgroundThreads(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); +} + +int HybridFSEnv::GetBackgroundThreads(Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); +} + +// Allow increasing the number of worker threads. +void HybridFSEnv::IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) { + assert(pri >= Env::Priority::BOTTOM && pri <= Env::Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); +} + +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/env/env_hybrid.h b/env/env_hybrid.h new file mode 100644 index 000000000..87d7664f3 --- /dev/null +++ b/env/env_hybrid.h @@ -0,0 +1,42 @@ +#pragma once + +#include + +#include + +#include "composite_env_wrapper.h" +#include "util/threadpool_imp.h" + +namespace ROCKSDB_NAMESPACE { +class HybridFSEnv : public CompositeEnv { + public: + static const char* kClassName() { return "HybridFSEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kClassName(); } + + // Constructs the default Env, a singleton + HybridFSEnv(const std::string& bdevname); + ~HybridFSEnv(); + + void Schedule(void (*function)(void*), void* arg, Env::Priority pri, + void* tag, void (*unschedFunction)(void* arg)) override; + + void StartThread(void (*function)(void* arg), void* arg) override; + + Status GetHostName(char* name, uint64_t len) override; + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Env::Priority pri) override; + int GetBackgroundThreads(Env::Priority pri) override; + + void IncBackgroundThreadsIfNeeded(int num, Env::Priority pri) override; + + private: + std::vector& thread_pools_; + std::vector thread_pools_storage_; + std::vector threads_to_join_storage_; + std::vector& threads_to_join_; + pthread_mutex_t& mu_; + pthread_mutex_t mu_storage_; +}; +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/env/env_zenfs.cc b/env/env_zenfs.cc new file mode 100644 index 000000000..5fd0c3e00 --- /dev/null +++ b/env/env_zenfs.cc @@ -0,0 +1,481 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors + + +#if !defined(OS_WIN) + +#include +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +#include +#endif +#include +#include + +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID) +#include +#endif +#include +#include +#include +#if defined(ROCKSDB_IOURING_PRESENT) +#include +#endif +#include +#include + +#include +// Get nano time includes +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) +#elif defined(__MACH__) +#include +#include +#include +#else +#include +#endif +#include +#include +#include + +#include "env/composite_env_wrapper.h" +#include "env/io_posix.h" +#include "logging/posix_logger.h" +#include "monitoring/iostats_context_imp.h" +#include "monitoring/thread_status_updater.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/system_clock.h" +#include "rocksdb/plugin/zenfs/fs/fs_zenfs.h" +#include "test_util/sync_point.h" +#include "util/coding.h" +#include "util/compression_context_cache.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/thread_local.h" +#include "util/threadpool_imp.h" + +#if !defined(TMPFS_MAGIC) +#define TMPFS_MAGIC 0x01021994 +#endif +#if !defined(XFS_SUPER_MAGIC) +#define XFS_SUPER_MAGIC 0x58465342 +#endif +#if !defined(EXT4_SUPER_MAGIC) +#define EXT4_SUPER_MAGIC 0xEF53 +#endif + +namespace ROCKSDB_NAMESPACE { +#if defined(OS_WIN) +static const std::string kSharedLibExt = ".dll"; +static const char kPathSeparator = ';'; +#else +static const char kPathSeparator = ':'; +#if defined(OS_MACOSX) +static const std::string kSharedLibExt = ".dylib"; +#else +static const std::string kSharedLibExt = ".so"; +#endif +#endif + +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION +class ZenfsDynamicLibrary : public DynamicLibrary { + public: + ZenfsDynamicLibrary(const std::string& name, void* handle) + : name_(name), handle_(handle) {} + ~ZenfsDynamicLibrary() override { dlclose(handle_); } + + Status LoadSymbol(const std::string& sym_name, void** func) override { + assert(nullptr != func); + dlerror(); // Clear any old error + *func = dlsym(handle_, sym_name.c_str()); + if (*func != nullptr) { + return Status::OK(); + } else { + char* err = dlerror(); + return Status::NotFound("Error finding symbol: " + sym_name, err); + } + } + + const char* Name() const override { return name_.c_str(); } + + private: + std::string name_; + void* handle_; +}; +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + +class ZenfsClock : public SystemClock { + public: + static const char* kClassName() { return "ZenfsClock"; } + const char* Name() const override { return kDefaultName(); } + const char* NickName() const override { return kClassName(); } + + uint64_t NowMicros() override { + port::TimeVal tv; + port::GetTimeOfDay(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + } + + uint64_t NowNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#elif defined(OS_SOLARIS) + return gethrtime(); +#elif defined(__MACH__) + clock_serv_t cclock; + mach_timespec_t ts; + host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); + clock_get_time(cclock, &ts); + mach_port_deallocate(mach_task_self(), cclock); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#else + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()) + .count(); +#endif + } + + uint64_t CPUMicros() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return (static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000; +#endif + return 0; + } + + uint64_t CPUNanos() override { +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \ + defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12)) + struct timespec ts; + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; +#endif + return 0; + } + + void SleepForMicroseconds(int micros) override { usleep(micros); } + + Status GetCurrentTime(int64_t* unix_time) override { + time_t ret = time(nullptr); + if (ret == (time_t)-1) { + return IOError("GetCurrentTime", "", errno); + } + *unix_time = (int64_t)ret; + return Status::OK(); + } + + std::string TimeToString(uint64_t secondsSince1970) override { + const time_t seconds = (time_t)secondsSince1970; + struct tm t; + int maxsize = 64; + std::string dummy; + dummy.reserve(maxsize); + dummy.resize(maxsize); + char* p = &dummy[0]; + port::LocalTimeR(&seconds, &t); + snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900, + t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec); + return dummy; + } +}; + +class ZenFSEnv : public CompositeEnv { + public: + static const char* kClassName() { return "ZenFSEnv"; } + const char* Name() const override { return kClassName(); } + const char* NickName() const override { return kClassName(); } + + ~ZenFSEnv() override { + if (this == Env::Default()) { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].JoinAllThreads(); + } + // Do not delete the thread_status_updater_ in order to avoid the + // free after use when Env::Default() is destructed while some other + // child threads are still trying to update thread status. All + // ZenFSEnv instances use the same thread_status_updater_, so never + // explicitly delete it. + } + } + + void SetFD_CLOEXEC(int fd, const EnvOptions* options) { + if ((options == nullptr || options->set_fd_cloexec) && fd > 0) { + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC); + } + } + +#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION + // Loads the named library into the result. + // If the input name is empty, the current executable is loaded + // On *nix systems, a "lib" prefix is added to the name if one is not supplied + // Comparably, the appropriate shared library extension is added to the name + // if not supplied. If search_path is not specified, the shared library will + // be loaded using the default path (LD_LIBRARY_PATH) If search_path is + // specified, the shared library will be searched for in the directories + // provided by the search path + Status LoadLibrary(const std::string& name, const std::string& path, + std::shared_ptr* result) override { + assert(result != nullptr); + if (name.empty()) { + void* hndl = dlopen(NULL, RTLD_NOW); + if (hndl != nullptr) { + result->reset(new ZenfsDynamicLibrary(name, hndl)); + return Status::OK(); + } + } else { + std::string library_name = name; + if (library_name.find(kSharedLibExt) == std::string::npos) { + library_name = library_name + kSharedLibExt; + } +#if !defined(OS_WIN) + if (library_name.find('/') == std::string::npos && + library_name.compare(0, 3, "lib") != 0) { + library_name = "lib" + library_name; + } +#endif + if (path.empty()) { + void* hndl = dlopen(library_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new ZenfsDynamicLibrary(library_name, hndl)); + return Status::OK(); + } + } else { + std::string local_path; + std::stringstream ss(path); + while (getline(ss, local_path, kPathSeparator)) { + if (!path.empty()) { + std::string full_name = local_path + "/" + library_name; + void* hndl = dlopen(full_name.c_str(), RTLD_NOW); + if (hndl != nullptr) { + result->reset(new ZenfsDynamicLibrary(full_name, hndl)); + return Status::OK(); + } + } + } + } + } + return Status::IOError( + IOErrorMsg("Failed to open shared library: xs", name), dlerror()); + } +#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION + + void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW, + void* tag = nullptr, + void (*unschedFunction)(void* arg) = nullptr) override; + + int UnSchedule(void* arg, Priority pri) override; + + void StartThread(void (*function)(void* arg), void* arg) override; + + void WaitForJoin() override; + + unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override; + + Status GetThreadList(std::vector* thread_list) override { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); + } + + uint64_t GetThreadID() const override { + uint64_t thread_id = 0; +#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 30) + thread_id = ::gettid(); +#else // __GLIBC_PREREQ(2, 30) + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // __GLIBC_PREREQ(2, 30) +#else // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + pthread_t tid = pthread_self(); + memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); +#endif // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ) + return thread_id; + } + + Status GetHostName(char* name, uint64_t len) override { + int ret = gethostname(name, static_cast(len)); + if (ret < 0) { + if (errno == EFAULT || errno == EINVAL) { + return Status::InvalidArgument(errnoStr(errno).c_str()); + } else { + return IOError("GetHostName", name, errno); + } + } + return Status::OK(); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return Env::GetThreadStatusUpdater(); + } + + std::string GenerateUniqueId() override { return Env::GenerateUniqueId(); } + + // Allow increasing the number of worker threads. + void SetBackgroundThreads(int num, Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].SetBackgroundThreads(num); + } + + int GetBackgroundThreads(Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].GetBackgroundThreads(); + } + + Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override { + allow_non_owner_access_ = allow_non_owner_access; + return Status::OK(); + } + + // Allow increasing the number of worker threads. + void IncBackgroundThreadsIfNeeded(int num, Priority pri) override { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); + } + + void LowerThreadPoolIOPriority(Priority pool) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); +#ifdef OS_LINUX + thread_pools_[pool].LowerIOPriority(); +#else + (void)pool; +#endif + } + + void LowerThreadPoolCPUPriority(Priority pool) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow); + } + + Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override { + assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH); + thread_pools_[pool].LowerCPUPriority(pri); + return Status::OK(); + } + + // Constructs the default Env, a singleton + ZenFSEnv(const std::string& bdevname); + + private: + friend Env* Env::Default(); + + // The below 4 members are only used by the default ZenFSEnv instance. + // Non-default instances simply maintain references to the backing + // members in te default instance + std::vector thread_pools_storage_; + pthread_mutex_t mu_storage_; + std::vector threads_to_join_storage_; + bool allow_non_owner_access_storage_; + + std::vector& thread_pools_; + pthread_mutex_t& mu_; + std::vector& threads_to_join_; + // If true, allow non owner read access for db files. Otherwise, non-owner + // has no access to db files. + bool& allow_non_owner_access_; +}; + +std::shared_ptr getDefaultFS(const std::string& bdevname){ + std::shared_ptr fs; + FileSystem* fs_ptr; + Status s = ROCKSDB_NAMESPACE::NewZenFS(&fs_ptr, bdevname); + assert(s.ok()); + fs.reset(fs_ptr); + return fs; +} + +ZenFSEnv::ZenFSEnv(const std::string& bdevname) + : CompositeEnv(getDefaultFS(bdevname), SystemClock::Default()), + thread_pools_storage_(Priority::TOTAL), + allow_non_owner_access_storage_(true), + thread_pools_(thread_pools_storage_), + mu_(mu_storage_), + threads_to_join_(threads_to_join_storage_), + allow_non_owner_access_(allow_non_owner_access_storage_) { + ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(this); + } + thread_status_updater_ = CreateThreadStatusUpdater(); +} + +void ZenFSEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri, + void* tag, void (*unschedFunction)(void* arg)) { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + thread_pools_[pri].Schedule(function, arg, tag, unschedFunction); +} + +int ZenFSEnv::UnSchedule(void* arg, Priority pri) { + return thread_pools_[pri].UnSchedule(arg); +} + +unsigned int ZenFSEnv::GetThreadPoolQueueLen(Priority pri) const { + assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH); + return thread_pools_[pri].GetQueueLen(); +} + +struct StartThreadState { + void (*user_function)(void*); + void* arg; +}; + +static void* StartThreadWrapper(void* arg) { + StartThreadState* state = reinterpret_cast(arg); + state->user_function(state->arg); + delete state; + return nullptr; +} + +void ZenFSEnv::StartThread(void (*function)(void* arg), void* arg) { + pthread_t t; + StartThreadState* state = new StartThreadState; + state->user_function = function; + state->arg = arg; + ThreadPoolImpl::PthreadCall( + "start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state)); + ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_)); + threads_to_join_.push_back(t); + ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_)); +} + +void ZenFSEnv::WaitForJoin() { + for (const auto tid : threads_to_join_) { + pthread_join(tid, nullptr); + } + threads_to_join_.clear(); +} +} // namespace ROCKSDB_NAMESPACE + +#endif \ No newline at end of file diff --git a/env/file_system_tracer.h b/env/file_system_tracer.h index 979a0bf12..d2b34fb6b 100644 --- a/env/file_system_tracer.h +++ b/env/file_system_tracer.h @@ -99,6 +99,7 @@ class FileSystemTracingWrapper : public FileSystemWrapper { // is disabled. class FileSystemPtr { public: + FileSystemPtr() = default; FileSystemPtr(std::shared_ptr fs, const std::shared_ptr& io_tracer) : fs_(fs), io_tracer_(io_tracer) { diff --git a/env/fs_posix.cc b/env/fs_posix.cc index e1b1400d0..fe7c29589 100644 --- a/env/fs_posix.cc +++ b/env/fs_posix.cc @@ -372,6 +372,9 @@ class PosixFileSystem : public FileSystem { IODebugContext* dbg) override { return OpenWritableFile(fname, options, false, result, dbg); } + IOStatus SetFileLifetime(std::string fname, uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) override { + return SetFileLifetime(fname, lifetime, clock, flag, level, overlap_list); + } IOStatus ReopenWritableFile(const std::string& fname, const FileOptions& options, diff --git a/env/mock_env.h b/env/mock_env.h index a8d5283c5..fb80d7989 100644 --- a/env/mock_env.h +++ b/env/mock_env.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "env/composite_env_wrapper.h" #include "port/port.h" @@ -50,6 +51,10 @@ class MockFileSystem : public FileSystem { const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* dbg) override; + IOStatus SetFileLifetime(std::string fname, uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) { + //std::cout << fname << lifetime << '\n'; + return IOStatus::NotSupported("SetFileLifetime"); + } IOStatus ReopenWritableFile(const std::string& fname, const FileOptions& options, std::unique_ptr* result, diff --git a/examples/Makefile b/examples/Makefile index b056508a6..c80d9a01f 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -8,18 +8,28 @@ ifndef DISABLE_JEMALLOC PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) endif +USE_RTTI = 1 + ifneq ($(USE_RTTI), 1) CXXFLAGS += -fno-rtti endif +CXXFLAGS += -g + CFLAGS += -Wstrict-prototypes .PHONY: clean librocksdb -all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example rocksdb_backup_restore_example +all: simple_example column_families_example compact_files_example c_simple_example optimistic_transaction_example transaction_example compaction_filter_example options_file_example + +zenfs: librocksdb_zenfs zenfs_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -o$@ /usr/lib/libzbd.a -I../ -I../include -I/usr/include/libzbd -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + +zenfs_example: librocksdb_zenfs zenfs_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb_debug.a -o$@ /usr/lib/libzbd.a -I../ -I../include -I/usr/include/libzbd -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) simple_example: librocksdb simple_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb_debug.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) column_families_example: librocksdb column_families_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) @@ -28,7 +38,7 @@ compaction_filter_example: librocksdb compaction_filter_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) compact_files_example: librocksdb compact_files_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb_debug.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) .c.o: $(CC) $(CFLAGS) -c $< -o $@ -I../include @@ -48,11 +58,11 @@ options_file_example: librocksdb options_file_example.cc multi_processes_example: librocksdb multi_processes_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) -rocksdb_backup_restore_example: librocksdb rocksdb_backup_restore_example.cc - $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++17 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) - clean: - rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example ./rocksdb_backup_restore_example + rm -rf ./zenfs_example ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example zenfs_example.o c_simple_example.o ./optimistic_transaction_example ./transaction_example ./options_file_example ./multi_processes_example librocksdb: - cd .. && $(MAKE) static_lib + cd .. && $(MAKE) dbg + +librocksdb_zenfs: + cd .. && DISABLE_WARNING_AS_ERROR=1 ROCKSDB_PLUGINS=zenfs $(MAKE) dbg diff --git a/examples/rocksdb_backup_restore_example.cc b/examples/rocksdb_backup_restore_example.cc deleted file mode 100644 index c833ed1c2..000000000 --- a/examples/rocksdb_backup_restore_example.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. -// This source code is licensed under both the GPLv2 (found in the -// COPYING file in the root directory) and Apache 2.0 License -// (found in the LICENSE.Apache file in the root directory). - -#include -#include -#include - -#include "rocksdb/db.h" -#include "rocksdb/options.h" -#include "rocksdb/utilities/backup_engine.h" - -using ROCKSDB_NAMESPACE::BackupEngine; -using ROCKSDB_NAMESPACE::BackupEngineOptions; -using ROCKSDB_NAMESPACE::BackupEngineReadOnly; -using ROCKSDB_NAMESPACE::BackupInfo; -using ROCKSDB_NAMESPACE::DB; -using ROCKSDB_NAMESPACE::Env; -using ROCKSDB_NAMESPACE::Options; -using ROCKSDB_NAMESPACE::ReadOptions; -using ROCKSDB_NAMESPACE::Status; -using ROCKSDB_NAMESPACE::WriteOptions; - -#if defined(OS_WIN) -std::string kDBPath = "C:\\Windows\\TEMP\\rocksdb_example"; -#else -std::string kDBPath = "/tmp/rocksdb_example"; -#endif - -int main() { - DB* db; - Options options; - // Optimize RocksDB. This is the easiest way to get RocksDB to perform well - options.IncreaseParallelism(); - options.OptimizeLevelStyleCompaction(); - // create the DB if it's not already present - options.create_if_missing = true; - - // open DB - Status s = DB::Open(options, kDBPath, &db); - assert(s.ok()); - - // Put key-value - db->Put(WriteOptions(), "key1", "value1"); - assert(s.ok()); - - // create backup - BackupEngine* backup_engine; - s = BackupEngine::Open(Env::Default(), - BackupEngineOptions("/tmp/rocksdb_example_backup"), - &backup_engine); - assert(s.ok()); - - backup_engine->CreateNewBackup(db); - assert(s.ok()); - - std::vector backup_info; - backup_engine->GetBackupInfo(&backup_info); - - s = backup_engine->VerifyBackup(1); - assert(s.ok()); - - // Put key-value - db->Put(WriteOptions(), "key2", "value2"); - assert(s.ok()); - - db->Close(); - delete db; - db = nullptr; - - // restore db to backup 1 - BackupEngineReadOnly* backup_engine_ro; - s = BackupEngineReadOnly::Open( - Env::Default(), BackupEngineOptions("/tmp/rocksdb_example_backup"), - &backup_engine_ro); - assert(s.ok()); - - s = backup_engine_ro->RestoreDBFromBackup(1, "/tmp/rocksdb_example", - "/tmp/rocksdb_example"); - assert(s.ok()); - - // open db again - s = DB::Open(options, kDBPath, &db); - assert(s.ok()); - - std::string value; - s = db->Get(ReadOptions(), "key1", &value); - assert(!s.IsNotFound()); - - s = db->Get(ReadOptions(), "key2", &value); - assert(s.IsNotFound()); - - delete backup_engine; - delete backup_engine_ro; - delete db; - - return 0; -} diff --git a/examples/zenfs.cc b/examples/zenfs.cc new file mode 100644 index 000000000..3a2c2fe44 --- /dev/null +++ b/examples/zenfs.cc @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include +#include "env/env_hybrid.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/plugin/zenfs/fs/fs_zenfs.h" + +using namespace std; + +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DestroyDB; +using ROCKSDB_NAMESPACE::HybridFSEnv; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::WriteOptions; +using ROCKSDB_NAMESPACE::Slice; + +const std::string PATH = "/tmp/test_rocksdb"; + +const long long limit = 1e5; +const int N = 2e5; +long long get_random() { + long long rd = rand(); + long long rd2 = rand(); + return (rd << 31ll | rd2) % limit + 1; + // return rand() % 1000000; +} +int main() { + freopen("out.txt", "w", stdout); + clock_t start, end; //定义clock_t变量 + start = clock(); //开始时间 + DB* db; + Options options; + options.create_if_missing = true; + options.env = new HybridFSEnv("nullb0"); + + const int B = 1; + const int KB = 1024; + const int MB = 1024 * 1024; + // options.max_bytes_for_level_base=10 * 1048; + // options.target_file_size_base=2097152; + // options.write_buffer_size = 4194304; + options.max_bytes_for_level_base= 1 * KB; + options.target_file_size_base = 1 * KB; + options.write_buffer_size = 2 * KB; + + options.max_bytes_for_level_multiplier=5; + + options.max_background_compactions=1; + options.max_background_flushes=1; + options.max_background_jobs=1; + + options.soft_pending_compaction_bytes_limit = options.target_file_size_base; + options.hard_pending_compaction_bytes_limit = options.target_file_size_base; //这里限制compaction的速率是因为rocksdb的flush速率特别快,不然来不及compaction + + options.num_levels=7; + options.level0_stop_writes_trigger=12; + options.level0_slowdown_writes_trigger=8; + options.level0_file_num_compaction_trigger=4; + options.max_write_buffer_number=1; + options.compaction_style=rocksdb::kCompactionStyleLevel; + options.compaction_pri=rocksdb::kRoundRobin; + options.max_open_files=1000; + options.target_file_size_multiplier=1; + + Status status = DB::Open(options, PATH, &db); + assert(status.ok()); + for (int i = 0; i < N; i++) { + std::string key = std::to_string(get_random()); + std::string value = std::to_string(get_random()); + status = db->Put(WriteOptions(), key, value); + std::string get_value; + if (status.ok()) { + status = db->Get(ReadOptions(), key, &get_value); + if (status.ok()) { + //printf("get %s\n", get_value.c_str()); + } else { + printf("get failed\n"); + } + //printf("success %d\n", i); + } else { + printf("put failed\n"); + } + } + + delete db; + end = clock(); //结束时间 + cout << "time = " << double(end - start) / CLOCKS_PER_SEC << "s" + << endl; //输出时间 + return 0; +} diff --git a/examples/zenfs_example.cc b/examples/zenfs_example.cc new file mode 100644 index 000000000..83593a96b --- /dev/null +++ b/examples/zenfs_example.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +// Examples with Zenfs. + +#include +#include + +#include "env/env_hybrid.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/plugin/zenfs/fs/fs_zenfs.h" + +using ROCKSDB_NAMESPACE::DB; +using ROCKSDB_NAMESPACE::DestroyDB; +using ROCKSDB_NAMESPACE::HybridFSEnv; +using ROCKSDB_NAMESPACE::Options; +using ROCKSDB_NAMESPACE::ReadOptions; +using ROCKSDB_NAMESPACE::Status; +using ROCKSDB_NAMESPACE::WriteOptions; +using ROCKSDB_NAMESPACE::Slice; + +std::string dbPath = "/hybrid_example"; + +void findAndListAllZenFS() { + // List all ZenFS in current system + std::map list; + Status s = ROCKSDB_NAMESPACE::ListZenFileSystems(list); + assert(s.ok()); // check if okay + + // Print all available ZenFS if possible + std::map::iterator it; + for (it = list.begin(); it != list.end(); it++) { + std::cout << it->first // string (key) + << ':' << it->second // string's value + << std::endl; + } + assert(list.size() > 0); +} + +int main() { + // Find and list all ZenFS in current system + findAndListAllZenFS(); + + DB* hybrid = nullptr; + Options hybridOptions; + hybridOptions.env = new HybridFSEnv("nullb0"); + hybridOptions.create_if_missing = true; + + // Optimize RocksDB. This is the easiest way to get RocksDB to perform well + hybridOptions.IncreaseParallelism(); + hybridOptions.level0_slowdown_writes_trigger = 3; + hybridOptions.level0_file_num_compaction_trigger = 4; + hybridOptions.level0_stop_writes_trigger = 5; + hybridOptions.max_bytes_for_level_base = 1000; + hybridOptions.max_bytes_for_level_multiplier = 2; + + // Clean files from previous tests + DestroyDB(dbPath, hybridOptions); + + // Open db + Status s = DB::Open(hybridOptions, dbPath, &hybrid); + assert(s.ok()); + + // Delete any dirty files. ex. linked files... + std::vector dirty_files; + hybridOptions.env->GetChildren(dbPath, &dirty_files); + for(const std::string &name : dirty_files){ + hybridOptions.env->DeleteFile(dbPath + "/" + name); + } + + // Put key-value + s = hybrid->Put(WriteOptions(), "key1", "value"); + assert(s.ok()); + + // Get value + std::string value; + s = hybrid->Get(ReadOptions(), "key1", &value); + assert(s.ok()); + assert(value == "value"); + + // Delete value + s = hybrid->Delete(WriteOptions(), "key1"); + assert(s.ok()); + + // Verify the delete is completed + s = hybrid->Get(ReadOptions(), "key1", &value); + assert(s.IsNotFound()); + + // Compaction test + // if background compaction is not working, write will stall + // because of options.level0_stop_writes_trigger + for (int i = 0; i < 999999; ++i) { + hybrid->Put(WriteOptions(), std::to_string(i), + std::string(500, 'a' + (i % 26))); + } + + // verify the values are still there + for (int i = 0; i < 999999; ++i) { + hybrid->Get(ReadOptions(), std::to_string(i), &value); + assert(value == std::string(500, 'a' + (i % 26))); + } + + // Flush all data from memory to drive + s = hybrid->Flush(ROCKSDB_NAMESPACE::FlushOptions()); + assert(s.ok()); + + // TEST LinkFile() and AreFilesSame()--------------------------- + std::vector f_names; + std::vector sst_names; + hybridOptions.env->GetChildren(dbPath, &f_names); + for(const std::string &name : f_names){ + Slice rest(name); + if(rest.ends_with(".sst")){ + sst_names.push_back(name); + } + } + + // sst_names should contains both posix and zenfs filenames where posix names in + // front and zenfs names in back. + assert(!sst_names.empty()); + + std::string first_posix_sst_name = dbPath + "/" + sst_names[0]; + s = hybridOptions.env->LinkFile(first_posix_sst_name, + first_posix_sst_name + "_another.sst"); + assert(s.ok()); + bool same = false; + s = hybridOptions.env->AreFilesSame( + first_posix_sst_name, first_posix_sst_name + "_another.sst", &same); + assert(s.ok() && same); + + std::string first_zenfs_sst_name = dbPath + "/" + sst_names[sst_names.size() - 1]; + s = hybridOptions.env->LinkFile(first_zenfs_sst_name, + first_zenfs_sst_name + "_another.sst"); + assert(s.ok()); + s = hybridOptions.env->AreFilesSame( + first_zenfs_sst_name, first_zenfs_sst_name + "_another.sst", &same); + assert(s.ok() && same); + //------------------------------------------------------------- + + // Test transfer function since these 2 table files are in diff. mediums. + s = hybridOptions.env->AreFilesSame(first_posix_sst_name, + first_zenfs_sst_name, &same); + assert(s.ok()); + + // Close db + s = hybrid->Close(); + assert(s.ok()); + + // Remove and free ptr + delete hybrid; + + return 0; +} \ No newline at end of file diff --git a/file/read_write_util.cc b/file/read_write_util.cc index cc4f6b849..a131a1cfa 100644 --- a/file/read_write_util.cc +++ b/file/read_write_util.cc @@ -19,6 +19,7 @@ IOStatus NewWritableFile(FileSystem* fs, const std::string& fname, const FileOptions& options) { TEST_SYNC_POINT_CALLBACK("NewWritableFile::FileOptions.temperature", const_cast(&options.temperature)); + IOStatus s = fs->NewWritableFile(fname, options, result, nullptr); TEST_KILL_RANDOM_WITH_WEIGHT("NewWritableFile:0", REDUCE_ODDS2); return s; diff --git a/file/writable_file_writer.cc b/file/writable_file_writer.cc index 3afc51c56..cac06d7ef 100644 --- a/file/writable_file_writer.cc +++ b/file/writable_file_writer.cc @@ -76,15 +76,21 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, cap *= 2) { // See whether the next available size is large enough. // Buffer will never be increased to more than max_buffer_size_. - size_t desired_capacity = std::min(cap * 2, max_buffer_size_); + //size_t desired_capacity = std::min(cap * 2, max_buffer_size_); + size_t desired_capacity = 134217728; if (desired_capacity - buf_.CurrentSize() >= left || (use_direct_io() && desired_capacity == max_buffer_size_)) { + //printf("Extent buffer true now_capacity=%ld Capacity=%ld CurrentSize=%ld left=%ld\n", + // buf_.Capacity() + desired_capacity, buf_.Capacity(), buf_.CurrentSize(), left); buf_.AllocateNewBuffer(desired_capacity, true); + break; } } } +// printf("use_direct_io? %d\n", use_direct_io()); + // Flush only when buffered I/O if (!use_direct_io() && (buf_.Capacity() - buf_.CurrentSize()) < left) { if (buf_.CurrentSize() > 0) { @@ -139,6 +145,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, // or we simply use it for its original purpose to accumulate many small // chunks if (use_direct_io() || (buf_.Capacity() >= left)) { + // printf("postion3\n"); while (left > 0) { size_t appended = buf_.Append(src, left); if (perform_data_verification_ && buffered_data_with_checksum_) { @@ -149,6 +156,7 @@ IOStatus WritableFileWriter::Append(const Slice& data, uint32_t crc32c_checksum, src += appended; if (left > 0) { + printf("position4 Capacity=%ld CurrentSize=%ld left=%ld\n", buf_.Capacity(), buf_.CurrentSize(), left); //avoid enter here s = Flush(op_rate_limiter_priority); if (!s.ok()) { break; diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index bef60a212..40839941b 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -276,6 +276,8 @@ class Env : public Customizable { std::unique_ptr* result, const EnvOptions& options) = 0; + virtual Status SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) = 0; // Create an object that writes to a file with the specified name. // `WritableFile::Append()`s will append after any existing content. If the // file does not already exist, creates it. @@ -1435,6 +1437,10 @@ class EnvWrapper : public Env { const EnvOptions& options) override { return target_.env->NewWritableFile(f, r, options); } + Status SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) override { + return target_.env->SetFileLifetime(fname, lifetime, clock, flag, level, overlap_list); + } Status ReopenWritableFile(const std::string& fname, std::unique_ptr* result, const EnvOptions& options) override { diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index ee8362eab..01f4267a7 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -160,6 +160,8 @@ struct FileOptions : EnvOptions { // handoff during file writes. ChecksumType handoff_checksum_type; + uint64_t lifetime; + FileOptions() : EnvOptions(), handoff_checksum_type(ChecksumType::kCRC32c) {} FileOptions(const DBOptions& opts) @@ -363,7 +365,8 @@ class FileSystem : public Customizable { const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* dbg) = 0; - + virtual IOStatus SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) = 0; // Create an object that writes to a file with the specified name. // `FSWritableFile::Append()`s will append after any existing content. If the // file does not already exist, creates it. @@ -1340,6 +1343,10 @@ class FileSystemWrapper : public FileSystem { IODebugContext* dbg) override { return target_->NewWritableFile(f, file_opts, r, dbg); } + IOStatus SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) override { + return target_->SetFileLifetime(fname, lifetime, clock, flag, level, overlap_list); + } IOStatus ReopenWritableFile(const std::string& fname, const FileOptions& file_opts, std::unique_ptr* result, diff --git a/level_pic.py b/level_pic.py new file mode 100644 index 000000000..bff690c17 --- /dev/null +++ b/level_pic.py @@ -0,0 +1,27 @@ +import matplotlib.pyplot as plt +import numpy as np + + +num_list = [] +x_list = [] +for line in open("level.out"): + x_list.append(int(line.split(' ')[0])) + num_list.append(int(line.split(' ')[1])) +print(len(num_list)) +print(len(x_list)) +x = np.array(x_list) +y = np.array(num_list) + + + +font_size=18 +plt.xticks(fontsize=18) +plt.yticks(fontsize=18) +plt.rcParams["font.family"] = "serif" +plt.rcParams["font.serif"] = ["Times New Roman"] +plt.ylim([-0.5, 5.5]) +#plt.xlim([9800, 9850]) +plt.xlabel('FC-Ticks', fontsize=font_size) +plt.ylabel('Level', fontsize=font_size) +plt.scatter(x, y, s=40) +plt.show() \ No newline at end of file diff --git a/lifetime_pic.py b/lifetime_pic.py new file mode 100644 index 000000000..42243cb11 --- /dev/null +++ b/lifetime_pic.py @@ -0,0 +1,329 @@ +from curses import keyname +from pickle import BINSTRING +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.ticker import MaxNLocator +from matplotlib.pyplot import MultipleLocator +x_list = [] +type_list = [] +lifetime_list = [] +predict_list = [] +real_type = [] + +def addtwodimdict(thedict, key_a, key_b, val): + if key_a in adic: + thedict[key_a].update({key_b: val}) + else: + thedict.update({key_a:{key_b: val}}) + +for line in open("lifetime.out"): + #print(line.split(' ')) + x_list.append(int(line.split(' ')[0])) + predict_list.append(int(line.split(' ')[1])) + type_list.append(int(line.split(' ')[2])) + lifetime_list.append(int(line.split(' ')[3])) + real_type.append(int(line.split(' ')[4])) + + +ans_list = [] +real_distribution = [] +right = 0 +for i in range(0, len(x_list)): + ans_list.append(predict_list[i] - lifetime_list[i]) + if(predict_list[i] - lifetime_list[i] >= -25 and predict_list[i] - lifetime_list[i] <= 25): + right = right + 1 + real_distribution.append(lifetime_list[i]) + + +print("total right rate=%.2lf"%(right / len(x_list))) +font_size=24 +ticks_size=18 + +plt.rcParams["font.family"] = "serif" +plt.rcParams["font.serif"] = ["Times New Roman"] +plt.figure(figsize=(6.5, 3.2)) +plt.xticks(fontsize=ticks_size, weight="bold") +plt.yticks(fontsize=ticks_size, weight="bold") +plt.xlim([-200, 200]) +plt.ylim([0, 25000]) +plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") +plt.ylabel('Number', fontsize=font_size, weight="bold") +plt.subplots_adjust(top=0.94, right=0.965, left=0.14, bottom=0.26) +plt.gca().set_yticklabels([str(int(y / 1000)) + "k" for y in plt.gca().get_yticks()]) +plt.hist(ans_list, bins=1000, color="gold") +plt.show() + + +plt.rcParams["font.family"] = "serif" +plt.rcParams["font.serif"] = ["Times New Roman"] +plt.figure(figsize=(6.5, 3.2)) +plt.xticks(fontsize=ticks_size, weight="bold") +plt.yticks(fontsize=ticks_size, weight="bold") +plt.xlim([-100, 1200]) +plt.ylim([0, 15000]) +plt.gca().set_yticklabels([str(int(y / 1000)) + "k" for y in plt.gca().get_yticks()]) +plt.xlabel('Real lifetime', fontsize=font_size, weight="bold") +plt.ylabel('Number', fontsize=font_size, weight="bold") +plt.subplots_adjust(top=0.94, right=0.96, left=0.14, bottom=0.26) +plt.hist(real_distribution, bins=1000, color="purple") +plt.show() + +for l in range(0, 7): #each level + kind_correct = { + "0": {"0": 0, "-1": 0}, + "-1": {"0": 0, "-1": 0}, + "1": {"0": 0, "-1": 0}, + "2": {"0": 0, "-1": 0}, + "3": {"0": 0, "-1": 0}, + "4": {"0": 0, "-1": 0}, + "5": {"0": 0, "-1": 0}, + "-2": {"0": 0, "-1": 0}, + "-3": {"0": 0, "-1": 0}, + "-4": {"0": 0, "-1": 0}, + "-5": {"0": 0, "-1": 0}, + "-6": {"0": 0, "-1": 0} + } + kind_num = { + "0": {"0": 0, "-1": 0}, + "-1": {"0": 0, "-1": 0}, + "1": {"0": 0, "-1": 0}, + "2": {"0": 0, "-1": 0}, + "3": {"0": 0, "-1": 0}, + "4": {"0": 0, "-1": 0}, + "5": {"0": 0, "-1": 0}, + "-2": {"0": 0, "-1": 0}, + "-3": {"0": 0, "-1": 0}, + "-4": {"0": 0, "-1": 0}, + "-5": {"0": 0, "-1": 0}, + "-6": {"0": 0, "-1": 0} + } + kind_ave = { + "0": {"0": 0, "-1": 0}, + "-1": {"0": 0, "-1": 0}, + "1": {"0": 0, "-1": 0}, + "2": {"0": 0, "-1": 0}, + "3": {"0": 0, "-1": 0}, + "4": {"0": 0, "-1": 0}, + "5": {"0": 0, "-1": 0}, + "-2": {"0": 0, "-1": 0}, + "-3": {"0": 0, "-1": 0}, + "-4": {"0": 0, "-1": 0}, + "-5": {"0": 0, "-1": 0}, + "-6": {"0": 0, "-1": 0} + } + tot = 0 + correct = 0 + sum = 0 + data1_list = [] #short-lived + data1_list_miss = [] + data2_list = [] #current level compaction + data2_list_miss = [] + data3_list = [] #upper level compaction + data3_list_miss = [] + data4_list = [] #trivial compaction + data4_list_miss = [] #trivial compaction + data5_list = [] + data5_list_miss = [] + THRESHOLD = 5 * (l + 1) + + level0_num = 0 #compacted by current level num + leveln1_num = 0 #compacted by top level num + level1_num = 0 #compacted by unknow file + for i in range(0, len(x_list)): + + if x_list[i] == l: + key1=str(type_list[i]) + key2=str(real_type[i]) + sum = sum + lifetime_list[i] + kind_ave[key1][key2] += lifetime_list[i] + diff = predict_list[i] - lifetime_list[i] + + if type_list[i] == 1: + if real_type[i] == -1: + data1_list.append(diff) + else: + data1_list_miss.append(diff) + elif type_list[i] == 2: + if real_type[i] == -1: + data2_list.append(diff) + else: + data2_list_miss.append(diff) + elif type_list[i] == 3: + if real_type[i] == -1: #compacted + data3_list.append(diff) + else: + data3_list_miss.append(diff) + elif type_list[i] == 4: + if real_type[i] == -1: + data4_list.append(diff) + else: + data4_list_miss.append(diff) #type_list[i] == 0 but real_type[i] == 1 + elif type_list[i] == 5: + if real_type[i] == -1: + data5_list.append(diff) + else: + data5_list_miss.append(diff) + tot = tot + 1 + kind_num[key1][key2] += 1 + if real_type[i] == -1: + leveln1_num += 1 + elif real_type[i] == 1: + level1_num += 1 + else: + level0_num += 1 + if diff >= -THRESHOLD and diff <= THRESHOLD: + correct = correct + 1 + kind_correct[key1][key2] += 1 + + if tot == 0: + continue + print("level %d correct_rate=%.3lf average_lifetime=%.3lf type-1_num=%d type0_num=%d type1_num=%d" % (l, correct / tot, sum / tot, leveln1_num, level0_num, level1_num)) #right number + for key1 in kind_num: + for key2 in kind_num[key1]: + if(kind_num[key1][key2] != 0): + key11 = "" + key22 = "" + if key1 == "1": + key11 = "2B" + elif key1 == "2": + key11 = "1" + elif key1 == "3": + key11 = "2A" + else: + key11 = "3" + if key2 == "0": + key22 = "1" + else: + key22 = "2" + print("P%s D%s Accuracy=%.3lf Predict_Ave=%.3lf num=%d" % (key11, key22, kind_correct[key1][key2] / kind_num[key1][key2], kind_ave[key1][key2] / kind_num[key1][key2], kind_num[key1][key2])) + if(l != 4): + continue + bins_num = 50 + + #P1D1 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.14, bottom=0.26) + if((len(data2_list_miss) != 0)): + plt.hist(data2_list_miss, bins=bins_num, color="green") #current + plt.show() + + #P1D2 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.125, bottom=0.26) + if(len(data2_list) != 0): #short-lived + plt.hist(data2_list, bins=bins_num, color="yellow") #upper + plt.show() + + + #P2AD1 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MultipleLocator(10)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.125, bottom=0.26) + if(len(data3_list_miss) != 0): + plt.hist(data3_list_miss, bins=bins_num, color="purple") #current + plt.show() + + #P2AD2 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MultipleLocator(300)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.16, bottom=0.26) + if(len(data3_list) != 0): + plt.hist(data3_list, bins=bins_num, color="blue") #upper + plt.show() + + + #P2BD1 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.11, bottom=0.26) + if((len(data1_list_miss) != 0)): + plt.hist(data1_list_miss, bins=bins_num, color="orange") #current + plt.show() + + #P2BD2 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.14, bottom=0.26) + if(len(data1_list) != 0): #short-lived + plt.hist(data1_list, bins=bins_num, color="red") #upper + plt.show() + + #P3AD1 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + plt.subplots_adjust(top=0.94, right=0.97, left=0.1, bottom=0.265) + if(len(data4_list_miss) != 0): + plt.hist(data4_list_miss, bins=bins_num, color="black") #current + plt.show() + + #P3AD2 + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.figure(figsize=(6.5, 3.2)) + plt.xlabel('Lifetime prediction error (FC-ticks)', fontsize=font_size, weight="bold") + plt.ylabel('Number', fontsize=font_size, weight="bold") + plt.xticks(fontsize=ticks_size, weight="bold") + plt.yticks(fontsize=ticks_size, weight="bold") + plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) + plt.gca().yaxis.set_major_locator(MaxNLocator(integer=True)) + plt.subplots_adjust(top=0.94, right=0.99, left=0.125, bottom=0.26) + if(len(data4_list) != 0): + plt.hist(data4_list, bins=bins_num, color="pink") # upper + plt.show() + + + + + +#4 [0-9]* 0 [0-9]* -1 diff --git a/logging/posix_logger.h b/logging/posix_logger.h new file mode 100644 index 000000000..44ff3d32c --- /dev/null +++ b/logging/posix_logger.h @@ -0,0 +1,179 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Logger implementation that can be shared by all environments +// where enough posix functionality is available. + +#pragma once +#include +#include +#include "port/sys_time.h" +#include +#include + +#ifdef OS_LINUX +#ifndef FALLOC_FL_KEEP_SIZE +#include +#endif +#endif + +#include +#include "env/io_posix.h" +#include "monitoring/iostats_context_imp.h" +#include "rocksdb/env.h" +#include "test_util/sync_point.h" + +namespace ROCKSDB_NAMESPACE { + +class PosixLogger : public Logger { + private: + Status PosixCloseHelper() { + int ret; + + ret = fclose(file_); + if (ret) { + return IOError("Unable to close log file", "", ret); + } + return Status::OK(); + } + FILE* file_; + uint64_t (*gettid_)(); // Return the thread id for the current thread + std::atomic_size_t log_size_; + int fd_; + const static uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + std::atomic flush_pending_; + + protected: + virtual Status CloseImpl() override { return PosixCloseHelper(); } + + public: + PosixLogger(FILE* f, uint64_t (*gettid)(), Env* env, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(f), + gettid_(gettid), + log_size_(0), + fd_(fileno(f)), + last_flush_micros_(0), + env_(env), + flush_pending_(false) {} + virtual ~PosixLogger() { + if (!closed_) { + closed_ = true; + PosixCloseHelper().PermitUncheckedError(); + } + } + virtual void Flush() override { + TEST_SYNC_POINT("PosixLogger::Flush:Begin1"); + TEST_SYNC_POINT("PosixLogger::Flush:Begin2"); + if (flush_pending_) { + flush_pending_ = false; + fflush(file_); + } + last_flush_micros_ = env_->NowMicros(); + } + + using Logger::Logv; + virtual void Logv(const char* format, va_list ap) override { + IOSTATS_TIMER_GUARD(logger_nanos); + + const uint64_t thread_id = (*gettid_)(); + + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 65536; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + port::TimeVal now_tv; + port::GetTimeOfDay(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + port::LocalTimeR(&seconds, &t); + p += snprintf(p, limit - p, "%04d/%02d/%02d-%02d:%02d:%02d.%06d %llu ", + t.tm_year + 1900, t.tm_mon + 1, t.tm_mday, t.tm_hour, + t.tm_min, t.tm_sec, static_cast(now_tv.tv_usec), + static_cast(thread_id)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + +#ifdef ROCKSDB_FALLOCATE_PRESENT + const int kDebugLogChunkSize = 128 * 1024; + + // If this write would cross a boundary of kDebugLogChunkSize + // space, pre-allocate more space to avoid overly large + // allocations from filesystem allocsize options. + const size_t log_size = log_size_; + const size_t last_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize); + const size_t desired_allocation_chunk = + ((kDebugLogChunkSize - 1 + log_size + write_size) / + kDebugLogChunkSize); + if (last_allocation_chunk != desired_allocation_chunk) { + fallocate( + fd_, FALLOC_FL_KEEP_SIZE, 0, + static_cast(desired_allocation_chunk * kDebugLogChunkSize)); + } +#endif + + size_t sz = fwrite(base, 1, write_size, file_); + flush_pending_ = true; + if (sz > 0) { + log_size_ += write_size; + } + uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + Flush(); + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const override { return log_size_; } +}; + +} // namespace ROCKSDB_NAMESPACE \ No newline at end of file diff --git a/memory/arena.h b/memory/arena.h index 1de04c477..74fd45279 100644 --- a/memory/arena.h +++ b/memory/arena.h @@ -28,7 +28,7 @@ namespace ROCKSDB_NAMESPACE { class Arena : public Allocator { public: // No copying allowed - Arena(const Arena&) = delete; + //Arena(const Arena&) = delete; void operator=(const Arena&) = delete; static const size_t kInlineSize = 2048; diff --git a/nullblk-zoned-remove.sh b/nullblk-zoned-remove.sh new file mode 100644 index 000000000..2436d30dd --- /dev/null +++ b/nullblk-zoned-remove.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +if [ $# != 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +nid=$1 + +if [ ! -b "/dev/nullb$nid" ]; then + echo "/dev/nullb$nid: No such device" + exit 1 +fi + +echo 0 > /sys/kernel/config/nullb/nullb$nid/power +rmdir /sys/kernel/config/nullb/nullb$nid + +echo "Destroyed /dev/nullb$nid" + diff --git a/nullblk-zoned.sh b/nullblk-zoned.sh new file mode 100644 index 000000000..a1aa29d2d --- /dev/null +++ b/nullblk-zoned.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +if [ $# != 7 ]; then + echo "Usage: $0 " + exit 1 +fi + +scriptdir=$(cd $(dirname "$0") && pwd) + +modprobe null_blk nr_devices=0 || return $? + +function create_zoned_nullb() +{ + local nid=0 + local bs=$1 + local zs=$2 + local zc=$3 + local max_open_zones=$4 + local max_active_zones=$5 + local nr_conv=$6 + local nr_seq=$7 + + cap=$(( zs * (nr_conv + nr_seq) )) + + while [ 1 ]; do + if [ ! -b "/dev/nullb$nid" ]; then + break + fi + nid=$(( nid + 1 )) + done + + dev="/sys/kernel/config/nullb/nullb$nid" + mkdir "$dev" + + echo $bs > "$dev"/blocksize + echo 0 > "$dev"/completion_nsec + echo 0 > "$dev"/irqmode + echo 2 > "$dev"/queue_mode + echo 1024 > "$dev"/hw_queue_depth + echo 1 > "$dev"/memory_backed + echo 1 > "$dev"/zoned + + echo $cap > "$dev"/size + echo $zs > "$dev"/zone_size + echo $zc > "$dev"/zone_capacity + echo $max_open_zones > "$dev"/zone_max_open + echo $max_active_zones > "$dev"/zone_max_active + echo $nr_conv > "$dev"/zone_nr_conv + + echo 1 > "$dev"/power + + echo mq-deadline > /sys/block/nullb$nid/queue/scheduler + + echo "$nid" +} + +nulldev=$(create_zoned_nullb $1 $2 $3 $4 $5 $6 $7) +echo "Created /dev/nullb$nulldev" + diff --git a/out.txtand_compaction b/out.txtand_compaction new file mode 100644 index 000000000..e69de29bb diff --git a/real_lifetime.py b/real_lifetime.py new file mode 100644 index 000000000..d304f8409 --- /dev/null +++ b/real_lifetime.py @@ -0,0 +1,74 @@ +from curses import keyname +from pickle import BINSTRING +import matplotlib.pyplot as plt +import numpy as np +x_list = [] +type_list = [] +lifetime_list = [] +predict_list = [] +real_type = [] +level_number = [] + +for line in open("lifetime.out"): + #print(line.split(' ')) + x_list.append(int(line.split(' ')[0])) + predict_list.append(int(line.split(' ')[1])) + type_list.append(int(line.split(' ')[2])) + lifetime_list.append(int(line.split(' ')[3])) + real_type.append(int(line.split(' ')[4])) + level_number.append(int(line.split(' ')[7])) + +tot_cnt = 0 +for l in range(0, 6): #each level + real_lifetime_list_0 = [] + real_lifetime_list_n1 = [] + sum = 0 + sum0 = 0 + sum1 = 0 + tot = 0 + cnt0 = 0 + cntn1 = 0 + num = 0 + for i in range(0, len(x_list)): + if x_list[i] == l: + if real_type[i] == 0: + real_lifetime_list_0.append(lifetime_list[i]) + cnt0 += 1 + sum0 += lifetime_list[i] + else: + real_lifetime_list_n1.append(lifetime_list[i]) + cntn1 += 1 + sum1 += lifetime_list[i] + sum += lifetime_list[i] + tot += 1 + if level_number[i] > num: + num = level_number[i] + d = 0 if tot == 0 else sum / tot + d1 = 0 if cnt0 == 0 else sum0 / cnt0 + d2 = 0 if cntn1 == 0 else sum1 / cntn1 + print("level=%d num=%d all_ave=%d ave_0=%d ave_n1=%d cnt0=%d cntn1=%d" % (l, num, d, d1, d2, cnt0, cntn1)) + tot_cnt += num + + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.xlabel('Real lifetime', fontsize=24, weight="bold") + plt.ylabel('Number', fontsize=24, weight="bold") + plt.xticks(fontsize=18, weight="bold") + plt.yticks(fontsize=18, weight="bold") + plt.subplots_adjust(top=0.94, right=0.96, left=0.16, bottom=0.15) + if(len(real_lifetime_list_0) != 0): + plt.hist(real_lifetime_list_0, bins=50, color="gold") + plt.show() + + plt.rcParams["font.family"] = "serif" + plt.rcParams["font.serif"] = ["Times New Roman"] + plt.xlabel('Real lifetime', fontsize=24, weight="bold") + plt.ylabel('Number', fontsize=24, weight="bold") + plt.xticks(fontsize=18, weight="bold") + plt.yticks(fontsize=18, weight="bold") + plt.subplots_adjust(top=0.94, right=0.96, left=0.16, bottom=0.15) + if(len(real_lifetime_list_n1) != 0): + plt.hist(real_lifetime_list_n1, bins=50, color="purple") + plt.show() + +print(tot_cnt) \ No newline at end of file diff --git a/sudo b/sudo new file mode 100644 index 000000000..75f3738d9 --- /dev/null +++ b/sudo @@ -0,0 +1 @@ +deadline /sys/class/block/nullb0/queue/scheduler diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index da81cb254..237f14b0b 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -922,7 +922,9 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { #endif // !NDEBUG auto should_flush = r->flush_block_policy->Update(key, value); + //auto should_flush = 0; if (should_flush) { + //printf("should_flush_called\n"); assert(!r->data_block.empty()); r->first_key_in_next_block = &key; Flush(); diff --git a/test_util/testutil.h b/test_util/testutil.h index dc02b84b1..a42a9647c 100644 --- a/test_util/testutil.h +++ b/test_util/testutil.h @@ -459,7 +459,7 @@ class FilterNumber : public CompactionFilter { private: mutable std::string last_merge_operand_key_; - uint64_t num_; +uint64_t num_; }; inline std::string EncodeInt(uint64_t x) { diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 5d662ab79..ad462a86a 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -3231,7 +3231,13 @@ class Benchmark { // ---------------------------- // | key 00000 | // ---------------------------- + //num_keys: key_number that need to generate + // void GenerateKeyFromInt(uint64_t v, int64_t num_keys, Slice* key) { + // const int range = 1500000; + // if(v > range) { + // v %= range; + // } if (!keys_.empty()) { assert(FLAGS_use_existing_keys); assert(keys_.size() == static_cast(num_keys)); @@ -3239,6 +3245,7 @@ class Benchmark { *key = keys_[v]; return; } + char* start = const_cast(key->data()); char* pos = start; if (keys_per_prefix_ > 0) { @@ -3260,6 +3267,7 @@ class Benchmark { } int bytes_to_fill = std::min(key_size_ - static_cast(pos - start), 8); + //printf("GenerateKeyFromInt v=%ld key_size_=%d bytes_to_fill=%d num_keys=%ld keys_per_prefix=%ld port::kLittleEndian=%d start=%s\n", v, key_size_, bytes_to_fill, num_keys, keys_per_prefix_, port::kLittleEndian, start); if (port::kLittleEndian) { for (int i = 0; i < bytes_to_fill; ++i) { pos[i] = (v >> ((bytes_to_fill - i - 1) << 3)) & 0xFF; diff --git a/utilities/counted_fs.h b/utilities/counted_fs.h index cb8a8968f..27ecb9153 100644 --- a/utilities/counted_fs.h +++ b/utilities/counted_fs.h @@ -11,7 +11,7 @@ #include "rocksdb/file_system.h" #include "rocksdb/io_status.h" #include "rocksdb/rocksdb_namespace.h" - +#include namespace ROCKSDB_NAMESPACE { class Logger; @@ -94,7 +94,11 @@ class CountedFileSystem : public FileSystemWrapper { IOStatus NewSequentialFile(const std::string& f, const FileOptions& options, std::unique_ptr* r, IODebugContext* dbg) override; - + IOStatus SetFileLifetime(std::string fname, + uint64_t lifetime, int clock, bool flag, int level, std::vector overlap_list) { + std::cout << fname << lifetime << '\n'; + return IOStatus::NotSupported("SetFileLifetime"); + } IOStatus NewRandomAccessFile(const std::string& f, const FileOptions& file_opts, std::unique_ptr* r, diff --git a/utilities/my_logger.h b/utilities/my_logger.h new file mode 100644 index 000000000..b472dbe9e --- /dev/null +++ b/utilities/my_logger.h @@ -0,0 +1,18 @@ +#include +#include +#include +#include +#include + + +const int LOGMODE = 0; + +void my_printf(const char* format, ...) +{ + printf("Debug => "); + va_list vp; + va_start(vp, format); + vprintf (format, vp); + va_end (vp); + printf ("\n"); +}