From 2b50e3528fb374d375cb4aa7f5d84d60719db312 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Thu, 15 Aug 2024 12:25:48 -0700 Subject: [PATCH] fix --- db/db_impl/db_impl.h | 5 +++ db/db_impl/db_impl_open.cc | 38 ++++++++++++++++++++ env/io_posix.cc | 72 ++++++++++++++++++++++++++++++-------- env/io_posix.h | 18 ++++++++++ 4 files changed, 119 insertions(+), 14 deletions(-) diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index e3eb3253e6ad..2ce5efae2f19 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -2943,6 +2943,11 @@ DBOptions SanitizeOptions(const std::string& db, const DBOptions& src, bool read_only = false, Status* logger_creation_s = nullptr); +#ifdef OS_LINUX +size_t GetCompactionReadaheadSizeSystemLimit( + const std::vector& db_paths); +#endif // OS_LINUX + CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions, const MutableCFOptions& mutable_cf_options); diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 8ef024e7ba8d..938a59e01228 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -13,6 +13,7 @@ #include "db/error_handler.h" #include "db/periodic_task_scheduler.h" #include "env/composite_env_wrapper.h" +#include "env/io_posix.h" #include "file/filename.h" #include "file/read_write_util.h" #include "file/sst_file_manager_impl.h" @@ -144,6 +145,22 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1); } +#ifdef OS_LINUX + if (result.compaction_readahead_size > 0) { + size_t system_limit = + GetCompactionReadaheadSizeSystemLimit(result.db_paths); + if (system_limit > 0 && result.compaction_readahead_size > system_limit) { + result.compaction_readahead_size = system_limit; + std::stringstream msg; + msg << "Compaction readahead size is set to no more than the POSIX " + "system limit (i.e, max_sectors_kb * 1024) " + ": " + << result.compaction_readahead_size; + ROCKS_LOG_INFO(result.info_log, "%s", msg.str().c_str()); + } + } +#endif // OS_LINUX + // Force flush on DB open if 2PC is enabled, since with 2PC we have no // guarantee that consecutive log files have consecutive sequence id, which // make recovery complicated. @@ -200,6 +217,27 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src, return result; } +#ifdef OS_LINUX +size_t GetCompactionReadaheadSizeSystemLimit( + const std::vector& db_paths) { + Status s; + size_t max_sectors_kb = 0; + + for (const auto& db_path : db_paths) { + size_t dir_max_sectors_kb = 0; + s = PosixHelper::GetMaxSectorsKBOfDirectory(db_path.path, + &dir_max_sectors_kb); + if (!s.ok()) { + break; + } + max_sectors_kb = (max_sectors_kb == 0) + ? dir_max_sectors_kb + : std::min(max_sectors_kb, dir_max_sectors_kb); + } + return max_sectors_kb * 1024; +} +#endif // OS_LINUX + namespace { Status ValidateOptionsByTable( const DBOptions& db_opts, diff --git a/env/io_posix.cc b/env/io_posix.cc index a509a1aa260d..650562b89be4 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -323,6 +323,9 @@ IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { * PosixRandomAccessFile */ #if defined(OS_LINUX) +const std::string PosixHelper::kLogicalBlockSizeFileName = "logical_block_size"; +const std::string PosixHelper::kMaxSectorsKBFileName = "max_sectors_kb"; + size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { if (max_size < kMaxVarint64Length * 3) { return 0; @@ -455,38 +458,62 @@ size_t LogicalBlockSizeCache::GetLogicalBlockSize(const std::string& fname, Status PosixHelper::GetLogicalBlockSizeOfDirectory(const std::string& directory, size_t* size) { + return GetQueueSysfsFileValueofDirectory(directory, kLogicalBlockSizeFileName, + size); +} + +Status PosixHelper::GetMaxSectorsKBOfDirectory(const std::string& directory, + size_t* kb) { + return GetQueueSysfsFileValueofDirectory(directory, kMaxSectorsKBFileName, + kb); +} + +Status PosixHelper::GetQueueSysfsFileValueofDirectory( + const std::string& directory, const std::string& file_name, size_t* value) { int fd = open(directory.c_str(), O_DIRECTORY | O_RDONLY); if (fd == -1) { return Status::IOError("Cannot open directory " + directory); } - *size = PosixHelper::GetLogicalBlockSizeOfFd(fd); + if (file_name == PosixHelper::kLogicalBlockSizeFileName) { + *value = PosixHelper::GetLogicalBlockSizeOfFd(fd); + } else if (file_name == PosixHelper::kMaxSectorsKBFileName) { + *value = PosixHelper::GetMaxSectorsKBOfFd(fd); + } else { + assert(false); + } close(fd); return Status::OK(); } -size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { +size_t PosixHelper::GetQueueSysfsFileValueOfFd(int fd, + const std::string& file_name, + size_t default_return_value) { #ifdef OS_LINUX struct stat buf; int result = fstat(fd, &buf); if (result == -1) { - return kDefaultPageSize; + return default_return_value; } + + // Get device number if (major(buf.st_dev) == 0) { // Unnamed devices (e.g. non-device mounts), reserved as null device number. // These don't have an entry in /sys/dev/block/. Return a sensible default. - return kDefaultPageSize; + return default_return_value; } - // Reading queue/logical_block_size does not require special permissions. + // Get device path const int kBufferSize = 100; char path[kBufferSize]; char real_path[PATH_MAX + 1]; snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), minor(buf.st_dev)); if (realpath(path, real_path) == nullptr) { - return kDefaultPageSize; + return default_return_value; } std::string device_dir(real_path); + + // Get the queue sysfs file path if (!device_dir.empty() && device_dir.back() == '/') { device_dir.pop_back(); } @@ -500,11 +527,11 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); if (parent_end == std::string::npos) { - return kDefaultPageSize; + return default_return_value; } size_t parent_begin = device_dir.rfind('/', parent_end - 1); if (parent_begin == std::string::npos) { - return kDefaultPageSize; + return default_return_value; } std::string parent = device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); @@ -513,25 +540,42 @@ size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { device_dir = device_dir.substr(0, parent_end); } - std::string fname = device_dir + "/queue/logical_block_size"; + std::string fname = device_dir + "/queue/" + file_name; + + // Get value in the queue sysfs file FILE* fp; - size_t size = 0; + size_t value = 0; fp = fopen(fname.c_str(), "r"); if (fp != nullptr) { char* line = nullptr; size_t len = 0; if (getline(&line, &len, fp) != -1) { - sscanf(line, "%zu", &size); + sscanf(line, "%zu", &value); } free(line); fclose(fp); } - if (size != 0 && (size & (size - 1)) == 0) { - return size; + if (file_name == kLogicalBlockSizeFileName && value != 0 && + (value & (value - 1)) == 0) { + return value; + } else if (file_name == kMaxSectorsKBFileName && value != 0) { + return value; + } else { + assert(false); } #endif (void)fd; - return kDefaultPageSize; + return default_return_value; +} + +size_t PosixHelper::GetLogicalBlockSizeOfFd(int fd) { + return GetQueueSysfsFileValueOfFd(fd, kLogicalBlockSizeFileName, + kDefaultPageSize); +} + +size_t PosixHelper::GetMaxSectorsKBOfFd(int fd) { + return GetQueueSysfsFileValueOfFd(fd, kMaxSectorsKBFileName, + kDefaultMaxSectorsKB); } /* diff --git a/env/io_posix.h b/env/io_posix.h index 603af2f885ac..dca7732a99b4 100644 --- a/env/io_posix.h +++ b/env/io_posix.h @@ -57,6 +57,24 @@ class PosixHelper { static size_t GetLogicalBlockSizeOfFd(int fd); static Status GetLogicalBlockSizeOfDirectory(const std::string& directory, size_t* size); + static size_t GetMaxSectorsKBOfFd(int fd); + static Status GetMaxSectorsKBOfDirectory(const std::string& directory, + size_t* kb); + + private: + static const std::string kLogicalBlockSizeFileName; + static const std::string kMaxSectorsKBFileName; + static const size_t kDefaultMaxSectorsKB = 2 * 1024; + + // Similar to `GetQueueSysfsFileValueFd()` but for directory + static Status GetQueueSysfsFileValueofDirectory(const std::string& directory, + const std::string& file_name, + size_t* value); + // Return the value in the specified file `file_name` under + // `/sys/block/xxx/queue/` for the device where the file of `fd` is on. + // If not found, then return the specified `default_return_value` + static size_t GetQueueSysfsFileValueOfFd(int fd, const std::string& file_name, + size_t default_return_value); }; /*