From 3a90482f04b97069c28964c082dc1efd3dc3baf7 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 21:33:49 +0000 Subject: [PATCH] Fix wlog header init issues --- src/image.cc | 4 +- src/imgtool.cc | 80 +------------------- src/write_cache.cc | 179 +++++++++++++++++++++++---------------------- src/write_cache.h | 7 +- 4 files changed, 95 insertions(+), 175 deletions(-) diff --git a/src/image.cc b/src/image.cc index 7e5098ef..c2ba0d25 100644 --- a/src/image.cc +++ b/src/image.cc @@ -35,7 +35,7 @@ lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) map_lock, bufmap, bufmap_lock, last_data_seq, clones, obj_info, checkpoints); - wlog = open_wlog(cfg.wlog_path(name), cfg.wlog_size / 4096, *xlate, cfg); + wlog = open_wlog(cfg.wlog_path(name), *xlate, cfg); THROW_MSG_ON(!wlog, "Failed to open write log"); // recover_from_wlog(); @@ -468,7 +468,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request { assert(parent == nullptr); - img->wlog->get_room(req_bytes / 512); + img->wlog->reserve_room(req_bytes / 512); img->xlate->backend_backpressure(); sector_t size_sectors = req_bytes / 512; diff --git a/src/imgtool.cc b/src/imgtool.cc index 11377984..b3517884 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -15,18 +15,10 @@ #include "translate.h" #include "utils.h" -enum tool_operation { - OP_CREATE = 1, - OP_DELETE = 2, - OP_INFO = 3, - OP_MKCACHE = 4, - OP_CLONE = 5 -}; +enum tool_operation { OP_CREATE = 1, OP_DELETE = 2, OP_INFO = 3, OP_CLONE = 5 }; const char *backend = "rados"; const char *image_name; -const char *cache_dir; -const char *cache_dev; cfg_cache_type cache_type = LSVD_CFG_READ; enum tool_operation op; const char *pool_name = "lsvd"; @@ -46,11 +38,7 @@ static long parseint(const char *_s) } static struct argp_option options[] = { - {"cache-dir", 'd', "DIR", 0, "cache directory", 0}, {"create", 'C', 0, 0, "create image", 0}, - {"mkcache", 'k', "DEV", 0, "use DEV as cache", 0}, - {"cache-type", 't', "R/W", 0, - "R for read cache, W for write cache (default: R)", 0}, {"size", 'z', "SIZE", 0, "size in bytes (M/G=2^20,2^30)", 0}, {"delete", 'D', 0, 0, "delete image", 0}, {"info", 'I', 0, 0, "show image information", 0}, @@ -61,18 +49,12 @@ static struct argp_option options[] = { static char args_doc[] = "IMAGE"; -extern int init_wcache(int fd, uuid_t &uuid, int n_pages); -int (*make_cache)(int fd, uuid_t &uuid, int n_pages) = init_wcache; - static error_t parse_opt(int key, char *arg, struct argp_state *state) { switch (key) { case ARGP_KEY_ARG: image_name = arg; break; - case 'd': - cache_dir = arg; - break; case 'C': op = OP_CREATE; break; @@ -85,21 +67,6 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) case 'I': op = OP_INFO; break; - case 't': - if (arg[0] == 'R') { - cache_type = LSVD_CFG_READ; - log_error("read cache no longer supported"); - exit(1); - } else if (arg[0] == 'W') { - cache_type = LSVD_CFG_WRITE; - make_cache = init_wcache; - } else - argp_usage(state); - break; - case 'k': - op = OP_MKCACHE; - cache_dev = arg; - break; case 'c': op = OP_CLONE; break; @@ -171,41 +138,6 @@ void info(rados_ioctx_t io, const char *image_name) } } -void mk_cache(rados_ioctx_t io, const char *image_name, const char *dev_name, - cfg_cache_type type) -{ - int rv, fd = open(dev_name, O_RDWR); - if (fd < 0) { - perror("device file open"); - exit(1); - } - auto sz = getsize64(fd); - - lsvd_config cfg; - if ((rv = cfg.read()) < 0) { - printf("error reading config: %d\n", rv); - exit(1); - } - auto objstore = make_rados_backend(io); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { - printf("error reading superblock: %d\n", rv); - exit(1); - } - auto cache_file = cfg.cache_filename(uu, image_name, type); - - auto n_pages = sz / 4096; - if (make_cache(fd, uu, n_pages) < 0) { - printf("make_cache failed\n"); - exit(1); - } - if ((rv = symlink(dev_name, cache_file.c_str())) < 0) { - perror("symbolic link"); - exit(1); - } - close(fd); -} - int main(int argc, char **argv) { std::set_terminate([]() { @@ -218,14 +150,6 @@ int main(int argc, char **argv) argp_parse(&argp, argc, argv, 0, 0, 0); - setenv("LSVD_BACKEND", backend, 1); - if (cache_dir != NULL) { - if (cache_type == LSVD_CFG_READ) - setenv("LSVD_RCACHE_DIR", cache_dir, 1); - else - setenv("LSVD_WCACHE_DIR", cache_dir, 1); - } - rados_t cluster; int err = rados_create2(&cluster, "ceph", "client.admin", 0); check_ret_neg(err, "Failed to create cluster handle"); @@ -246,8 +170,6 @@ int main(int argc, char **argv) rbd_remove(io_ctx, image_name); else if (op == OP_INFO) info(io_ctx, image_name); - else if (op == OP_MKCACHE) - mk_cache(io_ctx, image_name, cache_dev, cache_type); else if (op == OP_CLONE) { auto src_img = image_name; auto dst_img = argv[argc - 1]; diff --git a/src/write_cache.cc b/src/write_cache.cc index 2f9ac06d..de1e9724 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -17,15 +17,15 @@ #include "utils.h" #include "write_cache.h" +const usize SUPER_BLOCKNO = 1; + /* ------------- Write cache structure ------------- */ class wcache_write_req; class write_cache_impl : public write_cache { size_t dev_max; - uint32_t super_blkno; int fd = -1; - lsvd_config &cfg; std::atomic sequence = 1; // write sequence # @@ -38,24 +38,17 @@ class write_cache_impl : public write_cache size_t write_batch = 0; std::condition_variable write_cv; - /* initialization stuff - */ + // initialization stuff int roll_log_forward(); - char *_hdrbuf; // for reading at startup - - thread_pool *misc_threads; - void write_checkpoint(void); - /* allocate journal entry, create a header - */ + // allocate journal entry, create a header uint32_t allocate(page_t n, page_t &pad, page_t &n_pad, page_t &prev); j_write_super *super; page_t previous_hdr = 0; page_t next_alloc = 0; - /* these are used by wcache_write_req - */ + // these are used by wcache_write_req friend class wcache_write_req; std::mutex m; translate &be; @@ -63,13 +56,12 @@ class write_cache_impl : public write_cache nvme *nvme_w = NULL; public: - /* throttle writes with window of max_write_pages - */ - void get_room(sector_t sectors); + // throttle writes with window of max_write_pages + void reserve_room(sector_t sectors); void release_room(sector_t sectors); void flush(void); - write_cache_impl(uint32_t blkno, int _fd, translate &_be, lsvd_config &cfg); + write_cache_impl(int fd, translate &_be, lsvd_config &cfg); ~write_cache_impl(); request *writev(sector_t lba, smartiov *iov); @@ -219,7 +211,7 @@ void wcache_write_req::run(request *parent_) * * TODO record how long this takes per request, unlikely to be bottleneck though */ -void write_cache_impl::get_room(sector_t sectors) +void write_cache_impl::reserve_room(sector_t sectors) { int pages = sectors / 8; std::unique_lock lk(m2); @@ -308,11 +300,11 @@ j_hdr *write_cache_impl::mk_header(char *buf, uint32_t type, page_t blks, */ void write_cache_impl::write_checkpoint(void) { - /* shouldn't really need the copy, since it's only called on - * shutdown, except that some unit tests call this and expect things - * to work afterwards - */ - j_write_super *super_copy = (j_write_super *)aligned_alloc(4096, 4096); + // shouldn't really need the copy, since it's only called on + // shutdown, except that some unit tests call this and expect things + // to work afterwards + vec buf(4096); + auto super_copy = (j_write_super *)buf.data(); memcpy(super_copy, super, 4096); super_copy->seq = sequence; @@ -328,10 +320,8 @@ void write_cache_impl::write_checkpoint(void) super_copy->clean = true; - if (nvme_w->write((char *)super_copy, 4096, 4096L * super_blkno) < 0) - throw_fs_error("wckpt_s"); - - free(super_copy); + auto res = nvme_w->write(buf.data(), buf.size(), 4096); + THROW_ERRNO_ON(res < 0, errno, "Failed to write wlog header"); } /* needs to set the following variables: @@ -454,27 +444,23 @@ int write_cache_impl::roll_log_forward() #endif } -write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate &be, - lsvd_config &cfg) - : fd(fd), cfg(cfg), be(be) +write_cache_impl::write_cache_impl(int fd, translate &be, lsvd_config &cfg) + : fd(fd), be(be) { - - super_blkno = blkno; dev_max = getsize64(fd); - _hdrbuf = (char *)aligned_alloc(4096, 4096); - - const char *name = "wlog_uring"; - nvme_w = make_nvme_uring(fd, name); + nvme_w = make_nvme_uring(fd, "wlog_uring"); char *buf = (char *)aligned_alloc(4096, 4096); - if (nvme_w->read(buf, 4096, 4096L * super_blkno) < 4096) - throw_fs_error("wcache"); + auto res = nvme_w->read(buf, 4096, 4096L * SUPER_BLOCKNO); + THROW_ERRNO_ON(res < 0, -res, "Failed to read wlog header"); + THROW_MSG_ON(res < 4096, "Short read {}/4096 on wlog header", res); + super = (j_write_super *)buf; + THROW_MSG_ON(super->magic != LSVD_MAGIC, "Invalid magic in wlog sub-hdr"); - /* if it's clean we can read in the map and lengths, otherwise - * do crash recovery. Then set the dirty flag - */ + // if it's clean we can read in the map and lengths, otherwise + // do crash recovery. Then set the dirty flag if (super->clean) { sequence = super->seq; next_alloc = super->base; @@ -483,22 +469,18 @@ write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate &be, next_alloc = super->base; super->clean = false; - if (nvme_w->write(buf, 4096, 4096L * super_blkno) < 4096) - throw_fs_error("wcache"); + res = nvme_w->write(buf, 4096, 4096L * SUPER_BLOCKNO); + THROW_ERRNO_ON(res < 0, -res, "Failed to write wlog subhdr"); int n_pages = super->limit - super->base; - max_write_pages = n_pages / 2 + n_pages / 4; + max_write_pages = n_pages / 2 + n_pages / 4; // no idea why this is 3/4ths write_batch = cfg.wcache_batch; - - misc_threads = new thread_pool(&m); } write_cache_impl::~write_cache_impl() { - delete misc_threads; close(fd); free(super); - free(_hdrbuf); delete nvme_w; } @@ -530,75 +512,94 @@ request *write_cache_impl::writev(sector_t lba, smartiov *iovs) void write_cache_impl::do_write_checkpoint(void) { write_checkpoint(); } -int init_wcache(int fd, uuid_t &uuid, int n_pages) +int init_wcache(int fd, uuid_t &uuid, usize cache_size) { - page_t w_pages = n_pages - 1; - page_t _map = div_round_up(w_pages, 256); - page_t _len = div_round_up(w_pages, 512); - page_t w_meta = 2 * (_map + _len); - char buf[4096]; - - w_pages -= w_meta; - - memset(buf, 0, sizeof(buf)); - auto w_super = (j_write_super *)buf; - *w_super = (j_write_super){LSVD_MAGIC, - LSVD_J_W_SUPER, - 1, - 1, - 1, - 1, - 1 + w_meta, - 1 + w_meta, - 1 + w_meta + w_pages, - 1 + w_meta, - 0, - 0, - 0, - 0, - 0, - 0, - {0}}; - memcpy(w_super->vol_uuid, uuid, sizeof(uuid_t)); - - int ret = pwrite(fd, buf, 4096, 0); + // write log file has 2 header blocks: the first 4k block is the j_hdr, + // the second 4k block is the j_write_super + // not entirely sure why they are separate, but I'm leaving it for now + + page_t total_pages = cache_size / 4096; + page_t content_pages = total_pages - 2; + page_t _map = div_round_up(content_pages, 256); + page_t _len = div_round_up(content_pages, 512); + page_t meta_pages = 2 * (_map + _len); + page_t data_pages = content_pages - meta_pages; + + vec buf(4096 * 2); + auto hdr = (j_hdr *)buf.data(); + *hdr = { + .magic = LSVD_MAGIC, + .type = LSVD_J_DATA, + .version = 1, + .len = total_pages, + .seq = 0, + .crc32 = 0, + .extent_offset = 0, + .extent_len = 0, + .prev = 0, + }; + + auto sup = (j_write_super *)(buf.data() + 4096); + *sup = { + .magic = LSVD_MAGIC, + .type = LSVD_J_W_SUPER, + .version = 1, + .clean = 1, + .seq = 1, + .meta_base = 1, + .meta_limit = 1 + meta_pages, + .base = 1 + meta_pages, + .limit = 1 + meta_pages + data_pages, + .next = 1 + meta_pages, + .map_start = 0, + .map_blocks = 0, + .map_entries = 0, + .len_start = 0, + .len_blocks = 0, + .len_entries = 0, + .vol_uuid = {0}, + }; + uuid_copy(sup->vol_uuid, uuid); + + int ret = pwrite(fd, buf.data(), buf.size(), 0); PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to write wlog header"); // just truncate to right length, don't bother writing zeroes - ret = ftruncate(fd, 4096 * (1 + w_pages + w_meta)); + ret = ftruncate(fd, 4096 * total_pages); PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to truncate wlog file"); return 0; } -uptr open_wlog(fspath path, usize size, translate &xlate, - lsvd_config &cfg) +uptr open_wlog(fspath path, translate &xlate, lsvd_config &cfg) { + log_info("Opening write log at '{}'", path.string()); + int fd = 0; if (!std::filesystem::exists(path)) { - log_info("Creating write cache file '{}'", path.string()); + log_info("Creating write log file '{}'", path.string()); fd = open(path.c_str(), O_RDWR | O_CREAT, 0644); PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to create cache file"); - auto err = init_wcache(fd, xlate.uuid, size / 4096); + auto err = init_wcache(fd, xlate.uuid, cfg.wlog_size); PR_ERR_RET_IF(err < 0, nullptr, -err, "Failed to init wlog"); + } else { + fd = open(path.c_str(), O_RDWR); + PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to open wlog file"); } - fd = open(path.c_str(), O_RDWR); - PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to open wlog file"); - char buf[4096]; int err = pread(fd, buf, 4096, 0); PR_ERR_RET_IF(err < 0, nullptr, errno, "Failed to read wlog header"); - j_write_super *super = (j_write_super *)buf; + auto super = (j_hdr *)buf; PR_RET_IF(super->magic != LSVD_MAGIC, nullptr, "Invalid write cache magic number: {}", super->magic); - PR_RET_IF(super->type != LSVD_J_W_SUPER, nullptr, "Invalid cache type: {}", + PR_RET_IF(super->type != LSVD_J_DATA, nullptr, "Invalid cache type: {}", super->type); try { - return std::make_unique(1, fd, xlate, cfg); + return std::make_unique(fd, xlate, cfg); } catch (std::exception &e) { log_error("Failed to open write cache: {}", e.what()); close(fd); diff --git a/src/write_cache.h b/src/write_cache.h index 21ca0222..42955a70 100644 --- a/src/write_cache.h +++ b/src/write_cache.h @@ -10,7 +10,7 @@ class write_cache { public: - virtual void get_room(sector_t sectors) = 0; + virtual void reserve_room(sector_t sectors) = 0; virtual void release_room(sector_t sectors) = 0; virtual void flush(void) = 0; @@ -24,7 +24,4 @@ class write_cache uptr make_write_cache(uint32_t blkno, int fd, translate *be, lsvd_config *cfg); -int init_wcache(int fd, uuid_t &uuid, int n_pages); - -uptr open_wlog(fspath path, usize size, translate &xlate, - lsvd_config &cfg); +uptr open_wlog(fspath path, translate &xlate, lsvd_config &cfg);