diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 7141b65c..742297a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -13,16 +13,17 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Setup Clang 17 + - name: Setup Clang 18 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh - sudo ./llvm.sh 17 + sudo ./llvm.sh 18 - name: Install dependencies run: | sudo apt update make install-deps + pip3 install --upgrade meson - name: Build run: | @@ -45,4 +46,4 @@ jobs: - name: Logs if: always() run: | - cat build-dbg/meson-logs/testlog.txt + cat build-rel/meson-logs/meson-log.txt diff --git a/.gitignore b/.gitignore index 7414046b..8b84efbf 100644 --- a/.gitignore +++ b/.gitignore @@ -5,10 +5,11 @@ __pycache__/ venv/ .ipynb_checkpoints/ -subprojects/ - builddir +build qemu/*.img qemu/*.iso qemu/bzImage +subprojects/liburing-* +subprojects/packagecache diff --git a/.gitmodules b/.gitmodules index 55f4c711..e03182ad 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "spdk"] - path = spdk - url = https://github.com/spdk/spdk [submodule "lsvd-atc24"] path = atc2024 url = git@github.com:CCI-MOC/lsvd-atc24.git diff --git a/Makefile b/Makefile index 1e0d3d76..24f790f1 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,12 @@ .PHONY: setup setup-debug release debug paper clean setup: - meson setup --native-file meson.ini build-rel --buildtype=release - meson setup --native-file meson.ini build-dbg --buildtype=debug + meson setup --native-file meson.ini build-rel --buildtype=release -Db_sanitize=none + meson setup --native-file meson.ini build-dbg --buildtype=debug ln -s build-dbg builddir -debug: setup +debug: + meson setup --native-file meson.ini build-dbg --buildtype=debug cd build-dbg; meson compile paper: @@ -17,6 +18,14 @@ clean: cd build-dbg; meson compile --clean install-deps: - sudo apt install -y meson libfmt-dev libaio-dev librados-dev mold \ - libtcmalloc-minimal4 libboost-dev libradospp-dev \ - liburing-dev pkg-config uuid-dev + # Folly deps + sudo apt install libboost-all-dev libdouble-conversion-dev libevent-dev \ + libgflags-dev libgmock-dev libgoogle-glog-dev libgtest-dev \ + liblz4-dev liblzma-dev libsnappy-dev libsodium-dev libunwind-dev \ + libzstd-dev ninja-build zlib1g-dev + # SPDK deps + sudo apt install libnuma-dev libarchive-dev libibverbs-dev librdmacm-dev \ + python3-pyelftools libcunit1-dev libaio-dev + # LSVD deps + sudo apt install -y meson mold libfmt-dev librados-dev \ + libjemalloc-dev libradospp-dev liburing-dev pkg-config uuid-dev diff --git a/README.md b/README.md index 057eb940..be3218f7 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Parameters are: - `batch_size`, `LSVD_BATCH_SIZE`: size of objects written to the backend, in bytes (K/M recognized as 1024, 1024\*1024). Default: 8MiB - `wcache_batch`: write cache batching (see below) - `wcache_chunk': maximum size of atomic write, in bytes - larger writes will be split and may be non-atomic. -- `rcache_dir` - directory used for read cache file and GC temporary files. Note that `lsvd_imgtool` can format a partition for cache and symlink it into this directory, although the performance improvement seems limited. +- `rcache_dir` - directory used for read cache file and GC temporary files. Note that `imgtool` can format a partition for cache and symlink it into this directory, although the performance improvement seems limited. - `wcache_dir` - directory used for write cache file - `xlate_window`: max writes (i.e. objects) in flight to the backend. Note that this value is coupled to the size of the write cache, which must be big enough to hold all outstanding writes in case of a crash. - `hard_sync` (untested): "flush" forces all batched writes to the backend. @@ -70,7 +70,7 @@ figure out how to optimize at runtime instead of bothering the user for a value. First create a volume: ``` -build$ sudo bin/lsvd_imgtool --create --rados --size=20g pool/imgname +build$ sudo imgtool create poolname imgname --size=20g ``` Then you can start a SPDK NVMe-oF gateway: @@ -158,21 +158,17 @@ The read cache typically fetches 64K blocks, so there may be a bit of extra load Most of the testing to date has been with an 8,3 code with 64K stripe size. ## Tools -`lsvd_imgtool` mostly just calls the LSVD versions of `rbd_create` and `rbd_remove`, although it can also format a cache file (e.g. if you're using a raw partition) -``` -build$ bin/lsvd_imgtool --help -Usage: lsvd_imgtool [OPTION...] -IMAGE - -C, --create create image - -d, --cache-dir=DIR cache directory - -D, --delete delete image - -I, --info show image information - -k, --mkcache=DEV use DEV as cache - -O, --rados use RADOS - -z, --size=SIZE size in bytes (M/G=2^20,2^30) - -?, --help Give this help list - --usage Give a short usage message +``` +build$ ./imgtool --help +❯ ./imgtool --help +Allowed options: + --help produce help message + --cmd arg subcommand: create, clone, delete, info + --img arg name of the iname + --pool arg pool where the image resides + --size arg (=1G) size in bytes (M=2^20,G=2^30) + --dest arg destination (for clone) ``` Other tools live in the `tools` subdirectory - see the README there for more details. diff --git a/docs/qemu-launch.md b/docs/qemu-launch.md index 13995164..0f7dfd4a 100644 --- a/docs/qemu-launch.md +++ b/docs/qemu-launch.md @@ -5,7 +5,7 @@ install config, run QEMU with `-drive format=raw,file=seed.iso,cache=none,if=virtio`. 1. Create a lsvd image if you don't already have one: - `./imgtool --create --rados --size 10g $pool_name/$img_name` + `./imgtool create --size 10g $pool_name $img_name` 2. Launch LSVD as a NVMF target `qemu-gateway.bash $pool_name $img_name` 3. Lanuch QEMU with the NVMF target `qemu-client.bash`. This does the following: - `nvme connect` to the nvmf target on the gateway diff --git a/experiments/multigw/gateway-1.bash b/experiments/multigw/gateway-1.bash index f973de98..463710bf 100755 --- a/experiments/multigw/gateway-1.bash +++ b/experiments/multigw/gateway-1.bash @@ -42,7 +42,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw1 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw1 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/experiments/multigw/gateway-2.bash b/experiments/multigw/gateway-2.bash index c704e660..30b098f8 100755 --- a/experiments/multigw/gateway-2.bash +++ b/experiments/multigw/gateway-2.bash @@ -42,7 +42,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw2 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw2 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/experiments/multigw/rbd-gateway-1.bash b/experiments/multigw/rbd-gateway-1.bash index 4f147486..ee49bc22 100755 --- a/experiments/multigw/rbd-gateway-1.bash +++ b/experiments/multigw/rbd-gateway-1.bash @@ -44,7 +44,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw1 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw1 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/experiments/multigw/rbd-gateway-2.bash b/experiments/multigw/rbd-gateway-2.bash index 6b243d95..082c648b 100755 --- a/experiments/multigw/rbd-gateway-2.bash +++ b/experiments/multigw/rbd-gateway-2.bash @@ -44,7 +44,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw2 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw2 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/meson.build b/meson.build index 887a6692..2244811c 100644 --- a/meson.build +++ b/meson.build @@ -1,11 +1,14 @@ project( 'lsvd-rbd', - 'cpp', + ['c', 'cpp'], version: '0.1', default_options: [ - 'cpp_std=c++20', + 'cpp_std=c++23', 'warning_level=2', 'b_colorout=always', + 'b_sanitize=address,undefined', + 'b_lto=true', + 'b_thinlto_cache=true', ], ) @@ -14,34 +17,29 @@ add_project_arguments('-Wno-unused-parameter', language: 'cpp') if get_option('buildtype') == 'debug' add_project_arguments('-fno-inline', language: 'cpp') - add_project_arguments('-DLOGLV=1', language: 'cpp') - - # add sanitizers for debug builds - add_project_arguments( - '-fsanitize=address,undefined,nullability,implicit-conversion', - language: 'cpp', - ) - add_project_link_arguments( - '-fsanitize=address,undefined', - '-Wl,--unresolved-symbols=ignore-in-object-files', - language: 'cpp', - ) + add_project_arguments('-DLOGLV=0', language: 'cpp') endif subdir('src') -liblsvd = library('lsvd', lsvd_src, dependencies: lsvd_deps, install: false) +lsvd_ar = static_library('lsvd', lsvd_src, dependencies: lsvd_deps) + +executable( + 'lsvd', + spdk_fe, + dependencies: lsvd_deps + [dependency('_spdk')], +) executable( 'imgtool', - 'src/imgtool.cc', - link_with: liblsvd, + ['src/imgtool.cc'], + link_whole: lsvd_ar, dependencies: lsvd_deps, ) executable( 'thick-image', - 'src/thick-image.cc', - link_with: liblsvd, + ['src/thick-image.cc'], + link_whole: lsvd_ar, dependencies: lsvd_deps, ) -subdir('test') +subdir('test') \ No newline at end of file diff --git a/meson.ini b/meson.ini index a1a41aec..a1be4df4 100644 --- a/meson.ini +++ b/meson.ini @@ -1,5 +1,5 @@ [binaries] -c = 'clang-17' -c_ld = 'lld-17' -cpp = 'clang++-17' -cpp_ld = 'lld-17' \ No newline at end of file +c = 'clang-18' +c_ld = 'lld-18' +cpp = 'clang++-18' +cpp_ld = 'lld-18' \ No newline at end of file diff --git a/spdk b/spdk deleted file mode 160000 index fb13eebf..00000000 --- a/spdk +++ /dev/null @@ -1 +0,0 @@ -Subproject commit fb13eebf53d7f132baa2e39992a6ad79affdcdaa diff --git a/src/backend.h b/src/backend.h index 1f6eb0d1..5a6f65d1 100644 --- a/src/backend.h +++ b/src/backend.h @@ -4,9 +4,9 @@ #include #include -#include "config.h" #include "request.h" #include "smartiov.h" +#include "utils.h" class backend { @@ -45,16 +45,11 @@ class backend smartiov iov((char *)buf, len); return aio_read(name, offset, iov); } + + virtual opt get_size(std::string name) = 0; + virtual opt> read_whole_obj(std::string name) = 0; + virtual bool exists(std::string name) = 0; }; -extern std::shared_ptr make_file_backend(const char *prefix); extern std::shared_ptr make_rados_backend(rados_ioctx_t io); - -inline std::shared_ptr get_backend(lsvd_config *cfg, rados_ioctx_t io, - const char *name) -{ - if (cfg->backend == BACKEND_RADOS) - return make_rados_backend(io); - - throw std::runtime_error("Unknown backend"); -} +rados_ioctx_t connect_to_pool(str pool_name); diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc new file mode 100644 index 00000000..8890edc6 --- /dev/null +++ b/src/bdev_lsvd.cc @@ -0,0 +1,231 @@ +#include "rados/librados.h" +#include "spdk/bdev_module.h" +#include + +#include "bdev_lsvd.h" +#include "image.h" +#include "request.h" +#include "smartiov.h" +#include "spdk/thread.h" +#include "utils.h" + +static int bdev_lsvd_init(void); +static void bdev_lsvd_finish(void); +static int bdev_lsvd_io_ctx_size(void); + +static spdk_bdev_module lsvd_if = { + .module_init = bdev_lsvd_init, + .module_fini = bdev_lsvd_finish, + .name = "LSVD bdev module", + .get_ctx_size = bdev_lsvd_io_ctx_size, +}; +SPDK_BDEV_MODULE_REGISTER(ext_lsvd, &lsvd_if); + +static int bdev_lsvd_init(void) +{ + spdk_io_device_register( + &lsvd_if, [](auto iod, auto buf) { return 0; }, + [](auto iod, auto buf) { return; }, 0, "lsvd_poll_groups"); + return 0; +} + +static void bdev_lsvd_finish(void) +{ + spdk_io_device_unregister(&lsvd_if, nullptr); +} + +/** + * Function table for the LSVD bdev module. + */ + +static int lsvd_destroy_bdev(void *); +static void lsvd_submit_io(spdk_io_channel *c, spdk_bdev_io *io); +static bool lsvd_io_type_supported(void *ctx, spdk_bdev_io_type io_type); +static spdk_io_channel *lsvd_get_io_channel(void *ctx); + +static const spdk_bdev_fn_table lsvd_fn_table = { + .destruct = lsvd_destroy_bdev, + .submit_request = lsvd_submit_io, + .io_type_supported = lsvd_io_type_supported, + .get_io_channel = lsvd_get_io_channel, +}; + +class lsvd_iodevice +{ + public: + spdk_bdev bdev; + uptr img; + + lsvd_iodevice(uptr img_) : img(std::move(img_)) + { + std::memset(&bdev, 0, sizeof(bdev)); + bdev.product_name = strdup("Log-structured Virtual Disk"); + bdev.name = strdup(img->imgname.c_str()); + bdev.blocklen = 4096; + bdev.blockcnt = img->size / bdev.blocklen; + bdev.ctxt = this; + bdev.module = &lsvd_if; + bdev.fn_table = &lsvd_fn_table; + } + + ~lsvd_iodevice() + { + free(bdev.product_name); + free(bdev.name); + } +}; + +static int lsvd_destroy_bdev(void *ctx) +{ + auto iodev = reinterpret_cast(ctx); + delete iodev; + return 0; +} + +static bool lsvd_io_type_supported(void *ctx, spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: // we only use this to ensure ordering + case SPDK_BDEV_IO_TYPE_UNMAP: // trim + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: // also just trim + return true; + case SPDK_BDEV_IO_TYPE_RESET: // block until all pending io aborts + default: + return false; + } +} + +static spdk_io_channel *lsvd_get_io_channel(void *ctx) +{ + lsvd_iodevice *iodev = reinterpret_cast(ctx); + // SPDK will pass this to the iodevice's registered create/destroy + // io_channel functions that were passed in when the device was registered. + // We don't need to do anything special here, so just return the iodevice. + auto ch = spdk_get_io_channel(iodev); + assert(ch != nullptr); + return ch; +} + +struct lsvd_bdev_io { + spdk_thread *submit_td; + spdk_bdev_io_status status; + request *r; +}; + +static int bdev_lsvd_io_ctx_size(void) { return sizeof(lsvd_bdev_io); } + +static void lsvd_io_done(lsvd_bdev_io *io, int rc) +{ + auto sth = io->submit_td; + assert(sth != nullptr); + + // error is -errno, succ is 0 or bytes read/written + io->status = + rc >= 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_thread_send_msg( + sth, + [](void *ctx) { + auto io = (lsvd_bdev_io *)ctx; + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io), io->status); + }, + io); +} + +static void lsvd_submit_io(spdk_io_channel *c, spdk_bdev_io *io) +{ + auto dev = static_cast(io->bdev->ctxt); + auto &img = dev->img; + auto lio = (lsvd_bdev_io *)(io->driver_ctx); + lio->submit_td = spdk_io_channel_get_thread(c); + + // io details + auto offset = io->u.bdev.offset_blocks * io->bdev->blocklen; + auto len = io->u.bdev.num_blocks * io->bdev->blocklen; + smartiov iov(io->u.bdev.iovs, io->u.bdev.iovcnt); + + auto comp = [lio](int rc) { lsvd_io_done(lio, rc); }; + + switch (io->type) { + case SPDK_BDEV_IO_TYPE_READ: + lio->r = img->read(offset, iov, comp); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + lio->r = img->write(offset, iov, comp); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + lio->r = img->flush(comp); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + lio->r = img->trim(offset, len, comp); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + lio->r = img->trim(offset, len, comp); + break; + default: + log_error("Unknown request type: {}", io->type); + return; + } + + lio->r->run(nullptr); +} + +// Just copying from bdev_rbd, not sure where this is actually used +struct lsvd_bdev_io_channel { + lsvd_iodevice *lsvd_dev; + spdk_io_channel *io_channel; +}; + +int bdev_lsvd_create(str img_name, rados_ioctx_t ioctx, lsvd_config cfg) +{ + assert(!img_name.empty()); + + uptr img; + try { + img = uptr(new lsvd_image(img_name, ioctx, cfg)); + } catch (std::runtime_error &e) { + log_error("Failed to create image '{}': {}", img_name, e.what()); + return -1; + } + + auto iodev = new lsvd_iodevice(std::move(img)); + + spdk_io_device_register( + iodev, + [](void *iodev, void *ctx_buf) { + auto *ch = static_cast(ctx_buf); + ch->lsvd_dev = static_cast(iodev); + ch->io_channel = spdk_get_io_channel(&lsvd_if); + return 0; + }, + [](void *iodev, void *ctx_buf) { + auto *ch = static_cast(ctx_buf); + spdk_put_io_channel(ch->io_channel); + }, + sizeof(lsvd_bdev_io_channel), img_name.c_str()); + + auto err = spdk_bdev_register(&iodev->bdev); + if (err) { + log_error("Failed to register bdev: err {}", (err)); + spdk_io_device_unregister( + iodev, [](void *ctx) { delete (lsvd_iodevice *)ctx; }); + return err; + } + + return 0; +} + +int bdev_lsvd_delete(std::string img_name) +{ + auto p = std::promise(); + spdk_bdev_unregister_by_name( + img_name.c_str(), &lsvd_if, + [](void *arg, int rc) { + auto p = (std::promise *)arg; + p->set_value(rc); + }, + &p); + return p.get_future().get(); +} diff --git a/src/bdev_lsvd.h b/src/bdev_lsvd.h new file mode 100644 index 00000000..294298ce --- /dev/null +++ b/src/bdev_lsvd.h @@ -0,0 +1,8 @@ +#pragma once + +#include + +#include "config.h" + +int bdev_lsvd_create(str img_name, rados_ioctx_t io_ctx, lsvd_config cfg); +int bdev_lsvd_delete(str img_name); diff --git a/src/config.cc b/src/config.cc index 22662710..04dfde18 100644 --- a/src/config.cc +++ b/src/config.cc @@ -24,9 +24,9 @@ namespace fs = std::filesystem; #include "config.h" #include "config_macros.h" -std::vector cfg_path({"lsvd.conf", "/usr/local/etc/lsvd.conf"}); +vec cfg_path({"lsvd.conf", "/usr/local/etc/lsvd.conf"}); -static void split(std::string s, std::vector &words) +static void split(std::string s, vec &words) { std::string w = ""; for (auto c : s) { @@ -65,22 +65,22 @@ int lsvd_config::read() while (getline(fp, line)) { if (line[0] == '#') continue; - std::vector words; + vec words; split(line, words); if (words.size() != 2) continue; - F_CONFIG_H_INT(words[0], words[1], batch_size); + F_CONFIG_H_INT(words[0], words[1], backend_obj_size); F_CONFIG_INT(words[0], words[1], wcache_batch); F_CONFIG_H_INT(words[0], words[1], wcache_chunk); F_CONFIG_STR(words[0], words[1], rcache_dir); F_CONFIG_STR(words[0], words[1], wcache_dir); - F_CONFIG_INT(words[0], words[1], xlate_window); + F_CONFIG_INT(words[0], words[1], num_parallel_writes); F_CONFIG_TABLE(words[0], words[1], backend, m); F_CONFIG_H_INT(words[0], words[1], cache_size); F_CONFIG_H_INT(words[0], words[1], wlog_size); F_CONFIG_INT(words[0], words[1], hard_sync); F_CONFIG_INT(words[0], words[1], ckpt_interval); - F_CONFIG_INT(words[0], words[1], flush_msec); + F_CONFIG_INT(words[0], words[1], flush_timeout_msec); F_CONFIG_INT(words[0], words[1], gc_threshold); F_CONFIG_INT(words[0], words[1], fetch_window); F_CONFIG_INT(words[0], words[1], fetch_ratio); @@ -91,18 +91,18 @@ int lsvd_config::read() break; } - ENV_CONFIG_H_INT(batch_size); + ENV_CONFIG_H_INT(backend_obj_size); ENV_CONFIG_INT(wcache_batch); ENV_CONFIG_H_INT(wcache_chunk); ENV_CONFIG_STR(rcache_dir); ENV_CONFIG_STR(wcache_dir); - ENV_CONFIG_INT(xlate_window); + ENV_CONFIG_INT(num_parallel_writes); ENV_CONFIG_TABLE(backend, m); ENV_CONFIG_H_INT(cache_size); ENV_CONFIG_H_INT(wlog_size); ENV_CONFIG_INT(hard_sync); ENV_CONFIG_INT(ckpt_interval); - ENV_CONFIG_INT(flush_msec); + ENV_CONFIG_INT(flush_timeout_msec); ENV_CONFIG_INT(gc_threshold); ENV_CONFIG_INT(fetch_window); ENV_CONFIG_INT(fetch_ratio); diff --git a/src/config.h b/src/config.h index 45857a20..446f64cf 100644 --- a/src/config.h +++ b/src/config.h @@ -1,3 +1,5 @@ +#pragma once + /* * file: config.h * description: quick and dirty config file parser @@ -9,12 +11,11 @@ * LGPL-2.1-or-later */ -#ifndef __CONFIG_H__ -#define __CONFIG_H__ - #include #include +#include "utils.h" + enum cfg_backend { BACKEND_FILE = 1, BACKEND_RADOS = 2 }; enum cfg_cache_type { LSVD_CFG_READ = 1, LSVD_CFG_WRITE = 2 }; @@ -22,28 +23,34 @@ enum cfg_cache_type { LSVD_CFG_READ = 1, LSVD_CFG_WRITE = 2 }; class lsvd_config { public: - int batch_size = 8 * 1024 * 1024; // in bytes - int wcache_batch = 8; // requests - int wcache_chunk = 2 * 1024 * 1024; // bytes - std::string rcache_dir = "/tmp"; - std::string wcache_dir = "/tmp"; - int xlate_window = 8; + int backend_obj_size = 8 * 1024 * 1024; // in bytes + int wcache_batch = 8; // requests + int wcache_chunk = 2 * 1024 * 1024; // bytes + std::string rcache_dir = "/tmp/lsvd/"; + std::string wcache_dir = "/tmp/lsvd/"; + u32 num_parallel_writes = 8; int hard_sync = 0; enum cfg_backend backend = BACKEND_RADOS; long cache_size = 500 * 1024 * 1024; // in bytes - long wlog_size = 500 * 1024 * 1024; // in bytes + long wlog_size = 500 * 1024 * 1024; // in bytes int ckpt_interval = 500; // objects - int flush_msec = 2000; // flush timeout + int flush_timeout_msec = 2000; // flush timeout + int flush_interval_msec = 1000; // flush interval int gc_threshold = 60; // GC threshold, percent - int gc_window = 4; // max GC writes outstanding + int gc_window = 4; // max GC writes outstanding int fetch_window = 12; // read cache fetches int fetch_ratio = 67; // anti-thrash served:backend ratio - int no_gc = 0; // turn off GC + int no_gc = 0; // turn off GC lsvd_config() {} ~lsvd_config() {} int read(); - std::string cache_filename(uuid_t &uuid, const char *name, cfg_cache_type type); -}; + std::string cache_filename(uuid_t &uuid, const char *name, + cfg_cache_type type); -#endif + inline fspath wlog_path(str imgname) + { + auto filename = imgname + ".wlog"; + return fspath(wcache_dir) / filename; + } +}; diff --git a/src/extent.h b/src/extent.h index 17f63e47..2d23d73a 100644 --- a/src/extent.h +++ b/src/extent.h @@ -29,7 +29,8 @@ #include #include #include -#include + +#include "utils.h" namespace extmap { @@ -227,9 +228,9 @@ template struct extmap { static const int _load = load; public: - typedef std::vector extent_vector; - std::vector lists; - std::vector maxes; + typedef vec extent_vector; + vec lists; + vec maxes; int count; extmap() { count = 0; } @@ -410,9 +411,9 @@ template struct extmap { // Python-style list slicing - remove [len]..[end] and return it // - static std::vector *_slice(std::vector *A, int len) + static vec *_slice(vec *A, int len) { - auto half = new std::vector(); + auto half = new vec(); half->reserve(_load); for (auto it = A->begin() + len; it != A->end(); it++) half->push_back(*it); @@ -660,7 +661,7 @@ template struct extmap { // various ways of calling _update... // - void update(T_in base, T_in limit, T_out e, std::vector *del) + void update(T_in base, T_in limit, T_out e, vec *del) { _update(base, limit, e, false, del); } @@ -669,7 +670,7 @@ template struct extmap { _update(base, limit, e, false, nullptr); } - void trim(T_in base, T_in limit, std::vector *del) + void trim(T_in base, T_in limit, vec *del) { static T_out unused; if (count > 0) diff --git a/src/image.cc b/src/image.cc index 36ab0246..0db401ab 100644 --- a/src/image.cc +++ b/src/image.cc @@ -1,85 +1,178 @@ #include +#include +#include #include #include #include +#include "backend.h" #include "image.h" -#include "journal.h" #include "lsvd_types.h" +#include "objects.h" +#include "shared_read_cache.h" +#include "utils.h" +#include "write_cache.h" -extern int init_wcache(int fd, uuid_t &uuid, int n_pages); const int block_sectors = CACHE_CHUNK_SIZE / 512; +lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg_) + : imgname(name), cfg(cfg_), io(io) +{ + objstore = make_rados_backend(io); + rcache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); + + read_superblock(); + debug("Found checkpoints: {}", checkpoints); + if (checkpoints.size() > 0) + read_from_checkpoint(checkpoints.back()); + + // Roll forward on the log + auto last_data_seq = roll_forward_from_last_checkpoint(); + debug("Last data seq: {}", last_data_seq); + + // Successfully recovered everything, now we have enough information to + // init everything else + xlate = make_translate(name, cfg, size, uuid, objstore, rcache, objmap, + map_lock, bufmap, bufmap_lock, last_data_seq, clones, + obj_info, checkpoints); + + wlog = open_wlog(cfg.wlog_path(name), *xlate, cfg); + THROW_MSG_ON(!wlog, "Failed to open write log"); + // recover_from_wlog(); + + log_info("Image '{}' opened successfully", name); +} + lsvd_image::~lsvd_image() { - wcache->flush(); - wcache->do_write_checkpoint(); - if (!cfg.no_gc) - xlate->stop_gc(); - xlate->checkpoint(); + wlog->flush(); + wlog->do_write_checkpoint(); + xlate->shutdown(); + + // TODO figure out who owns the rados connection + rados_ioctx_destroy(io); - close(write_fd); + log_info("Image '{}' closed", imgname); } -int lsvd_image::try_open(std::string name, rados_ioctx_t io) +bool lsvd_image::apply_log(seqnum_t seq) { - this->image_name = name; + object_reader parser(objstore); + // TODO + auto data_hdr = parser.read_data_hdr(oname(imgname, seq)); + if (!data_hdr.has_value()) + return false; + trace("Recovering log with object at seq {}", seq); + + auto ohdr = data_hdr->hdr; + if (ohdr->type == OBJ_CHECKPOINT) { + log_warn("CORRUPTION: Found checkpoint at seq {} that was not " + "present in the superblock.", + seq); + checkpoints.push_back(seq); + return true; + } - if (cfg.read() < 0) - throw std::runtime_error("Failed to read config"); + obj_info[seq] = (data_obj_info){ + .hdr = ohdr->hdr_sectors, + .data = ohdr->data_sectors, + .live = ohdr->data_sectors, + }; + + // Consume log records + sector_t offset = 0; + vec deleted; + for (auto dmap : data_hdr->data_map) { + // Update the extent map + extmap::obj_offset oo = {seq, offset + ohdr->hdr_sectors}; + objmap.update(dmap->lba, dmap->lba + dmap->len, oo, &deleted); + offset += dmap->len; + } - objstore = get_backend(&cfg, io, name.c_str()); - shared_cache = - get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); + // Manage deleted extents + for (auto d : deleted) { + auto [base, limit, ptr] = d.vals(); + obj_info[ptr.obj].live -= (limit - base); + THROW_MSG_ON(obj_info[ptr.obj].live >= 0, "Negative live sectors."); + } - /* read superblock and initialize translation layer - */ - xlate = make_translate(objstore, &cfg, &map, &bufmap, &map_lock, - &bufmap_lock, shared_cache); - size = xlate->init(name.c_str(), true); - check_cond(size < 0, "Failed to initialize translation layer err={}", size); + return true; +} - /* figure out cache file name, create it if necessary - */ +void lsvd_image::read_superblock() +{ + object_reader parser(objstore); + auto superblock = parser.read_superblock(imgname); + THROW_MSG_ON(!superblock, "Failed to read superblock"); - /* - * TODO: Open 2 files. One for wcache and one for reader - */ - std::string wcache_name = - cfg.cache_filename(xlate->uuid, name.c_str(), LSVD_CFG_WRITE); + size = superblock->vol_size; + uuid_copy(uuid, superblock->uuid); - if (access(wcache_name.c_str(), R_OK | W_OK) < 0) { - log_info("Creating write cache file {}", wcache_name); - int cache_pages = cfg.wlog_size / 4096; + for (auto ckpt : superblock->ckpts) + checkpoints.push_back(ckpt); - int fd = open(wcache_name.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0777); - check_ret_errno(fd, "Can't open wcache file"); + for (auto ci : superblock->clones) { + clone_base c; + c.name = std::string(ci->name, ci->name_len); + c.last_seq = ci->last_seq; + c.first_seq = ci->last_seq + 1; - if (init_wcache(fd, xlate->uuid, cache_pages) < 0) - throw std::runtime_error("Failed to initialize write cache"); - close(fd); + debug("Using base image {} upto seq {}", c.name, c.last_seq); + clones.push_back(c); } +} - write_fd = open(wcache_name.c_str(), O_RDWR); - check_ret_errno(write_fd, "Can't open wcache file"); +void lsvd_image::read_from_checkpoint(seqnum_t seq) +{ + object_reader parser(objstore); + auto parsed = parser.read_checkpoint(oname(imgname, seq)); + THROW_MSG_ON(!parsed, "Failed to read checkpoint"); + + for (auto obj : parsed->objects) { + obj_info[obj->seq] = (data_obj_info){ + .hdr = obj->hdr_sectors, + .data = obj->data_sectors, + .live = obj->live_sectors, + }; + } - j_write_super *jws = (j_write_super *)aligned_alloc(512, 4096); + for (auto m : parsed->dmap) { + extmap::obj_offset oo = {m->obj, m->offset}; + objmap.update(m->lba, m->lba + m->len, oo); + } +} - check_ret_errno(pread(write_fd, (char *)jws, 4096, 0), - "Can't read wcache superblock"); - if (jws->magic != LSVD_MAGIC || jws->type != LSVD_J_W_SUPER) - throw std::runtime_error("bad magic/type in write cache superblock\n"); - if (memcmp(jws->vol_uuid, xlate->uuid, sizeof(uuid_t)) != 0) - throw std::runtime_error("object and cache UUIDs don't match"); +// Returns last processed object's seqnum +seqnum_t lsvd_image::roll_forward_from_last_checkpoint() +{ + if (checkpoints.size() == 0) + return 0; - wcache = make_write_cache(0, write_fd, xlate.get(), &cfg); - free(jws); + object_reader parser(objstore); + auto last_ckpt = checkpoints.back(); + auto seq = last_ckpt + 1; - if (!cfg.no_gc) - xlate->start_gc(); - return 0; + for (;; seq++) { + auto ret = apply_log(seq); + if (!ret) + break; + } + + seq -= 1; + + // Delete "dangling" objects if there are any in case they cause trouble + // with corruption + // This must be larger than the max backend batch size to avoid + // potential corruption if subsequent breaks overlap with current dangling + // objects and we get writes from two different "generations" + for (seqnum_t i = 1; i < cfg.num_parallel_writes * 4; i++) + objstore->delete_obj(oname(imgname, seq + i)); + + return seq; } +void lsvd_image::recover_from_wlog() { UNIMPLEMENTED(); } + /** * This is the base for aio read and write requests. It's copied from * the old rbd_aio_req omniclass, with the read and write paths split out and @@ -138,7 +231,7 @@ class lsvd_image::read_request : public lsvd_image::aio_request { assert(parent == nullptr); - std::vector requests; + vec requests; img->handle_reads(req_offset, iovs, requests); num_subreqs = requests.size(); @@ -166,7 +259,7 @@ class lsvd_image::read_request : public lsvd_image::aio_request }; void lsvd_image::handle_reads(size_t offset, smartiov iovs, - std::vector &requests) + vec &requests) { sector_t start_sector = offset / 512; sector_t end_sector = start_sector + iovs.bytes() / 512; @@ -181,7 +274,7 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, auto bufmap_it = bufmap.end(); if (bufmap.size() > 0) bufmap_it = bufmap.lookup(start_sector); - auto backend_it = map.lookup(start_sector); + auto backend_it = objmap.lookup(start_sector); size_t _offset = 0; /* @@ -218,7 +311,7 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, sector_t base2 = end_sector, limit2 = end_sector; extmap::obj_offset objptr = {0, 0}; - if (backend_it != map.end()) + if (backend_it != objmap.end()) std::tie(base2, limit2, objptr) = backend_it->vals(start_sector, end_sector); @@ -249,14 +342,14 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, * it2: |----| |----| * |------| < but not this */ - while (backend_it != map.end() && backend_it->limit() <= limit1) + while (backend_it != objmap.end() && backend_it->limit() <= limit1) backend_it++; bufmap_it++; continue; } assert(base2 == start_sector); - assert(backend_it != map.end()); + assert(backend_it != objmap.end()); limit2 = std::min(limit2, base1); sector_t sectors = limit2 - start_sector; @@ -330,9 +423,8 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, // same thing to the shared read cache auto prefix = xlate->prefix(key.obj); - auto req = - shared_cache->make_read_req(prefix, key.obj, key.offset * 512L, - sector_in_blk * 512L, slice); + auto req = rcache->make_read_req(prefix, key.obj, key.offset * 512L, + sector_in_blk * 512L, slice); if (req != nullptr) requests.push_back(req); @@ -356,7 +448,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request * and be done with it. The old code had these as pointers, but changed * them to be in the vectwor. */ - std::vector sub_iovs; + vec sub_iovs; public: write_request(lsvd_image *img, size_t offset, smartiov iovs, @@ -372,7 +464,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request if (old > 1) return; - img->wcache->release_room(req_bytes / 512); + img->wlog->release_room(req_bytes / 512); complete_request(0); // TODO shouldn't we return bytes written? } @@ -380,8 +472,8 @@ class lsvd_image::write_request : public lsvd_image::aio_request { assert(parent == nullptr); - img->wcache->get_room(req_bytes / 512); - img->xlate->wait_for_room(); + img->wlog->reserve_room(req_bytes / 512); + img->xlate->backend_backpressure(); sector_t size_sectors = req_bytes / 512; @@ -390,7 +482,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request n_req += div_round_up(req_bytes / 512, max_sectors); // TODO: this is horribly ugly - std::vector requests; + vec requests; auto cur_offset = req_offset; for (sector_t s_offset = 0; s_offset < size_sectors; @@ -400,7 +492,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request iovs.slice(s_offset * 512L, s_offset * 512L + _sectors * 512L); smartiov _iov(tmp.data(), tmp.size()); sub_iovs.push_back(_iov); - auto req = img->wcache->writev(cur_offset / 512, &_iov); + auto req = img->wlog->writev(cur_offset / 512, &_iov); requests.push_back(req); cur_offset += _sectors * 512L; @@ -471,3 +563,60 @@ request *lsvd_image::flush(std::function cb) { return new flush_request(this, cb); } + +int lsvd_image::create_new(std::string name, usize size, rados_ioctx_t io) +{ + auto be = make_rados_backend(io); + auto parser = object_reader(be); + + uuid_t uuid; + uuid_generate_random(uuid); + + vec buf(4096); + vec ckpts; + vec clones; + serialise_superblock(buf, ckpts, clones, uuid, size); + + return be->write(name, buf.data(), buf.size()); +} + +int lsvd_image::get_uuid(str name, uuid_t &uuid, rados_ioctx_t io) +{ + auto be = make_rados_backend(io); + auto parser = object_reader(be); + auto osb = parser.read_superblock(name); + PR_RET_IF(!osb, -EEXIST, "Could not read superblock '{}'", name); + + uuid_copy(uuid, osb->uuid); + return 0; +} + +int lsvd_image::delete_image(std::string name, rados_ioctx_t io) +{ + auto be = make_rados_backend(io); + auto parser = object_reader(be); + auto osb = parser.read_superblock(name); + PR_RET_IF(!osb, -EEXIST, "Could not read superblock '{}'", name); + auto sb = *osb; + + seqnum_t seq; + for (auto ckpt : sb.ckpts) { + auto rc = be->delete_obj(oname(name, ckpt)); + PR_RET_IF(rc < 0, rc, "Failed to delete checkpoint '{}'", ckpt); + seq = ckpt; + } + + for (int n = 0; n < 16; seq++, n++) + if (be->delete_obj(oname(name, seq)) >= 0) + n = 0; + + // delete the superblock last so we can recover from partial deletion + return be->delete_obj(name); +} + +int lsvd_image::clone_image(std::string oldname, std::string newname, + rados_ioctx_t io) +{ + UNIMPLEMENTED(); + return -1; +} \ No newline at end of file diff --git a/src/image.h b/src/image.h index 5a87c626..7a1f0a9e 100644 --- a/src/image.h +++ b/src/image.h @@ -2,10 +2,12 @@ #include #include +#include #include "backend.h" #include "config.h" #include "extent.h" +#include "objects.h" #include "shared_read_cache.h" #include "translate.h" #include "write_cache.h" @@ -13,17 +15,56 @@ /** * Core LSVD image class. An LSVD image supports 4 operations: read, write, * trim, and flush. All are async to prevent function colour issues. + * + * Currently, a lot of core image functionality is in the `translate` class. + * The separation between what is here and what is there is not clear, and the + * two classes really should be consolidated, and the GC function splitted out + * into its own class. + * + * For now, all the core information about the image is owned by this class, + * and `translate` only takes references to it. Most of the translate code was + * from long ago, written by people who are no longer around. It's written like + * a C program, and the ownership structure of most resources is unclear, with + * sketchy concurrency control and C++ style. + * + * Eventually we'll have to rewrite the core translation class to clarify + * resource ownership and to overhaul the disastrous locking situation, but + * that's only a dream for now */ class lsvd_image { + private: + // no copying or moving + lsvd_image(const lsvd_image &) = delete; + lsvd_image operator=(const lsvd_image &) = delete; + lsvd_image(const lsvd_image &&) = delete; + lsvd_image operator=(const lsvd_image &&) = delete; + + // Log recovery + void read_superblock(); + void read_from_checkpoint(seqnum_t ckpt_id); + bool apply_log(seqnum_t seq); + + seqnum_t roll_forward_from_last_checkpoint(); + void recover_from_wlog(); + public: - std::string image_name; + lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg); + ~lsvd_image(); + std::string imgname; + uuid_t uuid; + usize size; // bytes lsvd_config cfg; - ssize_t size; // bytes + + rados_ioctx_t io; + + vec clones; // Base images on which we're built + vec checkpoints; // Checkpoints + std::map obj_info; // LBA -> object id, object offset - extmap::objmap map; + extmap::objmap objmap; std::shared_mutex map_lock; // LBA -> in-memory, higher priority than the object map @@ -40,21 +81,15 @@ class lsvd_image std::map buffers; std::shared_ptr objstore; - std::shared_ptr shared_cache; - std::unique_ptr wcache; + std::shared_ptr rcache; + std::unique_ptr wlog; std::unique_ptr xlate; - int write_fd; /* write cache file */ int refcount = 0; std::thread dbg; bool done = false; - lsvd_image() {} - ~lsvd_image(); - - int try_open(std::string name, rados_ioctx_t io); - class aio_request; class trivial_request; class read_request; @@ -64,7 +99,14 @@ class lsvd_image request *trim(size_t offset, size_t len, std::function cb); request *flush(std::function cb); + // Image management + // They all return 0 on success, -errno on failure + static int create_new(std::string name, usize size, rados_ioctx_t io); + static int get_uuid(std::string name, uuid_t &uuid, rados_ioctx_t io); + static int delete_image(std::string name, rados_ioctx_t io); + static int clone_image(std::string oldname, std::string newname, + rados_ioctx_t io); + private: - void handle_reads(size_t offset, smartiov iovs, - std::vector &requests); + void handle_reads(size_t offset, smartiov iovs, vec &requests); }; diff --git a/src/imgtool.cc b/src/imgtool.cc index 68f85cd2..00c030df 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -1,252 +1,143 @@ #include +#include +#include +#include #include #include #include -#include +#include #include #include #include #include "backend.h" -#include "config.h" -#include "fake_rbd.h" -#include "lsvd_types.h" +#include "image.h" #include "objects.h" -#include "translate.h" #include "utils.h" -enum tool_operation { - OP_CREATE = 1, - OP_DELETE = 2, - OP_INFO = 3, - OP_MKCACHE = 4, - OP_CLONE = 5 -}; - -const char *backend = "rados"; -const char *image_name; -const char *cache_dir; -const char *cache_dev; -cfg_cache_type cache_type = LSVD_CFG_READ; -enum tool_operation op; -const char *pool_name = "lsvd"; -size_t size = 0; - -static long parseint(const char *_s) +static usize parseint(str i) { - char *s = (char *)_s; - long val = strtol(s, &s, 0); - if (toupper(*s) == 'G') + usize processed; + auto val = std::stoll(i, &processed); + char *postfix = (char *)i.c_str() + processed; + + if (toupper(*postfix) == 'G') val *= (1024 * 1024 * 1024); - if (toupper(*s) == 'M') + if (toupper(*postfix) == 'M') val *= (1024 * 1024); - if (toupper(*s) == 'K') + if (toupper(*postfix) == 'K') val *= 1024; + return val; } -static struct argp_option options[] = { - {"cache-dir", 'd', "DIR", 0, "cache directory", 0}, - {"create", 'C', 0, 0, "create image", 0}, - {"mkcache", 'k', "DEV", 0, "use DEV as cache", 0}, - {"cache-type", 't', "R/W", 0, - "R for read cache, W for write cache (default: R)", 0}, - {"size", 'z', "SIZE", 0, "size in bytes (M/G=2^20,2^30)", 0}, - {"delete", 'D', 0, 0, "delete image", 0}, - {"info", 'I', 0, 0, "show image information", 0}, - {"clone", 'c', "IMAGE", 0, "clone image", 0}, - {"pool", 'p', "POOL", 0, "pool name", 0}, - {0, 0, 0, 0, 0, 0}, -}; - -static char args_doc[] = "IMAGE"; - -extern int init_wcache(int fd, uuid_t &uuid, int n_pages); -int (*make_cache)(int fd, uuid_t &uuid, int n_pages) = init_wcache; - -static error_t parse_opt(int key, char *arg, struct argp_state *state) +static void create(rados_ioctx_t io, str name, usize size, bool thick) { - switch (key) { - case ARGP_KEY_ARG: - image_name = arg; - break; - case 'd': - cache_dir = arg; - break; - case 'C': - op = OP_CREATE; - break; - case 'z': - size = parseint(arg); - break; - case 'D': - op = OP_DELETE; - break; - case 'I': - op = OP_INFO; - break; - case 't': - if (arg[0] == 'R') { - cache_type = LSVD_CFG_READ; - log_error("read cache no longer supported"); - exit(1); - } else if (arg[0] == 'W') { - cache_type = LSVD_CFG_WRITE; - make_cache = init_wcache; - } else - argp_usage(state); - break; - case 'k': - op = OP_MKCACHE; - cache_dev = arg; - break; - case 'c': - op = OP_CLONE; - break; - case 'p': - pool_name = arg; - case ARGP_KEY_END: - if (op == 0 || (op == OP_CREATE && size == 0)) - argp_usage(state); - break; - } - return 0; + auto rc = lsvd_image::create_new(name, size, io); + THROW_MSG_ON(rc != 0, "Failed to create new image '{}'", name); } -static struct argp argp = {options, parse_opt, NULL, args_doc, 0, 0, 0}; +static void remove(rados_ioctx_t io, str name) +{ + auto rc = lsvd_image::delete_image(name, io); + THROW_MSG_ON(rc != 0, "Failed to delete image '{}'", name); +} -void info(rados_ioctx_t io, const char *image_name) +static void clone(rados_ioctx_t io, str src, str dst) { - lsvd_config cfg; - int rv; - if ((rv = cfg.read()) < 0) { - printf("error reading config: %d\n", rv); - exit(1); - } - auto objstore = get_backend(&cfg, io, NULL); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { - printf("error reading superblock: %d\n", rv); - exit(1); - } - auto rcache_file = cfg.cache_filename(uu, image_name, LSVD_CFG_READ); - auto wcache_file = cfg.cache_filename(uu, image_name, LSVD_CFG_WRITE); - printf("image: %s\n", image_name); - printf("read cache: %s\n", rcache_file.c_str()); - printf("write cache: %s\n", wcache_file.c_str()); - - char base_buf[4096]; - rv = objstore->read(image_name, 0, base_buf, sizeof(base_buf)); - if (rv < 0) - throw std::runtime_error("failed to read superblock"); - - auto base_hdr = (obj_hdr *)base_buf; - auto base_super = (super_hdr *)(base_hdr + 1); - - if (base_hdr->magic != LSVD_MAGIC || base_hdr->type != LSVD_SUPER) - throw std::runtime_error("corrupt superblock"); - - char uuid_str[64]; - uuid_unparse_lower(base_hdr->vol_uuid, uuid_str); - fmt::print("UUID: {}\n", uuid_str); - fmt::print("Size: {} bytes", base_super->vol_size * 512); - fmt::print(" / {} GiB\n", - (double)base_super->vol_size * 512. / 1024. / 1024. / 1024.); - fmt::print("Checkpoints: {}\n", base_super->ckpts_len / 4.); - fmt::print("Snapshots: {}\n", base_super->snaps_len / 4.); - fmt::print("Is a clone: {}\n", base_super->clones_len == 0 ? "no" : "yes"); - - // parse clones - if (base_super->clones_len == 0) - return; - - uint32_t consumed = 0; - while (consumed < base_super->clones_len) { - auto ci = - (clone_info *)(base_buf + base_super->clones_offset + consumed); - auto objname = (char *)(ci + 1); - auto upto_seq = ci->last_seq; - fmt::print("Base: {}, upto seq {}\n", objname, upto_seq); - consumed += sizeof(clone_info) + strlen(objname) + 1; - } + auto rc = lsvd_image::clone_image(src, dst, io); + THROW_MSG_ON(rc != 0, "Failed to clone image '{}' to '{}'", src, dst); } -void mk_cache(rados_ioctx_t io, const char *image_name, const char *dev_name, - cfg_cache_type type) +static void info(rados_ioctx_t io, str name) { - int rv, fd = open(dev_name, O_RDWR); - if (fd < 0) { - perror("device file open"); - exit(1); - } - auto sz = getsize64(fd); + auto be = make_rados_backend(io); + auto parser = object_reader(be); - lsvd_config cfg; - if ((rv = cfg.read()) < 0) { - printf("error reading config: %d\n", rv); - exit(1); - } - auto objstore = get_backend(&cfg, io, NULL); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { - printf("error reading superblock: %d\n", rv); - exit(1); - } - auto cache_file = cfg.cache_filename(uu, image_name, type); + auto sb = parser.read_superblock(name); + THROW_MSG_ON(!sb, "Superblock not found"); - auto n_pages = sz / 4096; - if (make_cache(fd, uu, n_pages) < 0) { - printf("make_cache failed\n"); - exit(1); - } - if ((rv = symlink(dev_name, cache_file.c_str())) < 0) { - perror("symbolic link"); - exit(1); - } - close(fd); + auto i = *sb; + char uuid_str[37]; + uuid_unparse_lower(i.uuid, uuid_str); + + using namespace fmt; + print("=== Image info ===\n"); + print("Name: {}\n", name); + print("UUID: {}\n", uuid_str); + print("Size: {} bytes / {} GiB\n", i.vol_size, + (double)i.vol_size / 1024 / 1024 / 1024); + print("Checkpoints: {}\n", i.ckpts); + + for (auto &c : i.clones) + print("Base: '{}' upto seq {}\n", c->name, c->last_seq); + + for (auto &c : i.snaps) + print("Snapshot: '{}' at seq {}\n", c->name, c->seq); } int main(int argc, char **argv) { - argp_parse(&argp, argc, argv, 0, 0, 0); - - setenv("LSVD_BACKEND", backend, 1); - if (cache_dir != NULL) { - if (cache_type == LSVD_CFG_READ) - setenv("LSVD_RCACHE_DIR", cache_dir, 1); - else - setenv("LSVD_WCACHE_DIR", cache_dir, 1); - } - - rados_t cluster; - int err = rados_create2(&cluster, "ceph", "client.admin", 0); - check_ret_neg(err, "Failed to create cluster handle"); - - err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); - check_ret_neg(err, "Failed to read config file"); - - err = rados_connect(cluster); - check_ret_neg(err, "Failed to connect to cluster"); - - rados_ioctx_t io_ctx; - err = rados_ioctx_create(cluster, pool_name, &io_ctx); - check_ret_neg(err, "Failed to connect to pool {}", pool_name); - - if (op == OP_CREATE && size > 0) - rbd_create(io_ctx, image_name, size, NULL); - else if (op == OP_DELETE) - rbd_remove(io_ctx, image_name); - else if (op == OP_INFO) - info(io_ctx, image_name); - else if (op == OP_MKCACHE) - mk_cache(io_ctx, image_name, cache_dev, cache_type); - else if (op == OP_CLONE) { - auto src_img = image_name; - auto dst_img = argv[argc - 1]; - fmt::print("cloning from {} to {}\n", src_img, dst_img); - rbd_clone(io_ctx, src_img, dst_img); + std::set_terminate([]() { + try { + std::cerr << boost::stacktrace::stacktrace(); + } catch (...) { + } + std::abort(); + }); + + namespace po = boost::program_options; + po::options_description desc("Allowed options"); + + // clang-format off + desc.add_options() + ("help", "produce help message") + ("cmd", po::value(), "subcommand: create, clone, delete, info") + ("img", po::value(), "name of the iname") + ("pool", po::value(), "pool where the image resides") + ("size", po::value()->default_value("1G"), + "size in bytes (M=2^20,G=2^30)") + ("thick", po::value()->default_value(false), + "thick provision when creating an image (not currently supported)") + ("dest", po::value(), "destination (for clone)"); + // clang-format on + + po::positional_options_description p; + p.add("cmd", 1).add("pool", 1).add("img", 1); + + po::variables_map vm; + po::store( + po::command_line_parser(argc, argv).options(desc).positional(p).run(), + vm); + po::notify(vm); + + if (vm.count("help") || !vm.count("cmd") || !vm.count("pool") || + !vm.count("img")) { + std::cout << desc << "\n"; + return 1; } - rados_ioctx_destroy(io_ctx); - rados_shutdown(cluster); + auto cmd = vm["cmd"].as(); + auto pool = vm["pool"].as(); + auto img = vm["img"].as(); + + auto io = connect_to_pool(pool); + THROW_MSG_ON(io == nullptr, "Failed to connect to pool '{}'", pool); + + if (cmd == "create") { + auto size = parseint(vm["size"].as()); + auto thick = vm["thick"].as(); + create(io, img, size, thick); + } else if (cmd == "delete") + remove(io, img); + else if (cmd == "clone") { + THROW_MSG_ON(!vm.count("dest"), "Destination image not specified"); + auto dst = vm["dest"].as(); + clone(io, img, dst); + } else if (cmd == "info") + info(io, img); + else + THROW_MSG_ON(true, "Unknown command '{}'", cmd); + + rados_ioctx_destroy(io); } diff --git a/src/liblsvd.cc b/src/liblsvd.cc index dcd61bd3..6c9bd2c1 100644 --- a/src/liblsvd.cc +++ b/src/liblsvd.cc @@ -22,11 +22,10 @@ extern "C" int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, const char *snap_name) { - auto img = lsvd_spdk::open_image(io, name); - if (img == nullptr) { - log_error("Failed to open image {}", name); + auto img = lsvd_rbd::open_image(io, name); + if (img == nullptr) return -1; - } + *image = (void *)img; log_info("Opened image: {}, size {}", name, img->get_img().size); return 0; @@ -34,8 +33,8 @@ extern "C" int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, extern "C" int rbd_close(rbd_image_t image) { - lsvd_spdk *img = (lsvd_spdk *)image; - log_info("Closing image {}", img->get_img().image_name); + lsvd_rbd *img = (lsvd_rbd *)image; + log_info("Closing image {}", img->get_img().imgname); // poor man's race prevention. wait for in-flight requests sleep(2); @@ -47,14 +46,14 @@ extern "C" int rbd_close(rbd_image_t image) extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; return img->poll_io_events(reinterpret_cast(comps), numcomp); } extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; assert(type == EVENT_TYPE_EVENTFD); event_socket ev(fd, EVENT_TYPE_EVENTFD); @@ -65,19 +64,19 @@ extern "C" int rbd_aio_create_completion(void *cb_arg, rbd_callback_t complete_cb, rbd_completion_t *c) { - auto nc = lsvd_spdk::create_completion(complete_cb, cb_arg); + auto nc = lsvd_rbd::create_completion(complete_cb, cb_arg); *c = (rbd_completion_t)nc; return 0; } extern "C" void rbd_aio_release(rbd_completion_t c) { - lsvd_spdk::release_completion((spdk_completion *)c); + lsvd_rbd::release_completion((spdk_completion *)c); } extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len) { - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; auto req = img->trim(ofs, len, nullptr); req->run(nullptr); req->wait(); @@ -88,7 +87,7 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, rbd_completion_t c) { auto p = (spdk_completion *)c; - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; img->trim(off, len, p); p->run(); return 0; @@ -97,7 +96,7 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c) { auto *p = (spdk_completion *)c; - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; img->flush(p); p->run(); return 0; @@ -105,7 +104,7 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c) extern "C" int rbd_flush(rbd_image_t image) { - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; auto req = img->flush(nullptr); req->run(nullptr); req->wait(); @@ -128,7 +127,7 @@ extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c) extern "C" int rbd_aio_read(rbd_image_t image, uint64_t offset, size_t len, char *buf, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto p = (spdk_completion *)c; img->read(offset, smartiov(buf, len), p); p->run(); @@ -138,7 +137,7 @@ extern "C" int rbd_aio_read(rbd_image_t image, uint64_t offset, size_t len, extern "C" int rbd_aio_readv(rbd_image_t image, const iovec *iov, int iovcnt, uint64_t offset, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto p = (spdk_completion *)c; img->read(offset, smartiov(iov, iovcnt), p); p->run(); @@ -148,7 +147,7 @@ extern "C" int rbd_aio_readv(rbd_image_t image, const iovec *iov, int iovcnt, extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, int iovcnt, uint64_t offset, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto *p = (spdk_completion *)c; img->write(offset, smartiov(iov, iovcnt), p); p->run(); @@ -158,7 +157,7 @@ extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, extern "C" int rbd_aio_write(rbd_image_t image, uint64_t offset, size_t len, const char *buf, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto *p = (spdk_completion *)c; img->write(offset, smartiov((char *)buf, len), p); p->run(); @@ -169,7 +168,7 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t offset, size_t len, */ extern "C" int rbd_read(rbd_image_t image, uint64_t off, size_t len, char *buf) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto req = img->read(off, smartiov(buf, len), NULL); req->run(NULL); req->wait(); @@ -180,7 +179,7 @@ extern "C" int rbd_read(rbd_image_t image, uint64_t off, size_t len, char *buf) extern "C" int rbd_write(rbd_image_t image, uint64_t off, size_t len, const char *buf) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto req = img->write(off, smartiov((char *)buf, len), NULL); req->run(NULL); req->wait(); @@ -201,7 +200,7 @@ extern "C" int rbd_aio_wait_for_complete(rbd_completion_t c) extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info, size_t infosize) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; memset(info, 0, sizeof(*info)); info->size = img->get_img().size; info->obj_size = 1 << 22; // 2^21 bytes @@ -212,7 +211,7 @@ extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info, extern "C" int rbd_get_size(rbd_image_t image, uint64_t *size) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; *size = img->get_img().size; return 0; } @@ -227,27 +226,13 @@ std::pair split_string(std::string s, extern "C" int rbd_create(rados_ioctx_t io, const char *name, uint64_t size, int *order) { - lsvd_config cfg; - if (cfg.read() < 0) - return -1; - auto objstore = get_backend(&cfg, io, NULL); - auto rv = translate_create_image(objstore, name, size); - return rv; + return lsvd_image::create_new(name, size, io); } extern "C" int rbd_clone(rados_ioctx_t io, const char *source_img, const char *dest_img) { - lsvd_config cfg; - if (cfg.read() < 0) { - throw std::runtime_error("Failed to read config"); - return -1; - } - - auto objstore = get_backend(&cfg, io, NULL); - auto rv = translate_clone_image(objstore, source_img, dest_img); - - return rv; + return lsvd_image::clone_image(source_img, dest_img, io); } /* remove all objects and cache file. @@ -257,25 +242,12 @@ extern "C" int rbd_clone(rados_ioctx_t io, const char *source_img, */ extern "C" int rbd_remove(rados_ioctx_t io, const char *name) { - lsvd_config cfg; - auto rv = cfg.read(); - if (rv < 0) - return rv; - auto objstore = get_backend(&cfg, io, NULL); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, name, uu)) < 0) - return rv; - auto rcache_file = cfg.cache_filename(uu, name, LSVD_CFG_READ); - unlink(rcache_file.c_str()); - auto wcache_file = cfg.cache_filename(uu, name, LSVD_CFG_WRITE); - unlink(wcache_file.c_str()); - rv = translate_remove_image(objstore, name); - return rv; + return lsvd_image::delete_image(name, io); } extern "C" void rbd_uuid(rbd_image_t image, uuid_t *uuid) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; memcpy(uuid, img->get_img().xlate->uuid, sizeof(uuid_t)); } diff --git a/src/lsvd_types.h b/src/lsvd_types.h index 7837dac2..0ac51707 100644 --- a/src/lsvd_types.h +++ b/src/lsvd_types.h @@ -1,12 +1,13 @@ #pragma once +#include "utils.h" #include #include #include -#include -typedef int64_t sector_t; -typedef int page_t; +using sector_t = int64_t; +using page_t = int32_t; +using seqnum_t = uint32_t; enum lsvd_op { OP_READ = 2, OP_WRITE = 4 }; @@ -16,8 +17,7 @@ enum { LSVD_MAGIC = 0x4456534c }; * copy them into the provided output vector */ template -void decode_offset_len(char *buf, size_t offset, size_t len, - std::vector &vals) +void decode_offset_len(char *buf, size_t offset, size_t len, vec &vals) { T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); for (; p < end; p++) @@ -28,8 +28,7 @@ void decode_offset_len(char *buf, size_t offset, size_t len, * length field name_len. */ template -void decode_offset_len_ptr(char *buf, size_t offset, size_t len, - std::vector &vals) +void decode_offset_len_ptr(char *buf, size_t offset, size_t len, vec &vals) { T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); for (; p < end;) { @@ -62,3 +61,8 @@ class objname std::string str() { return name; } const char *c_str() { return name.c_str(); } }; + +static inline std::string oname(std::string prefix, uint32_t seq) +{ + return fmt::format("{}.{:08x}", prefix, seq); +} diff --git a/src/meson.build b/src/meson.build index 2544aa21..4ede957e 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,20 +1,16 @@ -lpthread = dependency('threads') -lz = dependency('zlib') -lfmt = dependency('fmt') -lboost = dependency('boost') -luring = dependency('liburing', static: true) -luuid = dependency('uuid') - cxx = meson.get_compiler('cpp') -lrados = cxx.find_library('rados', required: true) -ltcmalloc = cxx.find_library('tcmalloc', required: false) + +cmake = import('cmake') +# cmvars = cmake.subproject_options() +# cmvars.add_cmake_defines({'POSITION_INDEPENDENT_CODE': true}) +folly_cm = cmake.subproject('folly') +libfolly = folly_cm.dependency('folly') lsvd_src = files( 'config.cc', 'image.cc', 'liblsvd.cc', 'lsvd_debug.cc', - 'mkcache.cc', 'nvme.cc', 'objects.cc', 'rados_backend.cc', @@ -25,4 +21,19 @@ lsvd_src = files( ) lsvd_inc = include_directories('.') -lsvd_deps = [lpthread, lz, lfmt, lboost, luring, lrados, luuid, ltcmalloc] +lsvd_deps = [ + libfolly, + dependency('threads'), + dependency('zlib'), + dependency('fmt'), + dependency('boost'), + dependency('liburing', static: true), + dependency('uuid'), + cxx.find_library('rados', required: true), + cxx.find_library('jemalloc', required: false), +] + +spdk_fe = lsvd_src + files( + 'bdev_lsvd.cc', + 'spdk_frontend.cc', +) \ No newline at end of file diff --git a/src/misc_cache.h b/src/misc_cache.h index 633e3332..53f3c637 100644 --- a/src/misc_cache.h +++ b/src/misc_cache.h @@ -101,12 +101,12 @@ static inline void throw_fs_error(std::string msg) */ template class sized_vector { - std::vector *elements; + vec *elements; public: ~sized_vector() { delete elements; } - void init(int n) { elements = new std::vector(n); } - void init(int n, T val) { elements = new std::vector(n, val); } + void init(int n) { elements = new vec(n); } + void init(int n, T val) { elements = new vec(n, val); } T &operator[](int index) { return (*elements)[index]; } }; diff --git a/src/mkcache.cc b/src/mkcache.cc deleted file mode 100644 index e53bb7b2..00000000 --- a/src/mkcache.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* - * file: mkcache.cc - * description: create file containing write+read caches - * - * author: Peter Desnoyers, Northeastern University - * Copyright 2021, 2022 Peter Desnoyers - * license: GNU LGPL v2.1 or newer - * LGPL-2.1-or-later - */ - -#include -#include -#include -#include - -#include "journal.h" -#include "lsvd_types.h" - -int init_wcache(int fd, uuid_t &uuid, int n_pages) -{ - page_t w_pages = n_pages - 1; - page_t _map = div_round_up(w_pages, 256); - page_t _len = div_round_up(w_pages, 512); - page_t w_meta = 2 * (_map + _len); - char buf[4096]; - - w_pages -= w_meta; - - memset(buf, 0, sizeof(buf)); - auto w_super = (j_write_super *)buf; - *w_super = (j_write_super){LSVD_MAGIC, LSVD_J_W_SUPER, - 1, 1, - 1, 1, - 1 + w_meta, 1 + w_meta, - 1 + w_meta + w_pages, 1 + w_meta, - 0, 0, - 0, 0, - 0, 0, {0}}; - memcpy(w_super->vol_uuid, uuid, sizeof(uuid_t)); - if (!write(fd, buf, 4096)) { - perror("write cache write"); - return -1; - } - - memset(buf, 0, 4096); - for (int i = 1; i < 1 + w_pages + w_meta; i++) { - if (!write(fd, buf, 4096)) { - perror("write cache write"); - return -1; - } - } - - return 0; -} \ No newline at end of file diff --git a/src/nvme.cc b/src/nvme.cc index 07dd0b90..1c44a1ec 100644 --- a/src/nvme.cc +++ b/src/nvme.cc @@ -1,18 +1,16 @@ +#include +#include #include +#include #include #include #include -#include -#include - #include "lsvd_types.h" +#include "nvme.h" #include "request.h" #include "smartiov.h" #include "utils.h" -#include - -#include "nvme.h" void do_log(const char *, ...); diff --git a/src/objects.cc b/src/objects.cc index 70012db6..98b80c28 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -1,122 +1,241 @@ -#include +#include +#include #include +#include #include -#include "lsvd_debug.h" #include "lsvd_types.h" #include "objects.h" +#include "utils.h" -char *object_reader::read_object_hdr(const char *name, bool fast) +void serialise_common_hdr(vec &buf, obj_type t, seqnum_t s, u32 hdr, + u32 data, uuid_t &uuid) { - obj_hdr *h = (obj_hdr *)malloc(4096); - int rv; - if ((rv = objstore->read(name, 0, h, 4096)) < 0) - goto fail; - if (fast) - return (char *)h; - if (h->hdr_sectors > 8) { - size_t len = h->hdr_sectors * 512; - h = (obj_hdr *)realloc(h, len); - if (objstore->read(name, 0, h, len) < 0) - goto fail; - } - return (char *)h; -fail: - free((char *)h); - return NULL; + if (buf.size() < sizeof(common_obj_hdr)) + buf.resize(sizeof(common_obj_hdr)); + + auto h = (common_obj_hdr *)buf.data(); + h->magic = LSVD_MAGIC; + h->version = 1; + h->type = t; + h->seq = s; + h->hdr_sectors = hdr; + h->data_sectors = data; + h->crc = 0; + uuid_copy(h->vol_uuid, uuid); } -/* read all info from superblock, returns a vast number of things: - * [super, vol_size] = f(name, &ckpts, &clones, *&snaps): - * - super - pointer to buffer (must be freed) - * - vol_size - in bytes (-1 on failure) - * - ckpts, clones, snaps - what you'd expect - */ -std::pair -object_reader::read_super(const char *name, std::vector &ckpts, - std::vector &clones, - std::vector &snaps, uuid_t &uuid) +void serialise_superblock(vec &buf, vec &checkpoints, + vec &clones, uuid_t &uuid, usize vol_size) { - char *super_buf = read_object_hdr(name, false); - if (super_buf == NULL) - return std::make_pair((char *)NULL, -1); + // Reserve required space ahead of time + usize req_size = sizeof(common_obj_hdr) + sizeof(super_hdr); + req_size += checkpoints.size() * sizeof(seqnum_t); + for (auto &c : clones) + req_size += sizeof(clone_info) + round_up(c.name.size(), 8); + // TODO snapshots + req_size = std::max(req_size, 8ul); // minimum of 4096 bytes + req_size = round_up(req_size, 512); // round to sector boundary (why??) + + if (buf.size() < req_size) + buf.resize(req_size); + + auto bufp = buf.data(); // start of buffer + auto hdrp = (super_hdr *)(bufp + sizeof(common_obj_hdr)); + hdrp->vol_size = vol_size / 512; + + serialise_common_hdr(buf, OBJ_SUPERBLOCK, 0, req_size / 512, 0, uuid); - auto super_h = (obj_hdr *)super_buf; + // There are three variable-length arrays in the superblock: checkpoints, + // snapshots, and clones. The order doesn't matter, but we put clones first + // since that's effectively immutable. This means that the offset into + // everything else will not change over the lifetime of an image + // The checkpoints and snapshots are appended after that - if (super_h->magic != LSVD_MAGIC || super_h->version != 1 || - super_h->type != LSVD_SUPER) - return std::make_pair((char *)NULL, -1); - memcpy(uuid, super_h->vol_uuid, sizeof(uuid_t)); + // Also note that we should make sure that each clone/snapshot is 8-byte + // aligned in the buffer, as when we read them back to deserialise we end + // up with a bunch of pointers into the buffer and unaligned pointers will + // just make all of us sad. Fortunately we have c-style null-terminated + // strings so we can just pad with more null - super_hdr *super_sh = (super_hdr *)(super_h + 1); + // Part 1: clones. TODO skip on partial serialise + byte *clonep; + clonep = bufp + sizeof(common_obj_hdr) + sizeof(super_hdr); + hdrp->clones_offset = clonep - bufp; + for (auto &c : clones) { + auto padded_namelen = round_up(c.name.size(), 8); + auto record_len = sizeof(clone_info) + padded_namelen; + auto cip = (clone_info *)clonep; - decode_offset_len(super_buf, super_sh->ckpts_offset, - super_sh->ckpts_len, ckpts); - decode_offset_len_ptr(super_buf, super_sh->clones_offset, - super_sh->clones_len, clones); - decode_offset_len_ptr(super_buf, super_sh->snaps_offset, - super_sh->snaps_len, snaps); + cip->last_seq = c.last_seq; + uuid_copy(cip->vol_uuid, nullptr); // TODO + cip->name_len = padded_namelen; - return std::make_pair(super_buf, super_sh->vol_size * 512); + std::memset(cip->name, 0, padded_namelen); + std::memcpy(cip->name, c.name.c_str(), c.name.size()); + + clonep += record_len; + hdrp->clones_len += record_len; + } + + // Part 2: checkpoints + hdrp->ckpts_offset = clonep - bufp; + hdrp->ckpts_len = checkpoints.size() * sizeof(seqnum_t); + auto p = (seqnum_t *)(bufp + hdrp->ckpts_offset); + for (auto &c : checkpoints) + *p++ = c; + + // Part 3: snapshots + // TODO implement this when we get around to snapshots + hdrp->snaps_offset = 0; + hdrp->snaps_len = 0; } -/* read and decode the header of an object. Copies into arguments, - * frees all allocated memory - */ -ssize_t object_reader::read_data_hdr(const char *name, obj_hdr &h, - obj_data_hdr &dh, - std::vector &cleaned, - std::vector &dmap) +opt> object_reader::fetch_object_header(std::string objname) { - char *buf = read_object_hdr(name, false); - if (buf == NULL) - return -1; - auto tmp_h = (obj_hdr *)buf; - auto tmp_dh = (obj_data_hdr *)(tmp_h + 1); - if (tmp_h->type != LSVD_DATA) { - free(buf); - return -1; - } + vec buf(4096); + auto err = objstore->read(objname, 0, buf.data(), 4096); + RET_IF(err == -ENOENT, std::nullopt); + THROW_ERRNO_ON(err < 0, -err, "Failed to read object '{}' header", objname); + THROW_MSG_ON(err < 512, "Short read {}/512 on obj '{}'", err, objname); + + auto h = (common_obj_hdr *)buf.data(); + + // Validate magic + PR_RET_IF(h->magic != LSVD_MAGIC || h->version != 1, std::nullopt, + "Invalid magic or version in object '{}'", objname); - h = *tmp_h; - dh = *tmp_dh; + if (h->hdr_sectors <= 8) + return buf; - decode_offset_len(buf, tmp_dh->objs_cleaned_offset, - tmp_dh->objs_cleaned_len, cleaned); - decode_offset_len(buf, tmp_dh->data_map_offset, - tmp_dh->data_map_len, dmap); + // Header is longer than 4096, we have to fetch the rest + auto len = h->hdr_sectors * 512; + buf.reserve(len); + err = objstore->read(objname, 0, buf.data(), len); + PR_ERR_RET_IF(std::cmp_not_equal(err, len), std::nullopt, err, + "Failed to read object '{}' header", objname); - free(buf); - return 0; + return buf; } -/* read and decode a checkpoint object identified by sequence number +/* buf[offset ... offset+len] contains array of type T, with variable + * length field name_len. */ -ssize_t object_reader::read_checkpoint(const char *name, uint64_t &cache_seq, - std::vector &ckpts, - std::vector &objects, - std::vector &deletes, - std::vector &dmap) +template +void deserialise_offset_ptr(char *buf, size_t offset, size_t len, + vec &vals) +{ + T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); + for (; p < end;) { + vals.push_back(p); + p = (T *)((char *)p + sizeof(T) + p->name_len); + } +} + +template vec deserialise_cpy(byte *buf, usize offset, usize len) +{ + vec ret; + for (usize i = 0; i < len / sizeof(T); i++) { + T *p = (T *)(buf + offset + i * sizeof(T)); + ret.push_back(*p); + } + return ret; +} + +template +vec deserialise_ptrs(byte *buf, usize offset, usize len) { - char *buf = read_object_hdr(name, false); - if (buf == NULL) { - do_log("buf == NULL\n"); - return -1; + vec ret; + for (usize i = 0; i < len / sizeof(T); i++) { + T *p = (T *)(buf + offset + i * sizeof(T)); + ret.push_back(p); } - auto h = (obj_hdr *)buf; - auto ch = (obj_ckpt_hdr *)(h + 1); - if (h->type != LSVD_CKPT) { - do_log("%s: WRONG TYPE %d\n", name, h->type); - free(buf); - return -1; + return ret; +} + +template +vec deserialise_ptrs_with_len(byte *buf, usize offset, usize len) +{ + vec ret; + byte *p = buf + offset; + for (; p < buf + offset + len;) { + ret.push_back((T *)p); + p += sizeof(T) + ((T *)p)->name_len; } - cache_seq = ch->cache_seq; - decode_offset_len(buf, ch->ckpts_offset, ch->ckpts_len, ckpts); - decode_offset_len(buf, ch->objs_offset, ch->objs_len, objects); - decode_offset_len(buf, ch->deletes_offset, ch->deletes_len, - deletes); - decode_offset_len(buf, ch->map_offset, ch->map_len, dmap); - - free(buf); - return 0; + return ret; } + +opt object_reader::read_superblock(std::string oname) +{ + auto buf = objstore->read_whole_obj(oname); + PASSTHRU_NULLOPT(buf); + auto hdr = (common_obj_hdr *)buf->data(); + auto hbuf = buf->data(); + + PR_RET_IF(hdr->magic != LSVD_MAGIC, std::nullopt, + "Corrupt object; invalid magic at '{}', found {:x}", oname, + hdr->magic); + PR_RET_IF(hdr->version != 1, std::nullopt, + "Invalid version in object '{}', only 1 is supported", oname); + PR_RET_IF(hdr->type != OBJ_SUPERBLOCK, std::nullopt, + "Obj '{}' not a superblock", oname); + + parsed_superblock ret; + super_hdr *shdr = (super_hdr *)(hdr + 1); + + ret.ckpts = deserialise_cpy(hbuf, shdr->ckpts_offset, shdr->ckpts_len); + ret.clones = deserialise_ptrs_with_len( + hbuf, shdr->clones_offset, shdr->clones_len); + ret.snaps = deserialise_ptrs_with_len(hbuf, shdr->snaps_offset, + shdr->snaps_len); + + ret.superblock_buf = *buf; + uuid_copy(ret.uuid, hdr->vol_uuid); + ret.vol_size = shdr->vol_size * 512; + + return ret; +} + +opt object_reader::read_data_hdr(std::string oname) +{ + auto hdr = fetch_object_header(oname); + PASSTHRU_NULLOPT(hdr); + + parsed_data_hdr h; + h.buf = std::move(*hdr); + h.hdr = (common_obj_hdr *)h.buf.data(); + PR_RET_IF(h.hdr->type != OBJ_LOGDATA, std::nullopt, + "Invalid object type in '{}'", oname); + h.data_hdr = (obj_data_hdr *)(h.buf.data() + sizeof(common_obj_hdr)); + + auto buf = h.buf.data(); + h.cleaned = deserialise_ptrs( + buf, h.data_hdr->objs_cleaned_offset, h.data_hdr->objs_cleaned_len); + h.data_map = deserialise_ptrs(buf, h.data_hdr->data_map_offset, + h.data_hdr->data_map_len); + return h; +} + +opt object_reader::read_checkpoint(std::string oname) +{ + auto hdr = fetch_object_header(oname); + PASSTHRU_NULLOPT(hdr); + + parsed_checkpoint ret; + ret.hdr = (common_obj_hdr *)hdr->data(); + PR_RET_IF(ret.hdr->type != OBJ_CHECKPOINT, std::nullopt, + "Invalid object type it '{}'", oname); + ret.ckpt_hdr = (obj_ckpt_hdr *)(&hdr->at(sizeof(common_obj_hdr))); + + auto buf = hdr->data(); + ret.ckpts = deserialise_cpy(buf, ret.ckpt_hdr->ckpts_offset, + ret.ckpt_hdr->ckpts_len); + ret.objects = deserialise_ptrs(buf, ret.ckpt_hdr->objs_offset, + ret.ckpt_hdr->objs_len); + ret.deletes = deserialise_ptrs( + buf, ret.ckpt_hdr->deletes_offset, ret.ckpt_hdr->deletes_len); + ret.dmap = deserialise_ptrs(buf, ret.ckpt_hdr->map_offset, + ret.ckpt_hdr->map_len); + ret.buf = std::move(*hdr); + return ret; +} \ No newline at end of file diff --git a/src/objects.h b/src/objects.h index e588b4a3..a49204de 100644 --- a/src/objects.h +++ b/src/objects.h @@ -1,25 +1,23 @@ #pragma once -#include +#include #include -#include #include "backend.h" +#include "lsvd_types.h" +#include "utils.h" #if __BYTE_ORDER != __LITTLE_ENDIAN #error "this code is little-endian only" #endif -/* for now we'll use 32-bit object sequence numbers. So sue me... - */ - -enum obj_type { LSVD_SUPER = 1, LSVD_DATA = 2, LSVD_CKPT = 3 }; +enum obj_type { OBJ_SUPERBLOCK = 1, OBJ_LOGDATA = 2, OBJ_CHECKPOINT = 3 }; /* hdr - standard header for all backend objects * total length is hdr_sectors + data_sectors, in 512-byte units * name is for superblock, (.%08x % seq) otherwise */ -struct obj_hdr { +struct common_obj_hdr { uint32_t magic; uint32_t version; // 1 uuid_t vol_uuid; @@ -42,7 +40,7 @@ struct obj_hdr { * snaps : TBD */ struct super_hdr { - uint64_t vol_size; + uint64_t vol_size; // in 512 byte sectors uint32_t ckpts_offset; uint32_t ckpts_len; uint32_t clones_offset; // array of struct clone @@ -138,28 +136,64 @@ struct ckpt_mapentry { /* ------ helper functions -------- */ +struct parsed_superblock { + vec superblock_buf; // buffer containing superblock + usize vol_size; // size in bytes + vec ckpts; // checkpoint sequence numbers + vec clones; // ptrs are into the buffer + vec snaps; // ptrs are into the buffer + uuid_t uuid; +}; + +struct parsed_data_hdr { + vec buf; + common_obj_hdr *hdr; + obj_data_hdr *data_hdr; + vec cleaned; + vec data_map; +}; + +struct parsed_checkpoint { + vec buf; + common_obj_hdr *hdr; + obj_ckpt_hdr *ckpt_hdr; + vec ckpts; + vec objects; + vec deletes; + vec dmap; +}; + class object_reader { - std::shared_ptr objstore; + sptr objstore; public: object_reader(std::shared_ptr be) : objstore(be) {} - char *read_object_hdr(const char *name, bool fast); + opt> fetch_object_header(std::string oname); + opt read_superblock(std::string oname); + opt read_data_hdr(std::string oname); + opt read_checkpoint(std::string oname); +}; - std::pair read_super(const char *name, - std::vector &ckpts, - std::vector &clones, - std::vector &snaps, - uuid_t &uuid); +// ----- common image types, temporary(T&Cs apply) workaround ----- - ssize_t read_data_hdr(const char *name, obj_hdr &h, obj_data_hdr &dh, - std::vector &cleaned, - std::vector &dmap); +struct clone_base { + std::string name; + seqnum_t last_seq; + seqnum_t first_seq = 0; +}; - ssize_t read_checkpoint(const char *name, uint64_t &cache_seq, - std::vector &ckpts, - std::vector &objects, - std::vector &deletes, - std::vector &dmap); +struct data_obj_info { + sector_t hdr; + sector_t data; + sector_t live; }; + +void serialise_common_hdr(vec &buf, obj_type t, seqnum_t s, u32 hdr, + u32 data, uuid_t &uuid); + +// Serialise a superblock object. +void serialise_superblock(vec &buf, vec &checkpoints, + vec &clones, uuid_t &uuid, + usize vol_size); diff --git a/src/rados_backend.cc b/src/rados_backend.cc index 6e62373d..74ab05a0 100644 --- a/src/rados_backend.cc +++ b/src/rados_backend.cc @@ -131,6 +131,8 @@ class rados_backend : public backend librados::IoCtx::from_rados_ioctx_t(ctx_, this->ctx); } + ~rados_backend() override {} + int write(std::string name, smartiov &iov) override { auto req = dynamic_cast(aio_write(name, iov)); @@ -163,9 +165,62 @@ class rados_backend : public backend { return new rados_delete_req(ctx, name); } + + bool exists(std::string name) override + { + return ctx.stat(name, nullptr, nullptr) == 0; + } + + opt get_size(std::string name) override + { + u64 size; + time_t mtime; + int rv = ctx.stat(name, &size, &mtime); + switch (rv) { + case 0: + return size; + case -ENOENT: + return std::nullopt; + default: + THROW_ERRNO_ON(true, -rv, "Failed to stat object '{}'", name); + } + } + + opt> read_whole_obj(std::string name) override + { + auto size = get_size(name); + PASSTHRU_NULLOPT(size); + + vec buf(size.value()); + smartiov iov((char *)buf.data(), buf.size()); + auto r = read(name, 0, iov); + if (r < 0) + return std::nullopt; + + return buf; + } }; std::shared_ptr make_rados_backend(rados_ioctx_t io) { return std::make_shared(io); } + +rados_ioctx_t connect_to_pool(str pool_name) +{ + rados_t cluster; + int err = rados_create2(&cluster, "ceph", "client.admin", 0); + check_ret_neg(err, "Failed to create cluster handle"); + + err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); + check_ret_neg(err, "Failed to read config file"); + + err = rados_connect(cluster); + check_ret_neg(err, "Failed to connect to cluster"); + + rados_ioctx_t io_ctx; + err = rados_ioctx_create(cluster, pool_name.c_str(), &io_ctx); + check_ret_neg(err, "Failed to connect to pool {}", pool_name); + + return io_ctx; +} diff --git a/src/shared_read_cache.cc b/src/shared_read_cache.cc index ee0b148f..0a74887a 100644 --- a/src/shared_read_cache.cc +++ b/src/shared_read_cache.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "lsvd_types.h" #include "nvme.h" @@ -85,13 +86,13 @@ class shared_read_cache void *pending_fill_data = nullptr; // Keep track of pending reads - std::vector pending_reads; + vec pending_reads; // Keep track of the reverse map so we can evict this entry chunk_key key; }; - std::vector cache_state; + vec cache_state; std::mutex global_cache_lock; std::mutex cache_stats_lock; @@ -351,7 +352,7 @@ class shared_read_cache::cache_miss_request : public self_refcount_request { is_backend_done = true; - std::vector reqs; + vec reqs; { std::unique_lock lock(cache.global_cache_lock); @@ -439,7 +440,7 @@ shared_read_cache::shared_read_cache(std::string cache_path, user_bytes(tag::rolling_window::window_size = CACHE_STATS_WINDOW), backend_bytes(tag::rolling_window::window_size = CACHE_STATS_WINDOW) { - debug("Opening {} for the read cache", cache_path); + trace("Opening {} for the read cache", cache_path); fd = open(cache_path.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0777); check_ret_errno(fd, "failed to open cache file"); @@ -458,7 +459,7 @@ shared_read_cache::shared_read_cache(std::string cache_path, // CACHE_CHUNK_SIZE); cache_store = std::unique_ptr(make_nvme_uring(fd, "rcache_uring")); - cache_state = std::vector(num_cache_blocks); + cache_state = vec(num_cache_blocks); } shared_read_cache::~shared_read_cache() {} @@ -685,7 +686,7 @@ class sharded_cache : public read_cache // TODO in-place obj instead of uptrs, don't understand why we can't have // just a vector of the plain object - std::vector> shards; + vec> shards; // centralise the reporter std::thread cache_stats_reporter; diff --git a/src/smartiov.h b/src/smartiov.h index efc720b7..f32a34a6 100644 --- a/src/smartiov.h +++ b/src/smartiov.h @@ -9,7 +9,7 @@ */ class smartiov { - std::vector iovs; + vec iovs; public: smartiov() {} diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc new file mode 100644 index 00000000..e22681be --- /dev/null +++ b/src/spdk_frontend.cc @@ -0,0 +1,245 @@ +#include "spdk/event.h" +#include "spdk/nvme.h" +#include "spdk/nvmf.h" +#include +#include +#include +#include + +#include "backend.h" +#include "bdev_lsvd.h" +#include "spdk/nvmf_spec.h" +#include "utils.h" + +const char *NVME_SS_NQN = "nqn.2019-05.io.lsvd:cnode1"; +const char *HOSTNAME = "127.0.0.1"; +const char *PORT = "4420"; + +spdk_nvme_transport_id get_trid(const char *host, const char *port) +{ + spdk_nvme_transport_id trid; + // They're fixed-size char[] bufs in the struct, so make sure we have space + assert(strlen(host) < sizeof(trid.traddr)); + assert(strlen(port) < sizeof(trid.trsvcid)); + std::copy(host, host + strlen(host), trid.traddr); + std::copy(port, port + strlen(port), trid.trsvcid); + trid.trtype = SPDK_NVME_TRANSPORT_TCP; + trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; + // This is required because spdk looks at trstring, not the trtype + spdk_nvme_transport_id_populate_trstring( + &trid, spdk_nvme_transport_id_trtype_str(trid.trtype)); + return trid; +} + +using IntCallbackFn = std::function; +IntCallbackFn *alloc_cb(std::function cb) +{ + return new IntCallbackFn(cb); +} + +void invoke_and_free_cb(void *ctx, int status) +{ + auto cb = static_cast *>(ctx); + (*cb)(status); + delete cb; +} + +struct start_lsvd_args { + const char *pool_name; + const char *image_name; +}; + +spdk_nvmf_tgt *create_target() +{ + debug("Creating NVMF target"); + spdk_nvmf_target_opts opts = { + .name = "lsvd_nvmf_tgt", + .discovery_filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY, + }; + auto tgt = spdk_nvmf_tgt_create(&opts); + assert(tgt != nullptr); + + auto pg = spdk_nvmf_poll_group_create(tgt); + assert(pg != nullptr); + + return tgt; +} + +spdk_nvmf_subsystem *add_discovery_ss(spdk_nvmf_tgt *tgt) +{ + debug("Creating NVMF discovery subsystem"); + auto ss = spdk_nvmf_subsystem_create( + tgt, SPDK_NVMF_DISCOVERY_NQN, SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT, 0); + assert(ss != nullptr); + spdk_nvmf_subsystem_set_allow_any_host(ss, true); + return ss; +} + +spdk_nvmf_subsystem *add_nvme_ss(spdk_nvmf_tgt *tgt) +{ + debug("Creating SPDK controller subsystem"); + auto ss = + spdk_nvmf_subsystem_create(tgt, NVME_SS_NQN, SPDK_NVMF_SUBTYPE_NVME, 1); + assert(ss != nullptr); + spdk_nvmf_subsystem_set_allow_any_host(ss, true); + spdk_nvmf_subsystem_set_sn(ss, "SPDK_000001"); + spdk_nvmf_subsystem_set_mn(ss, "LSVD NVMe controller"); + spdk_nvmf_subsystem_set_ana_reporting(ss, true); + return ss; +} + +using TranspCb = std::function; +void create_tcp_transport(TranspCb *cb) +{ + debug("Creating TCP transport"); + spdk_nvmf_transport_opts opts; + auto succ = spdk_nvmf_transport_opts_init("TCP", &opts, sizeof(opts)); + assert(succ == true); + opts.io_unit_size = 131072; + opts.max_qpairs_per_ctrlr = 8; + opts.in_capsule_data_size = 8192; + debug("TCP transport opts: io_unit_size={}, max_qpairs_per_ctrlr={}, " + "in_capsule_data_size={}", + opts.io_unit_size, opts.max_qpairs_per_ctrlr, + opts.in_capsule_data_size); + + auto rc = spdk_nvmf_transport_create_async( + "TCP", &opts, + [](auto ctx, auto r) { + auto cb = static_cast(ctx); + (*cb)(r); + delete cb; + }, + cb); + assert(rc == 0); +} + +void add_tgt_transport(spdk_nvmf_tgt *tgt, spdk_nvmf_transport *tr, + std::function *cb) +{ + debug("Adding transport to target"); + spdk_nvmf_tgt_add_transport(tgt, tr, invoke_and_free_cb, cb); +} + +void start_tgt_listen(spdk_nvmf_tgt *tgt, spdk_nvme_transport_id trid) +{ + spdk_nvmf_listen_opts lopts; + spdk_nvmf_listen_opts_init(&lopts, sizeof(lopts)); + auto rc = spdk_nvmf_tgt_listen_ext(tgt, &trid, &lopts); + assert(rc == 0); +} + +void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, + spdk_nvme_transport_id trid, std::function *cb) +{ + debug("Adding listener to subsystem"); + + spdk_nvmf_listener_opts lopts; + spdk_nvmf_subsystem_listener_opts_init(&lopts, sizeof(lopts)); + lopts.secure_channel = false; + spdk_nvmf_subsystem_add_listener_ext(ss, &trid, invoke_and_free_cb, cb, + &lopts); +} + +void add_bdev_ns(spdk_nvmf_subsystem *ss, str bdev_name) +{ + debug("Adding bdev namespace to subsystem"); + spdk_nvmf_ns_opts nopts; + spdk_nvmf_ns_opts_get_defaults(&nopts, sizeof(nopts)); + auto nsid = spdk_nvmf_subsystem_add_ns_ext(ss, bdev_name.c_str(), &nopts, + sizeof(nopts), nullptr); + assert(nsid != 0); +} + +void start_ss(spdk_nvmf_subsystem *ss, std::function *cb) +{ + // debug("Starting subsystem"); + spdk_nvmf_subsystem_start( + ss, + [](auto ss, auto arg, auto rc) { + auto cb = static_cast *>(arg); + (*cb)(rc); + delete cb; + }, + cb); +} + +static void start_lsvd(void *arg) +{ + log_info("Starting LSVD SPDK program ..."); + auto args = (start_lsvd_args *)arg; + + auto io_ctx = connect_to_pool(args->pool_name); + + // Setup spdk nvmf + auto tgt = create_target(); + auto disc_ss = add_discovery_ss(tgt); + auto nvme_ss = add_nvme_ss(tgt); + auto trid = get_trid(HOSTNAME, PORT); + + // Add lsvd bdev + lsvd_config cfg; // TODO read this in from a config file + cfg.cache_size = 160 * 1024 * 1024; // small 160mb cache for testing + auto err = bdev_lsvd_create(args->image_name, io_ctx, cfg); + assert(err == 0); + add_bdev_ns(nvme_ss, args->image_name); + + // some stupid formatting decisions up ahead due to tower-of-callback + // it also looks cleaner without indents + // clang-format off + create_tcp_transport(new TranspCb([=](auto *tr) { + assert(tr != nullptr); + + add_tgt_transport(tgt, tr, alloc_cb([=](int rc) { + assert(rc == 0); + + start_tgt_listen(tgt, trid); + add_ss_listener(tgt, disc_ss, trid, alloc_cb([=](int) { + add_ss_listener(tgt, nvme_ss, trid, alloc_cb([=](int rc) { + assert(rc == 0); + + // Start both subsystems + start_ss(nvme_ss, alloc_cb([=](int) { + start_ss(disc_ss, alloc_cb([=](int) { + + log_info("LSVD SPDK program started successfully"); + + })); })); })); })); })); })); + // clang-format on +} + +int main(int argc, const char **argv) +{ + std::set_terminate([]() { + try { + std::cerr << boost::stacktrace::stacktrace(); + } catch (...) { + } + std::abort(); + }); + + if (argc < 3) { + log_error("Usage: {} ", argv[0]); + return 1; + } + + auto args = (start_lsvd_args){ + .pool_name = argv[1], + .image_name = argv[2], + }; + debug("Args: pool={}, image={}", args.pool_name, args.image_name); + + spdk_app_opts opts = {.shutdown_cb = []() { + log_info("Shutting down LSVD SPDK program ..."); + spdk_app_stop(0); + }}; + + spdk_app_opts_init(&opts, sizeof(opts)); + opts.name = "spdk_frontend"; + + int rc = spdk_app_start(&opts, start_lsvd, &args); + spdk_app_fini(); + + log_info("Exiting ..."); + return rc; +} diff --git a/src/spdk_wrap.cc b/src/spdk_wrap.cc index e0103ca5..d7d90941 100644 --- a/src/spdk_wrap.cc +++ b/src/spdk_wrap.cc @@ -1,4 +1,6 @@ #include "spdk_wrap.h" +#include "config.h" +#include "src/utils.h" spdk_completion::spdk_completion(rbd_callback_t cb, void *cb_arg) : cb(cb), cb_arg(cb_arg) @@ -11,7 +13,7 @@ spdk_completion::~spdk_completion() req->release(); } -void spdk_completion::delayed_init(lsvd_spdk *img, request *req) +void spdk_completion::delayed_init(lsvd_rbd *img, request *req) { this->img = img; this->req = req; @@ -54,31 +56,37 @@ inline void spdk_completion::dec_and_free() delete this; } -lsvd_spdk *lsvd_spdk::open_image(rados_ioctx_t io, std::string name) +lsvd_rbd *lsvd_rbd::open_image(rados_ioctx_t io, std::string name) { - auto img = new lsvd_spdk(); - try { - img->img.try_open(name, io); + lsvd_config cfg; + auto err = cfg.read(); + PR_ERR_RET_IF(err < 0, nullptr, -err, "Failed to read config"); + + return new lsvd_rbd(name, io, cfg); } catch (std::runtime_error &e) { log_error("Failed to open image: {}", e.what()); - delete img; return nullptr; } +} - return img; +void lsvd_rbd::close_image() { delete this; } + +lsvd_rbd::lsvd_rbd(std::string name, rados_ioctx_t io, lsvd_config cfg) + : img(name, io, cfg) +{ } -void lsvd_spdk::close_image() { delete this; } +lsvd_rbd::~lsvd_rbd() {} -spdk_completion *lsvd_spdk::create_completion(rbd_callback_t cb, void *cb_arg) +spdk_completion *lsvd_rbd::create_completion(rbd_callback_t cb, void *cb_arg) { return new spdk_completion(cb, cb_arg); } -void lsvd_spdk::release_completion(spdk_completion *c) { c->release(); } +void lsvd_rbd::release_completion(spdk_completion *c) { c->release(); } -void lsvd_spdk::on_request_complete(spdk_completion *c) +void lsvd_rbd::on_request_complete(spdk_completion *c) { std::unique_lock lk(completions_mtx); if (ev.has_value()) { @@ -87,13 +95,13 @@ void lsvd_spdk::on_request_complete(spdk_completion *c) } } -int lsvd_spdk::switch_to_poll(event_socket &&ev) +int lsvd_rbd::switch_to_poll(event_socket &&ev) { this->ev = std::move(ev); return 0; } -int lsvd_spdk::poll_io_events(spdk_completion **comps, int numcomp) +int lsvd_rbd::poll_io_events(spdk_completion **comps, int numcomp) { assert(ev.has_value()); @@ -117,34 +125,34 @@ std::function make_cb(spdk_completion *c) return [c](int rv) { c->complete(rv); }; } -void init_completion(spdk_completion *c, lsvd_spdk *img, request *req) +void init_completion(spdk_completion *c, lsvd_rbd *img, request *req) { if (c != nullptr) c->delayed_init(img, req); } -request *lsvd_spdk::read(size_t offset, smartiov iov, spdk_completion *c) +request *lsvd_rbd::read(size_t offset, smartiov iov, spdk_completion *c) { auto req = img.read(offset, iov, make_cb(c)); init_completion(c, this, req); return req; } -request *lsvd_spdk::write(size_t offset, smartiov iov, spdk_completion *c) +request *lsvd_rbd::write(size_t offset, smartiov iov, spdk_completion *c) { auto req = img.write(offset, iov, make_cb(c)); init_completion(c, this, req); return req; } -request *lsvd_spdk::trim(size_t offset, size_t len, spdk_completion *c) +request *lsvd_rbd::trim(size_t offset, size_t len, spdk_completion *c) { auto req = img.trim(offset, len, make_cb(c)); init_completion(c, this, req); return req; } -request *lsvd_spdk::flush(spdk_completion *c) +request *lsvd_rbd::flush(spdk_completion *c) { auto req = img.flush(make_cb(c)); init_completion(c, this, req); diff --git a/src/spdk_wrap.h b/src/spdk_wrap.h index 5f85f01d..82b7ac09 100644 --- a/src/spdk_wrap.h +++ b/src/spdk_wrap.h @@ -5,7 +5,7 @@ #include "fake_rbd.h" #include "image.h" -class lsvd_spdk; +class lsvd_rbd; class lsvd_image; class spdk_completion @@ -19,7 +19,7 @@ class spdk_completion rbd_callback_t cb; - lsvd_spdk *img = nullptr; + lsvd_rbd *img = nullptr; int retval = -1; request *req = nullptr; @@ -31,7 +31,7 @@ class spdk_completion spdk_completion(rbd_callback_t cb, void *cb_arg); ~spdk_completion(); - void delayed_init(lsvd_spdk *img, request *req); + void delayed_init(lsvd_rbd *img, request *req); void run(); void wait(); @@ -74,13 +74,16 @@ struct event_socket { /** * Wrapper around lsvd_image for SPDK's RBD api */ -class lsvd_spdk +class lsvd_rbd { public: - static lsvd_spdk *open_image(rados_ioctx_t io, std::string name); + static lsvd_rbd *open_image(rados_ioctx_t io, std::string name); void close_image(); private: + lsvd_rbd(str name, rados_ioctx_t io, lsvd_config cfg); + ~lsvd_rbd(); + lsvd_image img; std::queue completions; diff --git a/src/thick-image.cc b/src/thick-image.cc index 5c575cb5..8b8cd473 100644 --- a/src/thick-image.cc +++ b/src/thick-image.cc @@ -114,8 +114,8 @@ void create_thick(char *name, long size) size_t data_bytes = data_sectors * 512; auto hdr_buf = (char *)calloc(data_bytes + 4096, 1); - auto h = (obj_hdr *)hdr_buf; - *h = {LSVD_MAGIC, 1, {0}, LSVD_DATA, 0, 8, data_sectors, 0}; + auto h = (common_obj_hdr *)hdr_buf; + *h = {LSVD_MAGIC, 1, {0}, OBJ_LOGDATA, 0, 8, data_sectors, 0}; memcpy(h->vol_uuid, uu, sizeof(uu)); auto dh = (obj_data_hdr *)(h + 1); @@ -160,13 +160,13 @@ void create_thick(char *name, long size) uint32_t objmap_bytes = n_objs * sizeof(ckpt_obj); uint32_t extmap_bytes = n_objs * sizeof(ckpt_mapentry); - int ckpt_bytes = - sizeof(obj_hdr) + sizeof(obj_ckpt_hdr) + objmap_bytes + extmap_bytes; + int ckpt_bytes = sizeof(common_obj_hdr) + sizeof(obj_ckpt_hdr) + + objmap_bytes + extmap_bytes; uint32_t ckpt_sectors = div_round_up(ckpt_bytes, 512); auto ckpt_buf = (char *)calloc(ckpt_sectors * 512, 1); - auto ch = (obj_hdr *)ckpt_buf; - *ch = {LSVD_MAGIC, 1, {0}, LSVD_CKPT, 0, ckpt_sectors, 0, 0}; + auto ch = (common_obj_hdr *)ckpt_buf; + *ch = {LSVD_MAGIC, 1, {0}, OBJ_CHECKPOINT, 0, ckpt_sectors, 0, 0}; memcpy(ch->vol_uuid, uu, sizeof(uu)); auto cph = (obj_ckpt_hdr *)(ch + 1); @@ -204,11 +204,11 @@ void create_thick(char *name, long size) /* now write the superblock, with a single checkpoint pointer */ auto sb_data = (char *)calloc(4096, 1); - auto h2 = (obj_hdr *)sb_data; + auto h2 = (common_obj_hdr *)sb_data; auto sh = (super_hdr *)(h2 + 1); int ckpt_offset = sizeof(*h2) + sizeof(*sh); - *h2 = {LSVD_MAGIC, 1, {0}, LSVD_SUPER, 0, 8, 0, 0}; + *h2 = {LSVD_MAGIC, 1, {0}, OBJ_SUPERBLOCK, 0, 8, 0, 0}; memcpy(h2->vol_uuid, uu, sizeof(uu)); sh->vol_size = img_size / 512; diff --git a/src/translate.cc b/src/translate.cc index ff5ab3fb..0806980c 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -1,9 +1,7 @@ #include #include -#include -#include +#include #include -#include #include #include #include @@ -13,10 +11,7 @@ #include #include "extent.h" -#include "lsvd_debug.h" -#include "lsvd_types.h" #include "misc_cache.h" -#include "objects.h" #include "request.h" #include "src/utils.h" #include "translate.h" @@ -89,7 +84,7 @@ class translate_req : public request char *gc_data = NULL; // passed in by GC thread /* lba/len/obj/offset (ignore obj/offset for REQ_PUT) */ - std::vector entries; + vec entries; /* used for removing from map */ char *local_buf_base = NULL; @@ -115,7 +110,11 @@ class translate_req : public request return len + bytes <= max; } - ~translate_req() {} + ~translate_req() + { + if (batch_buf) + free(batch_buf); + } /* NOTE - this assumes the only significant header entry is the map */ @@ -132,7 +131,6 @@ class translate_req : public request local_buf_base = data_ptr; local_buf_limit = data_ptr + bytes; } - translate_req(work_type op_, translate_impl *tx_) { op = op_; @@ -155,67 +153,49 @@ class translate_req : public request class translate_impl : public translate { - /* lock ordering: lock m before *map_lock - */ - std::mutex m; // for things in this instance - extmap::objmap *map; // shared object map - extmap::bufmap *bufmap; // shared object map - std::shared_mutex *map_lock; // locks the object map - std::mutex *bufmap_lock; + std::string name; + lsvd_config &cfg; + usize vol_size; + + std::shared_ptr objstore; + std::shared_ptr rcache; + + // lock ordering: lock m before *map_lock + std::mutex m; // for things in this instance + std::condition_variable cv; - lsvd_config *cfg; + extmap::objmap &objmap; // shared object map + std::shared_mutex &omap_mtx; // locks the object map + extmap::bufmap &bufmap; // shared object map + std::mutex &bufmap_lock; - std::atomic seq; + std::atomic cur_seq; uint64_t ckpt_cache_seq = 0; // from last data object friend class translate_req; translate_req *current = NULL; - /* info on live data objects - all sizes in sectors - * checkpoints are tracked in @checkpoints, and in the superblock - */ - struct obj_info { - int hdr; // sectors - int data; // sectors - int live; // sectors - }; - std::map object_info; + vec &clones; + std::map &object_info; + vec &checkpoints; - std::vector checkpoints; + std::atomic outstanding_writes = 0; - std::atomic outstanding_writes = 0; + // GC can't delete an object if the read logic has a + // request outstanding to it - skip, and dead object reaping + // will get it on the next pass. + std::map obj_read_refcount; - std::condition_variable cv; - bool stopped = false; // stop GC from writing - - /* various constant state - */ - struct clone { - char prefix[128]; - int last_seq; - int first_seq = 0; - }; - std::vector clone_list; - char super_name[128]; - - /* superblock has two sections: [obj_hdr] [super_hdr] - */ - char *super_buf = NULL; - obj_hdr *super_h = NULL; - super_hdr *super_sh = NULL; - size_t super_len; - - /* GC can't delete an object if the read logic has a - * request outstanding to it - skip, and dead object reaping - * will get it on the next pass. - */ - std::map reading_objects; + // Used for updating the superblock when writing out new checkpoints + // Reserve it up-front to avoid repeated allocations each time we serialise + vec superblock_buf; thread_pool *workers; - thread_pool *misc_threads; // so we can stop ckpt, gc first - /* for triggering GC - */ + opt flush_worker; + opt gc_worker; + + // for triggering GC sector_t total_sectors = 0; sector_t total_live_sectors = 0; int gc_cycles = 0; @@ -223,19 +203,9 @@ class translate_impl : public translate int gc_sectors_written = 0; int gc_deleted = 0; - /* for shutdown - */ - bool gc_running = false; - std::condition_variable gc_cv; - void stop_gc(void); - - object_reader *parser; - - std::shared_ptr rcache; - - void write_checkpoint(int seq, translate_req *req); - void process_batch(int seq, translate_req *req); - void write_gc(int _seq, translate_req *req); + void write_checkpoint(seqnum_t seq, translate_req *req); + void process_batch(seqnum_t seq, translate_req *req); + void write_gc(seqnum_t _seq, translate_req *req); void worker_thread(thread_pool *p); @@ -243,264 +213,114 @@ class translate_impl : public translate sector_t data_sectors, data_map *extents, int n_extents, bool is_gc); - void do_gc(bool *running); - void gc_thread(thread_pool *p); - void flush_thread(thread_pool *p); - - std::shared_ptr objstore; + void flush_thread(std::stop_token st); + void gc_thread(std::stop_token st); + void do_gc(std::stop_token &st); public: - translate_impl(std::shared_ptr _io, lsvd_config *cfg_, - extmap::objmap *map, extmap::bufmap *bufmap, - std::shared_mutex *m, std::mutex *buf_m, - sptr rcache); - ~translate_impl(); - - ssize_t init(const char *name, bool timedflush); - void shutdown(void); - - void flush(void); /* write out current batch */ - void checkpoint(void); /* flush, then write checkpoint */ - - ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, int iovcnt); - ssize_t trim(size_t offset, size_t len); - void wait_for_room(void); - - void object_read_start(int obj); // mark object as busy - can't delete - void object_read_end(int obj); - - void start_gc(void); - - const char *prefix(int seq); -}; - -const char *translate_impl::prefix(int seq) -{ - if (clone_list.size() == 0 || seq > clone_list.front().last_seq) - return super_name; - for (auto const &c : clone_list) - if (seq >= c.first_seq) - return c.prefix; - assert(false); -} - -translate_impl::translate_impl(std::shared_ptr _io, lsvd_config *cfg_, - extmap::objmap *map_, extmap::bufmap *bufmap_, - std::shared_mutex *m_, std::mutex *buf_m, - sptr rcache) - : rcache(rcache) -{ - misc_threads = new thread_pool(&m); - workers = new thread_pool(&m); - objstore = _io; - parser = new object_reader(objstore); - map = map_; - bufmap = bufmap_; - map_lock = m_; - bufmap_lock = buf_m; - cfg = cfg_; -} - -uptr make_translate(std::shared_ptr _io, lsvd_config *cfg, - extmap::objmap *map, extmap::bufmap *bufmap, - std::shared_mutex *m, std::mutex *buf_m, - sptr rcache) -{ - return std::make_unique(_io, cfg, map, bufmap, m, buf_m, - rcache); -} - -translate_impl::~translate_impl() -{ - stopped = true; - cv.notify_all(); - if (current) - delete current; - delete parser; - if (super_buf) - free(super_buf); -} - -ssize_t translate_impl::init(const char *prefix_, bool timedflush) -{ - std::vector ckpts; - std::vector clones; - std::vector snaps; - - /* note prefix = superblock name - */ - strcpy(super_name, prefix_); - - auto [_buf, bytes] = - parser->read_super(super_name, ckpts, clones, snaps, uuid); - - check_cond(bytes < 0, "read_super failed for obj {}", super_name); - check_cond(_buf == NULL, "no superblock"); + translate_impl(str name, lsvd_config &cfg, usize vol_size, uuid_t &vol_uuid, + sptr be, sptr rcache, + extmap::objmap &objmap, std::shared_mutex &omap_mtx, + extmap::bufmap &bmap, std::mutex &bmap_lck, + seqnum_t last_seq, vec &clones, + std::map &objinfo, + vec &checkpoints) + : translate(vol_uuid), name(name), cfg(cfg), vol_size(vol_size), + objstore(be), rcache(rcache), objmap(objmap), omap_mtx(omap_mtx), + bufmap(bmap), bufmap_lock(bmap_lck), cur_seq(last_seq + 1), + clones(clones), object_info(objinfo), checkpoints(checkpoints), + superblock_buf(4096) + { + // Calculate GC data + for (auto const &[_, oi] : objinfo) { + total_sectors += oi.data; + total_live_sectors += oi.live; + } - int n_ckpts = ckpts.size(); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); + assert(current->batch_buf != nullptr); - super_buf = _buf; - super_h = (obj_hdr *)super_buf; - super_len = super_h->hdr_sectors * 512; - super_sh = (super_hdr *)(super_h + 1); + // start worker, flush, and GC threads + if (cfg.flush_interval_msec > 0) + flush_worker = + std::jthread([this](std::stop_token st) { flush_thread(st); }); - memcpy(&uuid, super_h->vol_uuid, sizeof(uuid)); + if (!cfg.no_gc) + gc_worker = + std::jthread([this](std::stop_token st) { gc_thread(st); }); - current = new translate_req(REQ_PUT, cfg->batch_size, this); - seq = 1; // empty volume case + // honestly have no idea how this works + workers = new thread_pool(&m); + workers->pool.push( + std::thread(&translate_impl::worker_thread, this, workers)); - /* is this a clone? - */ - if (super_sh->clones_len > 0) { - debug("Image is a clone, parsing cloneinfo headers"); - - char buf[4096]; - auto ci = (clone_info *)(_buf + super_sh->clones_offset); - auto obj_name = (char *)(ci + 1); - while (true) { - if (has_poolname_prefix(obj_name)) { - log_warn("Found poolname prefix in baseimg name: {}; stripping " - "it out. Cross-pool clones are not supported.", - obj_name); - obj_name = strip_poolname_prefix(obj_name); - log_info("Using base name: {}", obj_name); - } + // Fully serialise superblock once, so we can do partial serialisations + // later on and skip the checkpoint stuff every time + // currently unimplemented + // serialise_superblock(superblock_buf, checkpoints, clones, vol_uuid); + } - auto rv = objstore->read(obj_name, 0, buf, sizeof(buf)); - check_cond(rv < 0, "Failed to read {}", obj_name); - - auto _h = (obj_hdr *)buf; - auto _sh = (super_hdr *)(_h + 1); - - check_cond(_h->magic != LSVD_MAGIC || _h->type != LSVD_SUPER, - "Corrupted superblock in {}", obj_name); - check_cond(memcmp(_h->vol_uuid, ci->vol_uuid, sizeof(uuid_t)) != 0, - "UUID mismatch in {}", obj_name); - clone c; - strcpy(c.prefix, obj_name); - c.last_seq = ci->last_seq; - if (clone_list.size() > 0) - clone_list.back().first_seq = ci->last_seq + 1; - clone_list.push_back(c); - debug("Using base image {} upto seq {}", obj_name, c.last_seq); - - if (_sh->clones_len == 0) - break; - ci = (clone_info *)(buf + _sh->clones_offset); - obj_name = (char *)(ci + 1); - } + ~translate_impl() + { + cv.notify_all(); + if (workers) + delete workers; + if (current) + delete current; } - /* read in the last checkpoint, then roll forward from there; - */ - int last_ckpt = -1; - if (ckpts.size() > 0) { - std::vector objects; - std::vector deletes; - std::vector entries; - - /* hmm, we should never have checkpoints listed in the - * super that aren't persisted on the backend, should we? - */ - while (n_ckpts > 0) { - int c = ckpts[n_ckpts - 1]; - objname name(prefix(c), c); - do_log("reading ckpt %s\n", name.c_str()); - if (parser->read_checkpoint(name.c_str(), max_cache_seq, ckpts, - objects, deletes, entries) >= 0) { - last_ckpt = c; - break; - } - do_log("chkpt skip %d\n", c); - n_ckpts--; - } - if (last_ckpt == -1) - return -1; + void flush(void) override; /* write out current batch */ + void checkpoint(void) override; /* flush, then write checkpoint */ - for (int i = 0; i < n_ckpts; i++) { - do_log("chkpt from super: %d\n", ckpts[i]); - checkpoints.push_back(ckpts[i]); // so we can delete them later - } + ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, + int iovcnt) override; + ssize_t trim(size_t offset, size_t len) override; + void backend_backpressure(void) override; - for (auto o : objects) { - object_info[o.seq] = (obj_info){.hdr = (int)o.hdr_sectors, - .data = (int)o.data_sectors, - .live = (int)o.live_sectors}; - total_sectors += o.data_sectors; - total_live_sectors += o.live_sectors; - } - for (auto m : entries) { - map->update(m.lba, m.lba + m.len, - (extmap::obj_offset){.obj = m.obj, .offset = m.offset}); - } - seq = last_ckpt + 1; - } + // mark object as busy - can't delete + void object_read_start(int obj) override; + void object_read_end(int obj) override; - /* roll forward - */ - for (;; seq++) { - std::vector cleaned; - std::vector entries; - obj_hdr h; - obj_data_hdr dh; - - objname name(prefix(seq), seq); - if (parser->read_data_hdr(name.c_str(), h, dh, cleaned, entries) < 0) - break; - if (h.type == LSVD_CKPT) { - do_log("ckpt from roll-forward: %d\n", seq.load()); - checkpoints.push_back(seq); - continue; + void shutdown() override + { + if (gc_worker) { + gc_worker->request_stop(); + gc_worker->join(); } - do_log("roll %d\n", seq.load()); - assert(h.type == LSVD_DATA); - object_info[seq] = (obj_info){.hdr = (int)h.hdr_sectors, - .data = (int)h.data_sectors, - .live = (int)h.data_sectors}; - total_sectors += h.data_sectors; - total_live_sectors += h.data_sectors; - if (dh.cache_seq) // skip GC writes - max_cache_seq = dh.cache_seq; - - int offset = 0, hdr_len = h.hdr_sectors; - std::vector deleted; - for (auto m : entries) { - extmap::obj_offset oo = {seq, offset + hdr_len}; - map->update(m.lba, m.lba + m.len, oo, &deleted); - offset += m.len; - } - for (auto d : deleted) { - auto [base, limit, ptr] = d.vals(); - object_info[ptr.obj].live -= (limit - base); - assert(object_info[ptr.obj].live >= 0); - total_live_sectors -= (limit - base); + checkpoint(); + + if (flush_worker) { + flush_worker->request_stop(); + flush_worker->join(); } } - /* delete any potential "dangling" objects. - */ - for (int i = 1; i < 32; i++) { - objname name(prefix(i + seq), i + seq); - objstore->delete_obj(name.str()); + str prefix(seqnum_t seq) override + { + if (clones.size() == 0 || seq > clones.front().last_seq) + return name; + for (auto const &c : clones) + if (seq >= c.first_seq) + return c.name; + assert(false); // unreachable } +}; - workers->pool.push( - std::thread(&translate_impl::worker_thread, this, workers)); - if (timedflush) - misc_threads->pool.push( - std::thread(&translate_impl::flush_thread, this, misc_threads)); - return bytes; -} - -void translate_impl::start_gc(void) +uptr make_translate(str name, lsvd_config &cfg, usize vol_size, + uuid_t &vol_uuid, sptr be, + sptr rcache, extmap::objmap &objmap, + std::shared_mutex &omap_mtx, + extmap::bufmap &bmap, std::mutex &bmap_lck, + seqnum_t last_seq, vec &clones, + std::map &objinfo, + vec &checkpoints) { - misc_threads->pool.push( - std::thread(&translate_impl::gc_thread, this, misc_threads)); + return std::unique_ptr(new translate_impl( + name, cfg, vol_size, vol_uuid, be, rcache, objmap, omap_mtx, bmap, + bmap_lck, last_seq, clones, objinfo, checkpoints)); } -void translate_impl::shutdown(void) {} - /* ----------- parsing and serializing various objects -------------*/ /* read object header @@ -514,22 +334,22 @@ void translate_impl::make_obj_hdr(char *buf, uint32_t _seq, sector_t hdr_sectors, sector_t data_sectors, data_map *extents, int n_extents, bool is_gc) { - auto h = (obj_hdr *)buf; + auto h = (common_obj_hdr *)buf; auto dh = (obj_data_hdr *)(h + 1); uint32_t map_offset = sizeof(*h) + sizeof(*dh), map_len = n_extents * sizeof(data_map); uint32_t hdr_bytes = map_offset + map_len; assert(hdr_bytes <= hdr_sectors * 512); - *h = (obj_hdr){.magic = LSVD_MAGIC, - .version = 1, - .vol_uuid = {0}, - .type = LSVD_DATA, - .seq = _seq, - .hdr_sectors = (uint32_t)hdr_sectors, - .data_sectors = (uint32_t)data_sectors, - .crc = 0}; - memcpy(h->vol_uuid, &uuid, sizeof(uuid_t)); + *h = (common_obj_hdr){.magic = LSVD_MAGIC, + .version = 1, + .vol_uuid = {0}, + .type = OBJ_LOGDATA, + .seq = _seq, + .hdr_sectors = (uint32_t)hdr_sectors, + .data_sectors = (uint32_t)data_sectors, + .crc = 0}; + uuid_copy(h->vol_uuid, uuid); *dh = (obj_data_hdr){.cache_seq = 0, .objs_cleaned_offset = 0, @@ -580,18 +400,18 @@ ssize_t translate_impl::writev(uint64_t cache_seq, size_t offset, iovec *iov, std::unique_lock lk(m); if (!current->room(bytes)) { workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } // write the data into the in-memory log auto ptr = current->append(base, &siov); // update the bufmap (lba -> in-memory buffer) with the extent - std::unique_lock obj_w_lock(*bufmap_lock); + std::unique_lock obj_w_lock(bufmap_lock); assert(ptr >= current->local_buf_base && ptr + (limit - base) * 512 <= current->local_buf_limit); assert(ptr != NULL); - bufmap->update(base, limit, ptr); + bufmap.update(base, limit, ptr); return 0; } @@ -612,11 +432,11 @@ ssize_t translate_impl::writev(uint64_t cache_seq, size_t offset, iovec *iov, ssize_t translate_impl::trim(size_t offset, size_t len) { std::unique_lock lk(m); - std::unique_lock obj_w_lock(*map_lock); + std::unique_lock obj_w_lock(omap_mtx); // trim the map - std::vector deleted; - map->trim(offset / 512, (offset + len) / 512, &deleted); + vec deleted; + objmap.trim(offset / 512, (offset + len) / 512, &deleted); // and then update the GC accounting for (auto d : deleted) { @@ -636,10 +456,10 @@ ssize_t translate_impl::trim(size_t offset, size_t len) * TODO measure how long this takes us, likely to be bottleneck on high * write throughput scenarios */ -void translate_impl::wait_for_room(void) +void translate_impl::backend_backpressure(void) { std::unique_lock lk(m); - while (outstanding_writes > cfg->xlate_window) + while (outstanding_writes > cfg.num_parallel_writes) cv.wait(lk); } @@ -649,20 +469,20 @@ void translate_impl::wait_for_room(void) void translate_impl::object_read_start(int obj) { std::unique_lock lk(m); - if (reading_objects.find(obj) == reading_objects.end()) - reading_objects[obj] = 0; + if (obj_read_refcount.find(obj) == obj_read_refcount.end()) + obj_read_refcount[obj] = 0; else - reading_objects[obj] = reading_objects[obj] + 1; + obj_read_refcount[obj] = obj_read_refcount[obj] + 1; } void translate_impl::object_read_end(int obj) { std::unique_lock lk(m); - auto i = reading_objects[obj]; + auto i = obj_read_refcount[obj]; if (i == 1) - reading_objects.erase(obj); + obj_read_refcount.erase(obj); else - reading_objects[obj] = i - 1; + obj_read_refcount[obj] = i - 1; } /* NOTE - currently not called for REQ_CKPT, which @@ -678,7 +498,7 @@ void translate_req::notify(request *child) * -> lock tx->m before tx->map_lock */ std::unique_lock lk(tx->m); - // if (--tx->outstanding_writes < tx->cfg->xlate_window) + // if (--tx->outstanding_writes < tx->cfg.xlate_window) tx->outstanding_writes--; tx->cv.notify_all(); } @@ -687,24 +507,22 @@ void translate_req::notify(request *child) /* remove extents from tx->bufmap, but only if they still * point to this buffer */ - std::unique_lock obj_w_lock(*tx->bufmap_lock); - std::vector> extents; + std::unique_lock obj_w_lock(tx->bufmap_lock); + vec> extents; for (auto const &e : entries) { auto limit = e.lba + e.len; - for (auto it2 = tx->bufmap->lookup(e.lba); - it2 != tx->bufmap->end() && it2->base() < limit; it2++) { + for (auto it2 = tx->bufmap.lookup(e.lba); + it2 != tx->bufmap.end() && it2->base() < limit; it2++) { auto [_base, _limit, ptr] = it2->vals(e.lba, limit); if (ptr.buf >= local_buf_base && ptr.buf < local_buf_limit) extents.push_back(std::pair(_base, _limit)); } } for (auto [base, limit] : extents) { - tx->bufmap->trim(base, limit); + tx->bufmap.trim(base, limit); } } - if (batch_buf != NULL) // allocated in constructor - free(batch_buf); if (gc_buf != NULL) // allocated in write_gc free(gc_buf); if (gc_data != NULL) // allocated in gc threqad @@ -725,17 +543,19 @@ void translate_req::notify(request *child) * - wait for preceding writes to complete before writing? * - write async rather than sync? (not really compatible with prev) */ -void translate_impl::write_checkpoint(int _seq, translate_req *req) +void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) { - std::vector entries; - std::vector objects; + debug("Writing checkpoint {}", cp_seq); + + vec entries; + vec objects; - for (auto it = map->begin(); it != map->end(); it++) { + for (auto it = objmap.begin(); it != objmap.end(); it++) { auto [base, limit, ptr] = it->vals(); entries.push_back((ckpt_mapentry){.lba = base, .len = limit - base, - .obj = (int32_t)ptr.obj, - .offset = (int32_t)ptr.offset}); + .obj = (s32)ptr.obj, + .offset = (s32)ptr.offset}); } size_t map_bytes = entries.size() * sizeof(ckpt_mapentry); @@ -749,22 +569,15 @@ void translate_impl::write_checkpoint(int _seq, translate_req *req) } size_t objs_bytes = objects.size() * sizeof(ckpt_obj); - size_t hdr_bytes = sizeof(obj_hdr) + sizeof(obj_ckpt_hdr); - int sectors = div_round_up(hdr_bytes + map_bytes + objs_bytes, 512); - - auto buf = (char *)calloc(sectors * 512, 1); - auto h = (obj_hdr *)buf; - *h = (obj_hdr){.magic = LSVD_MAGIC, - .version = 1, - .vol_uuid = {0}, - .type = LSVD_CKPT, - .seq = (uint32_t)_seq, - .hdr_sectors = (uint32_t)sectors, - .data_sectors = 0}; - memcpy(h->vol_uuid, uuid, sizeof(uuid_t)); - auto ch = (obj_ckpt_hdr *)(h + 1); - - uint32_t o1 = sizeof(obj_hdr) + sizeof(obj_ckpt_hdr), o2 = o1 + objs_bytes; + size_t hdr_bytes = sizeof(common_obj_hdr) + sizeof(obj_ckpt_hdr); + sector_t sectors = div_round_up(hdr_bytes + map_bytes + objs_bytes, 512); + + vec cp_buf(sectors * 512); + serialise_common_hdr(cp_buf, OBJ_CHECKPOINT, cp_seq, sectors, 0, uuid); + auto ch = (obj_ckpt_hdr *)(cp_buf.data() + sizeof(common_obj_hdr)); + + uint32_t o1 = sizeof(common_obj_hdr) + sizeof(obj_ckpt_hdr), + o2 = o1 + objs_bytes; *ch = (obj_ckpt_hdr){.cache_seq = ckpt_cache_seq, .ckpts_offset = 0, .ckpts_len = 0, @@ -776,36 +589,31 @@ void translate_impl::write_checkpoint(int _seq, translate_req *req) .map_len = (uint32_t)map_bytes}; auto objs = (char *)(ch + 1); - memcpy(objs, (char *)objects.data(), objs_bytes); auto maps = objs + objs_bytes; - memcpy(maps, (char *)entries.data(), map_bytes); - - /* and write it - */ - objname name(prefix(_seq), _seq); - objstore->write(name.str(), buf, sectors * 512); - free(buf); - - checkpoints.push_back(_seq); - size_t offset = sizeof(*super_h) + sizeof(*super_sh); - std::vector ckpts_to_delete; + if (objs_bytes > 0) + memcpy(objs, (char *)objects.data(), objs_bytes); + if (map_bytes > 0) + memcpy(maps, (char *)entries.data(), map_bytes); + + // Write out the checkpoint + objstore->write(oname(name, cp_seq), cp_buf.data(), cp_buf.size()); + + // Update superblock with new checkpoint, and keep only the last 3 + // around both in the backend and the superblock + checkpoints.push_back(cp_seq); + vec ckpts_to_delete; while (checkpoints.size() > 3) { ckpts_to_delete.push_back(checkpoints.front()); checkpoints.erase(checkpoints.begin()); } - super_sh->ckpts_offset = offset; - super_sh->ckpts_len = checkpoints.size() * sizeof(uint32_t); - auto pc = (uint32_t *)(super_buf + offset); - for (size_t i = 0; i < checkpoints.size(); i++) - *pc++ = checkpoints[i]; + serialise_superblock(superblock_buf, checkpoints, clones, uuid, vol_size); + // debug("Updating superblock with new checkpoint"); + objstore->write(name, superblock_buf.data(), superblock_buf.size()); - objstore->write(super_name, super_buf, 4096); - - for (auto c : ckpts_to_delete) { - objname name(prefix(c), c); - objstore->delete_obj(name.str()); - } + // debug("Deleting old checkpoints {}", ckpts_to_delete); + for (auto c : ckpts_to_delete) + objstore->delete_obj(oname(name, c)); req->done = true; req->cv.notify_all(); @@ -826,7 +634,7 @@ void translate_impl::write_checkpoint(int _seq, translate_req *req) * this guarantees that the contents reflect the map state after all * previous seq#s and before all following ones. */ -void translate_impl::write_gc(int _seq, translate_req *req) +void translate_impl::write_gc(seqnum_t _seq, translate_req *req) { req->_seq = _seq; @@ -834,8 +642,8 @@ void translate_impl::write_gc(int _seq, translate_req *req) for (const auto &e : req->entries) data_sectors += e.len; - int max_hdr_bytes = sizeof(obj_hdr) + sizeof(obj_data_hdr) + - (cfg->batch_size / 2048) * sizeof(data_map); + int max_hdr_bytes = sizeof(common_obj_hdr) + sizeof(obj_data_hdr) + + (cfg.backend_obj_size / 2048) * sizeof(data_map); int max_hdr_sectors = div_round_up(max_hdr_bytes, 512); auto buf = req->gc_buf = @@ -846,13 +654,13 @@ void translate_impl::write_gc(int _seq, translate_req *req) auto in_ptr = req->gc_data; // int _data_sectors = 0; // actual sectors in GC write - std::vector obj_extents; + vec obj_extents; req->local_buf_base = data_ptr; for (auto const &[base, len, obj, offset] : req->entries) { auto limit = base + len; - for (auto it2 = map->lookup(base); - it2 != map->end() && it2->base() < limit; it2++) { + for (auto it2 = objmap.lookup(base); + it2 != objmap.end() && it2->base() < limit; it2++) { /* [_base,_limit] is a piece of the extent * obj_base is where that piece starts in the object */ @@ -875,20 +683,20 @@ void translate_impl::write_gc(int _seq, translate_req *req) req->local_buf_limit = data_ptr; data_sectors = (data_ptr - data_ptr0) / 512; - int hdr_bytes = sizeof(obj_hdr) + sizeof(obj_data_hdr) + + int hdr_bytes = sizeof(common_obj_hdr) + sizeof(obj_data_hdr) + obj_extents.size() * sizeof(data_map); int hdr_pages = div_round_up(hdr_bytes, 4096); int hdr_sectors = hdr_pages * 8; sector_t offset = hdr_sectors; data_ptr = data_ptr0; - std::vector deleted; + vec deleted; req->entries.clear(); // replace with actual extents written - std::unique_lock obj_w_lock(*map_lock); // protect the readers + std::unique_lock obj_w_lock(omap_mtx); // protect the readers for (auto const &e : obj_extents) { extmap::obj_offset oo = {_seq, offset}; - map->update(e.lba, e.lba + e.len, oo, &deleted); + objmap.update(e.lba, e.lba + e.len, oo, &deleted); offset += e.len; req->entries.push_back( (ckpt_mapentry){(int64_t)e.lba, (int64_t)e.len, 0, 0}); @@ -909,10 +717,10 @@ void translate_impl::write_gc(int _seq, translate_req *req) make_obj_hdr(hdr, _seq, hdr_sectors, data_sectors, obj_extents.data(), obj_extents.size(), true); - auto h = (obj_hdr *)hdr; + auto h = (common_obj_hdr *)hdr; assert((int)h->hdr_sectors == hdr_sectors); - obj_info oi = { + data_obj_info oi = { .hdr = hdr_sectors, .data = data_sectors, .live = data_sectors}; object_info[_seq] = oi; @@ -923,11 +731,11 @@ void translate_impl::write_gc(int _seq, translate_req *req) req2->run(req); } -void translate_impl::process_batch(int _seq, translate_req *req) +void translate_impl::process_batch(seqnum_t _seq, translate_req *req) { req->_seq = _seq; - int offset = sizeof(obj_hdr) + sizeof(obj_data_hdr), + int offset = sizeof(common_obj_hdr) + sizeof(obj_data_hdr), len = req->entries.size() * sizeof(data_map); int hdr_bytes = offset + len; int hdr_pages = div_round_up(hdr_bytes, 4096); @@ -937,23 +745,22 @@ void translate_impl::process_batch(int _seq, translate_req *req) /* update the object info table */ - std::unique_lock obj_w_lock(*map_lock); + std::unique_lock obj_w_lock(omap_mtx); - obj_info oi = { + object_info[_seq] = (data_obj_info){ .hdr = hdr_sectors, .data = data_sectors, .live = data_sectors}; - object_info[_seq] = oi; /* and the object map (copy entries to right format at same time) */ sector_t sector_offset = hdr_sectors; - std::vector deleted; + vec deleted; deleted.reserve(req->entries.size()); - std::vector dm_entries; + vec dm_entries; dm_entries.reserve(req->entries.size()); for (auto e : req->entries) { extmap::obj_offset oo = {_seq, sector_offset}; - map->update(e.lba, e.lba + e.len, oo, &deleted); + objmap.update(e.lba, e.lba + e.len, oo, &deleted); sector_offset += e.len; dm_entries.push_back((data_map){(uint64_t)e.lba, (uint64_t)e.len}); } @@ -983,6 +790,8 @@ void translate_impl::process_batch(int _seq, translate_req *req) auto obj_size = (hdr_sectors + data_sectors) * 512; auto obj_ptr = hdr_ptr; + trace("Writing data obj seq {}", _seq); + rcache->insert_object(pf, _seq, obj_size, obj_ptr); auto req2 = objstore->aio_write(name.str(), obj_ptr, obj_size); @@ -1013,21 +822,21 @@ void translate_impl::worker_thread(thread_pool *p) * map is updated before any following requests are processed */ else if (req->op == REQ_PUT) { - auto _seq = seq++; + auto _seq = cur_seq++; lk.unlock(); process_batch(_seq, req); } // generate a checkpoint before any following requests processed else if (req->op == REQ_CKPT) { - auto _seq = seq++; + auto _seq = cur_seq++; lk.unlock(); write_checkpoint(_seq, req); } // handle output of GC thread else if (req->op == REQ_GC) { - auto _seq = seq++; + auto _seq = cur_seq++; lk.unlock(); write_gc(_seq, req); } @@ -1046,7 +855,7 @@ void translate_impl::flush(void) if (current->len > 0) { workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } auto flush_req = new translate_req(REQ_FLUSH, this); @@ -1061,7 +870,7 @@ void translate_impl::checkpoint(void) if (current->len > 0) { workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } auto ckpt_req = new translate_req(REQ_CKPT, this); @@ -1074,31 +883,33 @@ void translate_impl::checkpoint(void) * for @timeout then submit it for writing to the backend. * Unlike flush() we don't bother waiting until it completes. */ -void translate_impl::flush_thread(thread_pool *p) +void translate_impl::flush_thread(std::stop_token st) { pthread_setname_np(pthread_self(), "flush_thread"); - auto wait_time = std::chrono::milliseconds(500); - auto timeout = std::chrono::milliseconds(cfg->flush_msec); + auto interval = std::chrono::milliseconds(cfg.flush_interval_msec); + auto timeout = std::chrono::milliseconds(cfg.flush_timeout_msec); auto t0 = std::chrono::system_clock::now(); - auto seq0 = seq.load(); + auto seq0 = cur_seq.load(); - std::unique_lock lk(*p->m); - while (p->running) { - p->cv.wait_for(lk, wait_time); - if (!p->running) + debug("Flush thread {} starting", pthread_self()); + + while (true) { + std::this_thread::sleep_for(interval); + if (st.stop_requested()) break; - if (p->running && seq0 == seq.load() && current->len > 0) { + if (seq0 == cur_seq.load() && current->len > 0) { if (std::chrono::system_clock::now() - t0 < timeout) continue; workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } else { - seq0 = seq.load(); + seq0 = cur_seq.load(); t0 = std::chrono::system_clock::now(); } } - printf("flush thread (%lx) exiting\n", pthread_self()); + + log_info("Flush thread {} exiting", pthread_self()); } /* -------------- Garbage collection ---------------- */ @@ -1111,14 +922,14 @@ struct _extent { /* [describe GC algorithm here] */ -void translate_impl::do_gc(bool *running) +void translate_impl::do_gc(std::stop_token &st) { gc_cycles++; // trace("Start GC cycle {}", gc_cycles); - int max_obj = seq.load(); + int max_obj = cur_seq.load(); - std::shared_lock obj_r_lock(*map_lock); - std::vector dead_objects; + std::shared_lock obj_r_lock(omap_mtx); + vec dead_objects; for (auto const &p : object_info) { auto [hdrlen, datalen, live] = p.second; if (live == 0) { @@ -1136,7 +947,7 @@ void translate_impl::do_gc(bool *running) */ { std::unique_lock lk(m); - if (reading_objects.find(o) != reading_objects.end()) + if (obj_read_refcount.find(o) != obj_read_refcount.end()) continue; } objname name(prefix(o), o); @@ -1153,13 +964,14 @@ void translate_impl::do_gc(bool *running) deletes.pop(); } - std::unique_lock obj_w_lock(*map_lock); + std::unique_lock obj_w_lock(omap_mtx); for (auto const &o : dead_objects) object_info.erase(o); obj_w_lock.unlock(); std::unique_lock lk(m); - int last_ckpt = (checkpoints.size() > 0) ? checkpoints.back() : seq.load(); + auto last_ckpt = + (checkpoints.size() > 0) ? checkpoints.back() : cur_seq.load(); lk.unlock(); /* create list of object info in increasing order of @@ -1181,8 +993,8 @@ void translate_impl::do_gc(bool *running) /* gather list of objects needing cleaning, return if none */ - const double threshold = cfg->gc_threshold / 100.0; - std::vector> objs_to_clean; + const double threshold = cfg.gc_threshold / 100.0; + vec> objs_to_clean; for (auto [u, o, n] : utilization) { if (u > threshold) continue; @@ -1201,7 +1013,7 @@ void translate_impl::do_gc(bool *running) for (auto it = objs_to_clean.begin(); it != objs_to_clean.end(); it++) objects.insert(it->first); - int max_obj_sectors = 0; + sector_t max_obj_sectors = 0; for (auto o : objects) { auto _sectors = object_info[o].hdr + object_info[o].data; max_obj_sectors = std::max(_sectors, max_obj_sectors); @@ -1209,11 +1021,11 @@ void translate_impl::do_gc(bool *running) obj_r_lock.lock(); extmap::objmap live_extents; - for (auto it = map->begin(); it != map->end(); it++) { + for (auto it = objmap.begin(); it != objmap.end(); it++) { auto [base, limit, ptr] = it->vals(); if (ptr.obj <= max_obj && objects.find(ptr.obj) != objects.end()) live_extents.update(base, limit, ptr); - if (!*running) // forced exit + if (st.stop_requested()) // forced exit return; } obj_r_lock.unlock(); @@ -1226,10 +1038,11 @@ void translate_impl::do_gc(bool *running) if (live_extents.size() > 0) { /* temporary file, delete on close. */ - char temp[cfg->rcache_dir.size() + 20]; - sprintf(temp, "%s/gc.XXXXXX", cfg->rcache_dir.c_str()); - int fd = mkstemp(temp); - unlink(temp); + auto temp = fmt::format("{}/gc.XXXXXX", cfg.rcache_dir); + auto t1 = strdup(temp.c_str()); + int fd = mkstemp(t1); + free(t1); + unlink(temp.c_str()); /* read all objects in completely */ @@ -1245,14 +1058,14 @@ void translate_impl::do_gc(bool *running) if (write(fd, buf, sectors * 512) < 0) throw("no space"); offset += sectors; - if (!*running) + if (st.stop_requested()) return; } free(buf); auto file_end = offset; - std::vector<_extent> all_extents; + vec<_extent> all_extents; for (auto it = live_extents.begin(); it != live_extents.end(); it++) { auto [base, limit, ptr] = it->vals(); all_extents.push_back((_extent){base, limit, ptr}); @@ -1263,8 +1076,8 @@ void translate_impl::do_gc(bool *running) std::queue requests; while (all_extents.size() > 0) { - sector_t sectors = 0, max = cfg->batch_size / 512; - std::vector<_extent> extents; + sector_t sectors = 0, max = cfg.backend_obj_size / 512; + vec<_extent> extents; auto it = all_extents.begin(); while (it != all_extents.end() && sectors < max) { @@ -1298,20 +1111,17 @@ void translate_impl::do_gc(bool *running) workers->put_locked(req); lk.unlock(); - while ((int)requests.size() > cfg->gc_window && *running) { - if (stopped) - return; + while ((int)requests.size() > cfg.gc_window && + !st.stop_requested()) { auto t = requests.front(); t->wait(); requests.pop(); } - if (!*running) + if (st.stop_requested()) return; } - while (requests.size() > 0 && *running) { - if (stopped) - return; + while (requests.size() > 0 && !st.stop_requested()) { auto t = requests.front(); t->wait(); requests.pop(); @@ -1329,7 +1139,7 @@ void translate_impl::do_gc(bool *running) } obj_w_lock.unlock(); - if (stopped) + if (st.stop_requested()) return; /* write checkpoint *before* deleting any objects. @@ -1345,7 +1155,7 @@ void translate_impl::do_gc(bool *running) auto obj = it->first; { std::unique_lock lk(m); - if (reading_objects.find(obj) != reading_objects.end()) + if (obj_read_refcount.find(obj) != obj_read_refcount.end()) continue; } objname name(prefix(obj), obj); @@ -1355,199 +1165,27 @@ void translate_impl::do_gc(bool *running) } } -void translate_impl::stop_gc(void) +void translate_impl::gc_thread(std::stop_token st) { - stopped = true; - delete misc_threads; - std::unique_lock lk(m); - while (gc_running) - gc_cv.wait(lk); -} - -void translate_impl::gc_thread(thread_pool *p) -{ - debug("starting gc thread"); - auto interval = std::chrono::milliseconds(100); + debug("Starting GC"); + auto interval = std::chrono::milliseconds(200); // sector_t trigger = 128 * 1024 * 2; // 128 MB - const char *name = "gc_thread"; - pthread_setname_np(pthread_self(), name); + pthread_setname_np(pthread_self(), "gc_thread"); - while (p->running) { - std::unique_lock lk(m); - p->cv.wait_for(lk, interval); - if (!p->running) + while (!st.stop_requested()) { + std::this_thread::sleep_for(interval); + if (st.stop_requested()) return; /* check to see if we should run a GC cycle */ // if (total_sectors - total_live_sectors < trigger) // continue; - // if ((total_live_sectors / (double)total_sectors) > (cfg->gc_threshold + // if ((total_live_sectors / (double)total_sectors) > (cfg.gc_threshold // / 100.0)) continue; - gc_running = true; - lk.unlock(); - - do_gc(&p->running); - - lk.lock(); - gc_running = false; - gc_cv.notify_all(); + do_gc(st); } -} - -/* ---------------- Debug ---------------- */ -/** - * Given a buffer of len at len 4096, create the image header in the buffer - */ -void set_image_header(char *buf, size_t vol_size) -{ - memset(buf, 0, 4096); - - auto hdr = (obj_hdr *)buf; - hdr->magic = LSVD_MAGIC; - hdr->version = 1; - hdr->type = LSVD_SUPER; - hdr->seq = 0; - hdr->hdr_sectors = 8; - hdr->data_sectors = 0; - uuid_generate_random(hdr->vol_uuid); - - auto superblock = (super_hdr *)(hdr + 1); - memset(superblock, 0, sizeof(*superblock)); - superblock->vol_size = vol_size / 512; - // superblock->ckpts_offset = 0; - // superblock->ckpts_len = 0; - // superblock->clones_offset = 0; - // superblock->clones_len = 0; - // superblock->snaps_offset = 0; - // superblock->snaps_len = 0; -} - -int translate_create_image(sptr objstore, const char *name, - uint64_t size) -{ - char buf[4096]; - memset(buf, 0, 4096); - - auto _hdr = (obj_hdr *)buf; - *_hdr = (obj_hdr){LSVD_MAGIC, - 1, // version - {0}, // UUID - LSVD_SUPER, // type - 0, // seq - 8, // hdr_sectors - 0, // data_sectors - 0}; - uuid_generate_random(_hdr->vol_uuid); - - auto _super = (super_hdr *)(_hdr + 1); - uint64_t sectors = size / 512; - *_super = (super_hdr){sectors, // vol_size - 0, 0, // checkpoint offset, len - 0, 0, // clone offset, len - 0, 0}; // snap offset, len - - auto rv = objstore->write(name, buf, 4096); - return rv; -} - -int translate_get_uuid(sptr objstore, const char *name, uuid_t &uu) -{ - char buf[4096]; - int rv = objstore->read(name, 0, buf, sizeof(buf)); - if (rv < 0) - return rv; - auto hdr = (obj_hdr *)buf; - memcpy(uu, hdr->vol_uuid, sizeof(uuid_t)); - return 0; -} - -int translate_remove_image(sptr objstore, const char *name) -{ - - /* read the superblock to get the list of checkpoints - */ - char buf[4096]; - int rv = objstore->read(name, 0, buf, sizeof(buf)); - if (rv < 0) - return rv; - auto hdr = (obj_hdr *)buf; - auto sh = (super_hdr *)(hdr + 1); - - if (hdr->magic != LSVD_MAGIC || hdr->type != LSVD_SUPER) - return -1; - - int seq = 1; - std::vector ckpts; - decode_offset_len(buf, sh->ckpts_offset, sh->ckpts_len, ckpts); - - /* read the most recent checkpoint and get its object map - */ - if (ckpts.size() > 0) { - object_reader r(objstore); - seq = ckpts.back(); - objname obj(name, seq); - auto ckpt_buf = r.read_object_hdr(obj.c_str(), false); - auto c_hdr = (obj_hdr *)ckpt_buf; - auto c_data = (obj_ckpt_hdr *)(c_hdr + 1); - if (c_hdr->magic != LSVD_MAGIC || c_hdr->type != LSVD_CKPT) - return -1; - std::vector objects; - decode_offset_len(ckpt_buf, c_data->objs_offset, - c_data->objs_len, objects); - - /* delete all the objects in the objmap - */ - for (auto const &o : objects) { - objname obj(name, o.seq); - auto r = objstore->delete_obj(obj.str()); - if (r < 0) - log_warn("Failed to delete obj {}, r={}", obj.str(), r); - } - - /* delete all the checkpoints - */ - for (auto const &c : ckpts) { - objname obj(name, c); - objstore->delete_obj(obj.str()); - } - free(ckpt_buf); - } - /* delete any objects after the last checkpoint, up to the first run of - * 32 missing sequence numbers - */ - for (int n = 0; n < 16; seq++, n++) { - objname obj(name, seq); - if (objstore->delete_obj(obj.str()) >= 0) - n = 0; - } - - /* and delete the superblock - */ - objstore->delete_obj(name); - return 0; -} - -using ckpt_id = uint32_t; - -inline std::vector deserialise_checkpoint_ids(char *buf) -{ - auto obj = (obj_hdr *)buf; - auto super = (super_hdr *)(buf + 1); - - assert(obj->magic == LSVD_MAGIC); - assert(obj->type == LSVD_SUPER); - - std::vector checkpoint_ids; - decode_offset_len(buf, super->ckpts_offset, super->ckpts_len, - checkpoint_ids); - return checkpoint_ids; -} - -int translate_clone_image(sptr objstore, const char *source, - const char *dest) -{ - throw std::runtime_error("unimplemented"); + log_info("Stopping GC"); } diff --git a/src/translate.h b/src/translate.h index b746624e..dd05987f 100644 --- a/src/translate.h +++ b/src/translate.h @@ -1,51 +1,51 @@ #pragma once +#include #include #include #include "backend.h" #include "config.h" #include "extent.h" +#include "lsvd_types.h" +#include "objects.h" #include "shared_read_cache.h" #include "utils.h" class translate { public: - uuid_t uuid; uint64_t max_cache_seq; + uuid_t &uuid; - translate() {} + translate(uuid_t &uuid) : uuid(uuid) {} virtual ~translate() {} - virtual ssize_t init(const char *name, bool timedflush) = 0; virtual void shutdown(void) = 0; - virtual void flush(void) = 0; /* write out current batch */ virtual void checkpoint(void) = 0; /* flush, then write checkpoint */ virtual ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, int iovcnt) = 0; virtual ssize_t trim(size_t offset, size_t len) = 0; - virtual void wait_for_room(void) = 0; + virtual void backend_backpressure(void) = 0; virtual void object_read_start(int obj) = 0; virtual void object_read_end(int obj) = 0; - virtual const char *prefix(int seq) = 0; /* for read cache */ - - virtual void stop_gc(void) = 0; /* do this before shutdown */ - virtual void start_gc(void) = 0; + virtual str prefix(seqnum_t seq) = 0; /* for read cache */ }; +uptr make_translate(str name, lsvd_config &cfg, usize vol_size, + uuid_t &vol_uuid, sptr be, + sptr rcache, extmap::objmap &objmap, + std::shared_mutex &omap_mtx, + extmap::bufmap &bmap, std::mutex &bmap_lck, + seqnum_t last_seq, vec &clones, + std::map &objinfo, + vec &checkpoints); + uptr make_translate(std::shared_ptr _io, lsvd_config *cfg, extmap::objmap *map, extmap::bufmap *bufmap, std::shared_mutex *m, std::mutex *buf_m, sptr rcache); - -int translate_create_image(sptr objstore, const char *name, - uint64_t size); -int translate_clone_image(sptr objstore, const char *source, - const char *dest); -int translate_remove_image(sptr objstore, const char *name); -int translate_get_uuid(sptr objstore, const char *name, uuid_t &uu); diff --git a/src/utils.h b/src/utils.h index 8d3eb2ec..ee5009af 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,15 +1,19 @@ #pragma once -#include +#include "folly/FBVector.h" +#include +#include #include #include -#include +#include #include #include #include #include +#include #include #include +#include #include #include #include @@ -26,6 +30,39 @@ template using sptr = std::shared_ptr; template using uptr = std::unique_ptr; +template using opt = std::optional; +template using vec = std::vector; +template using fvec = folly::fbvector; + +#define CEXTERN extern "C" + +using u64 = uint64_t; +using u32 = uint32_t; +using u16 = uint16_t; +using u8 = uint8_t; +using s64 = int64_t; +using s32 = int32_t; +using s16 = int16_t; +using s8 = int8_t; +using usize = size_t; +using ssize = ssize_t; +using byte = std::byte; +using str = std::string; +using fspath = std::filesystem::path; + +#define PASSTHRU_NULLOPT(opt) \ + do { \ + if (!opt) { \ + return std::nullopt; \ + } \ + } while (0) + +#define PASSTHRU_NULLPTR(ptr) \ + do { \ + if (!ptr) { \ + return nullptr; \ + } \ + } while (0) #define trace(MSG, ...) \ do { \ @@ -51,13 +88,6 @@ template using uptr = std::unique_ptr; __func__, ##__VA_ARGS__); \ } while (0) -#define log_error(MSG, ...) \ - do { \ - fmt::print(stderr, fg(fmt::terminal_color::red) | fmt::emphasis::bold, \ - "[ERR {}:{} {}] " MSG "\n", __FILE__, __LINE__, __func__, \ - ##__VA_ARGS__); \ - } while (0) - #define log_warn(MSG, ...) \ do { \ if (LOGLV <= 3) \ @@ -67,11 +97,68 @@ template using uptr = std::unique_ptr; __func__, ##__VA_ARGS__); \ } while (0) +#define log_error(MSG, ...) \ + do { \ + fmt::print(stderr, fg(fmt::terminal_color::red) | fmt::emphasis::bold, \ + "[ERR {}:{} {}] " MSG "\n", __FILE__, __LINE__, __func__, \ + ##__VA_ARGS__); \ + } while (0) + #define trap_to_debugger() \ do { \ raise(SIGTRAP); \ } while (0) +#define RET_IF(cond, ret) \ + do { \ + if (cond) { \ + return ret; \ + } \ + } while (0) + +#define PR_RET_IF(cond, ret, MSG, ...) \ + do { \ + if (cond) { \ + log_error(MSG, ##__VA_ARGS__); \ + return ret; \ + } \ + } while (0) + +/** + * If `cond` is true, print an error message to stdout with MSG, then return + * `ret` + */ +#define PR_ERR_RET_IF(cond, ret, en, MSG, ...) \ + do { \ + if (cond) { \ + auto fs = fmt::format(MSG "\n", ##__VA_ARGS__); \ + auto s = fmt::format("[ERR {}:{} {} | errno {}/{}] {}", __FILE__, \ + __LINE__, __func__, en, strerror(en), fs); \ + fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, s); \ + return ret; \ + } \ + } while (0) + +#define THROW_MSG_ON(cond, MSG, ...) \ + do { \ + if (cond) { \ + auto m = fmt::format(MSG, ##__VA_ARGS__); \ + auto s = fmt::format("[ERR {}:{} {}] {}\n", __FILE__, __LINE__, \ + __func__, m); \ + fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, s); \ + throw std::runtime_error(m); \ + } \ + } while (0) + +#define THROW_ERRNO_ON(cond, en, MSG, ...) \ + do { \ + if (cond) { \ + auto m = \ + fmt::format("{}/{}: " MSG, en, strerror(en), ##__VA_ARGS__); \ + throw std::system_error(en, std::generic_category(), m); \ + } \ + } while (0) + /** * Check return values of libstdc functions. If it's -1, print the error and * throw an exception @@ -114,6 +201,13 @@ template using uptr = std::unique_ptr; } \ } while (0) +#define TODO() \ + do { \ + fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, \ + "[ERR {}:{} {}] TODO\n", __FILE__, __LINE__, __func__); \ + throw std::runtime_error("TODO stub"); \ + } while (0) + #define UNIMPLEMENTED() \ do { \ fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, \ @@ -126,10 +220,9 @@ template struct overloaded : Ts... { using Ts::operator()...; }; -inline std::vector split_string_on_char(const std::string &s, - char delim) +inline vec split_string_on_char(const std::string &s, char delim) { - std::vector result; + vec result; std::stringstream ss(s); std::string item; @@ -140,7 +233,7 @@ inline std::vector split_string_on_char(const std::string &s, return result; } -inline std::string string_join(const std::vector &strings, +inline std::string string_join(const vec &strings, const std::string &delim) { std::string result; @@ -152,25 +245,6 @@ inline std::string string_join(const std::vector &strings, return result; } -inline std::chrono::time_point tnow() -{ - return std::chrono::high_resolution_clock::now(); -} - -constexpr std::chrono::microseconds -tus(std::chrono::time_point start, - std::chrono::time_point end) -{ - return std::chrono::duration_cast(end - start); -} - -constexpr int64_t tdus(std::chrono::time_point start, - std::chrono::time_point end) -{ - return std::chrono::duration_cast(end - start) - .count(); -} - template std::shared_ptr to_shared(std::unique_ptr ptr) { return std::shared_ptr(std::move(ptr)); @@ -204,61 +278,3 @@ inline size_t getsize64(int fd) size = sb.st_size; return size; } - -/** - * This is a thread safe, bounded, blocking, multi-producer multi-consumer, - * single-ended, FIFO queue. Push operations block until there's space, and pop - * blocks until there are entries in the queue to pop. - * - * It uses an underlying std::queue for the actual queue, and then just adds a - * single global lock to both pop and push. Readers are notified when there are - * entries via condition vars, same for writers. - * - * Based on CPython's queue implementation found at - * https://github.com/python/cpython/blob/main/Lib/queue.py - * - * This queue is neither movable nor copyable. Use smart pointers instead. - */ -template class BlockingMPMC -{ - public: - BlockingMPMC(size_t size) : _buffer(), _max_capacity(size) {} - ~BlockingMPMC(); - - // TODO Change to take an rvalue to default move instead of copy - void push(T t) - { - { - std::unique_lock lck(_mutex); - _can_push.wait(lck, - [&]() { return _buffer.size() < _max_capacity; }); - _buffer.push(std::move(t)); - } - _can_pop.notify_one(); - } - - T pop() - { - T x; - { - std::unique_lock lck(_mutex); - _can_pop.wait(lck, [&]() { return !_buffer.empty(); }); - x = std::move(_buffer.front()); - _buffer.pop(); - } - _can_push.notify_one(); - return x; - } - - private: - BlockingMPMC(BlockingMPMC &src) = delete; - BlockingMPMC(BlockingMPMC &&src) = delete; - BlockingMPMC &operator=(BlockingMPMC &src) = delete; - BlockingMPMC &operator=(BlockingMPMC &&src) = delete; - - std::queue _buffer; - size_t _max_capacity; - std::mutex _mutex; - std::condition_variable _can_pop; - std::condition_variable _can_push; -}; diff --git a/src/write_cache.cc b/src/write_cache.cc index fb25e288..a4d22aa0 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -13,16 +14,18 @@ #include "request.h" #include "smartiov.h" #include "translate.h" +#include "utils.h" #include "write_cache.h" +const usize SUPER_BLOCKNO = 1; + /* ------------- Write cache structure ------------- */ class wcache_write_req; class write_cache_impl : public write_cache { size_t dev_max; - uint32_t super_blkno; - lsvd_config *cfg; + int fd = -1; std::atomic sequence = 1; // write sequence # @@ -35,38 +38,30 @@ class write_cache_impl : public write_cache size_t write_batch = 0; std::condition_variable write_cv; - /* initialization stuff - */ + // initialization stuff int roll_log_forward(); - char *_hdrbuf; // for reading at startup - - thread_pool *misc_threads; - void write_checkpoint(void); - /* allocate journal entry, create a header - */ + // allocate journal entry, create a header uint32_t allocate(page_t n, page_t &pad, page_t &n_pad, page_t &prev); j_write_super *super; page_t previous_hdr = 0; page_t next_alloc = 0; - /* these are used by wcache_write_req - */ + // these are used by wcache_write_req friend class wcache_write_req; std::mutex m; - translate *be; + translate &be; j_hdr *mk_header(char *buf, uint32_t type, page_t blks, page_t prev); nvme *nvme_w = NULL; public: - /* throttle writes with window of max_write_pages - */ - void get_room(sector_t sectors); + // throttle writes with window of max_write_pages + void reserve_room(sector_t sectors); void release_room(sector_t sectors); void flush(void); - write_cache_impl(uint32_t blkno, int _fd, translate *_be, lsvd_config *cfg); + write_cache_impl(int fd, translate &_be, lsvd_config &cfg); ~write_cache_impl(); request *writev(sector_t lba, smartiov *iov); @@ -148,7 +143,7 @@ wcache_write_req::wcache_write_req(sector_t lba, smartiov *iovs, page_t n_pages, r_pad = wcache->nvme_w->make_write_request(&pad_iov, pad * 4096L); } - std::vector extents; + vec extents; extents.push_back((j_extent){(uint64_t)lba, iovs->bytes() / 512}); /* TODO: don't assign seq# in mk_header @@ -216,7 +211,7 @@ void wcache_write_req::run(request *parent_) * * TODO record how long this takes per request, unlikely to be bottleneck though */ -void write_cache_impl::get_room(sector_t sectors) +void write_cache_impl::reserve_room(sector_t sectors) { int pages = sectors / 8; std::unique_lock lk(m2); @@ -305,11 +300,11 @@ j_hdr *write_cache_impl::mk_header(char *buf, uint32_t type, page_t blks, */ void write_cache_impl::write_checkpoint(void) { - /* shouldn't really need the copy, since it's only called on - * shutdown, except that some unit tests call this and expect things - * to work afterwards - */ - j_write_super *super_copy = (j_write_super *)aligned_alloc(4096, 4096); + // shouldn't really need the copy, since it's only called on + // shutdown, except that some unit tests call this and expect things + // to work afterwards + vec buf(4096); + auto super_copy = (j_write_super *)buf.data(); memcpy(super_copy, super, 4096); super_copy->seq = sequence; @@ -325,10 +320,8 @@ void write_cache_impl::write_checkpoint(void) super_copy->clean = true; - if (nvme_w->write((char *)super_copy, 4096, 4096L * super_blkno) < 0) - throw_fs_error("wckpt_s"); - - free(super_copy); + auto res = nvme_w->write(buf.data(), buf.size(), 4096); + THROW_ERRNO_ON(res < 0, errno, "Failed to write wlog header"); } /* needs to set the following variables: @@ -406,7 +399,7 @@ int write_cache_impl::roll_log_forward() * - put mappings into cache map * - write data to backend */ - std::vector entries; + vec entries; decode_offset_len(_hdrbuf, h->extent_offset, h->extent_len, entries); @@ -451,28 +444,23 @@ int write_cache_impl::roll_log_forward() #endif } -write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate *_be, - lsvd_config *cfg_) +write_cache_impl::write_cache_impl(int fd, translate &be, lsvd_config &cfg) + : fd(fd), be(be) { - - super_blkno = blkno; dev_max = getsize64(fd); - be = _be; - cfg = cfg_; - _hdrbuf = (char *)aligned_alloc(4096, 4096); - - const char *name = "wlog_uring"; - nvme_w = make_nvme_uring(fd, name); + nvme_w = make_nvme_uring(fd, "wlog_uring"); char *buf = (char *)aligned_alloc(4096, 4096); - if (nvme_w->read(buf, 4096, 4096L * super_blkno) < 4096) - throw_fs_error("wcache"); + auto res = nvme_w->read(buf, 4096, 4096L * SUPER_BLOCKNO); + THROW_ERRNO_ON(res < 0, -res, "Failed to read wlog header"); + THROW_MSG_ON(res < 4096, "Short read {}/4096 on wlog header", res); + super = (j_write_super *)buf; + THROW_MSG_ON(super->magic != LSVD_MAGIC, "Invalid magic in wlog sub-hdr"); - /* if it's clean we can read in the map and lengths, otherwise - * do crash recovery. Then set the dirty flag - */ + // if it's clean we can read in the map and lengths, otherwise + // do crash recovery. Then set the dirty flag if (super->clean) { sequence = super->seq; next_alloc = super->base; @@ -481,33 +469,23 @@ write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate *_be, next_alloc = super->base; super->clean = false; - if (nvme_w->write(buf, 4096, 4096L * super_blkno) < 4096) - throw_fs_error("wcache"); + res = nvme_w->write(buf, 4096, 4096L * SUPER_BLOCKNO); + THROW_ERRNO_ON(res < 0, -res, "Failed to write wlog subhdr"); int n_pages = super->limit - super->base; - max_write_pages = n_pages / 2 + n_pages / 4; - write_batch = cfg->wcache_batch; - - misc_threads = new thread_pool(&m); -} - -uptr make_write_cache(uint32_t blkno, int fd, translate *be, - lsvd_config *cfg) -{ - return std::make_unique(blkno, fd, be, cfg); + max_write_pages = n_pages / 2 + n_pages / 4; // no idea why this is 3/4ths + write_batch = cfg.wcache_batch; } write_cache_impl::~write_cache_impl() { - delete misc_threads; + close(fd); free(super); - free(_hdrbuf); delete nvme_w; } request *write_cache_impl::writev(sector_t lba, smartiov *iovs) { - size_t bytes = iovs->bytes(); page_t pages = div_round_up(bytes, 4096); page_t pad, n_pad, prev = 0; @@ -526,9 +504,104 @@ request *write_cache_impl::writev(sector_t lba, smartiov *iovs) lk.unlock(); // writing to in-memory buffer (translation layer) - be->writev(req->seq, lba * 512, iov, iovcnt); + be.writev(req->seq, lba * 512, iov, iovcnt); return req; } void write_cache_impl::do_write_checkpoint(void) { write_checkpoint(); } + +int init_wcache(int fd, uuid_t &uuid, usize cache_size) +{ + // write log file has 2 header blocks: the first 4k block is the j_hdr, + // the second 4k block is the j_write_super + // not entirely sure why they are separate, but I'm leaving it for now + + page_t total_pages = cache_size / 4096; + page_t content_pages = total_pages - 2; + page_t _map = div_round_up(content_pages, 256); + page_t _len = div_round_up(content_pages, 512); + page_t meta_pages = 2 * (_map + _len); + page_t data_pages = content_pages - meta_pages; + + vec buf(4096 * 2); + auto hdr = (j_hdr *)buf.data(); + *hdr = { + .magic = LSVD_MAGIC, + .type = LSVD_J_DATA, + .version = 1, + .len = total_pages, + .seq = 0, + .crc32 = 0, + .extent_offset = 0, + .extent_len = 0, + .prev = 0, + }; + + auto sup = (j_write_super *)(buf.data() + 4096); + *sup = { + .magic = LSVD_MAGIC, + .type = LSVD_J_W_SUPER, + .version = 1, + .clean = 1, + .seq = 1, + .meta_base = 1, + .meta_limit = 1 + meta_pages, + .base = 1 + meta_pages, + .limit = 1 + meta_pages + data_pages, + .next = 1 + meta_pages, + .map_start = 0, + .map_blocks = 0, + .map_entries = 0, + .len_start = 0, + .len_blocks = 0, + .len_entries = 0, + .vol_uuid = {0}, + }; + uuid_copy(sup->vol_uuid, uuid); + + int ret = pwrite(fd, buf.data(), buf.size(), 0); + PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to write wlog header"); + + // just truncate to right length, don't bother writing zeroes + ret = ftruncate(fd, 4096 * total_pages); + PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to truncate wlog file"); + + return 0; +} + +uptr open_wlog(fspath path, translate &xlate, lsvd_config &cfg) +{ + log_info("Opening write log at '{}'", path.string()); + + int fd = 0; + if (!std::filesystem::exists(path)) { + log_info("Creating write log file '{}'", path.string()); + fd = open(path.c_str(), O_RDWR | O_CREAT, 0644); + PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to create cache file"); + + auto err = init_wcache(fd, xlate.uuid, cfg.wlog_size); + PR_ERR_RET_IF(err < 0, nullptr, -err, "Failed to init wlog"); + } else { + fd = open(path.c_str(), O_RDWR); + PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to open wlog file"); + } + + char buf[4096]; + int err = pread(fd, buf, 4096, 0); + PR_ERR_RET_IF(err < 0, nullptr, errno, "Failed to read wlog header"); + + auto super = (j_hdr *)buf; + PR_RET_IF(super->magic != LSVD_MAGIC, nullptr, + "Invalid write cache magic number: {}", super->magic); + PR_RET_IF(super->type != LSVD_J_DATA, nullptr, "Invalid cache type: {}", + super->type); + + try { + return std::make_unique(fd, xlate, cfg); + } catch (std::exception &e) { + log_error("Failed to open write cache: {}", e.what()); + close(fd); + return nullptr; + } +} diff --git a/src/write_cache.h b/src/write_cache.h index 04b46710..42955a70 100644 --- a/src/write_cache.h +++ b/src/write_cache.h @@ -1,5 +1,6 @@ #pragma once +#include "config.h" #include "lsvd_types.h" #include "translate.h" #include "utils.h" @@ -9,7 +10,7 @@ class write_cache { public: - virtual void get_room(sector_t sectors) = 0; + virtual void reserve_room(sector_t sectors) = 0; virtual void release_room(sector_t sectors) = 0; virtual void flush(void) = 0; @@ -22,3 +23,5 @@ class write_cache uptr make_write_cache(uint32_t blkno, int fd, translate *be, lsvd_config *cfg); + +uptr open_wlog(fspath path, translate &xlate, lsvd_config &cfg); diff --git a/subprojects/folly.wrap b/subprojects/folly.wrap new file mode 100644 index 00000000..90426749 --- /dev/null +++ b/subprojects/folly.wrap @@ -0,0 +1,9 @@ +[wrap-git] +url = https://github.com/facebook/folly.git +revision = v2024.05.20.00 +clone-recursive = true +method = cmake +depth = 1 + +[provide] +_folly = folly_dep \ No newline at end of file diff --git a/subprojects/liburing.wrap b/subprojects/liburing.wrap new file mode 100644 index 00000000..00de3a76 --- /dev/null +++ b/subprojects/liburing.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = liburing-liburing-2.5 +source_url = https://github.com/axboe/liburing/archive/refs/tags/liburing-2.5.tar.gz +source_filename = liburing-2.5.tar.gz +source_hash = 456f5f882165630f0dc7b75e8fd53bd01a955d5d4720729b4323097e6e9f2a98 +patch_filename = liburing_2.5-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/liburing_2.5-1/get_patch +patch_hash = d72f651e0edd8102535af575d682ce86c3fc2fdabb40b8faa2659d0f7d437f44 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/liburing_2.5-1/liburing-2.5.tar.gz +wrapdb_version = 2.5-1 + +[provide] +dependency_names = liburing diff --git a/subprojects/packagefiles/spdk/configure-spdk.sh b/subprojects/packagefiles/spdk/configure-spdk.sh new file mode 100644 index 00000000..560af082 --- /dev/null +++ b/subprojects/packagefiles/spdk/configure-spdk.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +debug() { + echo '===Building SPDK in debug mode...' + ./configure --enable-debug --without-fuse --without-nvme-cuse \ + --without-rbd --without-shared --without-xnvme +} + +release() { + echo '===Building SPDK in release mode...' + ./configure --without-fuse --without-nvme-cuse \ + --without-rbd --without-shared --without-xnvme +} + +if [ $# -lt 1 ]; then + echo "Usage: ./configure-spdk.sh [debug, release]" + exit +fi + +"$@" diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build new file mode 100644 index 00000000..c798d74c --- /dev/null +++ b/subprojects/packagefiles/spdk/meson.build @@ -0,0 +1,155 @@ +# copied from xnvme's spdk subproject configuration and cleaned up a bit + +project('spdk', 'c', version: '24.01') + +if host_machine.system() != 'linux' + error('Unsupported system type "@0@"'.format(exec_env)) +endif + +fs = import('fs') +cc = meson.get_compiler('c') + +math_dep = cc.find_library('m', has_headers: ['math.h']) +ssl_dep = dependency('openssl', version: '>=1.1.1') +dlfcn_dep = cc.find_library('dl', has_headers: ['dlfcn.h']) +uuid_dep = cc.find_library('uuid', dirs: [], has_headers: ['uuid/uuid.h']) +numa_dep = cc.find_library('numa', has_headers: ['numaif.h']) +archive_dep = cc.find_library('archive', has_headers: ['archive.h']) + +if get_option('build_subprojects') and not fs.exists('build') + message('Configuring ..') + if get_option('buildtype') == 'debug' + run_command(['configure-spdk.sh', 'debug'], capture: true, check: true) + else + run_command(['configure-spdk.sh', 'release'], capture: true, check: true) + endif +endif + +if get_option('build_subprojects') and not fs.exists('build' / 'lib' / 'libspdk_nvme.a') + message('Building ..') + run_command([find_program('make'), '-j', '20'], capture: true, check: true, env: {}) +endif + +message('Setting up dependency ..') +message('build_subprojects:', get_option('build_subprojects')) + +custom_libnames = [ + 'spdk_event', + 'spdk_env_dpdk_rpc', + 'spdk_event_bdev', + 'spdk_bdev', + 'spdk_notify', + # 'spdk_bdev_malloc', + # 'spdk_bdev_null', + # 'spdk_bdev_nvme', + # 'spdk_bdev_passthru', + # 'spdk_bdev_lvol', + # 'spdk_bdev_raid', + # 'spdk_bdev_error', + # 'spdk_bdev_gpt', + # 'spdk_bdev_split', + # 'spdk_bdev_delay', + # 'spdk_bdev_zone_block', + 'spdk_blobfs_bdev', + 'spdk_blobfs', + 'spdk_blob_bdev', + 'spdk_lvol', + 'spdk_blob', + 'spdk_nvme', + 'spdk_nvmf', + # 'spdk_bdev_aio', + 'spdk_bdev_ftl', + 'spdk_ftl', + 'spdk_bdev_virtio', + 'spdk_virtio', + 'spdk_vfio_user', + 'spdk_event_accel', + 'spdk_accel', + 'spdk_dma', + 'spdk_accel_error', + 'spdk_accel_ioat', + 'spdk_ioat', + 'spdk_event_vmd', + 'spdk_vmd', + 'spdk_event_sock', + 'spdk_sock', + 'spdk_sock_posix', + 'spdk_event_iobuf', + 'spdk_init', + 'spdk_thread', + 'spdk_trace', + 'spdk_rpc', + 'spdk_jsonrpc', + 'spdk_json', + 'spdk_env_dpdk', + 'spdk_util', + 'spdk_log', + 'rte_eal', + 'rte_mempool', + 'rte_ring', + 'rte_mbuf', + 'rte_bus_pci', + 'rte_pci', + 'rte_mempool_ring', + 'rte_telemetry', + 'rte_kvargs', + 'rte_rcu', + 'rte_power', + 'rte_ethdev', + 'rte_vhost', + 'rte_net', + 'rte_dmadev', + 'rte_cryptodev', + 'rte_hash', + 'rte_log', + 'isal', + 'isal_crypto', +] + +spdk_deps = [ + dlfcn_dep, + math_dep, + numa_dep, + uuid_dep, + ssl_dep, + ssl_dep, + archive_dep, +] + +spdk_paths = [] +foreach libname : custom_libnames + csd = meson.current_source_dir() + lib_dep = cc.find_library( + libname, + dirs: [ + csd / 'build' / 'lib', + csd / 'dpdk' / 'build' / 'lib', + csd / 'isa-l' / '.libs', + csd / 'isa-l-crypto' / '.libs', + ], + static: true, + ) + + # Create a bunch of paths + paths = [ + csd / 'build' / 'lib' / 'lib' + libname + '.a', + csd / 'dpdk' / 'build' / 'lib' / 'lib' + libname + '.a', + csd / 'isa-l' / '.libs' / 'lib' + libname + '.a', + csd / 'isa-l-crypto' / '.libs' / 'lib' + libname + '.a', + ] + foreach path : paths + if lib_dep.found() and fs.exists(path) + spdk_paths += path + endif + endforeach +endforeach + +spdk_inc = get_option('build_subprojects') ? include_directories('dpdk' / 'build' / 'include', 'build' / 'include') : include_directories('.') +spdk_link_args = ['-Wl,--whole-archive'] + spdk_paths + ['-Wl,--no-whole-archive'] + +# Construct link_args based on the above +spdk_dep = declare_dependency( + dependencies: spdk_deps, + link_args: spdk_link_args, + include_directories: spdk_inc, +) \ No newline at end of file diff --git a/subprojects/packagefiles/spdk/meson_options.txt b/subprojects/packagefiles/spdk/meson_options.txt new file mode 100644 index 00000000..49dd0b32 --- /dev/null +++ b/subprojects/packagefiles/spdk/meson_options.txt @@ -0,0 +1 @@ +option('build_subprojects', type : 'boolean', value : true, yield : true) \ No newline at end of file diff --git a/subprojects/spdk.wrap b/subprojects/spdk.wrap new file mode 100644 index 00000000..1336d4c0 --- /dev/null +++ b/subprojects/spdk.wrap @@ -0,0 +1,9 @@ +[wrap-git] +url = https://github.com/spdk/spdk.git +revision = v24.01 +patch_directory = spdk +clone-recursive = true +depth = 1 + +[provide] +_spdk = spdk_dep \ No newline at end of file diff --git a/test/meson.build b/test/meson.build index 61ef5e0b..4c849fae 100644 --- a/test/meson.build +++ b/test/meson.build @@ -2,7 +2,7 @@ seq = executable( 'test-seq', 'test-seq.cc', include_directories: lsvd_inc, - link_with: liblsvd, + link_with: lsvd_ar, dependencies: lsvd_deps, ) @@ -10,7 +10,7 @@ extentmap = executable( 'test-extentmap', 'test-extentmap.cc', include_directories: lsvd_inc, - link_with: liblsvd, + link_with: lsvd_ar, dependencies: lsvd_deps, ) diff --git a/test/test-seq.cc b/test/test-seq.cc index d94bfef8..66aae582 100644 --- a/test/test-seq.cc +++ b/test/test-seq.cc @@ -1,5 +1,5 @@ -#include #include +#include #include #include @@ -16,8 +16,11 @@ using comp_buf = std::array; #ifdef __cplusplus extern "C" #endif -const char* __asan_default_options() { return "detect_leaks=0"; } - + const char * + __asan_default_options() +{ + return "detect_leaks=0"; +} /** * Usage: @@ -33,7 +36,8 @@ void hexdump(std::string desc, const void *addr, const int len, int perLine = 16) { int i; - unsigned char buff[perLine + 1]; + vec buf(perLine + 1); + auto buff = (unsigned char *)buf.data(); const unsigned char *pc = (const unsigned char *)addr; check_cond(len <= 0, "Invalid length {}", len); @@ -172,8 +176,8 @@ void run_test(rados_ioctx_t ctx) int main(int argc, char *argv[]) { // config options - setenv("LSVD_RCACHE_DIR", "/tmp/lsvd-read", 1); - setenv("LSVD_WCACHE_DIR", "/tmp/lsvd-write", 1); + setenv("LSVD_RCACHE_DIR", "/tmp/lsvd", 1); + setenv("LSVD_WCACHE_DIR", "/tmp/lsvd", 1); setenv("LSVD_CACHE_SIZE", "2147483648", 1); std::string pool_name = "pone"; diff --git a/tools/capture-fio-perf-trace.bash b/tools/capture-fio-perf-trace.bash index ed744cc4..a40b16f7 100755 --- a/tools/capture-fio-perf-trace.bash +++ b/tools/capture-fio-perf-trace.bash @@ -9,7 +9,7 @@ make clean make -j$(nproc) release ./tools/remove_objs.py pone perf-fio -# ./imgtool --rados --create --size=1G pone/perf-fio +# ./imgtool create --size 1G pone perf-fio ./thick-image --size=10G pone/perf-fio cd test/ diff --git a/tools/setup-wlog.bash b/tools/setup-wlog.bash index d675e335..a5e0dc34 100644 --- a/tools/setup-wlog.bash +++ b/tools/setup-wlog.bash @@ -3,7 +3,7 @@ set -xeuo pipefail lsvd_dir=$(git rev-parse --show-toplevel) -cd $lsvd_dir/spdk +cd $lsvd_dir/subprojects/spdk ./build/bin/nvmf_tgt -m '[0,1,2,3]' & diff --git a/tools/utils.bash b/tools/utils.bash index 8f03a7b3..d925844a 100644 --- a/tools/utils.bash +++ b/tools/utils.bash @@ -3,12 +3,12 @@ set -xeuo pipefail if [ "$EUID" -ne 0 ]; then - echo "Please run as root" - exit + echo "Please run as root" + exit fi function kill_nvmf { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py spdk_kill_instance SIGTERM >/dev/null || true scripts/rpc.py spdk_kill_instance SIGKILL >/dev/null || true pkill -f nvmf_tgt || true @@ -18,7 +18,7 @@ function kill_nvmf { function configure_nvmf_common { local gateway_ip=$1 - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py bdev_rbd_register_cluster rbd_cluster scripts/rpc.py nvmf_create_transport -t TCP -u 16384 -m 8 -c 8192 scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1 @@ -26,33 +26,32 @@ function configure_nvmf_common { } function add_rbd_img { - cd $lsvd_dir/spdk - local pool=$1 - local img=$2 - local bdev="bdev_$img" - scripts/rpc.py bdev_rbd_create $pool $img 4096 -c rbd_cluster -b $bdev - scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev + cd $lsvd_dir/subprojects/spdk + local pool=$1 + local img=$2 + local bdev="bdev_$img" + scripts/rpc.py bdev_rbd_create $pool $img 4096 -c rbd_cluster -b $bdev + scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev } function add_rbd_img_new_cluster { - cd $lsvd_dir/spdk - local pool=$1 - local img=$2 - local bdev="bdev_$img" - scripts/rpc.py bdev_rbd_create $pool $img 4096 -b $bdev - scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev + cd $lsvd_dir/subprojects/spdk + local pool=$1 + local img=$2 + local bdev="bdev_$img" + scripts/rpc.py bdev_rbd_create $pool $img 4096 -b $bdev + scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev } - function launch_lsvd_gw_background { local rcache_root=$1 local wlog_root=$2 local cache_size=${3:-5368709120} # 5GiB - + # allucate hugepages for spdk - echo 4096 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + echo 4096 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk mkdir -p $rcache_root/lsvd-read/ $wlog_root/lsvd-write/ export LSVD_RCACHE_DIR=$rcache_root/lsvd-read/ export LSVD_WCACHE_DIR=$wlog_root/lsvd-write/ @@ -69,7 +68,7 @@ function launch_lsvd_gw_background { } function launch_gw_background { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk ./build/bin/nvmf_tgt -m '[0,1,2,3,4,5,6,7]' & sleep 5 @@ -78,13 +77,13 @@ function launch_gw_background { function cleanup_nvmf_rbd { local bdev_name=$1 - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py bdev_rbd_delete $bdev_name scripts/rpc.py bdev_rbd_unregister_cluster rbd_cluster } function cleanup_nvmf { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py spdk_kill_instance SIGTERM } @@ -102,7 +101,7 @@ function create_lsvd_thin { # ./builddir/imgtool --delete --rados $pool/$img || true ./tools/remove_objs.py $pool $img - ./builddir/imgtool --create --rados --size=$size $pool/$img + ./builddir/imgtool create --size $size $pool $img # make sure image exists rados -p $pool stat $img @@ -129,7 +128,7 @@ function run_client_bench { cd $lsvd_dir/experiments ssh $client_ip 'mkdir -p /tmp/filebench; rm -rf /tmp/filebench/*' scp ./filebench-workloads/*.f root@$client_ip:/tmp/filebench/ - ssh $client_ip "bash -s gw_ip=$gw_ip $additional_args" < $benchscript 2>&1 | tee -a $outfile + ssh $client_ip "bash -s gw_ip=$gw_ip $additional_args" <$benchscript 2>&1 | tee -a $outfile perl -lane 'print if s/^RESULT: //' $outfile | tee -a $outfile }