From f20847ce722e8a0a537cf4a5ab7e819dd05f834f Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 13 May 2024 17:19:52 +0000 Subject: [PATCH 01/77] Move spdk submodule --- .gitignore | 2 -- .gitmodules | 2 +- experiments/multigw/gateway-1.bash | 2 +- experiments/multigw/gateway-2.bash | 2 +- experiments/multigw/rbd-gateway-1.bash | 2 +- experiments/multigw/rbd-gateway-2.bash | 2 +- spdk => subprojects/spdk | 0 tools/setup-wlog.bash | 2 +- tools/utils.bash | 47 +++++++++++++------------- 9 files changed, 29 insertions(+), 32 deletions(-) rename spdk => subprojects/spdk (100%) diff --git a/.gitignore b/.gitignore index 7414046b..3a812546 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,6 @@ __pycache__/ venv/ .ipynb_checkpoints/ -subprojects/ - builddir qemu/*.img qemu/*.iso diff --git a/.gitmodules b/.gitmodules index 55f4c711..895dfe48 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,5 +1,5 @@ [submodule "spdk"] - path = spdk + path = subprojects/spdk url = https://github.com/spdk/spdk [submodule "lsvd-atc24"] path = atc2024 diff --git a/experiments/multigw/gateway-1.bash b/experiments/multigw/gateway-1.bash index f973de98..463710bf 100755 --- a/experiments/multigw/gateway-1.bash +++ b/experiments/multigw/gateway-1.bash @@ -42,7 +42,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw1 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw1 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/experiments/multigw/gateway-2.bash b/experiments/multigw/gateway-2.bash index c704e660..30b098f8 100755 --- a/experiments/multigw/gateway-2.bash +++ b/experiments/multigw/gateway-2.bash @@ -42,7 +42,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw2 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw2 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/experiments/multigw/rbd-gateway-1.bash b/experiments/multigw/rbd-gateway-1.bash index 4f147486..ee49bc22 100755 --- a/experiments/multigw/rbd-gateway-1.bash +++ b/experiments/multigw/rbd-gateway-1.bash @@ -44,7 +44,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw1 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw1 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/experiments/multigw/rbd-gateway-2.bash b/experiments/multigw/rbd-gateway-2.bash index 6b243d95..082c648b 100755 --- a/experiments/multigw/rbd-gateway-2.bash +++ b/experiments/multigw/rbd-gateway-2.bash @@ -44,7 +44,7 @@ scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:lsvd-gw2 -a -s SPDKMULT scripts/rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:lsvd-gw2 -t tcp -a $gw_ip -s 9922 function add_rbd_img { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk local pool=$1 local img=$2 local bdev="bdev_$img" diff --git a/spdk b/subprojects/spdk similarity index 100% rename from spdk rename to subprojects/spdk diff --git a/tools/setup-wlog.bash b/tools/setup-wlog.bash index d675e335..a5e0dc34 100644 --- a/tools/setup-wlog.bash +++ b/tools/setup-wlog.bash @@ -3,7 +3,7 @@ set -xeuo pipefail lsvd_dir=$(git rev-parse --show-toplevel) -cd $lsvd_dir/spdk +cd $lsvd_dir/subprojects/spdk ./build/bin/nvmf_tgt -m '[0,1,2,3]' & diff --git a/tools/utils.bash b/tools/utils.bash index 8f03a7b3..0aaee13a 100644 --- a/tools/utils.bash +++ b/tools/utils.bash @@ -3,12 +3,12 @@ set -xeuo pipefail if [ "$EUID" -ne 0 ]; then - echo "Please run as root" - exit + echo "Please run as root" + exit fi function kill_nvmf { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py spdk_kill_instance SIGTERM >/dev/null || true scripts/rpc.py spdk_kill_instance SIGKILL >/dev/null || true pkill -f nvmf_tgt || true @@ -18,7 +18,7 @@ function kill_nvmf { function configure_nvmf_common { local gateway_ip=$1 - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py bdev_rbd_register_cluster rbd_cluster scripts/rpc.py nvmf_create_transport -t TCP -u 16384 -m 8 -c 8192 scripts/rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1 @@ -26,33 +26,32 @@ function configure_nvmf_common { } function add_rbd_img { - cd $lsvd_dir/spdk - local pool=$1 - local img=$2 - local bdev="bdev_$img" - scripts/rpc.py bdev_rbd_create $pool $img 4096 -c rbd_cluster -b $bdev - scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev + cd $lsvd_dir/subprojects/spdk + local pool=$1 + local img=$2 + local bdev="bdev_$img" + scripts/rpc.py bdev_rbd_create $pool $img 4096 -c rbd_cluster -b $bdev + scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev } function add_rbd_img_new_cluster { - cd $lsvd_dir/spdk - local pool=$1 - local img=$2 - local bdev="bdev_$img" - scripts/rpc.py bdev_rbd_create $pool $img 4096 -b $bdev - scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev + cd $lsvd_dir/subprojects/spdk + local pool=$1 + local img=$2 + local bdev="bdev_$img" + scripts/rpc.py bdev_rbd_create $pool $img 4096 -b $bdev + scripts/rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 $bdev } - function launch_lsvd_gw_background { local rcache_root=$1 local wlog_root=$2 local cache_size=${3:-5368709120} # 5GiB - + # allucate hugepages for spdk - echo 4096 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages + echo 4096 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk mkdir -p $rcache_root/lsvd-read/ $wlog_root/lsvd-write/ export LSVD_RCACHE_DIR=$rcache_root/lsvd-read/ export LSVD_WCACHE_DIR=$wlog_root/lsvd-write/ @@ -69,7 +68,7 @@ function launch_lsvd_gw_background { } function launch_gw_background { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk ./build/bin/nvmf_tgt -m '[0,1,2,3,4,5,6,7]' & sleep 5 @@ -78,13 +77,13 @@ function launch_gw_background { function cleanup_nvmf_rbd { local bdev_name=$1 - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py bdev_rbd_delete $bdev_name scripts/rpc.py bdev_rbd_unregister_cluster rbd_cluster } function cleanup_nvmf { - cd $lsvd_dir/spdk + cd $lsvd_dir/subprojects/spdk scripts/rpc.py spdk_kill_instance SIGTERM } @@ -129,7 +128,7 @@ function run_client_bench { cd $lsvd_dir/experiments ssh $client_ip 'mkdir -p /tmp/filebench; rm -rf /tmp/filebench/*' scp ./filebench-workloads/*.f root@$client_ip:/tmp/filebench/ - ssh $client_ip "bash -s gw_ip=$gw_ip $additional_args" < $benchscript 2>&1 | tee -a $outfile + ssh $client_ip "bash -s gw_ip=$gw_ip $additional_args" <$benchscript 2>&1 | tee -a $outfile perl -lane 'print if s/^RESULT: //' $outfile | tee -a $outfile } From 2e2e2b2ff1f42e9e7b0c04a668c63eb91363734d Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 13 May 2024 17:21:44 +0000 Subject: [PATCH 02/77] Cleanup meson scripts --- meson.build | 32 ++++++++++++++++++++------------ meson.ini | 8 ++++---- src/bdev_lsvd.cc | 18 ++++++++++++++++++ src/image.h | 6 ++++++ src/meson.build | 38 ++++++++++++++++++++++++++++---------- src/spdk_frontend.cc | 16 ++++++++++++++++ 6 files changed, 92 insertions(+), 26 deletions(-) create mode 100644 src/bdev_lsvd.cc create mode 100644 src/spdk_frontend.cc diff --git a/meson.build b/meson.build index 887a6692..d75d0225 100644 --- a/meson.build +++ b/meson.build @@ -1,9 +1,9 @@ project( 'lsvd-rbd', - 'cpp', + ['c', 'cpp'], version: '0.1', default_options: [ - 'cpp_std=c++20', + 'cpp_std=c++2a', 'warning_level=2', 'b_colorout=always', ], @@ -11,26 +11,34 @@ project( add_project_arguments('-fPIC', language: 'cpp') add_project_arguments('-Wno-unused-parameter', language: 'cpp') +# add_project_arguments(['-stdlib=libc++'], language: 'cpp') +# add_project_link_arguments(['-stdlib=libc++'], language: 'cpp') if get_option('buildtype') == 'debug' add_project_arguments('-fno-inline', language: 'cpp') add_project_arguments('-DLOGLV=1', language: 'cpp') # add sanitizers for debug builds - add_project_arguments( - '-fsanitize=address,undefined,nullability,implicit-conversion', - language: 'cpp', - ) - add_project_link_arguments( - '-fsanitize=address,undefined', - '-Wl,--unresolved-symbols=ignore-in-object-files', - language: 'cpp', - ) + # add_project_arguments( + # '-fsanitize=address,undefined,nullability,implicit-conversion', + # language: 'cpp', + # ) + # add_project_link_arguments( + # '-fsanitize=address,undefined', + # '-Wl,--unresolved-symbols=ignore-in-object-files', + # language: 'cpp', + # ) endif subdir('src') liblsvd = library('lsvd', lsvd_src, dependencies: lsvd_deps, install: false) +executable( + 'lsvd', + spdk_fe, + dependencies: lsvd_deps + spdk_deps + spdk_libs, +) + executable( 'imgtool', 'src/imgtool.cc', @@ -44,4 +52,4 @@ executable( dependencies: lsvd_deps, ) -subdir('test') +subdir('test') \ No newline at end of file diff --git a/meson.ini b/meson.ini index a1a41aec..a1be4df4 100644 --- a/meson.ini +++ b/meson.ini @@ -1,5 +1,5 @@ [binaries] -c = 'clang-17' -c_ld = 'lld-17' -cpp = 'clang++-17' -cpp_ld = 'lld-17' \ No newline at end of file +c = 'clang-18' +c_ld = 'lld-18' +cpp = 'clang++-18' +cpp_ld = 'lld-18' \ No newline at end of file diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc new file mode 100644 index 00000000..1338bd28 --- /dev/null +++ b/src/bdev_lsvd.cc @@ -0,0 +1,18 @@ +#include + +#include "spdk/env.h" +#include "spdk/bdev.h" +#include "spdk/thread.h" +#include "spdk/json.h" +#include "spdk/string.h" +#include "spdk/util.h" +#include "spdk/likely.h" +#include "spdk/bdev_module.h" +#include "spdk/log.h" + +#include "image.h" + +int bdev_lsvd_create() +{ + return 0; +} diff --git a/src/image.h b/src/image.h index 5a87c626..7e841aed 100644 --- a/src/image.h +++ b/src/image.h @@ -2,6 +2,7 @@ #include #include +#include #include "backend.h" #include "config.h" @@ -16,6 +17,11 @@ */ class lsvd_image { + private: + // no copying + lsvd_image(const lsvd_image &) = delete; + lsvd_image operator=(const lsvd_image &) = delete; + public: std::string image_name; diff --git a/src/meson.build b/src/meson.build index 2544aa21..a2c8a604 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,13 +1,4 @@ -lpthread = dependency('threads') -lz = dependency('zlib') -lfmt = dependency('fmt') -lboost = dependency('boost') -luring = dependency('liburing', static: true) -luuid = dependency('uuid') - cxx = meson.get_compiler('cpp') -lrados = cxx.find_library('rados', required: true) -ltcmalloc = cxx.find_library('tcmalloc', required: false) lsvd_src = files( 'config.cc', @@ -25,4 +16,31 @@ lsvd_src = files( ) lsvd_inc = include_directories('.') -lsvd_deps = [lpthread, lz, lfmt, lboost, luring, lrados, luuid, ltcmalloc] +lsvd_deps = [ + dependency('threads'), + dependency('zlib'), + dependency('fmt'), + dependency('boost'), + dependency('liburing'), + dependency('uuid'), + cxx.find_library('rados', required: true), + cxx.find_library('tcmalloc', required: false), +] + +spdk_fe = lsvd_src + files('spdk_frontend.cc') + +spdk_deps = [ + dependency('numa'), + cxx.find_library('archive'), +] + +spdk_libs = [ + dependency('spdk_bdev', static: true), + dependency('spdk_env_dpdk', static: true), + dependency('spdk_json', static: true), + dependency('spdk_nvme', static: true), + dependency('spdk_nvmf', static: true), + dependency('spdk_rpc', static: true), + dependency('spdk_trace', static: true), + dependency('spdk_util', static: true), +] \ No newline at end of file diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc new file mode 100644 index 00000000..0a438e84 --- /dev/null +++ b/src/spdk_frontend.cc @@ -0,0 +1,16 @@ +#include "spdk/env.h" +#include "spdk/event.h" + +int main(int argc, char **argv) +{ + spdk_env_opts opts; + spdk_env_opts_init(&opts); + opts.name = "spdk_frontend"; + opts.core_mask = "0x1"; + opts.shm_id = 0; + spdk_env_init(&opts); + + spdk_env_thread_wait_all(); + + return 0; +} From dd91d69dd15465dd0c7bc20a809fe2b7a313f108 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 13 May 2024 20:44:15 +0000 Subject: [PATCH 03/77] Add skeleton for spdk bdev support --- src/bdev_lsvd.cc | 77 ++++++++++++++++++++++++++++++++++++-------- src/bdev_lsvd.h | 7 ++++ src/spdk_frontend.cc | 20 +++++++----- src/spdk_rpc.cc | 51 +++++++++++++++++++++++++++++ src/utils.h | 2 ++ 5 files changed, 136 insertions(+), 21 deletions(-) create mode 100644 src/bdev_lsvd.h create mode 100644 src/spdk_rpc.cc diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 1338bd28..02c819e6 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -1,18 +1,69 @@ -#include - -#include "spdk/env.h" -#include "spdk/bdev.h" -#include "spdk/thread.h" -#include "spdk/json.h" -#include "spdk/string.h" -#include "spdk/util.h" -#include "spdk/likely.h" +#include "rados/librados.h" #include "spdk/bdev_module.h" -#include "spdk/log.h" -#include "image.h" +#include "bdev_lsvd.h" -int bdev_lsvd_create() +static int bdev_lsvd_init(void); +static void bdev_lsvd_finish(void); + +static spdk_bdev_module lsvd_if = { + .module_init = bdev_lsvd_init, + .module_fini = bdev_lsvd_finish, + .name = "lsvd", +}; +SPDK_BDEV_MODULE_REGISTER(ext_lsvd, &lsvd_if); + +static int bdev_lsvd_init(void) { return 0; } +static void bdev_lsvd_finish(void) {} + +static const spdk_bdev_fn_table lsvd_fn_table = { + .destruct = nullptr, + .submit_request = nullptr, + .io_type_supported = nullptr, + .get_io_channel = nullptr, + .dump_info_json = nullptr, +}; + +static int lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: // we only use this to ensure ordering + case SPDK_BDEV_IO_TYPE_RESET: // block until all pending io aborts + case SPDK_BDEV_IO_TYPE_UNMAP: // trim + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: // also just trim + return true; + default: + return false; + } +} + +CEXTERN int bdev_lsvd_create(const char *pool_name, const char *img_name) +{ + assert(pool_name != nullptr); + assert(img_name != nullptr); + + rados_t cluster; + int err = rados_create2(&cluster, "ceph", "client.admin", 0); + check_ret_neg(err, "Failed to create cluster handle"); + + err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); + check_ret_neg(err, "Failed to read config file"); + + err = rados_connect(cluster); + check_ret_neg(err, "Failed to connect to cluster"); + + rados_ioctx_t io_ctx; + err = rados_ioctx_create(cluster, pool_name, &io_ctx); + check_ret_neg(err, "Failed to connect to pool {}", pool_name); + + UNIMPLEMENTED(); + return 0; +} + +CEXTERN int bdev_lsvd_delete(const char *img_name) { - return 0; + UNIMPLEMENTED(); + return 0; } diff --git a/src/bdev_lsvd.h b/src/bdev_lsvd.h new file mode 100644 index 00000000..8f7c5380 --- /dev/null +++ b/src/bdev_lsvd.h @@ -0,0 +1,7 @@ +#pragma once + +#include "utils.h" + +CEXTERN int bdev_lsvd_create(const char *pool_name, const char *img_name); + +CEXTERN int bdev_lsvd_delete(const char *img_name); diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 0a438e84..70ce9e81 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -1,16 +1,20 @@ #include "spdk/env.h" #include "spdk/event.h" +#include "utils.h" + +static void start_lsvd() +{ + log_info("Starting LSVD SPDK program ..."); +} + int main(int argc, char **argv) { - spdk_env_opts opts; - spdk_env_opts_init(&opts); + spdk_app_opts opts = {}; + spdk_app_opts_init(&opts, sizeof(opts)); opts.name = "spdk_frontend"; - opts.core_mask = "0x1"; - opts.shm_id = 0; - spdk_env_init(&opts); - - spdk_env_thread_wait_all(); - return 0; + int rc = spdk_app_start(&opts, NULL, NULL); + spdk_app_fini(); + return rc; } diff --git a/src/spdk_rpc.cc b/src/spdk_rpc.cc new file mode 100644 index 00000000..177b9216 --- /dev/null +++ b/src/spdk_rpc.cc @@ -0,0 +1,51 @@ +#include "spdk/rpc.h" +#include "spdk/util.h" + +#include "utils.h" + +// RPC Command: bdev_lsvd_create + +struct rpc_bdev_lsvd_create { + char *pool_name; + char *name; +}; + +static const spdk_json_object_decoder rpc_create_lsvd_decoder[] = { + {"pool_name", offsetof(rpc_bdev_lsvd_create, pool_name), + spdk_json_decode_string, false}, + {"name", offsetof(rpc_bdev_lsvd_create, name), spdk_json_decode_string, + false}, +}; + +CEXTERN void rpc_bdev_lsvd_create(spdk_jsonrpc_request *request, + const spdk_json_val *params) +{ + struct rpc_bdev_lsvd_create req = {}; + auto ret = + spdk_json_decode_object(params, rpc_create_lsvd_decoder, + SPDK_COUNTOF(rpc_create_lsvd_decoder), &req); +} + +SPDK_RPC_REGISTER("bdev_lsvd_create", rpc_bdev_lsvd_create, SPDK_RPC_RUNTIME); + +// RPC Command: bdev_lsvd_delete + +struct rpc_bdev_lsvd_delete { + char *name; +}; + +static const spdk_json_object_decoder rpc_delete_lsvd_decoder[] = { + {"name", offsetof(rpc_bdev_lsvd_delete, name), spdk_json_decode_string, + false}, +}; + +CEXTERN void rpc_bdev_lsvd_delete(spdk_jsonrpc_request *request, + const spdk_json_val *params) +{ + struct rpc_bdev_lsvd_delete req = {}; + auto ret = + spdk_json_decode_object(params, rpc_delete_lsvd_decoder, + SPDK_COUNTOF(rpc_delete_lsvd_decoder), &req); +} + +SPDK_RPC_REGISTER("bdev_lsvd_delete", rpc_bdev_lsvd_delete, SPDK_RPC_RUNTIME); diff --git a/src/utils.h b/src/utils.h index 8d3eb2ec..eb722924 100644 --- a/src/utils.h +++ b/src/utils.h @@ -27,6 +27,8 @@ template using sptr = std::shared_ptr; template using uptr = std::unique_ptr; +#define CEXTERN extern "C" + #define trace(MSG, ...) \ do { \ if (LOGLV <= 0) \ From 38daba3e866ea814891858c776eb27f1637d8f38 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 13 May 2024 21:11:22 +0000 Subject: [PATCH 04/77] Update GH actions to llvm18 --- .github/workflows/test.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 7141b65c..58de99e4 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -13,11 +13,11 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Setup Clang 17 + - name: Setup Clang 18 run: | wget https://apt.llvm.org/llvm.sh chmod +x llvm.sh - sudo ./llvm.sh 17 + sudo ./llvm.sh 18 - name: Install dependencies run: | From 964f7f0fa2e334cd34f44122376e4141c5844fad Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 17:28:28 +0000 Subject: [PATCH 05/77] Convince spdk to link properly --- meson.build | 4 +- src/meson.build | 22 +- subprojects/packagefiles/spdk/meson.build | 226 ++++++++++++++++++ .../packagefiles/spdk/meson_options.txt | 1 + subprojects/spdk.wrap | 8 + 5 files changed, 241 insertions(+), 20 deletions(-) create mode 100644 subprojects/packagefiles/spdk/meson.build create mode 100644 subprojects/packagefiles/spdk/meson_options.txt create mode 100644 subprojects/spdk.wrap diff --git a/meson.build b/meson.build index d75d0225..f5d42882 100644 --- a/meson.build +++ b/meson.build @@ -11,8 +11,6 @@ project( add_project_arguments('-fPIC', language: 'cpp') add_project_arguments('-Wno-unused-parameter', language: 'cpp') -# add_project_arguments(['-stdlib=libc++'], language: 'cpp') -# add_project_link_arguments(['-stdlib=libc++'], language: 'cpp') if get_option('buildtype') == 'debug' add_project_arguments('-fno-inline', language: 'cpp') @@ -36,7 +34,7 @@ liblsvd = library('lsvd', lsvd_src, dependencies: lsvd_deps, install: false) executable( 'lsvd', spdk_fe, - dependencies: lsvd_deps + spdk_deps + spdk_libs, + dependencies: lsvd_deps + [dependency('_spdk')], ) executable( diff --git a/src/meson.build b/src/meson.build index a2c8a604..16100f52 100644 --- a/src/meson.build +++ b/src/meson.build @@ -27,20 +27,8 @@ lsvd_deps = [ cxx.find_library('tcmalloc', required: false), ] -spdk_fe = lsvd_src + files('spdk_frontend.cc') - -spdk_deps = [ - dependency('numa'), - cxx.find_library('archive'), -] - -spdk_libs = [ - dependency('spdk_bdev', static: true), - dependency('spdk_env_dpdk', static: true), - dependency('spdk_json', static: true), - dependency('spdk_nvme', static: true), - dependency('spdk_nvmf', static: true), - dependency('spdk_rpc', static: true), - dependency('spdk_trace', static: true), - dependency('spdk_util', static: true), -] \ No newline at end of file +spdk_fe = lsvd_src + files( + 'spdk_frontend.cc', + 'bdev_lsvd.cc', + 'spdk_rpc.cc', +) diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build new file mode 100644 index 00000000..b5265e31 --- /dev/null +++ b/subprojects/packagefiles/spdk/meson.build @@ -0,0 +1,226 @@ +# copied from xnvme's spdk subproject configuration and cleaned up a bit + +project( + 'spdk', + 'c', + version: '24.01', +) +fs = import('fs') + +system_support = ['linux'] +system = host_machine.system() +message('host_machine.system:', system) +if not system_support.contains(system) + error('Unsupported system type "@0@"'.format(exec_env)) +endif +foreach sys : system_support + set_variable('is_' + sys, system == sys) +endforeach + +cc = meson.get_compiler('c') + +math_dep = cc.find_library( + 'm', + has_headers: ['math.h'], +) + +ssl_dep = dependency( + 'openssl', + version: '>=1.1.1', +) + +dlfcn_dep = cc.find_library( + 'dl', + has_headers: ['dlfcn.h'], +) + +uuid_dep = cc.find_library( + 'uuid', + dirs: [], + has_headers: ['uuid/uuid.h'], +) +numa_dep = cc.find_library( + 'numa', + has_headers: ['numaif.h'], + required: is_linux, +) +archive_dep = cc.find_library( + 'archive', + has_headers: ['archive.h'], +) + +if get_option('build_subprojects') and not fs.exists('build') + message('Configuring ..') + if get_option('buildtype') == 'debug' + run_command('spdk_configure_debug.sh', capture: true, check: true) + else + run_command('spdk_configure.sh', capture: true, check: true) + endif +endif +if get_option('build_subprojects') and not fs.exists('build' / 'lib' / 'libspdk_nvme.a') + cpu_count = run_command( + [ + import('python').find_installation('python3'), + '-c', 'from multiprocessing import cpu_count; print(cpu_count())', + ], + check: true, + ).stdout().strip() + + message('Building ..') + run_command( + [find_program('make'), '-j', cpu_count], + capture: true, + check: true, + env: {}, + ) +endif + +message('Setting up dependency ..') +message('build_subprojects:', get_option('build_subprojects')) + +spdk_libnames = get_option('build_subprojects') ? [ + 'spdk_nvme', + 'spdk_env_dpdk', + 'spdk_sock', + 'spdk_sock_posix', + 'spdk_rpc', + 'spdk_trace', + 'spdk_jsonrpc', + 'spdk_json', + 'spdk_util', + 'spdk_log', + 'spdk_vfio_user', +] : [] + +custom_libnames = [ + 'spdk_event', + 'spdk_env_dpdk_rpc', + 'spdk_event_bdev', + 'spdk_bdev', + 'spdk_notify', + # 'spdk_bdev_malloc', + # 'spdk_bdev_null', + # 'spdk_bdev_nvme', + # 'spdk_bdev_passthru', + # 'spdk_bdev_lvol', + # 'spdk_bdev_raid', + # 'spdk_bdev_error', + # 'spdk_bdev_gpt', + # 'spdk_bdev_split', + # 'spdk_bdev_delay', + # 'spdk_bdev_zone_block', + 'spdk_blobfs_bdev', + 'spdk_blobfs', + 'spdk_blob_bdev', + 'spdk_lvol', + 'spdk_blob', + 'spdk_nvme', + # 'spdk_bdev_aio', + 'spdk_bdev_ftl', + 'spdk_ftl', + 'spdk_bdev_virtio', + 'spdk_virtio', + 'spdk_vfio_user', + 'spdk_event_accel', + 'spdk_accel', + 'spdk_dma', + 'spdk_accel_ioat', + 'spdk_ioat', + 'spdk_event_vmd', + 'spdk_vmd', + 'spdk_event_sock', + 'spdk_sock', + 'spdk_sock_posix', + 'spdk_event_iobuf', + 'spdk_init', + 'spdk_thread', + 'spdk_trace', + 'spdk_rpc', + 'spdk_jsonrpc', + 'spdk_json', + 'spdk_env_dpdk', + 'spdk_util', + 'spdk_log', + 'rte_eal', + 'rte_mempool', + 'rte_ring', + 'rte_mbuf', + 'rte_bus_pci', + 'rte_pci', + 'rte_mempool_ring', + 'rte_telemetry', + 'rte_kvargs', + 'rte_rcu', + 'rte_power', + 'rte_ethdev', + 'rte_vhost', + 'rte_net', + 'rte_dmadev', + 'rte_cryptodev', + 'rte_hash', + 'isal', + 'isal_crypto', +] + +dpdk_libnames = get_option('build_subprojects') ? [ + 'rte_eal', + 'rte_telemetry', + 'rte_bus_pci', + 'rte_pci', + 'rte_ring', + 'rte_mempool', + 'rte_kvargs', +] : [] + +isal_libnames = get_option('build_subprojects') ? [ + 'isal', +] : [] + +spdk_deps = [ + dlfcn_dep, + math_dep, + numa_dep, + uuid_dep, + ssl_dep, + ssl_dep, + archive_dep, +] + +spdk_paths = [] +# foreach libname : spdk_libnames + dpdk_libnames + isal_libnames +foreach libname : custom_libnames + csd = meson.current_source_dir() + lib_dep = cc.find_library( + libname, + dirs: [ + csd / 'build' / 'lib', + csd / 'dpdk' / 'build' / 'lib', + csd / 'isa-l' / '.libs', + csd / 'isa-l-crypto' / '.libs', + ], + static: true, + ) + + # Create a bunch of paths + paths = [ + csd / 'build' / 'lib' / 'lib' + libname + '.a', + csd / 'dpdk' / 'build' / 'lib' / 'lib' + libname + '.a', + csd / 'isa-l' / '.libs' / 'lib' + libname + '.a', + csd / 'isa-l-crypto' / '.libs' / 'lib' + libname + '.a', + ] + foreach path : paths + if lib_dep.found() and fs.exists(path) + spdk_paths += path + endif + endforeach +endforeach + +spdk_inc = get_option('build_subprojects') ? include_directories('dpdk' / 'build' / 'include', 'build' / 'include') : include_directories('.') +spdk_link_args = ['-Wl,--whole-archive'] + spdk_paths + ['-Wl,--no-whole-archive'] + +# Construct link_args based on the above +spdk_dep = declare_dependency( + dependencies: spdk_deps, + link_args: spdk_link_args, + include_directories: spdk_inc, +) \ No newline at end of file diff --git a/subprojects/packagefiles/spdk/meson_options.txt b/subprojects/packagefiles/spdk/meson_options.txt new file mode 100644 index 00000000..49dd0b32 --- /dev/null +++ b/subprojects/packagefiles/spdk/meson_options.txt @@ -0,0 +1 @@ +option('build_subprojects', type : 'boolean', value : true, yield : true) \ No newline at end of file diff --git a/subprojects/spdk.wrap b/subprojects/spdk.wrap new file mode 100644 index 00000000..8890bc98 --- /dev/null +++ b/subprojects/spdk.wrap @@ -0,0 +1,8 @@ +[wrap-git] +url = https://github.com/spdk/spdk.git +revision = v24.01 +patch_directory = spdk +clone-recursive = true + +[provide] +_spdk = spdk_dep \ No newline at end of file From 4bf2a3f1f10a2e2f68e75ecb55dbb6c4df3d9b00 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 22:17:16 +0000 Subject: [PATCH 06/77] Remove spdk_rpc interface --- src/bdev_lsvd.cc | 72 ++++++++++++++++++++++++++++++++------------ src/bdev_lsvd.h | 7 +++-- src/image.cc | 31 +++++++++++++++---- src/image.h | 1 + src/liblsvd.cc | 5 ++- src/meson.build | 5 ++- src/objects.h | 1 + src/spdk_frontend.cc | 4 +-- src/spdk_rpc.cc | 51 ------------------------------- 9 files changed, 89 insertions(+), 88 deletions(-) delete mode 100644 src/spdk_rpc.cc diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 02c819e6..a579657c 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -2,6 +2,8 @@ #include "spdk/bdev_module.h" #include "bdev_lsvd.h" +#include "image.h" +#include static int bdev_lsvd_init(void); static void bdev_lsvd_finish(void); @@ -9,22 +11,24 @@ static void bdev_lsvd_finish(void); static spdk_bdev_module lsvd_if = { .module_init = bdev_lsvd_init, .module_fini = bdev_lsvd_finish, - .name = "lsvd", + .name = "LSVD bdev module", }; SPDK_BDEV_MODULE_REGISTER(ext_lsvd, &lsvd_if); static int bdev_lsvd_init(void) { return 0; } static void bdev_lsvd_finish(void) {} +static bool lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type); + static const spdk_bdev_fn_table lsvd_fn_table = { .destruct = nullptr, .submit_request = nullptr, - .io_type_supported = nullptr, + .io_type_supported = &lsvd_bdev_io_type_supported, .get_io_channel = nullptr, .dump_info_json = nullptr, }; -static int lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type) +static bool lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type) { switch (io_type) { case SPDK_BDEV_IO_TYPE_READ: @@ -39,31 +43,59 @@ static int lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type) } } -CEXTERN int bdev_lsvd_create(const char *pool_name, const char *img_name) +class lsvd_iodevice { - assert(pool_name != nullptr); - assert(img_name != nullptr); + public: + spdk_bdev bdev; + uptr img; - rados_t cluster; - int err = rados_create2(&cluster, "ceph", "client.admin", 0); - check_ret_neg(err, "Failed to create cluster handle"); + lsvd_iodevice(uptr img) : img(std::move(img)) + { + bdev.product_name = (char *)"Log-structured Virtual Disk"; + bdev.name = (char *)img->image_name.c_str(); + bdev.blocklen = 4096; + bdev.blockcnt = img->size / bdev.blocklen; + bdev.ctxt = this; + bdev.module = &lsvd_if; + bdev.fn_table = &lsvd_fn_table; + } - err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); - check_ret_neg(err, "Failed to read config file"); + ~lsvd_iodevice() {} +}; - err = rados_connect(cluster); - check_ret_neg(err, "Failed to connect to cluster"); +class lsvd_bdev_io_channel +{ +}; + +int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) +{ + assert(!img_name.empty()); - rados_ioctx_t io_ctx; - err = rados_ioctx_create(cluster, pool_name, &io_ctx); - check_ret_neg(err, "Failed to connect to pool {}", pool_name); + auto img = lsvd_image::open_image(img_name, ioctx); + auto iodev = new lsvd_iodevice(std::move(img)); + + spdk_io_device_register(iodev, nullptr, nullptr, + sizeof(lsvd_bdev_io_channel), img_name.c_str()); + auto err = spdk_bdev_register(&iodev->bdev); + if (err) { + log_error("Failed to register bdev: err {}", (err)); + spdk_io_device_unregister(iodev, nullptr); + return err; + } - UNIMPLEMENTED(); return 0; } -CEXTERN int bdev_lsvd_delete(const char *img_name) +static void bdev_lsvd_delete_cb(void *arg, int rc) { - UNIMPLEMENTED(); - return 0; + auto p = (std::promise *)arg; + p->set_value(rc); +} + +int bdev_lsvd_delete(std::string img_name, std::function *cb) +{ + auto p = std::promise(); + spdk_bdev_unregister_by_name(img_name.c_str(), &lsvd_if, + bdev_lsvd_delete_cb, &p); + return p.get_future().get(); } diff --git a/src/bdev_lsvd.h b/src/bdev_lsvd.h index 8f7c5380..ff961d0f 100644 --- a/src/bdev_lsvd.h +++ b/src/bdev_lsvd.h @@ -1,7 +1,8 @@ #pragma once -#include "utils.h" +#include +#include -CEXTERN int bdev_lsvd_create(const char *pool_name, const char *img_name); +int bdev_lsvd_create(std::string img_name, rados_ioctx_t io_ctx); -CEXTERN int bdev_lsvd_delete(const char *img_name); +int bdev_lsvd_delete(std::string img_name); diff --git a/src/image.cc b/src/image.cc index 36ab0246..be8f7703 100644 --- a/src/image.cc +++ b/src/image.cc @@ -3,22 +3,41 @@ #include #include +#include "backend.h" #include "image.h" #include "journal.h" #include "lsvd_types.h" +#include "shared_read_cache.h" extern int init_wcache(int fd, uuid_t &uuid, int n_pages); const int block_sectors = CACHE_CHUNK_SIZE / 512; lsvd_image::~lsvd_image() { - wcache->flush(); - wcache->do_write_checkpoint(); - if (!cfg.no_gc) + // TODO fix to the utterly cursed try_open function so that the object is + // always in valid state instaed of being partially constructed + if (wcache) { + wcache->flush(); + wcache->do_write_checkpoint(); + } + if (xlate && !cfg.no_gc) xlate->stop_gc(); - xlate->checkpoint(); + if (xlate) + xlate->checkpoint(); + if (write_fd >= 0) + close(write_fd); +} - close(write_fd); +uptr lsvd_image::open_image(std::string name, rados_ioctx_t io) +{ + uptr img; + try { + img->try_open(name, io); + return img; + } catch (std::exception &e) { + log_error("Failed to open image {}: {}", name, e.what()); + return nullptr; + } } int lsvd_image::try_open(std::string name, rados_ioctx_t io) @@ -28,7 +47,7 @@ int lsvd_image::try_open(std::string name, rados_ioctx_t io) if (cfg.read() < 0) throw std::runtime_error("Failed to read config"); - objstore = get_backend(&cfg, io, name.c_str()); + objstore = make_rados_backend(io); shared_cache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); diff --git a/src/image.h b/src/image.h index 7e841aed..54c1b077 100644 --- a/src/image.h +++ b/src/image.h @@ -60,6 +60,7 @@ class lsvd_image ~lsvd_image(); int try_open(std::string name, rados_ioctx_t io); + static uptr open_image(std::string name, rados_ioctx_t io); class aio_request; class trivial_request; diff --git a/src/liblsvd.cc b/src/liblsvd.cc index dcd61bd3..5b4d8b7a 100644 --- a/src/liblsvd.cc +++ b/src/liblsvd.cc @@ -23,10 +23,9 @@ extern "C" int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, const char *snap_name) { auto img = lsvd_spdk::open_image(io, name); - if (img == nullptr) { - log_error("Failed to open image {}", name); + if (img == nullptr) return -1; - } + *image = (void *)img; log_info("Opened image: {}, size {}", name, img->get_img().size); return 0; diff --git a/src/meson.build b/src/meson.build index 16100f52..31a06e8a 100644 --- a/src/meson.build +++ b/src/meson.build @@ -28,7 +28,6 @@ lsvd_deps = [ ] spdk_fe = lsvd_src + files( - 'spdk_frontend.cc', 'bdev_lsvd.cc', - 'spdk_rpc.cc', -) + 'spdk_frontend.cc', +) \ No newline at end of file diff --git a/src/objects.h b/src/objects.h index e588b4a3..a5d7a0b0 100644 --- a/src/objects.h +++ b/src/objects.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 70ce9e81..e6d93f7f 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -3,7 +3,7 @@ #include "utils.h" -static void start_lsvd() +static void start_lsvd(void *arg) { log_info("Starting LSVD SPDK program ..."); } @@ -14,7 +14,7 @@ int main(int argc, char **argv) spdk_app_opts_init(&opts, sizeof(opts)); opts.name = "spdk_frontend"; - int rc = spdk_app_start(&opts, NULL, NULL); + int rc = spdk_app_start(&opts, start_lsvd, NULL); spdk_app_fini(); return rc; } diff --git a/src/spdk_rpc.cc b/src/spdk_rpc.cc deleted file mode 100644 index 177b9216..00000000 --- a/src/spdk_rpc.cc +++ /dev/null @@ -1,51 +0,0 @@ -#include "spdk/rpc.h" -#include "spdk/util.h" - -#include "utils.h" - -// RPC Command: bdev_lsvd_create - -struct rpc_bdev_lsvd_create { - char *pool_name; - char *name; -}; - -static const spdk_json_object_decoder rpc_create_lsvd_decoder[] = { - {"pool_name", offsetof(rpc_bdev_lsvd_create, pool_name), - spdk_json_decode_string, false}, - {"name", offsetof(rpc_bdev_lsvd_create, name), spdk_json_decode_string, - false}, -}; - -CEXTERN void rpc_bdev_lsvd_create(spdk_jsonrpc_request *request, - const spdk_json_val *params) -{ - struct rpc_bdev_lsvd_create req = {}; - auto ret = - spdk_json_decode_object(params, rpc_create_lsvd_decoder, - SPDK_COUNTOF(rpc_create_lsvd_decoder), &req); -} - -SPDK_RPC_REGISTER("bdev_lsvd_create", rpc_bdev_lsvd_create, SPDK_RPC_RUNTIME); - -// RPC Command: bdev_lsvd_delete - -struct rpc_bdev_lsvd_delete { - char *name; -}; - -static const spdk_json_object_decoder rpc_delete_lsvd_decoder[] = { - {"name", offsetof(rpc_bdev_lsvd_delete, name), spdk_json_decode_string, - false}, -}; - -CEXTERN void rpc_bdev_lsvd_delete(spdk_jsonrpc_request *request, - const spdk_json_val *params) -{ - struct rpc_bdev_lsvd_delete req = {}; - auto ret = - spdk_json_decode_object(params, rpc_delete_lsvd_decoder, - SPDK_COUNTOF(rpc_delete_lsvd_decoder), &req); -} - -SPDK_RPC_REGISTER("bdev_lsvd_delete", rpc_bdev_lsvd_delete, SPDK_RPC_RUNTIME); From 3f5c6e4bff409ab70fbcf055386c4d6cee2d62d6 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 22:20:35 +0000 Subject: [PATCH 07/77] Remove spdk submodule --- .gitignore | 1 + .gitmodules | 3 --- subprojects/spdk | 1 - 3 files changed, 1 insertion(+), 4 deletions(-) delete mode 160000 subprojects/spdk diff --git a/.gitignore b/.gitignore index 3a812546..d4b8999b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ venv/ .ipynb_checkpoints/ builddir +build qemu/*.img qemu/*.iso qemu/bzImage diff --git a/.gitmodules b/.gitmodules index 895dfe48..e03182ad 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "spdk"] - path = subprojects/spdk - url = https://github.com/spdk/spdk [submodule "lsvd-atc24"] path = atc2024 url = git@github.com:CCI-MOC/lsvd-atc24.git diff --git a/subprojects/spdk b/subprojects/spdk deleted file mode 160000 index fb13eebf..00000000 --- a/subprojects/spdk +++ /dev/null @@ -1 +0,0 @@ -Subproject commit fb13eebf53d7f132baa2e39992a6ad79affdcdaa From d71c7ab487f2b2ba7405ec8ac38eed8886a0d4f7 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 22:35:47 +0000 Subject: [PATCH 08/77] Fix spdk wrap --- .../packagefiles/spdk/configure-spdk.sh | 18 ++++++++++++++++++ subprojects/packagefiles/spdk/meson.build | 6 ++++-- 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 subprojects/packagefiles/spdk/configure-spdk.sh diff --git a/subprojects/packagefiles/spdk/configure-spdk.sh b/subprojects/packagefiles/spdk/configure-spdk.sh new file mode 100644 index 00000000..ea0153b1 --- /dev/null +++ b/subprojects/packagefiles/spdk/configure-spdk.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +debug() { + echo '===Building SPDK in debug mode...' + ./configure --enable-debug --without-shared +} + +release() { + echo '===Building SPDK in release mode...' + ./configure --without-shared +} + +if [ $# -lt 1 ]; then + echo "Usage: ./configure-spdk.sh [debug, release]" + exit +fi + +"$@" diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build index b5265e31..058d2308 100644 --- a/subprojects/packagefiles/spdk/meson.build +++ b/subprojects/packagefiles/spdk/meson.build @@ -52,9 +52,9 @@ archive_dep = cc.find_library( if get_option('build_subprojects') and not fs.exists('build') message('Configuring ..') if get_option('buildtype') == 'debug' - run_command('spdk_configure_debug.sh', capture: true, check: true) + run_command(['configure-spdk.sh', 'debug'], capture: true, check: true) else - run_command('spdk_configure.sh', capture: true, check: true) + run_command(['configure-spdk.sh', 'release'], capture: true, check: true) endif endif if get_option('build_subprojects') and not fs.exists('build' / 'lib' / 'libspdk_nvme.a') @@ -124,6 +124,7 @@ custom_libnames = [ 'spdk_event_accel', 'spdk_accel', 'spdk_dma', + 'spdk_accel_error', 'spdk_accel_ioat', 'spdk_ioat', 'spdk_event_vmd', @@ -158,6 +159,7 @@ custom_libnames = [ 'rte_dmadev', 'rte_cryptodev', 'rte_hash', + 'rte_log', 'isal', 'isal_crypto', ] From f99fe41cd76bc19d25cd96a3162b6d06a43c07db Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 22:40:40 +0000 Subject: [PATCH 09/77] Add spdk dependencies --- Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1e0d3d76..c55b6256 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,8 @@ clean: cd build-dbg; meson compile --clean install-deps: - sudo apt install -y meson libfmt-dev libaio-dev librados-dev mold \ + sudo apt install -y meson libfmt-dev libaio-dev librados-dev \ libtcmalloc-minimal4 libboost-dev libradospp-dev \ - liburing-dev pkg-config uuid-dev + liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ + libibverbs-dev librdmacm-dev + From 8ca5e554d44620bfdc62705856c11fd5e0f196d4 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 22:43:07 +0000 Subject: [PATCH 10/77] Fix github actions spdk config --- .github/workflows/test.yaml | 2 +- subprojects/packagefiles/spdk/configure-spdk.sh | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 58de99e4..99e1120f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -45,4 +45,4 @@ jobs: - name: Logs if: always() run: | - cat build-dbg/meson-logs/testlog.txt + cat build-rel/meson-logs/testlog.txt diff --git a/subprojects/packagefiles/spdk/configure-spdk.sh b/subprojects/packagefiles/spdk/configure-spdk.sh index ea0153b1..560af082 100644 --- a/subprojects/packagefiles/spdk/configure-spdk.sh +++ b/subprojects/packagefiles/spdk/configure-spdk.sh @@ -2,12 +2,14 @@ debug() { echo '===Building SPDK in debug mode...' - ./configure --enable-debug --without-shared + ./configure --enable-debug --without-fuse --without-nvme-cuse \ + --without-rbd --without-shared --without-xnvme } release() { echo '===Building SPDK in release mode...' - ./configure --without-shared + ./configure --without-fuse --without-nvme-cuse \ + --without-rbd --without-shared --without-xnvme } if [ $# -lt 1 ]; then From 89805819e9f82f6003213c042328eb69d34ae330 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 14 May 2024 22:45:36 +0000 Subject: [PATCH 11/77] Fix meson logs location --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 99e1120f..f24e80d6 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -45,4 +45,4 @@ jobs: - name: Logs if: always() run: | - cat build-rel/meson-logs/testlog.txt + cat build-rel/meson-logs/meson-log.txt From 5c27fc42f826bada5e140337745cffae9bcf98da Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 16 May 2024 00:06:40 +0000 Subject: [PATCH 12/77] Implement more of the bdev api --- src/bdev_lsvd.cc | 126 ++++++++++++++++++++++++++++++++++--------- src/image.cc | 2 +- src/spdk_frontend.cc | 24 ++++++++- 3 files changed, 124 insertions(+), 28 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index a579657c..0fa87499 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -1,9 +1,12 @@ #include "rados/librados.h" #include "spdk/bdev_module.h" +#include #include "bdev_lsvd.h" #include "image.h" -#include +#include "smartiov.h" +#include "spdk/thread.h" +#include "utils.h" static int bdev_lsvd_init(void); static void bdev_lsvd_finish(void); @@ -18,31 +21,22 @@ SPDK_BDEV_MODULE_REGISTER(ext_lsvd, &lsvd_if); static int bdev_lsvd_init(void) { return 0; } static void bdev_lsvd_finish(void) {} -static bool lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type); +/** + * Function table for the LSVD bdev module. + */ + +static int lsvd_destroy_bdev(void *); +static void lsvd_submit_io(spdk_io_channel *c, spdk_bdev_io *io); +static bool lsvd_io_type_supported(void *ctx, spdk_bdev_io_type io_type); +static spdk_io_channel *lsvd_get_io_channel(void *ctx); static const spdk_bdev_fn_table lsvd_fn_table = { - .destruct = nullptr, - .submit_request = nullptr, - .io_type_supported = &lsvd_bdev_io_type_supported, - .get_io_channel = nullptr, - .dump_info_json = nullptr, + .destruct = lsvd_destroy_bdev, + .submit_request = lsvd_submit_io, + .io_type_supported = lsvd_io_type_supported, + .get_io_channel = lsvd_get_io_channel, }; -static bool lsvd_bdev_io_type_supported(void *ctx, spdk_bdev_io_type io_type) -{ - switch (io_type) { - case SPDK_BDEV_IO_TYPE_READ: - case SPDK_BDEV_IO_TYPE_WRITE: - case SPDK_BDEV_IO_TYPE_FLUSH: // we only use this to ensure ordering - case SPDK_BDEV_IO_TYPE_RESET: // block until all pending io aborts - case SPDK_BDEV_IO_TYPE_UNMAP: // trim - case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: // also just trim - return true; - default: - return false; - } -} - class lsvd_iodevice { public: @@ -63,8 +57,76 @@ class lsvd_iodevice ~lsvd_iodevice() {} }; -class lsvd_bdev_io_channel +static int lsvd_destroy_bdev(void *ctx) { + auto iodev = reinterpret_cast(ctx); + delete iodev; + return 0; +} + +static bool lsvd_io_type_supported(void *ctx, spdk_bdev_io_type io_type) +{ + switch (io_type) { + case SPDK_BDEV_IO_TYPE_READ: + case SPDK_BDEV_IO_TYPE_WRITE: + case SPDK_BDEV_IO_TYPE_FLUSH: // we only use this to ensure ordering + case SPDK_BDEV_IO_TYPE_UNMAP: // trim + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: // also just trim + return true; + case SPDK_BDEV_IO_TYPE_RESET: // block until all pending io aborts + default: + return false; + } +} + +static spdk_io_channel *lsvd_get_io_channel(void *ctx) +{ + lsvd_iodevice *iodev = reinterpret_cast(ctx); + // SPDK will pass this to the iodevice's registered create/destroy + // io_channel functions that were passed in when the device was registered. + // We don't need to do anything special here, so just return the iodevice. + return spdk_get_io_channel(iodev); +} + +static void lsvd_submit_io(spdk_io_channel *c, spdk_bdev_io *io) +{ + auto dev = static_cast(io->bdev->ctxt); + auto &img = dev->img; + + // io details + auto offset = io->u.bdev.offset_blocks * io->bdev->blocklen; + auto len = io->u.bdev.num_blocks * io->bdev->blocklen; + smartiov iov(io->u.bdev.iovs, io->u.bdev.iovcnt); + + request *r; + switch (io->type) { + case SPDK_BDEV_IO_TYPE_READ: + r = img->read(offset, iov, nullptr); + break; + case SPDK_BDEV_IO_TYPE_WRITE: + r = img->write(offset, iov, nullptr); + break; + case SPDK_BDEV_IO_TYPE_FLUSH: + r = img->flush(nullptr); + break; + case SPDK_BDEV_IO_TYPE_UNMAP: + r = img->trim(offset, len, nullptr); + break; + case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: + r = img->trim(offset, len, nullptr); + break; + default: + log_error("Unknown request type: {}", io->type); + return; + } + + r->run(nullptr); +} + +// Just copying from bdev_rbd, not sure where this is actually used +struct lsvd_bdev_io_channel { + lsvd_iodevice *lsvd_dev; + spdk_io_channel *io_channel; }; int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) @@ -74,12 +136,24 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) auto img = lsvd_image::open_image(img_name, ioctx); auto iodev = new lsvd_iodevice(std::move(img)); - spdk_io_device_register(iodev, nullptr, nullptr, - sizeof(lsvd_bdev_io_channel), img_name.c_str()); + spdk_io_device_register( + iodev, + [](void *iodev, void *ctx_buf) { + auto *ch = static_cast(ctx_buf); + ch->lsvd_dev = static_cast(iodev); + ch->io_channel = spdk_get_io_channel(&lsvd_if); + return 0; + }, + [](void *iodev, void *ctx_buf) { + auto *ch = static_cast(ctx_buf); + spdk_put_io_channel(ch->io_channel); + }, + sizeof(lsvd_bdev_io_channel), img_name.c_str()); auto err = spdk_bdev_register(&iodev->bdev); if (err) { log_error("Failed to register bdev: err {}", (err)); - spdk_io_device_unregister(iodev, nullptr); + spdk_io_device_unregister( + iodev, [](void *ctx) { delete (lsvd_iodevice *)ctx; }); return err; } diff --git a/src/image.cc b/src/image.cc index be8f7703..c9be7037 100644 --- a/src/image.cc +++ b/src/image.cc @@ -30,7 +30,7 @@ lsvd_image::~lsvd_image() uptr lsvd_image::open_image(std::string name, rados_ioctx_t io) { - uptr img; + uptr img(new lsvd_image()); try { img->try_open(name, io); return img; diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index e6d93f7f..2642ea7e 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -1,11 +1,33 @@ -#include "spdk/env.h" #include "spdk/event.h" +#include "bdev_lsvd.h" #include "utils.h" static void start_lsvd(void *arg) { log_info("Starting LSVD SPDK program ..."); + + setenv("LSVD_RCACHE_DIR", "/tmp/lsvd-read", 1); + setenv("LSVD_WCACHE_DIR", "/tmp/lsvd-write", 1); + setenv("LSVD_CACHE_SIZE", "2147483648", 1); + + std::string pool_name = "pone"; + + rados_t cluster; + int err = rados_create2(&cluster, "ceph", "client.admin", 0); + check_ret_neg(err, "Failed to create cluster handle"); + + err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); + check_ret_neg(err, "Failed to read config file"); + + err = rados_connect(cluster); + check_ret_neg(err, "Failed to connect to cluster"); + + rados_ioctx_t io_ctx; + err = rados_ioctx_create(cluster, pool_name.c_str(), &io_ctx); + check_ret_neg(err, "Failed to connect to pool {}", pool_name); + + err = bdev_lsvd_create("test-image", io_ctx); } int main(int argc, char **argv) From 418cf52954cd2d40cc58afd0d3ad152ea2b5beff Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 16 May 2024 17:08:18 +0000 Subject: [PATCH 13/77] Add pyelftools as a dep --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c55b6256..49574af1 100644 --- a/Makefile +++ b/Makefile @@ -20,5 +20,5 @@ install-deps: sudo apt install -y meson libfmt-dev libaio-dev librados-dev \ libtcmalloc-minimal4 libboost-dev libradospp-dev \ liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ - libibverbs-dev librdmacm-dev + libibverbs-dev librdmacm-dev python3-pyelftools From b204d2b0a28e3e66bf74aeac0dd8f2761c3ab2a0 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 16 May 2024 17:12:27 +0000 Subject: [PATCH 14/77] Add cunit to deps --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 49574af1..d0ae8abd 100644 --- a/Makefile +++ b/Makefile @@ -20,5 +20,5 @@ install-deps: sudo apt install -y meson libfmt-dev libaio-dev librados-dev \ libtcmalloc-minimal4 libboost-dev libradospp-dev \ liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ - libibverbs-dev librdmacm-dev python3-pyelftools + libibverbs-dev librdmacm-dev python3-pyelftools libcunit1-dev From 1257ce61cc962b85c340340502ea99d07fb9f9fe Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 16 May 2024 17:34:59 +0000 Subject: [PATCH 15/77] Clean up spdk meson --- subprojects/packagefiles/spdk/meson.build | 96 +++-------------------- 1 file changed, 11 insertions(+), 85 deletions(-) diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build index 058d2308..0ec819e1 100644 --- a/subprojects/packagefiles/spdk/meson.build +++ b/subprojects/packagefiles/spdk/meson.build @@ -1,53 +1,20 @@ # copied from xnvme's spdk subproject configuration and cleaned up a bit -project( - 'spdk', - 'c', - version: '24.01', -) -fs = import('fs') +project('spdk', 'c', version: '24.01') -system_support = ['linux'] -system = host_machine.system() -message('host_machine.system:', system) -if not system_support.contains(system) +if not host_machine.system() == 'linux' error('Unsupported system type "@0@"'.format(exec_env)) endif -foreach sys : system_support - set_variable('is_' + sys, system == sys) -endforeach +fs = import('fs') cc = meson.get_compiler('c') -math_dep = cc.find_library( - 'm', - has_headers: ['math.h'], -) - -ssl_dep = dependency( - 'openssl', - version: '>=1.1.1', -) - -dlfcn_dep = cc.find_library( - 'dl', - has_headers: ['dlfcn.h'], -) - -uuid_dep = cc.find_library( - 'uuid', - dirs: [], - has_headers: ['uuid/uuid.h'], -) -numa_dep = cc.find_library( - 'numa', - has_headers: ['numaif.h'], - required: is_linux, -) -archive_dep = cc.find_library( - 'archive', - has_headers: ['archive.h'], -) +math_dep = cc.find_library('m', has_headers: ['math.h']) +ssl_dep = dependency('openssl', version: '>=1.1.1') +dlfcn_dep = cc.find_library('dl', has_headers: ['dlfcn.h']) +uuid_dep = cc.find_library('uuid', dirs: [], has_headers: ['uuid/uuid.h']) +numa_dep = cc.find_library('numa', has_headers: ['numaif.h']) +archive_dep = cc.find_library('archive', has_headers: ['archive.h']) if get_option('build_subprojects') and not fs.exists('build') message('Configuring ..') @@ -57,41 +24,15 @@ if get_option('build_subprojects') and not fs.exists('build') run_command(['configure-spdk.sh', 'release'], capture: true, check: true) endif endif -if get_option('build_subprojects') and not fs.exists('build' / 'lib' / 'libspdk_nvme.a') - cpu_count = run_command( - [ - import('python').find_installation('python3'), - '-c', 'from multiprocessing import cpu_count; print(cpu_count())', - ], - check: true, - ).stdout().strip() +if get_option('build_subprojects') and not fs.exists('build' / 'lib' / 'libspdk_nvme.a') message('Building ..') - run_command( - [find_program('make'), '-j', cpu_count], - capture: true, - check: true, - env: {}, - ) + run_command([find_program('make'), '-j', '20'], capture: true, check: true, env: {}) endif message('Setting up dependency ..') message('build_subprojects:', get_option('build_subprojects')) -spdk_libnames = get_option('build_subprojects') ? [ - 'spdk_nvme', - 'spdk_env_dpdk', - 'spdk_sock', - 'spdk_sock_posix', - 'spdk_rpc', - 'spdk_trace', - 'spdk_jsonrpc', - 'spdk_json', - 'spdk_util', - 'spdk_log', - 'spdk_vfio_user', -] : [] - custom_libnames = [ 'spdk_event', 'spdk_env_dpdk_rpc', @@ -164,20 +105,6 @@ custom_libnames = [ 'isal_crypto', ] -dpdk_libnames = get_option('build_subprojects') ? [ - 'rte_eal', - 'rte_telemetry', - 'rte_bus_pci', - 'rte_pci', - 'rte_ring', - 'rte_mempool', - 'rte_kvargs', -] : [] - -isal_libnames = get_option('build_subprojects') ? [ - 'isal', -] : [] - spdk_deps = [ dlfcn_dep, math_dep, @@ -189,7 +116,6 @@ spdk_deps = [ ] spdk_paths = [] -# foreach libname : spdk_libnames + dpdk_libnames + isal_libnames foreach libname : custom_libnames csd = meson.current_source_dir() lib_dep = cc.find_library( From cfdd9baad2a2655560bfcd1d3bd5e35cf15431c1 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 16 May 2024 17:43:48 +0000 Subject: [PATCH 16/77] Add sanitizers to debug build --- Makefile | 4 ++-- meson.build | 13 ++----------- subprojects/packagefiles/spdk/meson.build | 2 +- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index d0ae8abd..5689f096 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,8 @@ .PHONY: setup setup-debug release debug paper clean setup: - meson setup --native-file meson.ini build-rel --buildtype=release - meson setup --native-file meson.ini build-dbg --buildtype=debug + meson setup --native-file meson.ini build-rel --buildtype=release -Db_sanitize=none + meson setup --native-file meson.ini build-dbg --buildtype=debug ln -s build-dbg builddir debug: setup diff --git a/meson.build b/meson.build index f5d42882..cff1ae0a 100644 --- a/meson.build +++ b/meson.build @@ -6,6 +6,8 @@ project( 'cpp_std=c++2a', 'warning_level=2', 'b_colorout=always', + 'b_sanitize=address,undefined', + 'b_lto=true', ], ) @@ -15,17 +17,6 @@ add_project_arguments('-Wno-unused-parameter', language: 'cpp') if get_option('buildtype') == 'debug' add_project_arguments('-fno-inline', language: 'cpp') add_project_arguments('-DLOGLV=1', language: 'cpp') - - # add sanitizers for debug builds - # add_project_arguments( - # '-fsanitize=address,undefined,nullability,implicit-conversion', - # language: 'cpp', - # ) - # add_project_link_arguments( - # '-fsanitize=address,undefined', - # '-Wl,--unresolved-symbols=ignore-in-object-files', - # language: 'cpp', - # ) endif subdir('src') diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build index 0ec819e1..1298dfb2 100644 --- a/subprojects/packagefiles/spdk/meson.build +++ b/subprojects/packagefiles/spdk/meson.build @@ -2,7 +2,7 @@ project('spdk', 'c', version: '24.01') -if not host_machine.system() == 'linux' +if host_machine.system() != 'linux' error('Unsupported system type "@0@"'.format(exec_env)) endif From b648757ad66b02f1e2c112d83ef234b05e66370b Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 04:45:48 +0000 Subject: [PATCH 17/77] Update to c++23 --- meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meson.build b/meson.build index cff1ae0a..1064a1e2 100644 --- a/meson.build +++ b/meson.build @@ -3,7 +3,7 @@ project( ['c', 'cpp'], version: '0.1', default_options: [ - 'cpp_std=c++2a', + 'cpp_std=c++23', 'warning_level=2', 'b_colorout=always', 'b_sanitize=address,undefined', From bd983b54701d59c68d7d29c83de9d63b12530471 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:00:32 +0000 Subject: [PATCH 18/77] Update bdev management --- src/bdev_lsvd.cc | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 0fa87499..1cc9c048 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -45,8 +45,8 @@ class lsvd_iodevice lsvd_iodevice(uptr img) : img(std::move(img)) { - bdev.product_name = (char *)"Log-structured Virtual Disk"; - bdev.name = (char *)img->image_name.c_str(); + bdev.product_name = strdup("Log-structured Virtual Disk"); + bdev.name = strdup(img->imgname.c_str()); bdev.blocklen = 4096; bdev.blockcnt = img->size / bdev.blocklen; bdev.ctxt = this; @@ -54,7 +54,11 @@ class lsvd_iodevice bdev.fn_table = &lsvd_fn_table; } - ~lsvd_iodevice() {} + ~lsvd_iodevice() + { + free(bdev.product_name); + free(bdev.name); + } }; static int lsvd_destroy_bdev(void *ctx) @@ -134,6 +138,11 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) assert(!img_name.empty()); auto img = lsvd_image::open_image(img_name, ioctx); + if (!img) { + log_error("Failed to open image '{}'.", img_name); + return -1; + } + auto iodev = new lsvd_iodevice(std::move(img)); spdk_io_device_register( @@ -160,16 +169,15 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) return 0; } -static void bdev_lsvd_delete_cb(void *arg, int rc) -{ - auto p = (std::promise *)arg; - p->set_value(rc); -} - -int bdev_lsvd_delete(std::string img_name, std::function *cb) +int bdev_lsvd_delete(std::string img_name) { auto p = std::promise(); - spdk_bdev_unregister_by_name(img_name.c_str(), &lsvd_if, - bdev_lsvd_delete_cb, &p); + spdk_bdev_unregister_by_name( + img_name.c_str(), &lsvd_if, + [](void *arg, int rc) { + auto p = (std::promise *)arg; + p->set_value(rc); + }, + &p); return p.get_future().get(); } From bbe3f457f1f00f58be44f96942533ee22fe3d076 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:01:26 +0000 Subject: [PATCH 19/77] Move wlog init code into write_cache --- src/mkcache.cc | 54 ------------------------ src/write_cache.cc | 103 ++++++++++++++++++++++++++++++++++++++------- src/write_cache.h | 6 +++ 3 files changed, 94 insertions(+), 69 deletions(-) delete mode 100644 src/mkcache.cc diff --git a/src/mkcache.cc b/src/mkcache.cc deleted file mode 100644 index e53bb7b2..00000000 --- a/src/mkcache.cc +++ /dev/null @@ -1,54 +0,0 @@ -/* - * file: mkcache.cc - * description: create file containing write+read caches - * - * author: Peter Desnoyers, Northeastern University - * Copyright 2021, 2022 Peter Desnoyers - * license: GNU LGPL v2.1 or newer - * LGPL-2.1-or-later - */ - -#include -#include -#include -#include - -#include "journal.h" -#include "lsvd_types.h" - -int init_wcache(int fd, uuid_t &uuid, int n_pages) -{ - page_t w_pages = n_pages - 1; - page_t _map = div_round_up(w_pages, 256); - page_t _len = div_round_up(w_pages, 512); - page_t w_meta = 2 * (_map + _len); - char buf[4096]; - - w_pages -= w_meta; - - memset(buf, 0, sizeof(buf)); - auto w_super = (j_write_super *)buf; - *w_super = (j_write_super){LSVD_MAGIC, LSVD_J_W_SUPER, - 1, 1, - 1, 1, - 1 + w_meta, 1 + w_meta, - 1 + w_meta + w_pages, 1 + w_meta, - 0, 0, - 0, 0, - 0, 0, {0}}; - memcpy(w_super->vol_uuid, uuid, sizeof(uuid_t)); - if (!write(fd, buf, 4096)) { - perror("write cache write"); - return -1; - } - - memset(buf, 0, 4096); - for (int i = 1; i < 1 + w_pages + w_meta; i++) { - if (!write(fd, buf, 4096)) { - perror("write cache write"); - return -1; - } - } - - return 0; -} \ No newline at end of file diff --git a/src/write_cache.cc b/src/write_cache.cc index fb25e288..2bff46a1 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -13,6 +14,7 @@ #include "request.h" #include "smartiov.h" #include "translate.h" +#include "utils.h" #include "write_cache.h" /* ------------- Write cache structure ------------- */ @@ -22,7 +24,8 @@ class write_cache_impl : public write_cache { size_t dev_max; uint32_t super_blkno; - lsvd_config *cfg; + int fd = -1; + lsvd_config &cfg; std::atomic sequence = 1; // write sequence # @@ -55,7 +58,7 @@ class write_cache_impl : public write_cache */ friend class wcache_write_req; std::mutex m; - translate *be; + translate &be; j_hdr *mk_header(char *buf, uint32_t type, page_t blks, page_t prev); nvme *nvme_w = NULL; @@ -66,7 +69,7 @@ class write_cache_impl : public write_cache void release_room(sector_t sectors); void flush(void); - write_cache_impl(uint32_t blkno, int _fd, translate *_be, lsvd_config *cfg); + write_cache_impl(uint32_t blkno, int _fd, translate &_be, lsvd_config &cfg); ~write_cache_impl(); request *writev(sector_t lba, smartiov *iov); @@ -451,14 +454,13 @@ int write_cache_impl::roll_log_forward() #endif } -write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate *_be, - lsvd_config *cfg_) +write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate &be, + lsvd_config &cfg) + : fd(fd), cfg(cfg), be(be) { super_blkno = blkno; dev_max = getsize64(fd); - be = _be; - cfg = cfg_; _hdrbuf = (char *)aligned_alloc(4096, 4096); @@ -486,20 +488,15 @@ write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate *_be, int n_pages = super->limit - super->base; max_write_pages = n_pages / 2 + n_pages / 4; - write_batch = cfg->wcache_batch; + write_batch = cfg.wcache_batch; misc_threads = new thread_pool(&m); } -uptr make_write_cache(uint32_t blkno, int fd, translate *be, - lsvd_config *cfg) -{ - return std::make_unique(blkno, fd, be, cfg); -} - write_cache_impl::~write_cache_impl() { delete misc_threads; + close(fd); free(super); free(_hdrbuf); delete nvme_w; @@ -526,9 +523,85 @@ request *write_cache_impl::writev(sector_t lba, smartiov *iovs) lk.unlock(); // writing to in-memory buffer (translation layer) - be->writev(req->seq, lba * 512, iov, iovcnt); + be.writev(req->seq, lba * 512, iov, iovcnt); return req; } void write_cache_impl::do_write_checkpoint(void) { write_checkpoint(); } + +int init_wcache(int fd, uuid_t &uuid, int n_pages) +{ + page_t w_pages = n_pages - 1; + page_t _map = div_round_up(w_pages, 256); + page_t _len = div_round_up(w_pages, 512); + page_t w_meta = 2 * (_map + _len); + char buf[4096]; + + w_pages -= w_meta; + + memset(buf, 0, sizeof(buf)); + auto w_super = (j_write_super *)buf; + *w_super = (j_write_super){LSVD_MAGIC, + LSVD_J_W_SUPER, + 1, + 1, + 1, + 1, + 1 + w_meta, + 1 + w_meta, + 1 + w_meta + w_pages, + 1 + w_meta, + 0, + 0, + 0, + 0, + 0, + 0, + {0}}; + memcpy(w_super->vol_uuid, uuid, sizeof(uuid_t)); + + int ret = pwrite(fd, buf, 4096, 0); + PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to write wlog header"); + + // just truncate to right length, don't bother writing zeroes + ret = ftruncate(fd, 4096 * (1 + w_pages + w_meta)); + PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to truncate wlog file"); + + return 0; +} + +uptr open_wlog(fspath path, usize size, translate &xlate, + lsvd_config &cfg) +{ + int fd = 0; + if (!std::filesystem::exists(path)) { + log_info("Creating write cache file '{}'", path); + fd = open(path.c_str(), O_RDWR | O_CREAT, 0644); + PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to create cache file"); + + auto err = init_wcache(fd, xlate.uuid, size / 4096); + PR_ERR_RET_IF(err < 0, nullptr, -err, "Failed to init wlog"); + } + + fd = open(path.c_str(), O_RDWR); + PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to open wlog file"); + + char buf[4096]; + int err = pread(fd, buf, 4096, 0); + PR_ERR_RET_IF(err < 0, nullptr, errno, "Failed to read wlog header"); + + j_write_super *super = (j_write_super *)buf; + PR_RET_IF(super->magic != LSVD_MAGIC, nullptr, + "Invalid write cache magic number: {}", super->magic); + PR_RET_IF(super->type != LSVD_J_W_SUPER, nullptr, "Invalid cache type: {}", + super->type); + + try { + return std::make_unique(1, fd, xlate, cfg); + } catch (std::exception &e) { + log_error("Failed to open write cache: {}", e.what()); + close(fd); + return nullptr; + } +} diff --git a/src/write_cache.h b/src/write_cache.h index 04b46710..21ca0222 100644 --- a/src/write_cache.h +++ b/src/write_cache.h @@ -1,5 +1,6 @@ #pragma once +#include "config.h" #include "lsvd_types.h" #include "translate.h" #include "utils.h" @@ -22,3 +23,8 @@ class write_cache uptr make_write_cache(uint32_t blkno, int fd, translate *be, lsvd_config *cfg); + +int init_wcache(int fd, uuid_t &uuid, int n_pages); + +uptr open_wlog(fspath path, usize size, translate &xlate, + lsvd_config &cfg); From 5be595309996de9ef350f8d9e6fe9ff2463b895a Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:02:13 +0000 Subject: [PATCH 20/77] Add new backend functions --- src/backend.h | 6 ++++++ src/rados_backend.cc | 29 +++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/backend.h b/src/backend.h index 1f6eb0d1..7abe207e 100644 --- a/src/backend.h +++ b/src/backend.h @@ -7,6 +7,7 @@ #include "config.h" #include "request.h" #include "smartiov.h" +#include "utils.h" class backend { @@ -45,6 +46,11 @@ class backend smartiov iov((char *)buf, len); return aio_read(name, offset, iov); } + + virtual opt get_size(std::string name); + virtual opt> read_whole_obj(std::string name); + + virtual bool exists(std::string name); }; extern std::shared_ptr make_file_backend(const char *prefix); diff --git a/src/rados_backend.cc b/src/rados_backend.cc index 6e62373d..0e6ffcd7 100644 --- a/src/rados_backend.cc +++ b/src/rados_backend.cc @@ -163,6 +163,35 @@ class rados_backend : public backend { return new rados_delete_req(ctx, name); } + + bool exists(std::string name) override + { + return ctx.stat(name, nullptr, nullptr) == 0; + } + + opt get_size(std::string name) override + { + u64 size; + time_t mtime; + int rv = ctx.stat(name, &size, &mtime); + if (rv < 0) + return std::nullopt; + return size; + } + + opt> read_whole_obj(std::string name) override + { + auto size = get_size(name); + PASSTHRU_NULLOPT(size); + + std::vector buf(size.value()); + smartiov iov((char *)buf.data(), buf.size()); + auto r = read(name, 0, iov); + if (r < 0) + return std::nullopt; + + return buf; + } }; std::shared_ptr make_rados_backend(rados_ioctx_t io) From a099ea30518c63ee08eea4b8e0c9b6f8bef8dd62 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:02:20 +0000 Subject: [PATCH 21/77] Update utils headers --- src/utils.h | 95 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/src/utils.h b/src/utils.h index eb722924..77991052 100644 --- a/src/utils.h +++ b/src/utils.h @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -26,9 +28,39 @@ template using sptr = std::shared_ptr; template using uptr = std::unique_ptr; +template using opt = std::optional; +template using vec = std::vector; #define CEXTERN extern "C" +using u64 = uint64_t; +using u32 = uint32_t; +using u16 = uint16_t; +using u8 = uint8_t; +using s64 = int64_t; +using s32 = int32_t; +using s16 = int16_t; +using s8 = int8_t; +using usize = size_t; +using ssize = ssize_t; +using byte = std::byte; +using str = std::string; +using fspath = std::filesystem::path; + +#define PASSTHRU_NULLOPT(opt) \ + do { \ + if (!opt) { \ + return std::nullopt; \ + } \ + } while (0) + +#define PASSTHRU_NULLPTR(ptr) \ + do { \ + if (!ptr) { \ + return nullptr; \ + } \ + } while (0) + #define trace(MSG, ...) \ do { \ if (LOGLV <= 0) \ @@ -53,13 +85,6 @@ template using uptr = std::unique_ptr; __func__, ##__VA_ARGS__); \ } while (0) -#define log_error(MSG, ...) \ - do { \ - fmt::print(stderr, fg(fmt::terminal_color::red) | fmt::emphasis::bold, \ - "[ERR {}:{} {}] " MSG "\n", __FILE__, __LINE__, __func__, \ - ##__VA_ARGS__); \ - } while (0) - #define log_warn(MSG, ...) \ do { \ if (LOGLV <= 3) \ @@ -69,11 +94,67 @@ template using uptr = std::unique_ptr; __func__, ##__VA_ARGS__); \ } while (0) +#define log_error(MSG, ...) \ + do { \ + fmt::print(stderr, fg(fmt::terminal_color::red) | fmt::emphasis::bold, \ + "[ERR {}:{} {}] " MSG "\n", __FILE__, __LINE__, __func__, \ + ##__VA_ARGS__); \ + } while (0) + #define trap_to_debugger() \ do { \ raise(SIGTRAP); \ } while (0) +#define RET_IF(cond, ret) \ + do { \ + if (cond) { \ + return ret; \ + } \ + } while (0) + +#define PR_RET_IF(cond, ret, MSG, ...) \ + do { \ + if (cond) { \ + log_error(MSG, ##__VA_ARGS__); \ + return ret; \ + } \ + } while (0) + +/** + * If `cond` is true, print an error message to stdout with MSG, then return + * `ret` + */ +#define PR_ERR_RET_IF(cond, ret, en, MSG, ...) \ + do { \ + if (cond) { \ + auto fs = fmt::format(MSG "\n", ##__VA_ARGS__); \ + auto s = fmt::format("[ERR {}:{} {} | errno {}/{}] {}", __FILE__, \ + __LINE__, __func__, en, strerror(en), fs); \ + fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, s); \ + return ret; \ + } \ + } while (0) + +#define THROW_MSG_ON(cond, MSG, ...) \ + do { \ + if (cond) { \ + auto s = fmt::format("[ERR {}:{} {}] " MSG "\n", __FILE__, \ + __LINE__, __func__, ##__VA_ARGS__); \ + fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, s); \ + throw std::runtime_error(s); \ + } \ + } while (0) + +#define THROW_ERRNO_ON(cond, en, MSG, ...) \ + do { \ + if (cond) { \ + auto m = \ + fmt::format("{}/{}: " MSG, en, strerr(en), ##__VA_ARGS__); \ + throw std::system_error(m); \ + } \ + } while (0) + /** * Check return values of libstdc functions. If it's -1, print the error and * throw an exception From d12b18a652596c49f6679303305affc7e7350d71 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:02:45 +0000 Subject: [PATCH 22/77] Add seqnum definition --- src/lsvd_types.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/lsvd_types.h b/src/lsvd_types.h index 7837dac2..b8f046b9 100644 --- a/src/lsvd_types.h +++ b/src/lsvd_types.h @@ -5,8 +5,9 @@ #include #include -typedef int64_t sector_t; -typedef int page_t; +using sector_t = int64_t; +using page_t = int32_t; +using seqnum_t = uint32_t; enum lsvd_op { OP_READ = 2, OP_WRITE = 4 }; @@ -62,3 +63,8 @@ class objname std::string str() { return name; } const char *c_str() { return name.c_str(); } }; + +static inline std::string oname(std::string prefix, uint32_t seq) +{ + return fmt::format("{}.{:08x}", prefix, seq); +} From b4b11f7629b5028443179d71d7404a89654e504c Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:05:44 +0000 Subject: [PATCH 23/77] Overhaul backend parsing code --- src/objects.cc | 228 +++++++++++++++++++++++++++---------------------- src/objects.h | 75 +++++++++++----- 2 files changed, 181 insertions(+), 122 deletions(-) diff --git a/src/objects.cc b/src/objects.cc index 70012db6..43342901 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -1,122 +1,150 @@ -#include +#include #include +#include #include -#include "lsvd_debug.h" #include "lsvd_types.h" #include "objects.h" +#include "utils.h" -char *object_reader::read_object_hdr(const char *name, bool fast) +void serialise_common_hdr(vec buf, obj_type t, seqnum_t s, u32 hdr, + u32 data, uuid_t &uuid) { - obj_hdr *h = (obj_hdr *)malloc(4096); - int rv; - if ((rv = objstore->read(name, 0, h, 4096)) < 0) - goto fail; - if (fast) - return (char *)h; - if (h->hdr_sectors > 8) { - size_t len = h->hdr_sectors * 512; - h = (obj_hdr *)realloc(h, len); - if (objstore->read(name, 0, h, len) < 0) - goto fail; - } - return (char *)h; -fail: - free((char *)h); - return NULL; + if (buf.size() < sizeof(common_obj_hdr)) + buf.resize(sizeof(common_obj_hdr)); + + auto h = (common_obj_hdr *)buf.data(); + *h = (common_obj_hdr){.magic = LSVD_MAGIC, + .version = 1, + .vol_uuid = {0}, + .type = t, + .seq = s, + .hdr_sectors = hdr, + .data_sectors = data, + .crc = 0}; + uuid_copy(h->vol_uuid, uuid); } -/* read all info from superblock, returns a vast number of things: - * [super, vol_size] = f(name, &ckpts, &clones, *&snaps): - * - super - pointer to buffer (must be freed) - * - vol_size - in bytes (-1 on failure) - * - ckpts, clones, snaps - what you'd expect - */ -std::pair -object_reader::read_super(const char *name, std::vector &ckpts, - std::vector &clones, - std::vector &snaps, uuid_t &uuid) +void serialise_superblock(vec buf, vec &checkpoints, + vec &clones, uuid_t &uuid) { - char *super_buf = read_object_hdr(name, false); - if (super_buf == NULL) - return std::make_pair((char *)NULL, -1); + usize req_size = sizeof(common_obj_hdr) + sizeof(super_hdr); + req_size += checkpoints.size() * sizeof(seqnum_t); + req_size = std::max(req_size, 8ul); // minimum of 4096 bytes + req_size = round_up(req_size, 512); // round to sector boundary + + if (buf.size() < req_size) + buf.resize(req_size); + + serialise_common_hdr(buf, OBJ_SUPERBLOCK, 0, req_size / 512, 0, uuid); - auto super_h = (obj_hdr *)super_buf; + auto h = (super_hdr *)(buf.data() + sizeof(common_obj_hdr)); + h->ckpts_offset = sizeof(common_obj_hdr) + sizeof(super_hdr); + h->ckpts_len = checkpoints.size() * sizeof(seqnum_t); - if (super_h->magic != LSVD_MAGIC || super_h->version != 1 || - super_h->type != LSVD_SUPER) - return std::make_pair((char *)NULL, -1); - memcpy(uuid, super_h->vol_uuid, sizeof(uuid_t)); + auto p = (seqnum_t *)(buf.data() + h->ckpts_offset); + for (auto &c : checkpoints) + *p++ = c; +} + +opt> object_reader::fetch_object_header(std::string objname) +{ + vec buf(4096); + auto err = objstore->read(objname, 0, buf.data(), 4096); + RET_IF(err != 4096, std::nullopt); - super_hdr *super_sh = (super_hdr *)(super_h + 1); + auto h = (common_obj_hdr *)buf.data(); - decode_offset_len(super_buf, super_sh->ckpts_offset, - super_sh->ckpts_len, ckpts); - decode_offset_len_ptr(super_buf, super_sh->clones_offset, - super_sh->clones_len, clones); - decode_offset_len_ptr(super_buf, super_sh->snaps_offset, - super_sh->snaps_len, snaps); + // Validate magic + PR_RET_IF(h->magic != LSVD_MAGIC || h->version != 1, std::nullopt, + "Invalid magic or version in object '{}'", objname); - return std::make_pair(super_buf, super_sh->vol_size * 512); + if (h->hdr_sectors <= 8) + return buf; + + // Header is longer than 4096, we have to fetch the rest + auto len = h->hdr_sectors * 512; + buf.reserve(len); + err = objstore->read(objname, 0, buf.data(), len); + PR_ERR_RET_IF(std::cmp_not_equal(err, len), std::nullopt, err, + "Failed to read object '{}' header", objname); + + return buf; } -/* read and decode the header of an object. Copies into arguments, - * frees all allocated memory - */ -ssize_t object_reader::read_data_hdr(const char *name, obj_hdr &h, - obj_data_hdr &dh, - std::vector &cleaned, - std::vector &dmap) +opt object_reader::read_superblock(std::string oname) { - char *buf = read_object_hdr(name, false); - if (buf == NULL) - return -1; - auto tmp_h = (obj_hdr *)buf; - auto tmp_dh = (obj_data_hdr *)(tmp_h + 1); - if (tmp_h->type != LSVD_DATA) { - free(buf); - return -1; - } - - h = *tmp_h; - dh = *tmp_dh; - - decode_offset_len(buf, tmp_dh->objs_cleaned_offset, - tmp_dh->objs_cleaned_len, cleaned); - decode_offset_len(buf, tmp_dh->data_map_offset, - tmp_dh->data_map_len, dmap); - - free(buf); - return 0; + auto buf = objstore->read_whole_obj(oname); + PASSTHRU_NULLOPT(buf); + auto hdr = (common_obj_hdr *)buf->data(); + + PR_RET_IF(hdr->magic != LSVD_MAGIC, std::nullopt, + "Corrupt object; invalid magic at '{}'", oname); + PR_RET_IF(hdr->version != 1, std::nullopt, + "Invalid version in object '{}', only 1 is supported", oname); + PR_RET_IF(hdr->type != OBJ_SUPERBLOCK, std::nullopt, + "Obj '{}' not a superblock", oname); + + parsed_superblock ret; + super_hdr *superblk = (super_hdr *)(hdr + 1); + + decode_offset_len((char *)hdr, superblk->ckpts_offset, + superblk->ckpts_len, ret.ckpts); + decode_offset_len_ptr((char *)hdr, superblk->clones_offset, + superblk->clones_len, ret.clones); + decode_offset_len_ptr((char *)hdr, superblk->snaps_offset, + superblk->snaps_len, ret.snaps); + + ret.superblock_buf = *buf; + uuid_copy(ret.uuid, hdr->vol_uuid); + ret.vol_size = superblk->vol_size * 512; + + return ret; } -/* read and decode a checkpoint object identified by sequence number - */ -ssize_t object_reader::read_checkpoint(const char *name, uint64_t &cache_seq, - std::vector &ckpts, - std::vector &objects, - std::vector &deletes, - std::vector &dmap) +opt object_reader::read_data_hdr(std::string oname) { - char *buf = read_object_hdr(name, false); - if (buf == NULL) { - do_log("buf == NULL\n"); - return -1; - } - auto h = (obj_hdr *)buf; - auto ch = (obj_ckpt_hdr *)(h + 1); - if (h->type != LSVD_CKPT) { - do_log("%s: WRONG TYPE %d\n", name, h->type); - free(buf); - return -1; - } - cache_seq = ch->cache_seq; - decode_offset_len(buf, ch->ckpts_offset, ch->ckpts_len, ckpts); - decode_offset_len(buf, ch->objs_offset, ch->objs_len, objects); - decode_offset_len(buf, ch->deletes_offset, ch->deletes_len, - deletes); - decode_offset_len(buf, ch->map_offset, ch->map_len, dmap); - - free(buf); - return 0; + auto hdr = fetch_object_header(oname); + PASSTHRU_NULLOPT(hdr); + + parsed_data_hdr h; + h.hdr = (common_obj_hdr *)h.buf.data(); + PR_RET_IF(h.hdr->type != OBJ_LOGDATA, std::nullopt, + "Invalid object type in '{}'", oname); + h.data_hdr = (obj_data_hdr *)(hdr->data() + sizeof(common_obj_hdr)); + + auto buf = (char *)hdr->data(); + decode_offset_len_ptr(buf, h.data_hdr->objs_cleaned_offset, + h.data_hdr->objs_cleaned_len, h.cleaned); + decode_offset_len_ptr(buf, h.data_hdr->data_map_offset, + h.data_hdr->data_map_len, h.data_map); + + h.buf = std::move(*hdr); + return h; } + +opt object_reader::read_checkpoint(std::string oname) +{ + auto hdr = fetch_object_header(oname); + PASSTHRU_NULLOPT(hdr); + + parsed_checkpoint ret; + ret.hdr = (common_obj_hdr *)hdr->data(); + PR_RET_IF(ret.hdr->type != OBJ_CHECKPOINT, std::nullopt, + "Invalid object type it '{}'", oname); + ret.ckpt_hdr = (obj_ckpt_hdr *)(&hdr->at(sizeof(common_obj_hdr))); + + auto buf = (char *)hdr->data(); + decode_offset_len(buf, ret.ckpt_hdr->ckpts_offset, + ret.ckpt_hdr->ckpts_len, ret.ckpts); + + decode_offset_len_ptr(buf, ret.ckpt_hdr->objs_offset, + ret.ckpt_hdr->objs_len, ret.objects); + decode_offset_len_ptr(buf, ret.ckpt_hdr->deletes_offset, + ret.ckpt_hdr->deletes_len, + ret.deletes); + decode_offset_len_ptr(buf, ret.ckpt_hdr->map_offset, + ret.ckpt_hdr->map_len, ret.dmap); + ret.buf = std::move(*hdr); + return ret; +} \ No newline at end of file diff --git a/src/objects.h b/src/objects.h index a5d7a0b0..c6a46093 100644 --- a/src/objects.h +++ b/src/objects.h @@ -1,26 +1,23 @@ #pragma once #include -#include #include -#include #include "backend.h" +#include "lsvd_types.h" +#include "utils.h" #if __BYTE_ORDER != __LITTLE_ENDIAN #error "this code is little-endian only" #endif -/* for now we'll use 32-bit object sequence numbers. So sue me... - */ - -enum obj_type { LSVD_SUPER = 1, LSVD_DATA = 2, LSVD_CKPT = 3 }; +enum obj_type { OBJ_SUPERBLOCK = 1, OBJ_LOGDATA = 2, OBJ_CHECKPOINT = 3 }; /* hdr - standard header for all backend objects * total length is hdr_sectors + data_sectors, in 512-byte units * name is for superblock, (.%08x % seq) otherwise */ -struct obj_hdr { +struct common_obj_hdr { uint32_t magic; uint32_t version; // 1 uuid_t vol_uuid; @@ -139,28 +136,62 @@ struct ckpt_mapentry { /* ------ helper functions -------- */ +struct parsed_superblock { + vec superblock_buf; // buffer containing superblock + usize vol_size; // size in bytes + vec ckpts; // checkpoint sequence numbers + vec clones; // ptrs are into the buffer + vec snaps; // ptrs are into the buffer + uuid_t uuid; +}; + +struct parsed_data_hdr { + vec buf; + common_obj_hdr *hdr; + obj_data_hdr *data_hdr; + vec cleaned; + vec data_map; +}; + +struct parsed_checkpoint { + vec buf; + common_obj_hdr *hdr; + obj_ckpt_hdr *ckpt_hdr; + vec ckpts; + vec objects; + vec deletes; + vec dmap; +}; + class object_reader { - std::shared_ptr objstore; + sptr objstore; public: object_reader(std::shared_ptr be) : objstore(be) {} - char *read_object_hdr(const char *name, bool fast); + opt> fetch_object_header(std::string oname); + opt read_superblock(std::string oname); + opt read_data_hdr(std::string oname); + opt read_checkpoint(std::string oname); +}; - std::pair read_super(const char *name, - std::vector &ckpts, - std::vector &clones, - std::vector &snaps, - uuid_t &uuid); +// ----- common image types, temporary(T&Cs apply) workaround ----- - ssize_t read_data_hdr(const char *name, obj_hdr &h, obj_data_hdr &dh, - std::vector &cleaned, - std::vector &dmap); +struct clone_base { + std::string name; + seqnum_t last_seq; + seqnum_t first_seq = 0; +}; - ssize_t read_checkpoint(const char *name, uint64_t &cache_seq, - std::vector &ckpts, - std::vector &objects, - std::vector &deletes, - std::vector &dmap); +struct data_obj_info { + sector_t hdr; + sector_t data; + sector_t live; }; + +void serialise_common_hdr(vec buf, obj_type t, seqnum_t s, u32 hdr, + u32 data, uuid_t &uuid); + +void serialise_superblock(vec buf, vec &checkpoints, + vec &clones, uuid_t &uuid); From 85340f329e863c07236c6fa75ac6c72451486cdc Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 20 May 2024 23:06:06 +0000 Subject: [PATCH 24/77] Change config option names --- src/config.cc | 12 ++++++------ src/config.h | 35 +++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/src/config.cc b/src/config.cc index 22662710..b874d26d 100644 --- a/src/config.cc +++ b/src/config.cc @@ -69,18 +69,18 @@ int lsvd_config::read() split(line, words); if (words.size() != 2) continue; - F_CONFIG_H_INT(words[0], words[1], batch_size); + F_CONFIG_H_INT(words[0], words[1], backend_obj_size); F_CONFIG_INT(words[0], words[1], wcache_batch); F_CONFIG_H_INT(words[0], words[1], wcache_chunk); F_CONFIG_STR(words[0], words[1], rcache_dir); F_CONFIG_STR(words[0], words[1], wcache_dir); - F_CONFIG_INT(words[0], words[1], xlate_window); + F_CONFIG_INT(words[0], words[1], num_parallel_writes); F_CONFIG_TABLE(words[0], words[1], backend, m); F_CONFIG_H_INT(words[0], words[1], cache_size); F_CONFIG_H_INT(words[0], words[1], wlog_size); F_CONFIG_INT(words[0], words[1], hard_sync); F_CONFIG_INT(words[0], words[1], ckpt_interval); - F_CONFIG_INT(words[0], words[1], flush_msec); + F_CONFIG_INT(words[0], words[1], flush_timeout_msec); F_CONFIG_INT(words[0], words[1], gc_threshold); F_CONFIG_INT(words[0], words[1], fetch_window); F_CONFIG_INT(words[0], words[1], fetch_ratio); @@ -91,18 +91,18 @@ int lsvd_config::read() break; } - ENV_CONFIG_H_INT(batch_size); + ENV_CONFIG_H_INT(backend_obj_size); ENV_CONFIG_INT(wcache_batch); ENV_CONFIG_H_INT(wcache_chunk); ENV_CONFIG_STR(rcache_dir); ENV_CONFIG_STR(wcache_dir); - ENV_CONFIG_INT(xlate_window); + ENV_CONFIG_INT(num_parallel_writes); ENV_CONFIG_TABLE(backend, m); ENV_CONFIG_H_INT(cache_size); ENV_CONFIG_H_INT(wlog_size); ENV_CONFIG_INT(hard_sync); ENV_CONFIG_INT(ckpt_interval); - ENV_CONFIG_INT(flush_msec); + ENV_CONFIG_INT(flush_timeout_msec); ENV_CONFIG_INT(gc_threshold); ENV_CONFIG_INT(fetch_window); ENV_CONFIG_INT(fetch_ratio); diff --git a/src/config.h b/src/config.h index 45857a20..47cc8749 100644 --- a/src/config.h +++ b/src/config.h @@ -1,3 +1,5 @@ +#pragma once + /* * file: config.h * description: quick and dirty config file parser @@ -9,12 +11,11 @@ * LGPL-2.1-or-later */ -#ifndef __CONFIG_H__ -#define __CONFIG_H__ - #include #include +#include "utils.h" + enum cfg_backend { BACKEND_FILE = 1, BACKEND_RADOS = 2 }; enum cfg_cache_type { LSVD_CFG_READ = 1, LSVD_CFG_WRITE = 2 }; @@ -22,28 +23,34 @@ enum cfg_cache_type { LSVD_CFG_READ = 1, LSVD_CFG_WRITE = 2 }; class lsvd_config { public: - int batch_size = 8 * 1024 * 1024; // in bytes - int wcache_batch = 8; // requests - int wcache_chunk = 2 * 1024 * 1024; // bytes + int backend_obj_size = 8 * 1024 * 1024; // in bytes + int wcache_batch = 8; // requests + int wcache_chunk = 2 * 1024 * 1024; // bytes std::string rcache_dir = "/tmp"; std::string wcache_dir = "/tmp"; - int xlate_window = 8; + u32 num_parallel_writes = 8; int hard_sync = 0; enum cfg_backend backend = BACKEND_RADOS; long cache_size = 500 * 1024 * 1024; // in bytes - long wlog_size = 500 * 1024 * 1024; // in bytes + long wlog_size = 500 * 1024 * 1024; // in bytes int ckpt_interval = 500; // objects - int flush_msec = 2000; // flush timeout + int flush_timeout_msec = 2000; // flush timeout + int flush_interval_msec = 1000; // flush interval int gc_threshold = 60; // GC threshold, percent - int gc_window = 4; // max GC writes outstanding + int gc_window = 4; // max GC writes outstanding int fetch_window = 12; // read cache fetches int fetch_ratio = 67; // anti-thrash served:backend ratio - int no_gc = 0; // turn off GC + int no_gc = 0; // turn off GC lsvd_config() {} ~lsvd_config() {} int read(); - std::string cache_filename(uuid_t &uuid, const char *name, cfg_cache_type type); -}; + std::string cache_filename(uuid_t &uuid, const char *name, + cfg_cache_type type); -#endif + inline fspath wlog_path(str imgname) + { + auto filename = imgname + ".wlog"; + return fspath(wcache_dir) / filename; + } +}; From 678430ffd45e7d65bb173b1bec4b696d6f4aeca9 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 03:00:30 +0000 Subject: [PATCH 25/77] Rename rbd wrapper to better reflect its usage --- src/liblsvd.cc | 40 ++++++++++++++++++++-------------------- src/spdk_wrap.cc | 39 ++++++++++++++++++++------------------- src/spdk_wrap.h | 13 ++++++++----- 3 files changed, 48 insertions(+), 44 deletions(-) diff --git a/src/liblsvd.cc b/src/liblsvd.cc index 5b4d8b7a..5ac575cf 100644 --- a/src/liblsvd.cc +++ b/src/liblsvd.cc @@ -22,7 +22,7 @@ extern "C" int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, const char *snap_name) { - auto img = lsvd_spdk::open_image(io, name); + auto img = lsvd_rbd::open_image(io, name); if (img == nullptr) return -1; @@ -33,8 +33,8 @@ extern "C" int rbd_open(rados_ioctx_t io, const char *name, rbd_image_t *image, extern "C" int rbd_close(rbd_image_t image) { - lsvd_spdk *img = (lsvd_spdk *)image; - log_info("Closing image {}", img->get_img().image_name); + lsvd_rbd *img = (lsvd_rbd *)image; + log_info("Closing image {}", img->get_img().imgname); // poor man's race prevention. wait for in-flight requests sleep(2); @@ -46,14 +46,14 @@ extern "C" int rbd_close(rbd_image_t image) extern "C" int rbd_poll_io_events(rbd_image_t image, rbd_completion_t *comps, int numcomp) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; return img->poll_io_events(reinterpret_cast(comps), numcomp); } extern "C" int rbd_set_image_notification(rbd_image_t image, int fd, int type) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; assert(type == EVENT_TYPE_EVENTFD); event_socket ev(fd, EVENT_TYPE_EVENTFD); @@ -64,19 +64,19 @@ extern "C" int rbd_aio_create_completion(void *cb_arg, rbd_callback_t complete_cb, rbd_completion_t *c) { - auto nc = lsvd_spdk::create_completion(complete_cb, cb_arg); + auto nc = lsvd_rbd::create_completion(complete_cb, cb_arg); *c = (rbd_completion_t)nc; return 0; } extern "C" void rbd_aio_release(rbd_completion_t c) { - lsvd_spdk::release_completion((spdk_completion *)c); + lsvd_rbd::release_completion((spdk_completion *)c); } extern "C" int rbd_discard(rbd_image_t image, uint64_t ofs, uint64_t len) { - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; auto req = img->trim(ofs, len, nullptr); req->run(nullptr); req->wait(); @@ -87,7 +87,7 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, rbd_completion_t c) { auto p = (spdk_completion *)c; - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; img->trim(off, len, p); p->run(); return 0; @@ -96,7 +96,7 @@ extern "C" int rbd_aio_discard(rbd_image_t image, uint64_t off, uint64_t len, extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c) { auto *p = (spdk_completion *)c; - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; img->flush(p); p->run(); return 0; @@ -104,7 +104,7 @@ extern "C" int rbd_aio_flush(rbd_image_t image, rbd_completion_t c) extern "C" int rbd_flush(rbd_image_t image) { - auto img = (lsvd_spdk *)image; + auto img = (lsvd_rbd *)image; auto req = img->flush(nullptr); req->run(nullptr); req->wait(); @@ -127,7 +127,7 @@ extern "C" ssize_t rbd_aio_get_return_value(rbd_completion_t c) extern "C" int rbd_aio_read(rbd_image_t image, uint64_t offset, size_t len, char *buf, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto p = (spdk_completion *)c; img->read(offset, smartiov(buf, len), p); p->run(); @@ -137,7 +137,7 @@ extern "C" int rbd_aio_read(rbd_image_t image, uint64_t offset, size_t len, extern "C" int rbd_aio_readv(rbd_image_t image, const iovec *iov, int iovcnt, uint64_t offset, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto p = (spdk_completion *)c; img->read(offset, smartiov(iov, iovcnt), p); p->run(); @@ -147,7 +147,7 @@ extern "C" int rbd_aio_readv(rbd_image_t image, const iovec *iov, int iovcnt, extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, int iovcnt, uint64_t offset, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto *p = (spdk_completion *)c; img->write(offset, smartiov(iov, iovcnt), p); p->run(); @@ -157,7 +157,7 @@ extern "C" int rbd_aio_writev(rbd_image_t image, const struct iovec *iov, extern "C" int rbd_aio_write(rbd_image_t image, uint64_t offset, size_t len, const char *buf, rbd_completion_t c) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto *p = (spdk_completion *)c; img->write(offset, smartiov((char *)buf, len), p); p->run(); @@ -168,7 +168,7 @@ extern "C" int rbd_aio_write(rbd_image_t image, uint64_t offset, size_t len, */ extern "C" int rbd_read(rbd_image_t image, uint64_t off, size_t len, char *buf) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto req = img->read(off, smartiov(buf, len), NULL); req->run(NULL); req->wait(); @@ -179,7 +179,7 @@ extern "C" int rbd_read(rbd_image_t image, uint64_t off, size_t len, char *buf) extern "C" int rbd_write(rbd_image_t image, uint64_t off, size_t len, const char *buf) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; auto req = img->write(off, smartiov((char *)buf, len), NULL); req->run(NULL); req->wait(); @@ -200,7 +200,7 @@ extern "C" int rbd_aio_wait_for_complete(rbd_completion_t c) extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info, size_t infosize) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; memset(info, 0, sizeof(*info)); info->size = img->get_img().size; info->obj_size = 1 << 22; // 2^21 bytes @@ -211,7 +211,7 @@ extern "C" int rbd_stat(rbd_image_t image, rbd_image_info_t *info, extern "C" int rbd_get_size(rbd_image_t image, uint64_t *size) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; *size = img->get_img().size; return 0; } @@ -274,7 +274,7 @@ extern "C" int rbd_remove(rados_ioctx_t io, const char *name) extern "C" void rbd_uuid(rbd_image_t image, uuid_t *uuid) { - lsvd_spdk *img = (lsvd_spdk *)image; + lsvd_rbd *img = (lsvd_rbd *)image; memcpy(uuid, img->get_img().xlate->uuid, sizeof(uuid_t)); } diff --git a/src/spdk_wrap.cc b/src/spdk_wrap.cc index e0103ca5..44960763 100644 --- a/src/spdk_wrap.cc +++ b/src/spdk_wrap.cc @@ -1,4 +1,6 @@ #include "spdk_wrap.h" +#include "config.h" +#include "src/utils.h" spdk_completion::spdk_completion(rbd_callback_t cb, void *cb_arg) : cb(cb), cb_arg(cb_arg) @@ -11,7 +13,7 @@ spdk_completion::~spdk_completion() req->release(); } -void spdk_completion::delayed_init(lsvd_spdk *img, request *req) +void spdk_completion::delayed_init(lsvd_rbd *img, request *req) { this->img = img; this->req = req; @@ -54,31 +56,30 @@ inline void spdk_completion::dec_and_free() delete this; } -lsvd_spdk *lsvd_spdk::open_image(rados_ioctx_t io, std::string name) +lsvd_rbd *lsvd_rbd::open_image(rados_ioctx_t io, std::string name) { - auto img = new lsvd_spdk(); - try { - img->img.try_open(name, io); + lsvd_config cfg; + auto err = cfg.read(); + PR_ERR_RET_IF(err < 0, nullptr, -err, "Failed to read config"); + + return new lsvd_rbd(name, io, cfg); } catch (std::runtime_error &e) { log_error("Failed to open image: {}", e.what()); - delete img; return nullptr; } - - return img; } -void lsvd_spdk::close_image() { delete this; } +void lsvd_rbd::close_image() { delete this; } -spdk_completion *lsvd_spdk::create_completion(rbd_callback_t cb, void *cb_arg) +spdk_completion *lsvd_rbd::create_completion(rbd_callback_t cb, void *cb_arg) { return new spdk_completion(cb, cb_arg); } -void lsvd_spdk::release_completion(spdk_completion *c) { c->release(); } +void lsvd_rbd::release_completion(spdk_completion *c) { c->release(); } -void lsvd_spdk::on_request_complete(spdk_completion *c) +void lsvd_rbd::on_request_complete(spdk_completion *c) { std::unique_lock lk(completions_mtx); if (ev.has_value()) { @@ -87,13 +88,13 @@ void lsvd_spdk::on_request_complete(spdk_completion *c) } } -int lsvd_spdk::switch_to_poll(event_socket &&ev) +int lsvd_rbd::switch_to_poll(event_socket &&ev) { this->ev = std::move(ev); return 0; } -int lsvd_spdk::poll_io_events(spdk_completion **comps, int numcomp) +int lsvd_rbd::poll_io_events(spdk_completion **comps, int numcomp) { assert(ev.has_value()); @@ -117,34 +118,34 @@ std::function make_cb(spdk_completion *c) return [c](int rv) { c->complete(rv); }; } -void init_completion(spdk_completion *c, lsvd_spdk *img, request *req) +void init_completion(spdk_completion *c, lsvd_rbd *img, request *req) { if (c != nullptr) c->delayed_init(img, req); } -request *lsvd_spdk::read(size_t offset, smartiov iov, spdk_completion *c) +request *lsvd_rbd::read(size_t offset, smartiov iov, spdk_completion *c) { auto req = img.read(offset, iov, make_cb(c)); init_completion(c, this, req); return req; } -request *lsvd_spdk::write(size_t offset, smartiov iov, spdk_completion *c) +request *lsvd_rbd::write(size_t offset, smartiov iov, spdk_completion *c) { auto req = img.write(offset, iov, make_cb(c)); init_completion(c, this, req); return req; } -request *lsvd_spdk::trim(size_t offset, size_t len, spdk_completion *c) +request *lsvd_rbd::trim(size_t offset, size_t len, spdk_completion *c) { auto req = img.trim(offset, len, make_cb(c)); init_completion(c, this, req); return req; } -request *lsvd_spdk::flush(spdk_completion *c) +request *lsvd_rbd::flush(spdk_completion *c) { auto req = img.flush(make_cb(c)); init_completion(c, this, req); diff --git a/src/spdk_wrap.h b/src/spdk_wrap.h index 5f85f01d..82b7ac09 100644 --- a/src/spdk_wrap.h +++ b/src/spdk_wrap.h @@ -5,7 +5,7 @@ #include "fake_rbd.h" #include "image.h" -class lsvd_spdk; +class lsvd_rbd; class lsvd_image; class spdk_completion @@ -19,7 +19,7 @@ class spdk_completion rbd_callback_t cb; - lsvd_spdk *img = nullptr; + lsvd_rbd *img = nullptr; int retval = -1; request *req = nullptr; @@ -31,7 +31,7 @@ class spdk_completion spdk_completion(rbd_callback_t cb, void *cb_arg); ~spdk_completion(); - void delayed_init(lsvd_spdk *img, request *req); + void delayed_init(lsvd_rbd *img, request *req); void run(); void wait(); @@ -74,13 +74,16 @@ struct event_socket { /** * Wrapper around lsvd_image for SPDK's RBD api */ -class lsvd_spdk +class lsvd_rbd { public: - static lsvd_spdk *open_image(rados_ioctx_t io, std::string name); + static lsvd_rbd *open_image(rados_ioctx_t io, std::string name); void close_image(); private: + lsvd_rbd(str name, rados_ioctx_t io, lsvd_config cfg); + ~lsvd_rbd(); + lsvd_image img; std::queue completions; From beffbb9c7c3b3ae51dafdc1b48f5dfc39471359d Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 03:01:05 +0000 Subject: [PATCH 26/77] Remove mkcache from meson --- src/meson.build | 1 - src/write_cache.cc | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/meson.build b/src/meson.build index 31a06e8a..c1b0b46e 100644 --- a/src/meson.build +++ b/src/meson.build @@ -5,7 +5,6 @@ lsvd_src = files( 'image.cc', 'liblsvd.cc', 'lsvd_debug.cc', - 'mkcache.cc', 'nvme.cc', 'objects.cc', 'rados_backend.cc', diff --git a/src/write_cache.cc b/src/write_cache.cc index 2bff46a1..2f9ac06d 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -576,7 +576,7 @@ uptr open_wlog(fspath path, usize size, translate &xlate, { int fd = 0; if (!std::filesystem::exists(path)) { - log_info("Creating write cache file '{}'", path); + log_info("Creating write cache file '{}'", path.string()); fd = open(path.c_str(), O_RDWR | O_CREAT, 0644); PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to create cache file"); From 255e478286c19dbf11905307cc722241d1956270 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 03:01:31 +0000 Subject: [PATCH 27/77] Update tools to use new definitions --- src/imgtool.cc | 4 ++-- src/thick-image.cc | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/imgtool.cc b/src/imgtool.cc index 68f85cd2..2f03da9c 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -140,10 +140,10 @@ void info(rados_ioctx_t io, const char *image_name) if (rv < 0) throw std::runtime_error("failed to read superblock"); - auto base_hdr = (obj_hdr *)base_buf; + auto base_hdr = (common_obj_hdr *)base_buf; auto base_super = (super_hdr *)(base_hdr + 1); - if (base_hdr->magic != LSVD_MAGIC || base_hdr->type != LSVD_SUPER) + if (base_hdr->magic != LSVD_MAGIC || base_hdr->type != OBJ_SUPERBLOCK) throw std::runtime_error("corrupt superblock"); char uuid_str[64]; diff --git a/src/thick-image.cc b/src/thick-image.cc index 5c575cb5..8b8cd473 100644 --- a/src/thick-image.cc +++ b/src/thick-image.cc @@ -114,8 +114,8 @@ void create_thick(char *name, long size) size_t data_bytes = data_sectors * 512; auto hdr_buf = (char *)calloc(data_bytes + 4096, 1); - auto h = (obj_hdr *)hdr_buf; - *h = {LSVD_MAGIC, 1, {0}, LSVD_DATA, 0, 8, data_sectors, 0}; + auto h = (common_obj_hdr *)hdr_buf; + *h = {LSVD_MAGIC, 1, {0}, OBJ_LOGDATA, 0, 8, data_sectors, 0}; memcpy(h->vol_uuid, uu, sizeof(uu)); auto dh = (obj_data_hdr *)(h + 1); @@ -160,13 +160,13 @@ void create_thick(char *name, long size) uint32_t objmap_bytes = n_objs * sizeof(ckpt_obj); uint32_t extmap_bytes = n_objs * sizeof(ckpt_mapentry); - int ckpt_bytes = - sizeof(obj_hdr) + sizeof(obj_ckpt_hdr) + objmap_bytes + extmap_bytes; + int ckpt_bytes = sizeof(common_obj_hdr) + sizeof(obj_ckpt_hdr) + + objmap_bytes + extmap_bytes; uint32_t ckpt_sectors = div_round_up(ckpt_bytes, 512); auto ckpt_buf = (char *)calloc(ckpt_sectors * 512, 1); - auto ch = (obj_hdr *)ckpt_buf; - *ch = {LSVD_MAGIC, 1, {0}, LSVD_CKPT, 0, ckpt_sectors, 0, 0}; + auto ch = (common_obj_hdr *)ckpt_buf; + *ch = {LSVD_MAGIC, 1, {0}, OBJ_CHECKPOINT, 0, ckpt_sectors, 0, 0}; memcpy(ch->vol_uuid, uu, sizeof(uu)); auto cph = (obj_ckpt_hdr *)(ch + 1); @@ -204,11 +204,11 @@ void create_thick(char *name, long size) /* now write the superblock, with a single checkpoint pointer */ auto sb_data = (char *)calloc(4096, 1); - auto h2 = (obj_hdr *)sb_data; + auto h2 = (common_obj_hdr *)sb_data; auto sh = (super_hdr *)(h2 + 1); int ckpt_offset = sizeof(*h2) + sizeof(*sh); - *h2 = {LSVD_MAGIC, 1, {0}, LSVD_SUPER, 0, 8, 0, 0}; + *h2 = {LSVD_MAGIC, 1, {0}, OBJ_SUPERBLOCK, 0, 8, 0, 0}; memcpy(h2->vol_uuid, uu, sizeof(uu)); sh->vol_size = img_size / 512; From 2d745732c39a084caa7c7524501dd116910cce60 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 03:27:09 +0000 Subject: [PATCH 28/77] Fully remove file backend --- src/backend.h | 17 +++-------------- src/imgtool.cc | 4 ++-- src/liblsvd.cc | 6 +++--- src/rados_backend.cc | 2 ++ 4 files changed, 10 insertions(+), 19 deletions(-) diff --git a/src/backend.h b/src/backend.h index 7abe207e..75f2feb1 100644 --- a/src/backend.h +++ b/src/backend.h @@ -4,7 +4,6 @@ #include #include -#include "config.h" #include "request.h" #include "smartiov.h" #include "utils.h" @@ -47,20 +46,10 @@ class backend return aio_read(name, offset, iov); } - virtual opt get_size(std::string name); - virtual opt> read_whole_obj(std::string name); - - virtual bool exists(std::string name); + virtual opt get_size(std::string name) = 0; + virtual opt> read_whole_obj(std::string name) = 0; + virtual bool exists(std::string name) = 0; }; extern std::shared_ptr make_file_backend(const char *prefix); extern std::shared_ptr make_rados_backend(rados_ioctx_t io); - -inline std::shared_ptr get_backend(lsvd_config *cfg, rados_ioctx_t io, - const char *name) -{ - if (cfg->backend == BACKEND_RADOS) - return make_rados_backend(io); - - throw std::runtime_error("Unknown backend"); -} diff --git a/src/imgtool.cc b/src/imgtool.cc index 2f03da9c..f6987ed1 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -123,7 +123,7 @@ void info(rados_ioctx_t io, const char *image_name) printf("error reading config: %d\n", rv); exit(1); } - auto objstore = get_backend(&cfg, io, NULL); + auto objstore = make_rados_backend(io); uuid_t uu; if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { printf("error reading superblock: %d\n", rv); @@ -186,7 +186,7 @@ void mk_cache(rados_ioctx_t io, const char *image_name, const char *dev_name, printf("error reading config: %d\n", rv); exit(1); } - auto objstore = get_backend(&cfg, io, NULL); + auto objstore = make_rados_backend(io); uuid_t uu; if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { printf("error reading superblock: %d\n", rv); diff --git a/src/liblsvd.cc b/src/liblsvd.cc index 5ac575cf..49e93812 100644 --- a/src/liblsvd.cc +++ b/src/liblsvd.cc @@ -229,7 +229,7 @@ extern "C" int rbd_create(rados_ioctx_t io, const char *name, uint64_t size, lsvd_config cfg; if (cfg.read() < 0) return -1; - auto objstore = get_backend(&cfg, io, NULL); + auto objstore = make_rados_backend(io); auto rv = translate_create_image(objstore, name, size); return rv; } @@ -243,7 +243,7 @@ extern "C" int rbd_clone(rados_ioctx_t io, const char *source_img, return -1; } - auto objstore = get_backend(&cfg, io, NULL); + auto objstore = make_rados_backend(io); auto rv = translate_clone_image(objstore, source_img, dest_img); return rv; @@ -260,7 +260,7 @@ extern "C" int rbd_remove(rados_ioctx_t io, const char *name) auto rv = cfg.read(); if (rv < 0) return rv; - auto objstore = get_backend(&cfg, io, NULL); + auto objstore = make_rados_backend(io); uuid_t uu; if ((rv = translate_get_uuid(objstore, name, uu)) < 0) return rv; diff --git a/src/rados_backend.cc b/src/rados_backend.cc index 0e6ffcd7..f4b2cdcb 100644 --- a/src/rados_backend.cc +++ b/src/rados_backend.cc @@ -131,6 +131,8 @@ class rados_backend : public backend librados::IoCtx::from_rados_ioctx_t(ctx_, this->ctx); } + ~rados_backend() override {} + int write(std::string name, smartiov &iov) override { auto req = dynamic_cast(aio_write(name, iov)); From 0a604608088480b3d2432bead4d623668266073c Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 03:27:46 +0000 Subject: [PATCH 29/77] Rewrite image open/recovery and initialisation path --- src/bdev_lsvd.cc | 10 +- src/image.cc | 212 ++++++---- src/image.h | 60 ++- src/objects.cc | 115 +++-- src/spdk_frontend.cc | 2 +- src/spdk_wrap.cc | 7 + src/translate.cc | 988 +++++++++++++++++++------------------------ src/translate.h | 19 +- 8 files changed, 735 insertions(+), 678 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 1cc9c048..7543bf0f 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -137,9 +137,12 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) { assert(!img_name.empty()); - auto img = lsvd_image::open_image(img_name, ioctx); - if (!img) { - log_error("Failed to open image '{}'.", img_name); + lsvd_config cfg; // TODO + uptr img; + try { + img = uptr(new lsvd_image(img_name, ioctx, cfg)); + } catch (std::runtime_error &e) { + log_error("Failed to create image '{}': {}", img_name, e.what()); return -1; } @@ -158,6 +161,7 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) spdk_put_io_channel(ch->io_channel); }, sizeof(lsvd_bdev_io_channel), img_name.c_str()); + auto err = spdk_bdev_register(&iodev->bdev); if (err) { log_error("Failed to register bdev: err {}", (err)); diff --git a/src/image.cc b/src/image.cc index c9be7037..0435e850 100644 --- a/src/image.cc +++ b/src/image.cc @@ -1,104 +1,165 @@ #include +#include #include #include #include #include "backend.h" #include "image.h" -#include "journal.h" #include "lsvd_types.h" +#include "objects.h" #include "shared_read_cache.h" +#include "utils.h" +#include "write_cache.h" -extern int init_wcache(int fd, uuid_t &uuid, int n_pages); const int block_sectors = CACHE_CHUNK_SIZE / 512; -lsvd_image::~lsvd_image() +lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) + : imgname(name), cfg(cfg) { - // TODO fix to the utterly cursed try_open function so that the object is - // always in valid state instaed of being partially constructed - if (wcache) { - wcache->flush(); - wcache->do_write_checkpoint(); - } - if (xlate && !cfg.no_gc) - xlate->stop_gc(); - if (xlate) - xlate->checkpoint(); - if (write_fd >= 0) - close(write_fd); + objstore = make_rados_backend(io); + rcache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); + + wlog = open_wlog(cfg.wlog_path(name), cfg.wlog_size / 4096, *xlate, cfg); + THROW_MSG_ON(!wlog, "Failed to open write log"); + + read_superblock(); + if (checkpoints.size() > 0) + read_from_checkpoint(checkpoints.back()); + + // Roll forward on the log + auto last_data_seq = roll_forward_from_last_checkpoint(); + + // TODO: actually recover from the write log, this is currently a no-op + recover_from_wlog(); + + // Successfully recovered everything, now we have enough information to + // init everything else } -uptr lsvd_image::open_image(std::string name, rados_ioctx_t io) +lsvd_image::~lsvd_image() { - uptr img(new lsvd_image()); - try { - img->try_open(name, io); - return img; - } catch (std::exception &e) { - log_error("Failed to open image {}: {}", name, e.what()); - return nullptr; - } + wlog->flush(); + wlog->do_write_checkpoint(); + xlate->shutdown(); } -int lsvd_image::try_open(std::string name, rados_ioctx_t io) +bool lsvd_image::apply_log(seqnum_t seq) { - this->image_name = name; + object_reader parser(objstore); + // TODO + auto data_hdr = parser.read_data_hdr(oname(imgname, seq)); + if (!data_hdr.has_value()) + return false; + trace("Recovering log with object at seq {}", seq); + + auto ohdr = data_hdr->hdr; + if (ohdr->type == OBJ_CHECKPOINT) { + log_warn("CORRUPTION: Found checkpoint at seq {} that was not " + "present in the superblock.", + seq); + checkpoints.push_back(seq); + return true; + } - if (cfg.read() < 0) - throw std::runtime_error("Failed to read config"); + obj_info[seq] = (data_obj_info){ + .hdr = ohdr->hdr_sectors, + .data = ohdr->data_sectors, + .live = ohdr->data_sectors, + }; + + // Consume log records + sector_t offset = 0; + vec deleted; + for (auto dmap : data_hdr->data_map) { + // Update the extent map + extmap::obj_offset oo = {seq, offset + ohdr->hdr_sectors}; + objmap.update(dmap->lba, dmap->lba + dmap->len, oo, &deleted); + offset += dmap->len; + } - objstore = make_rados_backend(io); - shared_cache = - get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); + // Manage deleted extents + for (auto d : deleted) { + auto [base, limit, ptr] = d.vals(); + obj_info[ptr.obj].live -= (limit - base); + THROW_MSG_ON(obj_info[ptr.obj].live >= 0, "Negative live sectors."); + } - /* read superblock and initialize translation layer - */ - xlate = make_translate(objstore, &cfg, &map, &bufmap, &map_lock, - &bufmap_lock, shared_cache); - size = xlate->init(name.c_str(), true); - check_cond(size < 0, "Failed to initialize translation layer err={}", size); + return true; +} - /* figure out cache file name, create it if necessary - */ +void lsvd_image::read_superblock() +{ + object_reader parser(objstore); + auto superblock = parser.read_superblock(oname(imgname, 0)); + THROW_MSG_ON(!superblock, "Failed to read superblock"); - /* - * TODO: Open 2 files. One for wcache and one for reader - */ - std::string wcache_name = - cfg.cache_filename(xlate->uuid, name.c_str(), LSVD_CFG_WRITE); + size = superblock->vol_size; + uuid_copy(uuid, superblock->uuid); - if (access(wcache_name.c_str(), R_OK | W_OK) < 0) { - log_info("Creating write cache file {}", wcache_name); - int cache_pages = cfg.wlog_size / 4096; + for (auto ckpt : superblock->ckpts) + checkpoints.push_back(ckpt); - int fd = open(wcache_name.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0777); - check_ret_errno(fd, "Can't open wcache file"); + for (auto ci : superblock->clones) { + clone_base c; + c.name = std::string(ci->name, ci->name_len); + c.last_seq = ci->last_seq; + c.first_seq = ci->last_seq + 1; - if (init_wcache(fd, xlate->uuid, cache_pages) < 0) - throw std::runtime_error("Failed to initialize write cache"); - close(fd); + debug("Using base image {} upto seq {}", c.name, c.last_seq); + clones.push_back(c); } +} - write_fd = open(wcache_name.c_str(), O_RDWR); - check_ret_errno(write_fd, "Can't open wcache file"); +void lsvd_image::read_from_checkpoint(seqnum_t seq) +{ + object_reader parser(objstore); + auto parsed = parser.read_checkpoint(oname(imgname, seq)); + THROW_MSG_ON(!parsed, "Failed to read checkpoint"); + + for (auto obj : parsed->objects) { + obj_info[obj->seq] = (data_obj_info){ + .hdr = obj->hdr_sectors, + .data = obj->data_sectors, + .live = obj->live_sectors, + }; + } - j_write_super *jws = (j_write_super *)aligned_alloc(512, 4096); + for (auto m : parsed->dmap) { + extmap::obj_offset oo = {m->obj, m->offset}; + objmap.update(m->lba, m->lba + m->len, oo); + } +} - check_ret_errno(pread(write_fd, (char *)jws, 4096, 0), - "Can't read wcache superblock"); - if (jws->magic != LSVD_MAGIC || jws->type != LSVD_J_W_SUPER) - throw std::runtime_error("bad magic/type in write cache superblock\n"); - if (memcmp(jws->vol_uuid, xlate->uuid, sizeof(uuid_t)) != 0) - throw std::runtime_error("object and cache UUIDs don't match"); +// Returns last processed checkpoint +seqnum_t lsvd_image::roll_forward_from_last_checkpoint() +{ + if (checkpoints.size() == 0) + return 0; - wcache = make_write_cache(0, write_fd, xlate.get(), &cfg); - free(jws); + object_reader parser(objstore); + auto last_ckpt = checkpoints.back(); + auto seq = last_ckpt + 1; - if (!cfg.no_gc) - xlate->start_gc(); - return 0; + for (;; seq++) { + auto ret = apply_log(seq); + if (!ret) + break; + } + + // Delete "dangling" objects if there are any in case they cause trouble + // with corruption + // This must be larger than the max backend batch size to avoid + // potential corruption if subsequent breaks overlap with current dangling + // objects and we get writes from two different "generations" + for (seqnum_t i = 1; i < cfg.num_parallel_writes * 4; i++) + objstore->delete_obj(oname(imgname, seq + i)); + + return seq; } +void lsvd_image::recover_from_wlog() { UNIMPLEMENTED(); } + /** * This is the base for aio read and write requests. It's copied from * the old rbd_aio_req omniclass, with the read and write paths split out and @@ -200,7 +261,7 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, auto bufmap_it = bufmap.end(); if (bufmap.size() > 0) bufmap_it = bufmap.lookup(start_sector); - auto backend_it = map.lookup(start_sector); + auto backend_it = objmap.lookup(start_sector); size_t _offset = 0; /* @@ -237,7 +298,7 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, sector_t base2 = end_sector, limit2 = end_sector; extmap::obj_offset objptr = {0, 0}; - if (backend_it != map.end()) + if (backend_it != objmap.end()) std::tie(base2, limit2, objptr) = backend_it->vals(start_sector, end_sector); @@ -268,14 +329,14 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, * it2: |----| |----| * |------| < but not this */ - while (backend_it != map.end() && backend_it->limit() <= limit1) + while (backend_it != objmap.end() && backend_it->limit() <= limit1) backend_it++; bufmap_it++; continue; } assert(base2 == start_sector); - assert(backend_it != map.end()); + assert(backend_it != objmap.end()); limit2 = std::min(limit2, base1); sector_t sectors = limit2 - start_sector; @@ -349,9 +410,8 @@ void lsvd_image::handle_reads(size_t offset, smartiov iovs, // same thing to the shared read cache auto prefix = xlate->prefix(key.obj); - auto req = - shared_cache->make_read_req(prefix, key.obj, key.offset * 512L, - sector_in_blk * 512L, slice); + auto req = rcache->make_read_req(prefix, key.obj, key.offset * 512L, + sector_in_blk * 512L, slice); if (req != nullptr) requests.push_back(req); @@ -391,7 +451,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request if (old > 1) return; - img->wcache->release_room(req_bytes / 512); + img->wlog->release_room(req_bytes / 512); complete_request(0); // TODO shouldn't we return bytes written? } @@ -399,7 +459,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request { assert(parent == nullptr); - img->wcache->get_room(req_bytes / 512); + img->wlog->get_room(req_bytes / 512); img->xlate->wait_for_room(); sector_t size_sectors = req_bytes / 512; @@ -419,7 +479,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request iovs.slice(s_offset * 512L, s_offset * 512L + _sectors * 512L); smartiov _iov(tmp.data(), tmp.size()); sub_iovs.push_back(_iov); - auto req = img->wcache->writev(cur_offset / 512, &_iov); + auto req = img->wlog->writev(cur_offset / 512, &_iov); requests.push_back(req); cur_offset += _sectors * 512L; diff --git a/src/image.h b/src/image.h index 54c1b077..023dcad7 100644 --- a/src/image.h +++ b/src/image.h @@ -7,6 +7,7 @@ #include "backend.h" #include "config.h" #include "extent.h" +#include "objects.h" #include "shared_read_cache.h" #include "translate.h" #include "write_cache.h" @@ -14,22 +15,54 @@ /** * Core LSVD image class. An LSVD image supports 4 operations: read, write, * trim, and flush. All are async to prevent function colour issues. + * + * Currently, a lot of core image functionality is in the `translate` class. + * The separation between what is here and what is there is not clear, and the + * two classes really should be consolidated, and the GC function splitted out + * into its own class. + * + * For now, all the core information about the image is owned by this class, + * and `translate` only takes references to it. Most of the translate code was + * from long ago, written by people who are no longer around. It's written like + * a C program, and the ownership structure of most resources is unclear, with + * sketchy concurrency control and C++ style. + * + * Eventually we'll have to rewrite the core translation class to clarify + * resource ownership and to overhaul the disastrous locking situation, but + * that's only a dream for now */ class lsvd_image { private: - // no copying + // no copying or moving lsvd_image(const lsvd_image &) = delete; lsvd_image operator=(const lsvd_image &) = delete; + lsvd_image(const lsvd_image &&) = delete; + lsvd_image operator=(const lsvd_image &&) = delete; + + // Log recovery + void read_superblock(); + void read_from_checkpoint(seqnum_t ckpt_id); + bool apply_log(seqnum_t seq); + + seqnum_t roll_forward_from_last_checkpoint(); + void recover_from_wlog(); public: - std::string image_name; + lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg); + ~lsvd_image(); + std::string imgname; + uuid_t uuid; + usize size; // bytes lsvd_config cfg; - ssize_t size; // bytes + + std::vector clones; // Base images on which we're built + std::vector checkpoints; // Checkpoints + std::map obj_info; // LBA -> object id, object offset - extmap::objmap map; + extmap::objmap objmap; std::shared_mutex map_lock; // LBA -> in-memory, higher priority than the object map @@ -46,22 +79,15 @@ class lsvd_image std::map buffers; std::shared_ptr objstore; - std::shared_ptr shared_cache; - std::unique_ptr wcache; + std::shared_ptr rcache; + std::unique_ptr wlog; std::unique_ptr xlate; - int write_fd; /* write cache file */ int refcount = 0; std::thread dbg; bool done = false; - lsvd_image() {} - ~lsvd_image(); - - int try_open(std::string name, rados_ioctx_t io); - static uptr open_image(std::string name, rados_ioctx_t io); - class aio_request; class trivial_request; class read_request; @@ -71,6 +97,14 @@ class lsvd_image request *trim(size_t offset, size_t len, std::function cb); request *flush(std::function cb); + // Image management + // They all return 0 on success, -errno on failure + static int create_new(std::string name, rados_ioctx_t io); + static int get_uuid(std::string name, rados_ioctx_t io); + static int delete_image(std::string name, rados_ioctx_t io); + static int clone_image(std::string oldname, std::string newname, + rados_ioctx_t io); + private: void handle_reads(size_t offset, smartiov iovs, std::vector &requests); diff --git a/src/objects.cc b/src/objects.cc index 43342901..739e6ad3 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -31,13 +31,27 @@ void serialise_superblock(vec buf, vec &checkpoints, usize req_size = sizeof(common_obj_hdr) + sizeof(super_hdr); req_size += checkpoints.size() * sizeof(seqnum_t); req_size = std::max(req_size, 8ul); // minimum of 4096 bytes - req_size = round_up(req_size, 512); // round to sector boundary + req_size = round_up(req_size, 512); // round to sector boundary (why??) if (buf.size() < req_size) buf.resize(req_size); serialise_common_hdr(buf, OBJ_SUPERBLOCK, 0, req_size / 512, 0, uuid); + // There are three variable-length arrays in the superblock: checkpoints, + // snapshots, and clones. The order doesn't matter, so we put checkpoints + // first, as each checkpoint is just a 32-bit sequence number. The rest are + // more complicated as they are variable-length due to them also containing + // names of the clones and snapshots, which need to be handled correctly + + // Also note that we should make sure that each clone/snapshot is 8-byte + // aligned in the buffer, as when we read them back to deserialise we end + // up with a bunch of pointers into the buffer and unaligned pointers will + // just make all of us sad. Fortunately we have c-style null-terminated + // strings so we can just pad with more null + + // Part 1: checkpoints + auto h = (super_hdr *)(buf.data() + sizeof(common_obj_hdr)); h->ckpts_offset = sizeof(common_obj_hdr) + sizeof(super_hdr); h->ckpts_len = checkpoints.size() * sizeof(seqnum_t); @@ -45,6 +59,11 @@ void serialise_superblock(vec buf, vec &checkpoints, auto p = (seqnum_t *)(buf.data() + h->ckpts_offset); for (auto &c : checkpoints) *p++ = c; + + // Part 2: clones + + // Part 3: snapshots + // TODO implement this when we get around to snapshots } opt> object_reader::fetch_object_header(std::string objname) @@ -72,11 +91,59 @@ opt> object_reader::fetch_object_header(std::string objname) return buf; } +/* buf[offset ... offset+len] contains array of type T, with variable + * length field name_len. + */ +template +void deserialise_offset_ptr(char *buf, size_t offset, size_t len, + std::vector &vals) +{ + T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); + for (; p < end;) { + vals.push_back(p); + p = (T *)((char *)p + sizeof(T) + p->name_len); + } +} + +template vec deserialise_cpy(byte *buf, usize offset, usize len) +{ + vec ret; + for (usize i = 0; i < len / sizeof(T); i++) { + T *p = (T *)(buf + offset + i * sizeof(T)); + ret.push_back(*p); + } + return ret; +} + +template +vec deserialise_ptrs(byte *buf, usize offset, usize len) +{ + vec ret; + for (usize i = 0; i < len / sizeof(T); i++) { + T *p = (T *)(buf + offset + i * sizeof(T)); + ret.push_back(p); + } + return ret; +} + +template +vec deserialise_ptrs_with_len(byte *buf, usize offset, usize len) +{ + vec ret; + byte *p = buf + offset; + for (; p < buf + offset + len;) { + ret.push_back((T *)p); + p += sizeof(T) + ((T *)p)->name_len; + } + return ret; +} + opt object_reader::read_superblock(std::string oname) { auto buf = objstore->read_whole_obj(oname); PASSTHRU_NULLOPT(buf); auto hdr = (common_obj_hdr *)buf->data(); + auto hbuf = buf->data(); PR_RET_IF(hdr->magic != LSVD_MAGIC, std::nullopt, "Corrupt object; invalid magic at '{}'", oname); @@ -86,18 +153,17 @@ opt object_reader::read_superblock(std::string oname) "Obj '{}' not a superblock", oname); parsed_superblock ret; - super_hdr *superblk = (super_hdr *)(hdr + 1); + super_hdr *shdr = (super_hdr *)(hdr + 1); - decode_offset_len((char *)hdr, superblk->ckpts_offset, - superblk->ckpts_len, ret.ckpts); - decode_offset_len_ptr((char *)hdr, superblk->clones_offset, - superblk->clones_len, ret.clones); - decode_offset_len_ptr((char *)hdr, superblk->snaps_offset, - superblk->snaps_len, ret.snaps); + ret.ckpts = deserialise_cpy(hbuf, shdr->ckpts_offset, shdr->ckpts_len); + ret.clones = deserialise_ptrs_with_len( + hbuf, shdr->clones_offset, shdr->clones_len); + ret.snaps = deserialise_ptrs_with_len(hbuf, shdr->snaps_offset, + shdr->snaps_len); ret.superblock_buf = *buf; uuid_copy(ret.uuid, hdr->vol_uuid); - ret.vol_size = superblk->vol_size * 512; + ret.vol_size = shdr->vol_size * 512; return ret; } @@ -113,12 +179,11 @@ opt object_reader::read_data_hdr(std::string oname) "Invalid object type in '{}'", oname); h.data_hdr = (obj_data_hdr *)(hdr->data() + sizeof(common_obj_hdr)); - auto buf = (char *)hdr->data(); - decode_offset_len_ptr(buf, h.data_hdr->objs_cleaned_offset, - h.data_hdr->objs_cleaned_len, h.cleaned); - decode_offset_len_ptr(buf, h.data_hdr->data_map_offset, - h.data_hdr->data_map_len, h.data_map); - + auto buf = hdr->data(); + h.cleaned = deserialise_ptrs( + buf, h.data_hdr->objs_cleaned_offset, h.data_hdr->objs_cleaned_len); + h.data_map = deserialise_ptrs(buf, h.data_hdr->data_map_offset, + h.data_hdr->data_map_len); h.buf = std::move(*hdr); return h; } @@ -134,17 +199,15 @@ opt object_reader::read_checkpoint(std::string oname) "Invalid object type it '{}'", oname); ret.ckpt_hdr = (obj_ckpt_hdr *)(&hdr->at(sizeof(common_obj_hdr))); - auto buf = (char *)hdr->data(); - decode_offset_len(buf, ret.ckpt_hdr->ckpts_offset, - ret.ckpt_hdr->ckpts_len, ret.ckpts); - - decode_offset_len_ptr(buf, ret.ckpt_hdr->objs_offset, - ret.ckpt_hdr->objs_len, ret.objects); - decode_offset_len_ptr(buf, ret.ckpt_hdr->deletes_offset, - ret.ckpt_hdr->deletes_len, - ret.deletes); - decode_offset_len_ptr(buf, ret.ckpt_hdr->map_offset, - ret.ckpt_hdr->map_len, ret.dmap); + auto buf = hdr->data(); + ret.ckpts = deserialise_cpy(buf, ret.ckpt_hdr->ckpts_offset, + ret.ckpt_hdr->ckpts_len); + ret.objects = deserialise_ptrs(buf, ret.ckpt_hdr->objs_offset, + ret.ckpt_hdr->objs_len); + ret.deletes = deserialise_ptrs( + buf, ret.ckpt_hdr->deletes_offset, ret.ckpt_hdr->deletes_len); + ret.dmap = deserialise_ptrs(buf, ret.ckpt_hdr->map_offset, + ret.ckpt_hdr->map_len); ret.buf = std::move(*hdr); return ret; } \ No newline at end of file diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 2642ea7e..d204d9fb 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -27,7 +27,7 @@ static void start_lsvd(void *arg) err = rados_ioctx_create(cluster, pool_name.c_str(), &io_ctx); check_ret_neg(err, "Failed to connect to pool {}", pool_name); - err = bdev_lsvd_create("test-image", io_ctx); + err = bdev_lsvd_create("test", io_ctx); } int main(int argc, char **argv) diff --git a/src/spdk_wrap.cc b/src/spdk_wrap.cc index 44960763..d7d90941 100644 --- a/src/spdk_wrap.cc +++ b/src/spdk_wrap.cc @@ -72,6 +72,13 @@ lsvd_rbd *lsvd_rbd::open_image(rados_ioctx_t io, std::string name) void lsvd_rbd::close_image() { delete this; } +lsvd_rbd::lsvd_rbd(std::string name, rados_ioctx_t io, lsvd_config cfg) + : img(name, io, cfg) +{ +} + +lsvd_rbd::~lsvd_rbd() {} + spdk_completion *lsvd_rbd::create_completion(rbd_callback_t cb, void *cb_arg) { return new spdk_completion(cb, cb_arg); diff --git a/src/translate.cc b/src/translate.cc index ff5ab3fb..54db9fc5 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -1,9 +1,7 @@ #include #include -#include -#include +#include #include -#include #include #include #include @@ -13,12 +11,8 @@ #include #include "extent.h" -#include "lsvd_debug.h" -#include "lsvd_types.h" #include "misc_cache.h" -#include "objects.h" #include "request.h" -#include "src/utils.h" #include "translate.h" /* @@ -155,67 +149,50 @@ class translate_req : public request class translate_impl : public translate { - /* lock ordering: lock m before *map_lock - */ - std::mutex m; // for things in this instance - extmap::objmap *map; // shared object map - extmap::bufmap *bufmap; // shared object map - std::shared_mutex *map_lock; // locks the object map - std::mutex *bufmap_lock; + std::string name; + lsvd_config &cfg; + usize vol_size; + uuid_t &vol_uuid; - lsvd_config *cfg; + std::shared_ptr objstore; + std::shared_ptr rcache; - std::atomic seq; + // lock ordering: lock m before *map_lock + std::mutex m; // for things in this instance + std::condition_variable cv; + + extmap::objmap &objmap; // shared object map + std::shared_mutex &omap_mtx; // locks the object map + extmap::bufmap &bufmap; // shared object map + std::mutex &bufmap_lock; + + std::atomic cur_seq; uint64_t ckpt_cache_seq = 0; // from last data object friend class translate_req; translate_req *current = NULL; - /* info on live data objects - all sizes in sectors - * checkpoints are tracked in @checkpoints, and in the superblock - */ - struct obj_info { - int hdr; // sectors - int data; // sectors - int live; // sectors - }; - std::map object_info; + std::vector &clones; + std::map &object_info; + vec &checkpoints; - std::vector checkpoints; + std::atomic outstanding_writes = 0; - std::atomic outstanding_writes = 0; + // GC can't delete an object if the read logic has a + // request outstanding to it - skip, and dead object reaping + // will get it on the next pass. + std::map obj_read_refcount; - std::condition_variable cv; - bool stopped = false; // stop GC from writing - - /* various constant state - */ - struct clone { - char prefix[128]; - int last_seq; - int first_seq = 0; - }; - std::vector clone_list; - char super_name[128]; - - /* superblock has two sections: [obj_hdr] [super_hdr] - */ - char *super_buf = NULL; - obj_hdr *super_h = NULL; - super_hdr *super_sh = NULL; - size_t super_len; - - /* GC can't delete an object if the read logic has a - * request outstanding to it - skip, and dead object reaping - * will get it on the next pass. - */ - std::map reading_objects; + // Used for updating the superblock when writing out new checkpoints + // Reserve it up-front to avoid repeated allocations each time we serialise + vec superblock_buf; thread_pool *workers; - thread_pool *misc_threads; // so we can stop ckpt, gc first - /* for triggering GC - */ + opt flush_worker; + opt gc_worker; + + // for triggering GC sector_t total_sectors = 0; sector_t total_live_sectors = 0; int gc_cycles = 0; @@ -223,19 +200,9 @@ class translate_impl : public translate int gc_sectors_written = 0; int gc_deleted = 0; - /* for shutdown - */ - bool gc_running = false; - std::condition_variable gc_cv; - void stop_gc(void); - - object_reader *parser; - - std::shared_ptr rcache; - - void write_checkpoint(int seq, translate_req *req); - void process_batch(int seq, translate_req *req); - void write_gc(int _seq, translate_req *req); + void write_checkpoint(seqnum_t seq, translate_req *req); + void process_batch(seqnum_t seq, translate_req *req); + void write_gc(seqnum_t _seq, translate_req *req); void worker_thread(thread_pool *p); @@ -243,264 +210,106 @@ class translate_impl : public translate sector_t data_sectors, data_map *extents, int n_extents, bool is_gc); - void do_gc(bool *running); - void gc_thread(thread_pool *p); - void flush_thread(thread_pool *p); - - std::shared_ptr objstore; + void flush_thread(std::stop_token st); + void gc_thread(std::stop_token st); + void do_gc(std::stop_token &st); public: - translate_impl(std::shared_ptr _io, lsvd_config *cfg_, - extmap::objmap *map, extmap::bufmap *bufmap, - std::shared_mutex *m, std::mutex *buf_m, - sptr rcache); - ~translate_impl(); - - ssize_t init(const char *name, bool timedflush); - void shutdown(void); - - void flush(void); /* write out current batch */ - void checkpoint(void); /* flush, then write checkpoint */ - - ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, int iovcnt); - ssize_t trim(size_t offset, size_t len); - void wait_for_room(void); - - void object_read_start(int obj); // mark object as busy - can't delete - void object_read_end(int obj); - - void start_gc(void); - - const char *prefix(int seq); -}; - -const char *translate_impl::prefix(int seq) -{ - if (clone_list.size() == 0 || seq > clone_list.front().last_seq) - return super_name; - for (auto const &c : clone_list) - if (seq >= c.first_seq) - return c.prefix; - assert(false); -} - -translate_impl::translate_impl(std::shared_ptr _io, lsvd_config *cfg_, - extmap::objmap *map_, extmap::bufmap *bufmap_, - std::shared_mutex *m_, std::mutex *buf_m, - sptr rcache) - : rcache(rcache) -{ - misc_threads = new thread_pool(&m); - workers = new thread_pool(&m); - objstore = _io; - parser = new object_reader(objstore); - map = map_; - bufmap = bufmap_; - map_lock = m_; - bufmap_lock = buf_m; - cfg = cfg_; -} - -uptr make_translate(std::shared_ptr _io, lsvd_config *cfg, - extmap::objmap *map, extmap::bufmap *bufmap, - std::shared_mutex *m, std::mutex *buf_m, - sptr rcache) -{ - return std::make_unique(_io, cfg, map, bufmap, m, buf_m, - rcache); -} - -translate_impl::~translate_impl() -{ - stopped = true; - cv.notify_all(); - if (current) - delete current; - delete parser; - if (super_buf) - free(super_buf); -} - -ssize_t translate_impl::init(const char *prefix_, bool timedflush) -{ - std::vector ckpts; - std::vector clones; - std::vector snaps; - - /* note prefix = superblock name - */ - strcpy(super_name, prefix_); - - auto [_buf, bytes] = - parser->read_super(super_name, ckpts, clones, snaps, uuid); - - check_cond(bytes < 0, "read_super failed for obj {}", super_name); - check_cond(_buf == NULL, "no superblock"); - - int n_ckpts = ckpts.size(); - - super_buf = _buf; - super_h = (obj_hdr *)super_buf; - super_len = super_h->hdr_sectors * 512; - super_sh = (super_hdr *)(super_h + 1); - - memcpy(&uuid, super_h->vol_uuid, sizeof(uuid)); - - current = new translate_req(REQ_PUT, cfg->batch_size, this); - seq = 1; // empty volume case - - /* is this a clone? - */ - if (super_sh->clones_len > 0) { - debug("Image is a clone, parsing cloneinfo headers"); - - char buf[4096]; - auto ci = (clone_info *)(_buf + super_sh->clones_offset); - auto obj_name = (char *)(ci + 1); - while (true) { - if (has_poolname_prefix(obj_name)) { - log_warn("Found poolname prefix in baseimg name: {}; stripping " - "it out. Cross-pool clones are not supported.", - obj_name); - obj_name = strip_poolname_prefix(obj_name); - log_info("Using base name: {}", obj_name); - } + translate_impl(str name, lsvd_config &cfg, usize vol_size, uuid_t &vol_uuid, + sptr be, sptr rcache, + extmap::objmap &objmap, std::shared_mutex &omap_mtx, + extmap::bufmap &bmap, std::mutex &bmap_lck, + seqnum_t last_seq, vec &clones, + std::map &objinfo, + vec &checkpoints) + : name(name), cfg(cfg), vol_size(vol_size), vol_uuid(vol_uuid), + objstore(be), rcache(rcache), objmap(objmap), omap_mtx(omap_mtx), + bufmap(bmap), bufmap_lock(bmap_lck), cur_seq(last_seq), + clones(clones), object_info(objinfo), checkpoints(checkpoints), + superblock_buf(4096) + { + // Calculate GC data + for (auto const &[_, oi] : objinfo) { + total_sectors += oi.data; + total_live_sectors += oi.live; + } - auto rv = objstore->read(obj_name, 0, buf, sizeof(buf)); - check_cond(rv < 0, "Failed to read {}", obj_name); + // start worker, flush, and GC threads + // if (cfg.flush_interval_msec > 0) + // flush_worker = std::jthread(&translate_impl::flush_thread, this); - auto _h = (obj_hdr *)buf; - auto _sh = (super_hdr *)(_h + 1); + // if (!cfg.no_gc) + // gc_worker = std::jthread(&translate_impl::gc_thread, this); - check_cond(_h->magic != LSVD_MAGIC || _h->type != LSVD_SUPER, - "Corrupted superblock in {}", obj_name); - check_cond(memcmp(_h->vol_uuid, ci->vol_uuid, sizeof(uuid_t)) != 0, - "UUID mismatch in {}", obj_name); - clone c; - strcpy(c.prefix, obj_name); - c.last_seq = ci->last_seq; - if (clone_list.size() > 0) - clone_list.back().first_seq = ci->last_seq + 1; - clone_list.push_back(c); - debug("Using base image {} upto seq {}", obj_name, c.last_seq); + // // honestly have no idea how this works + // workers = new thread_pool(&m); + // workers->pool.push( + // std::thread(&translate_impl::worker_thread, this, workers)); - if (_sh->clones_len == 0) - break; - ci = (clone_info *)(buf + _sh->clones_offset); - obj_name = (char *)(ci + 1); - } + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } - /* read in the last checkpoint, then roll forward from there; - */ - int last_ckpt = -1; - if (ckpts.size() > 0) { - std::vector objects; - std::vector deletes; - std::vector entries; - - /* hmm, we should never have checkpoints listed in the - * super that aren't persisted on the backend, should we? - */ - while (n_ckpts > 0) { - int c = ckpts[n_ckpts - 1]; - objname name(prefix(c), c); - do_log("reading ckpt %s\n", name.c_str()); - if (parser->read_checkpoint(name.c_str(), max_cache_seq, ckpts, - objects, deletes, entries) >= 0) { - last_ckpt = c; - break; - } - do_log("chkpt skip %d\n", c); - n_ckpts--; - } - if (last_ckpt == -1) - return -1; + ~translate_impl() + { + cv.notify_all(); + if (workers) + delete workers; + if (current) + delete current; + } - for (int i = 0; i < n_ckpts; i++) { - do_log("chkpt from super: %d\n", ckpts[i]); - checkpoints.push_back(ckpts[i]); // so we can delete them later - } + void flush(void) override; /* write out current batch */ + void checkpoint(void) override; /* flush, then write checkpoint */ - for (auto o : objects) { - object_info[o.seq] = (obj_info){.hdr = (int)o.hdr_sectors, - .data = (int)o.data_sectors, - .live = (int)o.live_sectors}; - total_sectors += o.data_sectors; - total_live_sectors += o.live_sectors; - } - for (auto m : entries) { - map->update(m.lba, m.lba + m.len, - (extmap::obj_offset){.obj = m.obj, .offset = m.offset}); - } - seq = last_ckpt + 1; - } + ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, + int iovcnt) override; + ssize_t trim(size_t offset, size_t len) override; + void wait_for_room(void) override; - /* roll forward - */ - for (;; seq++) { - std::vector cleaned; - std::vector entries; - obj_hdr h; - obj_data_hdr dh; + // mark object as busy - can't delete + void object_read_start(int obj) override; + void object_read_end(int obj) override; - objname name(prefix(seq), seq); - if (parser->read_data_hdr(name.c_str(), h, dh, cleaned, entries) < 0) - break; - if (h.type == LSVD_CKPT) { - do_log("ckpt from roll-forward: %d\n", seq.load()); - checkpoints.push_back(seq); - continue; + void shutdown() override + { + if (gc_worker) { + gc_worker->request_stop(); + gc_worker->join(); } - do_log("roll %d\n", seq.load()); - assert(h.type == LSVD_DATA); - object_info[seq] = (obj_info){.hdr = (int)h.hdr_sectors, - .data = (int)h.data_sectors, - .live = (int)h.data_sectors}; - total_sectors += h.data_sectors; - total_live_sectors += h.data_sectors; - if (dh.cache_seq) // skip GC writes - max_cache_seq = dh.cache_seq; + checkpoint(); - int offset = 0, hdr_len = h.hdr_sectors; - std::vector deleted; - for (auto m : entries) { - extmap::obj_offset oo = {seq, offset + hdr_len}; - map->update(m.lba, m.lba + m.len, oo, &deleted); - offset += m.len; - } - for (auto d : deleted) { - auto [base, limit, ptr] = d.vals(); - object_info[ptr.obj].live -= (limit - base); - assert(object_info[ptr.obj].live >= 0); - total_live_sectors -= (limit - base); + if (flush_worker) { + flush_worker->request_stop(); + flush_worker->join(); } } - /* delete any potential "dangling" objects. - */ - for (int i = 1; i < 32; i++) { - objname name(prefix(i + seq), i + seq); - objstore->delete_obj(name.str()); + str prefix(seqnum_t seq) override + { + if (clones.size() == 0 || seq > clones.front().last_seq) + return name; + for (auto const &c : clones) + if (seq >= c.first_seq) + return c.name; + assert(false); // unreachable } +}; - workers->pool.push( - std::thread(&translate_impl::worker_thread, this, workers)); - if (timedflush) - misc_threads->pool.push( - std::thread(&translate_impl::flush_thread, this, misc_threads)); - return bytes; -} - -void translate_impl::start_gc(void) +uptr make_translate(str name, lsvd_config &cfg, usize vol_size, + uuid_t &vol_uuid, sptr be, + sptr rcache, extmap::objmap &objmap, + std::shared_mutex &omap_mtx, + extmap::bufmap &bmap, std::mutex &bmap_lck, + seqnum_t last_seq, vec &clones, + std::map &objinfo, + vec &checkpoints) { - misc_threads->pool.push( - std::thread(&translate_impl::gc_thread, this, misc_threads)); + return std::unique_ptr(new translate_impl( + name, cfg, vol_size, vol_uuid, be, rcache, objmap, omap_mtx, bmap, + bmap_lck, last_seq, clones, objinfo, checkpoints)); } -void translate_impl::shutdown(void) {} - /* ----------- parsing and serializing various objects -------------*/ /* read object header @@ -514,21 +323,21 @@ void translate_impl::make_obj_hdr(char *buf, uint32_t _seq, sector_t hdr_sectors, sector_t data_sectors, data_map *extents, int n_extents, bool is_gc) { - auto h = (obj_hdr *)buf; + auto h = (common_obj_hdr *)buf; auto dh = (obj_data_hdr *)(h + 1); uint32_t map_offset = sizeof(*h) + sizeof(*dh), map_len = n_extents * sizeof(data_map); uint32_t hdr_bytes = map_offset + map_len; assert(hdr_bytes <= hdr_sectors * 512); - *h = (obj_hdr){.magic = LSVD_MAGIC, - .version = 1, - .vol_uuid = {0}, - .type = LSVD_DATA, - .seq = _seq, - .hdr_sectors = (uint32_t)hdr_sectors, - .data_sectors = (uint32_t)data_sectors, - .crc = 0}; + *h = (common_obj_hdr){.magic = LSVD_MAGIC, + .version = 1, + .vol_uuid = {0}, + .type = OBJ_LOGDATA, + .seq = _seq, + .hdr_sectors = (uint32_t)hdr_sectors, + .data_sectors = (uint32_t)data_sectors, + .crc = 0}; memcpy(h->vol_uuid, &uuid, sizeof(uuid_t)); *dh = (obj_data_hdr){.cache_seq = 0, @@ -580,18 +389,18 @@ ssize_t translate_impl::writev(uint64_t cache_seq, size_t offset, iovec *iov, std::unique_lock lk(m); if (!current->room(bytes)) { workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } // write the data into the in-memory log auto ptr = current->append(base, &siov); // update the bufmap (lba -> in-memory buffer) with the extent - std::unique_lock obj_w_lock(*bufmap_lock); + std::unique_lock obj_w_lock(bufmap_lock); assert(ptr >= current->local_buf_base && ptr + (limit - base) * 512 <= current->local_buf_limit); assert(ptr != NULL); - bufmap->update(base, limit, ptr); + bufmap.update(base, limit, ptr); return 0; } @@ -612,11 +421,11 @@ ssize_t translate_impl::writev(uint64_t cache_seq, size_t offset, iovec *iov, ssize_t translate_impl::trim(size_t offset, size_t len) { std::unique_lock lk(m); - std::unique_lock obj_w_lock(*map_lock); + std::unique_lock obj_w_lock(omap_mtx); // trim the map std::vector deleted; - map->trim(offset / 512, (offset + len) / 512, &deleted); + objmap.trim(offset / 512, (offset + len) / 512, &deleted); // and then update the GC accounting for (auto d : deleted) { @@ -639,7 +448,7 @@ ssize_t translate_impl::trim(size_t offset, size_t len) void translate_impl::wait_for_room(void) { std::unique_lock lk(m); - while (outstanding_writes > cfg->xlate_window) + while (outstanding_writes > cfg.num_parallel_writes) cv.wait(lk); } @@ -649,20 +458,20 @@ void translate_impl::wait_for_room(void) void translate_impl::object_read_start(int obj) { std::unique_lock lk(m); - if (reading_objects.find(obj) == reading_objects.end()) - reading_objects[obj] = 0; + if (obj_read_refcount.find(obj) == obj_read_refcount.end()) + obj_read_refcount[obj] = 0; else - reading_objects[obj] = reading_objects[obj] + 1; + obj_read_refcount[obj] = obj_read_refcount[obj] + 1; } void translate_impl::object_read_end(int obj) { std::unique_lock lk(m); - auto i = reading_objects[obj]; + auto i = obj_read_refcount[obj]; if (i == 1) - reading_objects.erase(obj); + obj_read_refcount.erase(obj); else - reading_objects[obj] = i - 1; + obj_read_refcount[obj] = i - 1; } /* NOTE - currently not called for REQ_CKPT, which @@ -678,7 +487,7 @@ void translate_req::notify(request *child) * -> lock tx->m before tx->map_lock */ std::unique_lock lk(tx->m); - // if (--tx->outstanding_writes < tx->cfg->xlate_window) + // if (--tx->outstanding_writes < tx->cfg.xlate_window) tx->outstanding_writes--; tx->cv.notify_all(); } @@ -687,19 +496,19 @@ void translate_req::notify(request *child) /* remove extents from tx->bufmap, but only if they still * point to this buffer */ - std::unique_lock obj_w_lock(*tx->bufmap_lock); + std::unique_lock obj_w_lock(tx->bufmap_lock); std::vector> extents; for (auto const &e : entries) { auto limit = e.lba + e.len; - for (auto it2 = tx->bufmap->lookup(e.lba); - it2 != tx->bufmap->end() && it2->base() < limit; it2++) { + for (auto it2 = tx->bufmap.lookup(e.lba); + it2 != tx->bufmap.end() && it2->base() < limit; it2++) { auto [_base, _limit, ptr] = it2->vals(e.lba, limit); if (ptr.buf >= local_buf_base && ptr.buf < local_buf_limit) extents.push_back(std::pair(_base, _limit)); } } for (auto [base, limit] : extents) { - tx->bufmap->trim(base, limit); + tx->bufmap.trim(base, limit); } } @@ -725,17 +534,17 @@ void translate_req::notify(request *child) * - wait for preceding writes to complete before writing? * - write async rather than sync? (not really compatible with prev) */ -void translate_impl::write_checkpoint(int _seq, translate_req *req) +void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) { std::vector entries; std::vector objects; - for (auto it = map->begin(); it != map->end(); it++) { + for (auto it = objmap.begin(); it != objmap.end(); it++) { auto [base, limit, ptr] = it->vals(); entries.push_back((ckpt_mapentry){.lba = base, .len = limit - base, - .obj = (int32_t)ptr.obj, - .offset = (int32_t)ptr.offset}); + .obj = (s32)ptr.obj, + .offset = (s32)ptr.offset}); } size_t map_bytes = entries.size() * sizeof(ckpt_mapentry); @@ -749,22 +558,15 @@ void translate_impl::write_checkpoint(int _seq, translate_req *req) } size_t objs_bytes = objects.size() * sizeof(ckpt_obj); - size_t hdr_bytes = sizeof(obj_hdr) + sizeof(obj_ckpt_hdr); - int sectors = div_round_up(hdr_bytes + map_bytes + objs_bytes, 512); - - auto buf = (char *)calloc(sectors * 512, 1); - auto h = (obj_hdr *)buf; - *h = (obj_hdr){.magic = LSVD_MAGIC, - .version = 1, - .vol_uuid = {0}, - .type = LSVD_CKPT, - .seq = (uint32_t)_seq, - .hdr_sectors = (uint32_t)sectors, - .data_sectors = 0}; - memcpy(h->vol_uuid, uuid, sizeof(uuid_t)); - auto ch = (obj_ckpt_hdr *)(h + 1); - - uint32_t o1 = sizeof(obj_hdr) + sizeof(obj_ckpt_hdr), o2 = o1 + objs_bytes; + size_t hdr_bytes = sizeof(common_obj_hdr) + sizeof(obj_ckpt_hdr); + sector_t sectors = div_round_up(hdr_bytes + map_bytes + objs_bytes, 512); + + vec cp_buf(sectors * 512); + serialise_common_hdr(cp_buf, OBJ_CHECKPOINT, cp_seq, sectors, 0, uuid); + auto ch = (obj_ckpt_hdr *)(cp_buf.data() + sizeof(common_obj_hdr)); + + uint32_t o1 = sizeof(common_obj_hdr) + sizeof(obj_ckpt_hdr), + o2 = o1 + objs_bytes; *ch = (obj_ckpt_hdr){.cache_seq = ckpt_cache_seq, .ckpts_offset = 0, .ckpts_len = 0, @@ -780,32 +582,23 @@ void translate_impl::write_checkpoint(int _seq, translate_req *req) auto maps = objs + objs_bytes; memcpy(maps, (char *)entries.data(), map_bytes); - /* and write it - */ - objname name(prefix(_seq), _seq); - objstore->write(name.str(), buf, sectors * 512); - free(buf); + // Write out the checkpoint + objstore->write(oname(name, cp_seq), cp_buf.data(), cp_buf.size()); - checkpoints.push_back(_seq); - size_t offset = sizeof(*super_h) + sizeof(*super_sh); - std::vector ckpts_to_delete; + // Update superblock with new checkpoint, and keep only the last 3 + // around both in the backend and the superblock + checkpoints.push_back(cp_seq); + std::vector ckpts_to_delete; while (checkpoints.size() > 3) { ckpts_to_delete.push_back(checkpoints.front()); checkpoints.erase(checkpoints.begin()); } - super_sh->ckpts_offset = offset; - super_sh->ckpts_len = checkpoints.size() * sizeof(uint32_t); - auto pc = (uint32_t *)(super_buf + offset); - for (size_t i = 0; i < checkpoints.size(); i++) - *pc++ = checkpoints[i]; - - objstore->write(super_name, super_buf, 4096); + serialise_superblock(superblock_buf, checkpoints, clones, uuid); + objstore->write(name, superblock_buf.data(), superblock_buf.size()); - for (auto c : ckpts_to_delete) { - objname name(prefix(c), c); - objstore->delete_obj(name.str()); - } + for (auto c : ckpts_to_delete) + objstore->delete_obj(oname(name, c)); req->done = true; req->cv.notify_all(); @@ -826,7 +619,7 @@ void translate_impl::write_checkpoint(int _seq, translate_req *req) * this guarantees that the contents reflect the map state after all * previous seq#s and before all following ones. */ -void translate_impl::write_gc(int _seq, translate_req *req) +void translate_impl::write_gc(seqnum_t _seq, translate_req *req) { req->_seq = _seq; @@ -834,8 +627,8 @@ void translate_impl::write_gc(int _seq, translate_req *req) for (const auto &e : req->entries) data_sectors += e.len; - int max_hdr_bytes = sizeof(obj_hdr) + sizeof(obj_data_hdr) + - (cfg->batch_size / 2048) * sizeof(data_map); + int max_hdr_bytes = sizeof(common_obj_hdr) + sizeof(obj_data_hdr) + + (cfg.backend_obj_size / 2048) * sizeof(data_map); int max_hdr_sectors = div_round_up(max_hdr_bytes, 512); auto buf = req->gc_buf = @@ -851,8 +644,8 @@ void translate_impl::write_gc(int _seq, translate_req *req) req->local_buf_base = data_ptr; for (auto const &[base, len, obj, offset] : req->entries) { auto limit = base + len; - for (auto it2 = map->lookup(base); - it2 != map->end() && it2->base() < limit; it2++) { + for (auto it2 = objmap.lookup(base); + it2 != objmap.end() && it2->base() < limit; it2++) { /* [_base,_limit] is a piece of the extent * obj_base is where that piece starts in the object */ @@ -875,7 +668,7 @@ void translate_impl::write_gc(int _seq, translate_req *req) req->local_buf_limit = data_ptr; data_sectors = (data_ptr - data_ptr0) / 512; - int hdr_bytes = sizeof(obj_hdr) + sizeof(obj_data_hdr) + + int hdr_bytes = sizeof(common_obj_hdr) + sizeof(obj_data_hdr) + obj_extents.size() * sizeof(data_map); int hdr_pages = div_round_up(hdr_bytes, 4096); int hdr_sectors = hdr_pages * 8; @@ -885,10 +678,10 @@ void translate_impl::write_gc(int _seq, translate_req *req) std::vector deleted; req->entries.clear(); // replace with actual extents written - std::unique_lock obj_w_lock(*map_lock); // protect the readers + std::unique_lock obj_w_lock(omap_mtx); // protect the readers for (auto const &e : obj_extents) { extmap::obj_offset oo = {_seq, offset}; - map->update(e.lba, e.lba + e.len, oo, &deleted); + objmap.update(e.lba, e.lba + e.len, oo, &deleted); offset += e.len; req->entries.push_back( (ckpt_mapentry){(int64_t)e.lba, (int64_t)e.len, 0, 0}); @@ -909,10 +702,10 @@ void translate_impl::write_gc(int _seq, translate_req *req) make_obj_hdr(hdr, _seq, hdr_sectors, data_sectors, obj_extents.data(), obj_extents.size(), true); - auto h = (obj_hdr *)hdr; + auto h = (common_obj_hdr *)hdr; assert((int)h->hdr_sectors == hdr_sectors); - obj_info oi = { + data_obj_info oi = { .hdr = hdr_sectors, .data = data_sectors, .live = data_sectors}; object_info[_seq] = oi; @@ -923,11 +716,11 @@ void translate_impl::write_gc(int _seq, translate_req *req) req2->run(req); } -void translate_impl::process_batch(int _seq, translate_req *req) +void translate_impl::process_batch(seqnum_t _seq, translate_req *req) { req->_seq = _seq; - int offset = sizeof(obj_hdr) + sizeof(obj_data_hdr), + int offset = sizeof(common_obj_hdr) + sizeof(obj_data_hdr), len = req->entries.size() * sizeof(data_map); int hdr_bytes = offset + len; int hdr_pages = div_round_up(hdr_bytes, 4096); @@ -937,11 +730,10 @@ void translate_impl::process_batch(int _seq, translate_req *req) /* update the object info table */ - std::unique_lock obj_w_lock(*map_lock); + std::unique_lock obj_w_lock(omap_mtx); - obj_info oi = { + object_info[_seq] = (data_obj_info){ .hdr = hdr_sectors, .data = data_sectors, .live = data_sectors}; - object_info[_seq] = oi; /* and the object map (copy entries to right format at same time) */ @@ -953,7 +745,7 @@ void translate_impl::process_batch(int _seq, translate_req *req) for (auto e : req->entries) { extmap::obj_offset oo = {_seq, sector_offset}; - map->update(e.lba, e.lba + e.len, oo, &deleted); + objmap.update(e.lba, e.lba + e.len, oo, &deleted); sector_offset += e.len; dm_entries.push_back((data_map){(uint64_t)e.lba, (uint64_t)e.len}); } @@ -1013,21 +805,21 @@ void translate_impl::worker_thread(thread_pool *p) * map is updated before any following requests are processed */ else if (req->op == REQ_PUT) { - auto _seq = seq++; + auto _seq = cur_seq++; lk.unlock(); process_batch(_seq, req); } // generate a checkpoint before any following requests processed else if (req->op == REQ_CKPT) { - auto _seq = seq++; + auto _seq = cur_seq++; lk.unlock(); write_checkpoint(_seq, req); } // handle output of GC thread else if (req->op == REQ_GC) { - auto _seq = seq++; + auto _seq = cur_seq++; lk.unlock(); write_gc(_seq, req); } @@ -1046,7 +838,7 @@ void translate_impl::flush(void) if (current->len > 0) { workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } auto flush_req = new translate_req(REQ_FLUSH, this); @@ -1061,7 +853,7 @@ void translate_impl::checkpoint(void) if (current->len > 0) { workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } auto ckpt_req = new translate_req(REQ_CKPT, this); @@ -1074,31 +866,31 @@ void translate_impl::checkpoint(void) * for @timeout then submit it for writing to the backend. * Unlike flush() we don't bother waiting until it completes. */ -void translate_impl::flush_thread(thread_pool *p) +void translate_impl::flush_thread(std::stop_token st) { pthread_setname_np(pthread_self(), "flush_thread"); - auto wait_time = std::chrono::milliseconds(500); - auto timeout = std::chrono::milliseconds(cfg->flush_msec); + auto interval = std::chrono::milliseconds(cfg.flush_interval_msec); + auto timeout = std::chrono::milliseconds(cfg.flush_timeout_msec); auto t0 = std::chrono::system_clock::now(); - auto seq0 = seq.load(); + auto seq0 = cur_seq.load(); - std::unique_lock lk(*p->m); - while (p->running) { - p->cv.wait_for(lk, wait_time); - if (!p->running) + while (true) { + std::this_thread::sleep_for(interval); + if (st.stop_requested()) break; - if (p->running && seq0 == seq.load() && current->len > 0) { + if (seq0 == cur_seq.load() && current->len > 0) { if (std::chrono::system_clock::now() - t0 < timeout) continue; workers->put_locked(current); - current = new translate_req(REQ_PUT, cfg->batch_size, this); + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); } else { - seq0 = seq.load(); + seq0 = cur_seq.load(); t0 = std::chrono::system_clock::now(); } } - printf("flush thread (%lx) exiting\n", pthread_self()); + + log_info("Flush thread {} exiting", pthread_self()); } /* -------------- Garbage collection ---------------- */ @@ -1111,13 +903,13 @@ struct _extent { /* [describe GC algorithm here] */ -void translate_impl::do_gc(bool *running) +void translate_impl::do_gc(std::stop_token &st) { gc_cycles++; // trace("Start GC cycle {}", gc_cycles); - int max_obj = seq.load(); + int max_obj = cur_seq.load(); - std::shared_lock obj_r_lock(*map_lock); + std::shared_lock obj_r_lock(omap_mtx); std::vector dead_objects; for (auto const &p : object_info) { auto [hdrlen, datalen, live] = p.second; @@ -1136,7 +928,7 @@ void translate_impl::do_gc(bool *running) */ { std::unique_lock lk(m); - if (reading_objects.find(o) != reading_objects.end()) + if (obj_read_refcount.find(o) != obj_read_refcount.end()) continue; } objname name(prefix(o), o); @@ -1153,13 +945,14 @@ void translate_impl::do_gc(bool *running) deletes.pop(); } - std::unique_lock obj_w_lock(*map_lock); + std::unique_lock obj_w_lock(omap_mtx); for (auto const &o : dead_objects) object_info.erase(o); obj_w_lock.unlock(); std::unique_lock lk(m); - int last_ckpt = (checkpoints.size() > 0) ? checkpoints.back() : seq.load(); + auto last_ckpt = + (checkpoints.size() > 0) ? checkpoints.back() : cur_seq.load(); lk.unlock(); /* create list of object info in increasing order of @@ -1181,7 +974,7 @@ void translate_impl::do_gc(bool *running) /* gather list of objects needing cleaning, return if none */ - const double threshold = cfg->gc_threshold / 100.0; + const double threshold = cfg.gc_threshold / 100.0; std::vector> objs_to_clean; for (auto [u, o, n] : utilization) { if (u > threshold) @@ -1201,7 +994,7 @@ void translate_impl::do_gc(bool *running) for (auto it = objs_to_clean.begin(); it != objs_to_clean.end(); it++) objects.insert(it->first); - int max_obj_sectors = 0; + sector_t max_obj_sectors = 0; for (auto o : objects) { auto _sectors = object_info[o].hdr + object_info[o].data; max_obj_sectors = std::max(_sectors, max_obj_sectors); @@ -1209,11 +1002,11 @@ void translate_impl::do_gc(bool *running) obj_r_lock.lock(); extmap::objmap live_extents; - for (auto it = map->begin(); it != map->end(); it++) { + for (auto it = objmap.begin(); it != objmap.end(); it++) { auto [base, limit, ptr] = it->vals(); if (ptr.obj <= max_obj && objects.find(ptr.obj) != objects.end()) live_extents.update(base, limit, ptr); - if (!*running) // forced exit + if (st.stop_requested()) // forced exit return; } obj_r_lock.unlock(); @@ -1226,10 +1019,11 @@ void translate_impl::do_gc(bool *running) if (live_extents.size() > 0) { /* temporary file, delete on close. */ - char temp[cfg->rcache_dir.size() + 20]; - sprintf(temp, "%s/gc.XXXXXX", cfg->rcache_dir.c_str()); - int fd = mkstemp(temp); - unlink(temp); + auto temp = fmt::format("{}/gc.XXXXXX", cfg.rcache_dir); + auto t1 = strdup(temp.c_str()); + int fd = mkstemp(t1); + free(t1); + unlink(temp.c_str()); /* read all objects in completely */ @@ -1245,7 +1039,7 @@ void translate_impl::do_gc(bool *running) if (write(fd, buf, sectors * 512) < 0) throw("no space"); offset += sectors; - if (!*running) + if (st.stop_requested()) return; } free(buf); @@ -1263,7 +1057,7 @@ void translate_impl::do_gc(bool *running) std::queue requests; while (all_extents.size() > 0) { - sector_t sectors = 0, max = cfg->batch_size / 512; + sector_t sectors = 0, max = cfg.backend_obj_size / 512; std::vector<_extent> extents; auto it = all_extents.begin(); @@ -1298,20 +1092,17 @@ void translate_impl::do_gc(bool *running) workers->put_locked(req); lk.unlock(); - while ((int)requests.size() > cfg->gc_window && *running) { - if (stopped) - return; + while ((int)requests.size() > cfg.gc_window && + !st.stop_requested()) { auto t = requests.front(); t->wait(); requests.pop(); } - if (!*running) + if (st.stop_requested()) return; } - while (requests.size() > 0 && *running) { - if (stopped) - return; + while (requests.size() > 0 && !st.stop_requested()) { auto t = requests.front(); t->wait(); requests.pop(); @@ -1329,7 +1120,7 @@ void translate_impl::do_gc(bool *running) } obj_w_lock.unlock(); - if (stopped) + if (st.stop_requested()) return; /* write checkpoint *before* deleting any objects. @@ -1345,7 +1136,7 @@ void translate_impl::do_gc(bool *running) auto obj = it->first; { std::unique_lock lk(m); - if (reading_objects.find(obj) != reading_objects.end()) + if (obj_read_refcount.find(obj) != obj_read_refcount.end()) continue; } objname name(prefix(obj), obj); @@ -1355,91 +1146,48 @@ void translate_impl::do_gc(bool *running) } } -void translate_impl::stop_gc(void) -{ - stopped = true; - delete misc_threads; - std::unique_lock lk(m); - while (gc_running) - gc_cv.wait(lk); -} - -void translate_impl::gc_thread(thread_pool *p) +void translate_impl::gc_thread(std::stop_token st) { - debug("starting gc thread"); - auto interval = std::chrono::milliseconds(100); + debug("Starting GC"); + auto interval = std::chrono::milliseconds(200); // sector_t trigger = 128 * 1024 * 2; // 128 MB - const char *name = "gc_thread"; - pthread_setname_np(pthread_self(), name); + pthread_setname_np(pthread_self(), "gc_thread"); - while (p->running) { - std::unique_lock lk(m); - p->cv.wait_for(lk, interval); - if (!p->running) + while (!st.stop_requested()) { + std::this_thread::sleep_for(interval); + if (st.stop_requested()) return; /* check to see if we should run a GC cycle */ // if (total_sectors - total_live_sectors < trigger) // continue; - // if ((total_live_sectors / (double)total_sectors) > (cfg->gc_threshold + // if ((total_live_sectors / (double)total_sectors) > (cfg.gc_threshold // / 100.0)) continue; - gc_running = true; - lk.unlock(); - - do_gc(&p->running); - - lk.lock(); - gc_running = false; - gc_cv.notify_all(); + do_gc(st); } + + log_info("Stopping GC"); } /* ---------------- Debug ---------------- */ -/** - * Given a buffer of len at len 4096, create the image header in the buffer - */ -void set_image_header(char *buf, size_t vol_size) -{ - memset(buf, 0, 4096); - - auto hdr = (obj_hdr *)buf; - hdr->magic = LSVD_MAGIC; - hdr->version = 1; - hdr->type = LSVD_SUPER; - hdr->seq = 0; - hdr->hdr_sectors = 8; - hdr->data_sectors = 0; - uuid_generate_random(hdr->vol_uuid); - - auto superblock = (super_hdr *)(hdr + 1); - memset(superblock, 0, sizeof(*superblock)); - superblock->vol_size = vol_size / 512; - // superblock->ckpts_offset = 0; - // superblock->ckpts_len = 0; - // superblock->clones_offset = 0; - // superblock->clones_len = 0; - // superblock->snaps_offset = 0; - // superblock->snaps_len = 0; -} - int translate_create_image(sptr objstore, const char *name, uint64_t size) { char buf[4096]; memset(buf, 0, 4096); - auto _hdr = (obj_hdr *)buf; - *_hdr = (obj_hdr){LSVD_MAGIC, - 1, // version - {0}, // UUID - LSVD_SUPER, // type - 0, // seq - 8, // hdr_sectors - 0, // data_sectors - 0}; + auto _hdr = (common_obj_hdr *)buf; + *_hdr = (common_obj_hdr){LSVD_MAGIC, + 1, // version + {0}, // UUID + OBJ_SUPERBLOCK, // type + 0, // seq + 8, // hdr_sectors + 0, // data_sectors + 0}; uuid_generate_random(_hdr->vol_uuid); auto _super = (super_hdr *)(_hdr + 1); @@ -1459,95 +1207,229 @@ int translate_get_uuid(sptr objstore, const char *name, uuid_t &uu) int rv = objstore->read(name, 0, buf, sizeof(buf)); if (rv < 0) return rv; - auto hdr = (obj_hdr *)buf; + auto hdr = (common_obj_hdr *)buf; memcpy(uu, hdr->vol_uuid, sizeof(uuid_t)); return 0; } int translate_remove_image(sptr objstore, const char *name) { + // read the superblock to get the list of checkpoints and objects + object_reader parser(objstore); + auto super = parser.read_superblock(name); + PR_RET_IF(!super, -1, "Could not read superblock for {}", name); - /* read the superblock to get the list of checkpoints - */ - char buf[4096]; - int rv = objstore->read(name, 0, buf, sizeof(buf)); - if (rv < 0) - return rv; - auto hdr = (obj_hdr *)buf; - auto sh = (super_hdr *)(hdr + 1); + seqnum_t seq = 0; - if (hdr->magic != LSVD_MAGIC || hdr->type != LSVD_SUPER) - return -1; + if (super->ckpts.size() > 0) { + seq = super->ckpts.back(); + auto cpoint = parser.read_checkpoint(oname(name, seq)); + PR_RET_IF(!cpoint, -1, "Could not read checkpoint {}.{}", name, seq); - int seq = 1; + // Delete objects + for (auto const &o : cpoint->objects) { + auto r = objstore->delete_obj(oname(name, o->seq)); + if (r < 0) + log_warn("Failed to delete obj {}, r={}", o->seq, r); + } + } + + // Delete checkpoints + for (auto const &c : super->ckpts) { + auto r = objstore->delete_obj(oname(name, c)); + if (r < 0) + log_warn("Failed to delete obj {}, r={}", c, r); + } + + // delete any objects after the last checkpoint, up to the first run of + // 32 missing sequence numbers + for (int n = 0; n < 16; seq++, n++) { + if (objstore->delete_obj(oname(name, seq)) >= 0) + n = 0; + } + + // delete the superblock last so we can recover from partial deletion + objstore->delete_obj(name); + return 0; +} + +int translate_clone_image(sptr objstore, const char *source, + const char *dest) +{ + UNIMPLEMENTED(); +} + +#if 0 +ssize_t translate_impl::init(const char *prefix_, bool timedflush) +{ std::vector ckpts; - decode_offset_len(buf, sh->ckpts_offset, sh->ckpts_len, ckpts); + std::vector clones; + std::vector snaps; + + /* note prefix = superblock name + */ + strcpy(super_name, prefix_); - /* read the most recent checkpoint and get its object map + auto [_buf, bytes] = + parser->read_super(super_name, ckpts, clones, snaps, uuid); + + check_cond(bytes < 0, "read_super failed for obj {}", super_name); + check_cond(_buf == NULL, "no superblock"); + + int n_ckpts = ckpts.size(); + + super_buf = _buf; + super_h = (common_obj_hdr *)super_buf; + super_len = super_h->hdr_sectors * 512; + super_sh = (super_hdr *)(super_h + 1); + + memcpy(&uuid, super_h->vol_uuid, sizeof(uuid)); + + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); + seq = 1; // empty volume case + + /* is this a clone? */ + if (super_sh->clones_len > 0) { + debug("Image is a clone, parsing cloneinfo headers"); + + char buf[4096]; + auto ci = (clone_info *)(_buf + super_sh->clones_offset); + auto obj_name = (char *)(ci + 1); + while (true) { + if (has_poolname_prefix(obj_name)) { + log_warn("Found poolname prefix in baseimg name: {}; stripping " + "it out. Cross-pool clones are not supported.", + obj_name); + obj_name = strip_poolname_prefix(obj_name); + log_info("Using base name: {}", obj_name); + } + + auto rv = objstore->read(obj_name, 0, buf, sizeof(buf)); + check_cond(rv < 0, "Failed to read {}", obj_name); + + auto _h = (common_obj_hdr *)buf; + auto _sh = (super_hdr *)(_h + 1); + + check_cond(_h->magic != LSVD_MAGIC || _h->type != LSVD_SUPER, + "Corrupted superblock in {}", obj_name); + check_cond(memcmp(_h->vol_uuid, ci->vol_uuid, sizeof(uuid_t)) != 0, + "UUID mismatch in {}", obj_name); + clone c; + strcpy(c.prefix, obj_name); + c.last_seq = ci->last_seq; + if (clones.size() > 0) + clones.back().first_seq = ci->last_seq + 1; + clone_list.push_back(c); + debug("Using base image {} upto seq {}", obj_name, c.last_seq); + + if (_sh->clones_len == 0) + break; + ci = (clone_info *)(buf + _sh->clones_offset); + obj_name = (char *)(ci + 1); + } + } + + /* read in the last checkpoint, then roll forward from there; + */ + int last_ckpt = -1; if (ckpts.size() > 0) { - object_reader r(objstore); - seq = ckpts.back(); - objname obj(name, seq); - auto ckpt_buf = r.read_object_hdr(obj.c_str(), false); - auto c_hdr = (obj_hdr *)ckpt_buf; - auto c_data = (obj_ckpt_hdr *)(c_hdr + 1); - if (c_hdr->magic != LSVD_MAGIC || c_hdr->type != LSVD_CKPT) - return -1; std::vector objects; - decode_offset_len(ckpt_buf, c_data->objs_offset, - c_data->objs_len, objects); + std::vector deletes; + std::vector entries; - /* delete all the objects in the objmap + /* hmm, we should never have checkpoints listed in the + * super that aren't persisted on the backend, should we? */ - for (auto const &o : objects) { - objname obj(name, o.seq); - auto r = objstore->delete_obj(obj.str()); - if (r < 0) - log_warn("Failed to delete obj {}, r={}", obj.str(), r); + while (n_ckpts > 0) { + int c = ckpts[n_ckpts - 1]; + objname name(name_for_seq(c), c); + do_log("reading ckpt %s\n", name.c_str()); + if (parser->read_checkpoint(name.c_str(), max_cache_seq, ckpts, + objects, deletes, entries) >= 0) { + last_ckpt = c; + break; + } + do_log("chkpt skip %d\n", c); + n_ckpts--; } + if (last_ckpt == -1) + return -1; - /* delete all the checkpoints - */ - for (auto const &c : ckpts) { - objname obj(name, c); - objstore->delete_obj(obj.str()); + for (int i = 0; i < n_ckpts; i++) { + do_log("chkpt from super: %d\n", ckpts[i]); + checkpoints.push_back(ckpts[i]); // so we can delete them later } - free(ckpt_buf); - } - /* delete any objects after the last checkpoint, up to the first run of - * 32 missing sequence numbers - */ - for (int n = 0; n < 16; seq++, n++) { - objname obj(name, seq); - if (objstore->delete_obj(obj.str()) >= 0) - n = 0; + + for (auto o : objects) { + object_info[o.seq] = (obj_info){.hdr = (int)o.hdr_sectors, + .data = (int)o.data_sectors, + .live = (int)o.live_sectors}; + total_sectors += o.data_sectors; + total_live_sectors += o.live_sectors; + } + for (auto m : entries) { + objmap.update( + m.lba, m.lba + m.len, + (extmap::obj_offset){.obj = m.obj, .offset = m.offset}); + } + seq = last_ckpt + 1; } - /* and delete the superblock + /* roll forward */ - objstore->delete_obj(name); - return 0; -} + for (;; seq++) { + std::vector cleaned; + std::vector entries; + common_obj_hdr h; + obj_data_hdr dh; -using ckpt_id = uint32_t; + objname name(name_for_seq(seq), seq); + if (parser->read_data_hdr(name.c_str(), h, dh, cleaned, entries) < 0) + break; + if (h.type == LSVD_CKPT) { + do_log("ckpt from roll-forward: %d\n", seq.load()); + checkpoints.push_back(seq); + continue; + } -inline std::vector deserialise_checkpoint_ids(char *buf) -{ - auto obj = (obj_hdr *)buf; - auto super = (super_hdr *)(buf + 1); + do_log("roll %d\n", seq.load()); + assert(h.type == LSVD_DATA); + object_info[seq] = (obj_info){.hdr = (int)h.hdr_sectors, + .data = (int)h.data_sectors, + .live = (int)h.data_sectors}; + total_sectors += h.data_sectors; + total_live_sectors += h.data_sectors; + if (dh.cache_seq) // skip GC writes + max_cache_seq = dh.cache_seq; - assert(obj->magic == LSVD_MAGIC); - assert(obj->type == LSVD_SUPER); + int offset = 0, hdr_len = h.hdr_sectors; + std::vector deleted; + for (auto m : entries) { + extmap::obj_offset oo = {seq, offset + hdr_len}; + objmap.update(m.lba, m.lba + m.len, oo, &deleted); + offset += m.len; + } + for (auto d : deleted) { + auto [base, limit, ptr] = d.vals(); + object_info[ptr.obj].live -= (limit - base); + assert(object_info[ptr.obj].live >= 0); + total_live_sectors -= (limit - base); + } + } - std::vector checkpoint_ids; - decode_offset_len(buf, super->ckpts_offset, super->ckpts_len, - checkpoint_ids); - return checkpoint_ids; -} + /* delete any potential "dangling" objects. + */ + for (int i = 1; i < 32; i++) { + objname name(name_for_seq(i + seq), i + seq); + objstore->delete_obj(name.str()); + } -int translate_clone_image(sptr objstore, const char *source, - const char *dest) -{ - throw std::runtime_error("unimplemented"); + workers->pool.push( + std::thread(&translate_impl::worker_thread, this, workers)); + if (timedflush) + misc_threads->pool.push( + std::thread(&translate_impl::flush_thread, this, misc_threads)); + return bytes; } +#endif \ No newline at end of file diff --git a/src/translate.h b/src/translate.h index b746624e..08e12496 100644 --- a/src/translate.h +++ b/src/translate.h @@ -1,11 +1,14 @@ #pragma once +#include #include #include #include "backend.h" #include "config.h" #include "extent.h" +#include "lsvd_types.h" +#include "objects.h" #include "shared_read_cache.h" #include "utils.h" @@ -18,9 +21,7 @@ class translate translate() {} virtual ~translate() {} - virtual ssize_t init(const char *name, bool timedflush) = 0; virtual void shutdown(void) = 0; - virtual void flush(void) = 0; /* write out current batch */ virtual void checkpoint(void) = 0; /* flush, then write checkpoint */ @@ -32,12 +33,18 @@ class translate virtual void object_read_start(int obj) = 0; virtual void object_read_end(int obj) = 0; - virtual const char *prefix(int seq) = 0; /* for read cache */ - - virtual void stop_gc(void) = 0; /* do this before shutdown */ - virtual void start_gc(void) = 0; + virtual str prefix(seqnum_t seq) = 0; /* for read cache */ }; +uptr make_translate(str name, lsvd_config &cfg, usize vol_size, + uuid_t &vol_uuid, sptr be, + sptr rcache, extmap::objmap &objmap, + std::shared_mutex &omap_mtx, + extmap::bufmap &bmap, std::mutex &bmap_lck, + seqnum_t last_seq, vec &clones, + std::map &objinfo, + vec &checkpoints); + uptr make_translate(std::shared_ptr _io, lsvd_config *cfg, extmap::objmap *map, extmap::bufmap *bufmap, std::shared_mutex *m, std::mutex *buf_m, From 3ecf3cb04e8b152dc5200d7838e3e0be9ad80d49 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 04:40:34 +0000 Subject: [PATCH 30/77] Update names --- src/bdev_lsvd.cc | 2 +- src/image.cc | 15 ++++++++------- src/objects.cc | 1 + src/spdk_wrap.cc | 2 +- src/translate.cc | 8 +++++--- src/translate.h | 2 +- src/write_cache.cc | 2 +- 7 files changed, 18 insertions(+), 14 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 7543bf0f..6ecd00f4 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -142,7 +142,7 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) try { img = uptr(new lsvd_image(img_name, ioctx, cfg)); } catch (std::runtime_error &e) { - log_error("Failed to create image '{}': {}", img_name, e.what()); + log_error("Failed to create image '{}':\n{}", img_name, e.what()); return -1; } diff --git a/src/image.cc b/src/image.cc index 0435e850..79dbf8a0 100644 --- a/src/image.cc +++ b/src/image.cc @@ -20,9 +20,6 @@ lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) objstore = make_rados_backend(io); rcache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); - wlog = open_wlog(cfg.wlog_path(name), cfg.wlog_size / 4096, *xlate, cfg); - THROW_MSG_ON(!wlog, "Failed to open write log"); - read_superblock(); if (checkpoints.size() > 0) read_from_checkpoint(checkpoints.back()); @@ -30,11 +27,15 @@ lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) // Roll forward on the log auto last_data_seq = roll_forward_from_last_checkpoint(); - // TODO: actually recover from the write log, this is currently a no-op - recover_from_wlog(); - // Successfully recovered everything, now we have enough information to // init everything else + xlate = make_translate(name, cfg, size, uuid, objstore, rcache, objmap, + map_lock, bufmap, bufmap_lock, last_data_seq, clones, + obj_info, checkpoints); + + wlog = open_wlog(cfg.wlog_path(name), cfg.wlog_size / 4096, *xlate, cfg); + THROW_MSG_ON(!wlog, "Failed to open write log"); + // recover_from_wlog(); } lsvd_image::~lsvd_image() @@ -460,7 +461,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request assert(parent == nullptr); img->wlog->get_room(req_bytes / 512); - img->xlate->wait_for_room(); + img->xlate->backend_backpressure(); sector_t size_sectors = req_bytes / 512; diff --git a/src/objects.cc b/src/objects.cc index 739e6ad3..3e918464 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -61,6 +61,7 @@ void serialise_superblock(vec buf, vec &checkpoints, *p++ = c; // Part 2: clones + UNIMPLEMENTED(); // Part 3: snapshots // TODO implement this when we get around to snapshots diff --git a/src/spdk_wrap.cc b/src/spdk_wrap.cc index d7d90941..dfa74c3f 100644 --- a/src/spdk_wrap.cc +++ b/src/spdk_wrap.cc @@ -65,7 +65,7 @@ lsvd_rbd *lsvd_rbd::open_image(rados_ioctx_t io, std::string name) return new lsvd_rbd(name, io, cfg); } catch (std::runtime_error &e) { - log_error("Failed to open image: {}", e.what()); + log_error("Failed to open image:\n{}", e.what()); return nullptr; } } diff --git a/src/translate.cc b/src/translate.cc index 54db9fc5..c2a70397 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -13,6 +13,7 @@ #include "extent.h" #include "misc_cache.h" #include "request.h" +#include "src/utils.h" #include "translate.h" /* @@ -224,7 +225,7 @@ class translate_impl : public translate vec &checkpoints) : name(name), cfg(cfg), vol_size(vol_size), vol_uuid(vol_uuid), objstore(be), rcache(rcache), objmap(objmap), omap_mtx(omap_mtx), - bufmap(bmap), bufmap_lock(bmap_lck), cur_seq(last_seq), + bufmap(bmap), bufmap_lock(bmap_lck), cur_seq(last_seq + 1), clones(clones), object_info(objinfo), checkpoints(checkpoints), superblock_buf(4096) { @@ -234,6 +235,7 @@ class translate_impl : public translate total_live_sectors += oi.live; } + UNIMPLEMENTED(); // start worker, flush, and GC threads // if (cfg.flush_interval_msec > 0) // flush_worker = std::jthread(&translate_impl::flush_thread, this); @@ -264,7 +266,7 @@ class translate_impl : public translate ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, int iovcnt) override; ssize_t trim(size_t offset, size_t len) override; - void wait_for_room(void) override; + void backend_backpressure(void) override; // mark object as busy - can't delete void object_read_start(int obj) override; @@ -445,7 +447,7 @@ ssize_t translate_impl::trim(size_t offset, size_t len) * TODO measure how long this takes us, likely to be bottleneck on high * write throughput scenarios */ -void translate_impl::wait_for_room(void) +void translate_impl::backend_backpressure(void) { std::unique_lock lk(m); while (outstanding_writes > cfg.num_parallel_writes) diff --git a/src/translate.h b/src/translate.h index 08e12496..e757181f 100644 --- a/src/translate.h +++ b/src/translate.h @@ -28,7 +28,7 @@ class translate virtual ssize_t writev(uint64_t cache_seq, size_t offset, iovec *iov, int iovcnt) = 0; virtual ssize_t trim(size_t offset, size_t len) = 0; - virtual void wait_for_room(void) = 0; + virtual void backend_backpressure(void) = 0; virtual void object_read_start(int obj) = 0; virtual void object_read_end(int obj) = 0; diff --git a/src/write_cache.cc b/src/write_cache.cc index 2f9ac06d..ae05e8e6 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -600,7 +600,7 @@ uptr open_wlog(fspath path, usize size, translate &xlate, try { return std::make_unique(1, fd, xlate, cfg); } catch (std::exception &e) { - log_error("Failed to open write cache: {}", e.what()); + log_error("Failed to open write cache:\n{}", e.what()); close(fd); return nullptr; } From c102f0d389e8e1545b2c69eb6ee92a4060f7b5bc Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 04:45:36 +0000 Subject: [PATCH 31/77] Update error messages --- src/bdev_lsvd.cc | 2 +- src/spdk_wrap.cc | 2 +- src/utils.h | 7 ++++--- src/write_cache.cc | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 6ecd00f4..7543bf0f 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -142,7 +142,7 @@ int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) try { img = uptr(new lsvd_image(img_name, ioctx, cfg)); } catch (std::runtime_error &e) { - log_error("Failed to create image '{}':\n{}", img_name, e.what()); + log_error("Failed to create image '{}': {}", img_name, e.what()); return -1; } diff --git a/src/spdk_wrap.cc b/src/spdk_wrap.cc index dfa74c3f..d7d90941 100644 --- a/src/spdk_wrap.cc +++ b/src/spdk_wrap.cc @@ -65,7 +65,7 @@ lsvd_rbd *lsvd_rbd::open_image(rados_ioctx_t io, std::string name) return new lsvd_rbd(name, io, cfg); } catch (std::runtime_error &e) { - log_error("Failed to open image:\n{}", e.what()); + log_error("Failed to open image: {}", e.what()); return nullptr; } } diff --git a/src/utils.h b/src/utils.h index 77991052..61e0b68f 100644 --- a/src/utils.h +++ b/src/utils.h @@ -139,10 +139,11 @@ using fspath = std::filesystem::path; #define THROW_MSG_ON(cond, MSG, ...) \ do { \ if (cond) { \ - auto s = fmt::format("[ERR {}:{} {}] " MSG "\n", __FILE__, \ - __LINE__, __func__, ##__VA_ARGS__); \ + auto m = fmt::format(MSG, ##__VA_ARGS__); \ + auto s = fmt::format("[ERR {}:{} {}] {}\n", __FILE__, __LINE__, \ + __func__, m); \ fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, s); \ - throw std::runtime_error(s); \ + throw std::runtime_error(m); \ } \ } while (0) diff --git a/src/write_cache.cc b/src/write_cache.cc index ae05e8e6..2f9ac06d 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -600,7 +600,7 @@ uptr open_wlog(fspath path, usize size, translate &xlate, try { return std::make_unique(1, fd, xlate, cfg); } catch (std::exception &e) { - log_error("Failed to open write cache:\n{}", e.what()); + log_error("Failed to open write cache: {}", e.what()); close(fd); return nullptr; } From ccae1dcb4976123b2a6f5556ff51e897ac442dc4 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 21 May 2024 21:03:03 +0000 Subject: [PATCH 32/77] Improve serialisation and fix xlate threads --- src/objects.cc | 48 ++++++++++++++++++++++++++++++---------- src/objects.h | 1 + src/shared_read_cache.cc | 1 + src/translate.cc | 26 +++++++++++++--------- 4 files changed, 54 insertions(+), 22 deletions(-) diff --git a/src/objects.cc b/src/objects.cc index 3e918464..288d0c23 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -28,21 +29,28 @@ void serialise_common_hdr(vec buf, obj_type t, seqnum_t s, u32 hdr, void serialise_superblock(vec buf, vec &checkpoints, vec &clones, uuid_t &uuid) { + // Reserve required space ahead of time usize req_size = sizeof(common_obj_hdr) + sizeof(super_hdr); req_size += checkpoints.size() * sizeof(seqnum_t); + for (auto &c : clones) + req_size += sizeof(clone_info) + round_up(c.name.size(), 8); + // TODO snapshots req_size = std::max(req_size, 8ul); // minimum of 4096 bytes req_size = round_up(req_size, 512); // round to sector boundary (why??) if (buf.size() < req_size) buf.resize(req_size); + auto bufp = buf.data(); // start of buffer + auto hdrp = (super_hdr *)(bufp + sizeof(common_obj_hdr)); + serialise_common_hdr(buf, OBJ_SUPERBLOCK, 0, req_size / 512, 0, uuid); // There are three variable-length arrays in the superblock: checkpoints, - // snapshots, and clones. The order doesn't matter, so we put checkpoints - // first, as each checkpoint is just a 32-bit sequence number. The rest are - // more complicated as they are variable-length due to them also containing - // names of the clones and snapshots, which need to be handled correctly + // snapshots, and clones. The order doesn't matter, but we put clones first + // since that's effectively immutable. This means that the offset into + // everything else will not change over the lifetime of an image + // The checkpoints and snapshots are appended after that // Also note that we should make sure that each clone/snapshot is 8-byte // aligned in the buffer, as when we read them back to deserialise we end @@ -50,21 +58,37 @@ void serialise_superblock(vec buf, vec &checkpoints, // just make all of us sad. Fortunately we have c-style null-terminated // strings so we can just pad with more null - // Part 1: checkpoints + // Part 1: clones. TODO skip on partial serialise + byte *clonep; + clonep = bufp + sizeof(common_obj_hdr) + sizeof(super_hdr); + hdrp->clones_offset = clonep - bufp; + for (auto &c : clones) { + auto padded_namelen = round_up(c.name.size(), 8); + auto record_len = sizeof(clone_info) + padded_namelen; + auto cip = (clone_info *)clonep; + + cip->last_seq = c.last_seq; + uuid_copy(cip->vol_uuid, nullptr); // TODO + cip->name_len = padded_namelen; - auto h = (super_hdr *)(buf.data() + sizeof(common_obj_hdr)); - h->ckpts_offset = sizeof(common_obj_hdr) + sizeof(super_hdr); - h->ckpts_len = checkpoints.size() * sizeof(seqnum_t); + std::memset(cip->name, 0, padded_namelen); + std::memcpy(cip->name, c.name.c_str(), c.name.size()); - auto p = (seqnum_t *)(buf.data() + h->ckpts_offset); + clonep += record_len; + hdrp->clones_len += record_len; + } + + // Part 2: checkpoints + hdrp->ckpts_offset = clonep - bufp; + hdrp->ckpts_len = checkpoints.size() * sizeof(seqnum_t); + auto p = (seqnum_t *)(bufp + hdrp->ckpts_offset); for (auto &c : checkpoints) *p++ = c; - // Part 2: clones - UNIMPLEMENTED(); - // Part 3: snapshots // TODO implement this when we get around to snapshots + hdrp->snaps_offset = 0; + hdrp->snaps_len = 0; } opt> object_reader::fetch_object_header(std::string objname) diff --git a/src/objects.h b/src/objects.h index c6a46093..eb46c2e8 100644 --- a/src/objects.h +++ b/src/objects.h @@ -193,5 +193,6 @@ struct data_obj_info { void serialise_common_hdr(vec buf, obj_type t, seqnum_t s, u32 hdr, u32 data, uuid_t &uuid); +// Serialise a superblock object. void serialise_superblock(vec buf, vec &checkpoints, vec &clones, uuid_t &uuid); diff --git a/src/shared_read_cache.cc b/src/shared_read_cache.cc index ee0b148f..d9609316 100644 --- a/src/shared_read_cache.cc +++ b/src/shared_read_cache.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "lsvd_types.h" #include "nvme.h" diff --git a/src/translate.cc b/src/translate.cc index c2a70397..7543c330 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -235,20 +235,26 @@ class translate_impl : public translate total_live_sectors += oi.live; } - UNIMPLEMENTED(); // start worker, flush, and GC threads - // if (cfg.flush_interval_msec > 0) - // flush_worker = std::jthread(&translate_impl::flush_thread, this); + if (cfg.flush_interval_msec > 0) + flush_worker = + std::jthread([this](std::stop_token st) { flush_thread(st); }); - // if (!cfg.no_gc) - // gc_worker = std::jthread(&translate_impl::gc_thread, this); + if (!cfg.no_gc) + gc_worker = + std::jthread([this](std::stop_token st) { gc_thread(st); }); - // // honestly have no idea how this works - // workers = new thread_pool(&m); - // workers->pool.push( - // std::thread(&translate_impl::worker_thread, this, workers)); + // honestly have no idea how this works + workers = new thread_pool(&m); + workers->pool.push( + std::thread(&translate_impl::worker_thread, this, workers)); current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); + + // Fully serialise superblock once, so we can do partial serialisations + // later on and skip the checkpoint stuff every time + // currently unimplemented + // serialise_superblock(superblock_buf, checkpoints, clones, vol_uuid); } ~translate_impl() @@ -1258,7 +1264,7 @@ int translate_remove_image(sptr objstore, const char *name) int translate_clone_image(sptr objstore, const char *source, const char *dest) { - UNIMPLEMENTED(); + TODO(); } #if 0 From d794a727067bbc7a17ae14836c165ea8045dcbb4 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 04:56:08 +0000 Subject: [PATCH 33/77] Add libfolly and switch to jemalloc --- Makefile | 2 +- src/meson.build | 8 +++- src/utils.h | 86 +++++------------------------------------- subprojects/folly.wrap | 8 ++++ 4 files changed, 25 insertions(+), 79 deletions(-) create mode 100644 subprojects/folly.wrap diff --git a/Makefile b/Makefile index 5689f096..a15d6e71 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ clean: install-deps: sudo apt install -y meson libfmt-dev libaio-dev librados-dev \ - libtcmalloc-minimal4 libboost-dev libradospp-dev \ + libjemalloc-dev libboost-dev libradospp-dev \ liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ libibverbs-dev librdmacm-dev python3-pyelftools libcunit1-dev diff --git a/src/meson.build b/src/meson.build index c1b0b46e..4fbb3693 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,5 +1,9 @@ cxx = meson.get_compiler('cpp') +cmake = import('cmake') +libfolly_subproject = cmake.subproject('folly') +libfolly = libfolly_subproject.dependency('folly') + lsvd_src = files( 'config.cc', 'image.cc', @@ -16,6 +20,7 @@ lsvd_src = files( lsvd_inc = include_directories('.') lsvd_deps = [ + libfolly, dependency('threads'), dependency('zlib'), dependency('fmt'), @@ -23,7 +28,8 @@ lsvd_deps = [ dependency('liburing'), dependency('uuid'), cxx.find_library('rados', required: true), - cxx.find_library('tcmalloc', required: false), + # cxx.find_library('tcmalloc', required: false), + cxx.find_library('jemalloc', required: false), ] spdk_fe = lsvd_src + files( diff --git a/src/utils.h b/src/utils.h index 61e0b68f..09abf0ec 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -9,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -198,6 +200,13 @@ using fspath = std::filesystem::path; } \ } while (0) +#define TODO() \ + do { \ + fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, \ + "[ERR {}:{} {}] TODO\n", __FILE__, __LINE__, __func__); \ + throw std::runtime_error("TODO stub"); \ + } while (0) + #define UNIMPLEMENTED() \ do { \ fmt::print(stderr, fg(fmt::color::red) | fmt::emphasis::bold, \ @@ -236,25 +245,6 @@ inline std::string string_join(const std::vector &strings, return result; } -inline std::chrono::time_point tnow() -{ - return std::chrono::high_resolution_clock::now(); -} - -constexpr std::chrono::microseconds -tus(std::chrono::time_point start, - std::chrono::time_point end) -{ - return std::chrono::duration_cast(end - start); -} - -constexpr int64_t tdus(std::chrono::time_point start, - std::chrono::time_point end) -{ - return std::chrono::duration_cast(end - start) - .count(); -} - template std::shared_ptr to_shared(std::unique_ptr ptr) { return std::shared_ptr(std::move(ptr)); @@ -288,61 +278,3 @@ inline size_t getsize64(int fd) size = sb.st_size; return size; } - -/** - * This is a thread safe, bounded, blocking, multi-producer multi-consumer, - * single-ended, FIFO queue. Push operations block until there's space, and pop - * blocks until there are entries in the queue to pop. - * - * It uses an underlying std::queue for the actual queue, and then just adds a - * single global lock to both pop and push. Readers are notified when there are - * entries via condition vars, same for writers. - * - * Based on CPython's queue implementation found at - * https://github.com/python/cpython/blob/main/Lib/queue.py - * - * This queue is neither movable nor copyable. Use smart pointers instead. - */ -template class BlockingMPMC -{ - public: - BlockingMPMC(size_t size) : _buffer(), _max_capacity(size) {} - ~BlockingMPMC(); - - // TODO Change to take an rvalue to default move instead of copy - void push(T t) - { - { - std::unique_lock lck(_mutex); - _can_push.wait(lck, - [&]() { return _buffer.size() < _max_capacity; }); - _buffer.push(std::move(t)); - } - _can_pop.notify_one(); - } - - T pop() - { - T x; - { - std::unique_lock lck(_mutex); - _can_pop.wait(lck, [&]() { return !_buffer.empty(); }); - x = std::move(_buffer.front()); - _buffer.pop(); - } - _can_push.notify_one(); - return x; - } - - private: - BlockingMPMC(BlockingMPMC &src) = delete; - BlockingMPMC(BlockingMPMC &&src) = delete; - BlockingMPMC &operator=(BlockingMPMC &src) = delete; - BlockingMPMC &operator=(BlockingMPMC &&src) = delete; - - std::queue _buffer; - size_t _max_capacity; - std::mutex _mutex; - std::condition_variable _can_pop; - std::condition_variable _can_push; -}; diff --git a/subprojects/folly.wrap b/subprojects/folly.wrap new file mode 100644 index 00000000..042c9e81 --- /dev/null +++ b/subprojects/folly.wrap @@ -0,0 +1,8 @@ +[wrap-git] +url = https://github.com/facebook/folly.git +revision = v2024.05.20.00 +patch_directory = folly +clone-recursive = true + +[provide] +_folly = folly_dep \ No newline at end of file From 9fc3078b8b37b4fcae6ccfff1edcc37c1ae4c1cb Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 05:06:55 +0000 Subject: [PATCH 34/77] Reshuffle headers --- src/nvme.cc | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/nvme.cc b/src/nvme.cc index 07dd0b90..1c44a1ec 100644 --- a/src/nvme.cc +++ b/src/nvme.cc @@ -1,18 +1,16 @@ +#include +#include #include +#include #include #include #include -#include -#include - #include "lsvd_types.h" +#include "nvme.h" #include "request.h" #include "smartiov.h" #include "utils.h" -#include - -#include "nvme.h" void do_log(const char *, ...); From f867619bab7510b78c9164c280a55d16124784ac Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 05:07:20 +0000 Subject: [PATCH 35/77] Add stacktrace on exit --- src/imgtool.cc | 8 ++++++++ src/spdk_frontend.cc | 23 ++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/imgtool.cc b/src/imgtool.cc index f6987ed1..11377984 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -208,6 +208,14 @@ void mk_cache(rados_ioctx_t io, const char *image_name, const char *dev_name, int main(int argc, char **argv) { + std::set_terminate([]() { + try { + std::cerr << boost::stacktrace::stacktrace(); + } catch (...) { + } + std::abort(); + }); + argp_parse(&argp, argc, argv, 0, 0, 0); setenv("LSVD_BACKEND", backend, 1); diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index d204d9fb..c535ec2b 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -1,4 +1,6 @@ #include "spdk/event.h" +#include +#include #include "bdev_lsvd.h" #include "utils.h" @@ -28,11 +30,30 @@ static void start_lsvd(void *arg) check_ret_neg(err, "Failed to connect to pool {}", pool_name); err = bdev_lsvd_create("test", io_ctx); + + spdk_app_stop(err); } int main(int argc, char **argv) { - spdk_app_opts opts = {}; + std::set_terminate([]() { + try { + std::cerr << boost::stacktrace::stacktrace(); + } catch (...) { + } + std::abort(); + }); + + std::signal(SIGINT, [](int) { + log_info("Received SIGINT, shutting down LSVD SPDK program ..."); + spdk_app_stop(0); + }); + + spdk_app_opts opts = {.shutdown_cb = []() { + log_info("Shutting down LSVD SPDK program ..."); + spdk_app_stop(0); + }}; + spdk_app_opts_init(&opts, sizeof(opts)); opts.name = "spdk_frontend"; From 2d301ba6fa17cb84885af69eb537ea22b87d8e38 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 05:42:18 +0000 Subject: [PATCH 36/77] Get fully working spdk build --- meson.build | 8 +++++--- src/bdev_lsvd.cc | 2 +- src/image.cc | 2 +- src/meson.build | 2 +- src/objects.cc | 3 ++- src/rados_backend.cc | 9 +++++++-- src/spdk_frontend.cc | 6 ++++-- src/utils.h | 7 +++---- 8 files changed, 24 insertions(+), 15 deletions(-) diff --git a/meson.build b/meson.build index 1064a1e2..c06f9ea2 100644 --- a/meson.build +++ b/meson.build @@ -20,7 +20,9 @@ if get_option('buildtype') == 'debug' endif subdir('src') -liblsvd = library('lsvd', lsvd_src, dependencies: lsvd_deps, install: false) + +liblsvd = shared_library('lsvd', lsvd_src, dependencies: lsvd_deps) +lsvd_ar = static_library('lsvd', lsvd_src, dependencies: lsvd_deps) executable( 'lsvd', @@ -31,13 +33,13 @@ executable( executable( 'imgtool', 'src/imgtool.cc', - link_with: liblsvd, + link_whole: lsvd_ar, dependencies: lsvd_deps, ) executable( 'thick-image', 'src/thick-image.cc', - link_with: liblsvd, + link_whole: lsvd_ar, dependencies: lsvd_deps, ) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 7543bf0f..2df80358 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -43,7 +43,7 @@ class lsvd_iodevice spdk_bdev bdev; uptr img; - lsvd_iodevice(uptr img) : img(std::move(img)) + lsvd_iodevice(uptr img_) : img(std::move(img_)) { bdev.product_name = strdup("Log-structured Virtual Disk"); bdev.name = strdup(img->imgname.c_str()); diff --git a/src/image.cc b/src/image.cc index 79dbf8a0..9556a434 100644 --- a/src/image.cc +++ b/src/image.cc @@ -92,7 +92,7 @@ bool lsvd_image::apply_log(seqnum_t seq) void lsvd_image::read_superblock() { object_reader parser(objstore); - auto superblock = parser.read_superblock(oname(imgname, 0)); + auto superblock = parser.read_superblock(imgname); THROW_MSG_ON(!superblock, "Failed to read superblock"); size = superblock->vol_size; diff --git a/src/meson.build b/src/meson.build index 4fbb3693..1ef9c1d9 100644 --- a/src/meson.build +++ b/src/meson.build @@ -20,7 +20,7 @@ lsvd_src = files( lsvd_inc = include_directories('.') lsvd_deps = [ - libfolly, + # libfolly, dependency('threads'), dependency('zlib'), dependency('fmt'), diff --git a/src/objects.cc b/src/objects.cc index 288d0c23..cf4e7fdd 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -171,7 +171,8 @@ opt object_reader::read_superblock(std::string oname) auto hbuf = buf->data(); PR_RET_IF(hdr->magic != LSVD_MAGIC, std::nullopt, - "Corrupt object; invalid magic at '{}'", oname); + "Corrupt object; invalid magic at '{}', found {:x}", oname, + hdr->magic); PR_RET_IF(hdr->version != 1, std::nullopt, "Invalid version in object '{}', only 1 is supported", oname); PR_RET_IF(hdr->type != OBJ_SUPERBLOCK, std::nullopt, diff --git a/src/rados_backend.cc b/src/rados_backend.cc index f4b2cdcb..f144a8c8 100644 --- a/src/rados_backend.cc +++ b/src/rados_backend.cc @@ -176,9 +176,14 @@ class rados_backend : public backend u64 size; time_t mtime; int rv = ctx.stat(name, &size, &mtime); - if (rv < 0) + switch (rv) { + case 0: + return size; + case -ENOENT: return std::nullopt; - return size; + default: + THROW_ERRNO_ON(true, -rv, "Failed to stat object '{}'", name); + } } opt> read_whole_obj(std::string name) override diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index c535ec2b..0abd9159 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -30,8 +30,10 @@ static void start_lsvd(void *arg) check_ret_neg(err, "Failed to connect to pool {}", pool_name); err = bdev_lsvd_create("test", io_ctx); - - spdk_app_stop(err); + if (err) { + log_error("Failed to create bdev"); + spdk_app_stop(err); + } } int main(int argc, char **argv) diff --git a/src/utils.h b/src/utils.h index 09abf0ec..7a6654cf 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,10 +1,9 @@ #pragma once #include -#include +#include #include #include -#include #include #include #include @@ -153,8 +152,8 @@ using fspath = std::filesystem::path; do { \ if (cond) { \ auto m = \ - fmt::format("{}/{}: " MSG, en, strerr(en), ##__VA_ARGS__); \ - throw std::system_error(m); \ + fmt::format("{}/{}: " MSG, en, strerror(en), ##__VA_ARGS__); \ + throw std::system_error(en, std::generic_category(), m); \ } \ } while (0) From 970936519dfab56c3d8067dc66f5eda1ed2b43ce Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 05:58:57 +0000 Subject: [PATCH 37/77] Pass in pool from cmdline --- src/spdk_frontend.cc | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 0abd9159..c7039406 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -5,6 +5,11 @@ #include "bdev_lsvd.h" #include "utils.h" +struct start_lsvd_args { + const char *pool_name; + const char *image_name; +}; + static void start_lsvd(void *arg) { log_info("Starting LSVD SPDK program ..."); @@ -13,7 +18,7 @@ static void start_lsvd(void *arg) setenv("LSVD_WCACHE_DIR", "/tmp/lsvd-write", 1); setenv("LSVD_CACHE_SIZE", "2147483648", 1); - std::string pool_name = "pone"; + auto args = (start_lsvd_args *)arg; rados_t cluster; int err = rados_create2(&cluster, "ceph", "client.admin", 0); @@ -26,17 +31,17 @@ static void start_lsvd(void *arg) check_ret_neg(err, "Failed to connect to cluster"); rados_ioctx_t io_ctx; - err = rados_ioctx_create(cluster, pool_name.c_str(), &io_ctx); - check_ret_neg(err, "Failed to connect to pool {}", pool_name); + err = rados_ioctx_create(cluster, args->pool_name, &io_ctx); + check_ret_neg(err, "Failed to connect to pool {}", args->pool_name); - err = bdev_lsvd_create("test", io_ctx); + err = bdev_lsvd_create(args->image_name, io_ctx); if (err) { log_error("Failed to create bdev"); spdk_app_stop(err); } } -int main(int argc, char **argv) +int main(int argc, const char **argv) { std::set_terminate([]() { try { @@ -46,6 +51,17 @@ int main(int argc, char **argv) std::abort(); }); + if (argc < 3) { + log_error("Usage: {} ", argv[0]); + return 1; + } + + auto args = (start_lsvd_args){ + .pool_name = argv[1], + .image_name = argv[2], + }; + log_info("Args: pool={}, image={}", args.pool_name, args.image_name); + std::signal(SIGINT, [](int) { log_info("Received SIGINT, shutting down LSVD SPDK program ..."); spdk_app_stop(0); @@ -59,7 +75,7 @@ int main(int argc, char **argv) spdk_app_opts_init(&opts, sizeof(opts)); opts.name = "spdk_frontend"; - int rc = spdk_app_start(&opts, start_lsvd, NULL); + int rc = spdk_app_start(&opts, start_lsvd, &args); spdk_app_fini(); return rc; } From 8e56da96b38894c189e8e25a563f2ed381abe081 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 06:04:06 +0000 Subject: [PATCH 38/77] Make CI happy --- .github/workflows/test.yaml | 12 +++++++----- Makefile | 2 +- meson.ini | 8 ++++---- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f24e80d6..a2786e1f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -13,16 +13,18 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Setup Clang 18 - run: | - wget https://apt.llvm.org/llvm.sh - chmod +x llvm.sh - sudo ./llvm.sh 18 + # Temporarily revert to GCC to make GH actions happy + # - name: Setup Clang 18 + # run: | + # wget https://apt.llvm.org/llvm.sh + # chmod +x llvm.sh + # sudo ./llvm.sh 18 - name: Install dependencies run: | sudo apt update make install-deps + pip3 install meson - name: Build run: | diff --git a/Makefile b/Makefile index a15d6e71..bf6caad8 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ clean: cd build-dbg; meson compile --clean install-deps: - sudo apt install -y meson libfmt-dev libaio-dev librados-dev \ + sudo apt install -y meson g++-14 mold libfmt-dev libaio-dev librados-dev \ libjemalloc-dev libboost-dev libradospp-dev \ liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ libibverbs-dev librdmacm-dev python3-pyelftools libcunit1-dev diff --git a/meson.ini b/meson.ini index a1be4df4..e5a789c3 100644 --- a/meson.ini +++ b/meson.ini @@ -1,5 +1,5 @@ [binaries] -c = 'clang-18' -c_ld = 'lld-18' -cpp = 'clang++-18' -cpp_ld = 'lld-18' \ No newline at end of file +c = 'gcc-14' +c_ld = 'mold' +cpp = 'g++-14' +cpp_ld = 'mold' \ No newline at end of file From e90edfa56e0d7f6d577b8fb78a7d59cb0b6c5afe Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 06:08:04 +0000 Subject: [PATCH 39/77] Revert to clang for CI --- .github/workflows/test.yaml | 11 +++++------ Makefile | 2 +- meson.ini | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a2786e1f..f676106c 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -13,12 +13,11 @@ jobs: steps: - uses: actions/checkout@v3 - # Temporarily revert to GCC to make GH actions happy - # - name: Setup Clang 18 - # run: | - # wget https://apt.llvm.org/llvm.sh - # chmod +x llvm.sh - # sudo ./llvm.sh 18 + - name: Setup Clang 18 + run: | + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + sudo ./llvm.sh 18 - name: Install dependencies run: | diff --git a/Makefile b/Makefile index bf6caad8..141b05d2 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ clean: cd build-dbg; meson compile --clean install-deps: - sudo apt install -y meson g++-14 mold libfmt-dev libaio-dev librados-dev \ + sudo apt install -y meson mold libfmt-dev libaio-dev librados-dev \ libjemalloc-dev libboost-dev libradospp-dev \ liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ libibverbs-dev librdmacm-dev python3-pyelftools libcunit1-dev diff --git a/meson.ini b/meson.ini index e5a789c3..0e8514bf 100644 --- a/meson.ini +++ b/meson.ini @@ -1,5 +1,5 @@ [binaries] -c = 'gcc-14' +c = 'clang-18' c_ld = 'mold' -cpp = 'g++-14' +cpp = 'clang++-18' cpp_ld = 'mold' \ No newline at end of file From f15a03bddd02253330fbe9ff8aaebd038e0785e6 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:28:29 +0000 Subject: [PATCH 40/77] Fix offbyone in seqnums and add logging statements --- src/bdev_lsvd.cc | 3 +-- src/bdev_lsvd.h | 6 +++--- src/image.cc | 10 +++++++++- src/objects.cc | 26 ++++++++++++++------------ src/objects.h | 9 +++++---- src/shared_read_cache.cc | 2 +- src/spdk_frontend.cc | 18 +++++++----------- src/translate.cc | 14 +++++++++----- src/translate.h | 4 ++-- 9 files changed, 51 insertions(+), 41 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index 2df80358..c22a7c0c 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -133,11 +133,10 @@ struct lsvd_bdev_io_channel { spdk_io_channel *io_channel; }; -int bdev_lsvd_create(std::string img_name, rados_ioctx_t ioctx) +int bdev_lsvd_create(str img_name, rados_ioctx_t ioctx, lsvd_config cfg) { assert(!img_name.empty()); - lsvd_config cfg; // TODO uptr img; try { img = uptr(new lsvd_image(img_name, ioctx, cfg)); diff --git a/src/bdev_lsvd.h b/src/bdev_lsvd.h index ff961d0f..294298ce 100644 --- a/src/bdev_lsvd.h +++ b/src/bdev_lsvd.h @@ -1,8 +1,8 @@ #pragma once #include -#include -int bdev_lsvd_create(std::string img_name, rados_ioctx_t io_ctx); +#include "config.h" -int bdev_lsvd_delete(std::string img_name); +int bdev_lsvd_create(str img_name, rados_ioctx_t io_ctx, lsvd_config cfg); +int bdev_lsvd_delete(str img_name); diff --git a/src/image.cc b/src/image.cc index 9556a434..7e5098ef 100644 --- a/src/image.cc +++ b/src/image.cc @@ -21,11 +21,13 @@ lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) rcache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); read_superblock(); + debug("Found checkpoints: {}", checkpoints); if (checkpoints.size() > 0) read_from_checkpoint(checkpoints.back()); // Roll forward on the log auto last_data_seq = roll_forward_from_last_checkpoint(); + debug("Last data seq: {}", last_data_seq); // Successfully recovered everything, now we have enough information to // init everything else @@ -36,6 +38,8 @@ lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) wlog = open_wlog(cfg.wlog_path(name), cfg.wlog_size / 4096, *xlate, cfg); THROW_MSG_ON(!wlog, "Failed to open write log"); // recover_from_wlog(); + + log_info("Image '{}' opened successfully", name); } lsvd_image::~lsvd_image() @@ -43,6 +47,8 @@ lsvd_image::~lsvd_image() wlog->flush(); wlog->do_write_checkpoint(); xlate->shutdown(); + + log_info("Image '{}' closed", imgname); } bool lsvd_image::apply_log(seqnum_t seq) @@ -132,7 +138,7 @@ void lsvd_image::read_from_checkpoint(seqnum_t seq) } } -// Returns last processed checkpoint +// Returns last processed object's seqnum seqnum_t lsvd_image::roll_forward_from_last_checkpoint() { if (checkpoints.size() == 0) @@ -148,6 +154,8 @@ seqnum_t lsvd_image::roll_forward_from_last_checkpoint() break; } + seq -= 1; + // Delete "dangling" objects if there are any in case they cause trouble // with corruption // This must be larger than the max backend batch size to avoid diff --git a/src/objects.cc b/src/objects.cc index cf4e7fdd..18dae9af 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -8,26 +8,25 @@ #include "objects.h" #include "utils.h" -void serialise_common_hdr(vec buf, obj_type t, seqnum_t s, u32 hdr, +void serialise_common_hdr(vec &buf, obj_type t, seqnum_t s, u32 hdr, u32 data, uuid_t &uuid) { if (buf.size() < sizeof(common_obj_hdr)) buf.resize(sizeof(common_obj_hdr)); auto h = (common_obj_hdr *)buf.data(); - *h = (common_obj_hdr){.magic = LSVD_MAGIC, - .version = 1, - .vol_uuid = {0}, - .type = t, - .seq = s, - .hdr_sectors = hdr, - .data_sectors = data, - .crc = 0}; + h->magic = LSVD_MAGIC; + h->version = 1; + h->type = t; + h->seq = s; + h->hdr_sectors = hdr; + h->data_sectors = data; + h->crc = 0; uuid_copy(h->vol_uuid, uuid); } -void serialise_superblock(vec buf, vec &checkpoints, - vec &clones, uuid_t &uuid) +void serialise_superblock(vec &buf, vec &checkpoints, + vec &clones, uuid_t &uuid, usize vol_size) { // Reserve required space ahead of time usize req_size = sizeof(common_obj_hdr) + sizeof(super_hdr); @@ -43,6 +42,7 @@ void serialise_superblock(vec buf, vec &checkpoints, auto bufp = buf.data(); // start of buffer auto hdrp = (super_hdr *)(bufp + sizeof(common_obj_hdr)); + hdrp->vol_size = vol_size / 512; serialise_common_hdr(buf, OBJ_SUPERBLOCK, 0, req_size / 512, 0, uuid); @@ -95,7 +95,9 @@ opt> object_reader::fetch_object_header(std::string objname) { vec buf(4096); auto err = objstore->read(objname, 0, buf.data(), 4096); - RET_IF(err != 4096, std::nullopt); + RET_IF(err == -ENOENT, std::nullopt); + THROW_ERRNO_ON(err < 0, -err, "Failed to read object '{}' header", objname); + THROW_MSG_ON(err < 512, "Short read {}/512 on obj '{}'", err, objname); auto h = (common_obj_hdr *)buf.data(); diff --git a/src/objects.h b/src/objects.h index eb46c2e8..a49204de 100644 --- a/src/objects.h +++ b/src/objects.h @@ -40,7 +40,7 @@ struct common_obj_hdr { * snaps : TBD */ struct super_hdr { - uint64_t vol_size; + uint64_t vol_size; // in 512 byte sectors uint32_t ckpts_offset; uint32_t ckpts_len; uint32_t clones_offset; // array of struct clone @@ -190,9 +190,10 @@ struct data_obj_info { sector_t live; }; -void serialise_common_hdr(vec buf, obj_type t, seqnum_t s, u32 hdr, +void serialise_common_hdr(vec &buf, obj_type t, seqnum_t s, u32 hdr, u32 data, uuid_t &uuid); // Serialise a superblock object. -void serialise_superblock(vec buf, vec &checkpoints, - vec &clones, uuid_t &uuid); +void serialise_superblock(vec &buf, vec &checkpoints, + vec &clones, uuid_t &uuid, + usize vol_size); diff --git a/src/shared_read_cache.cc b/src/shared_read_cache.cc index d9609316..b4de2b91 100644 --- a/src/shared_read_cache.cc +++ b/src/shared_read_cache.cc @@ -440,7 +440,7 @@ shared_read_cache::shared_read_cache(std::string cache_path, user_bytes(tag::rolling_window::window_size = CACHE_STATS_WINDOW), backend_bytes(tag::rolling_window::window_size = CACHE_STATS_WINDOW) { - debug("Opening {} for the read cache", cache_path); + trace("Opening {} for the read cache", cache_path); fd = open(cache_path.c_str(), O_RDWR | O_CREAT | O_TRUNC, 0777); check_ret_errno(fd, "failed to open cache file"); diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index c7039406..3b0904bb 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -13,11 +13,6 @@ struct start_lsvd_args { static void start_lsvd(void *arg) { log_info("Starting LSVD SPDK program ..."); - - setenv("LSVD_RCACHE_DIR", "/tmp/lsvd-read", 1); - setenv("LSVD_WCACHE_DIR", "/tmp/lsvd-write", 1); - setenv("LSVD_CACHE_SIZE", "2147483648", 1); - auto args = (start_lsvd_args *)arg; rados_t cluster; @@ -34,11 +29,15 @@ static void start_lsvd(void *arg) err = rados_ioctx_create(cluster, args->pool_name, &io_ctx); check_ret_neg(err, "Failed to connect to pool {}", args->pool_name); - err = bdev_lsvd_create(args->image_name, io_ctx); + lsvd_config cfg; // TODO get this from somewhere reasonable + cfg.cache_size = 160 * 1024 * 1024; + err = bdev_lsvd_create(args->image_name, io_ctx, cfg); if (err) { log_error("Failed to create bdev"); spdk_app_stop(err); } + + // TODO setup nvmf subsystems and all that nonsense } int main(int argc, const char **argv) @@ -62,11 +61,6 @@ int main(int argc, const char **argv) }; log_info("Args: pool={}, image={}", args.pool_name, args.image_name); - std::signal(SIGINT, [](int) { - log_info("Received SIGINT, shutting down LSVD SPDK program ..."); - spdk_app_stop(0); - }); - spdk_app_opts opts = {.shutdown_cb = []() { log_info("Shutting down LSVD SPDK program ..."); spdk_app_stop(0); @@ -77,5 +71,7 @@ int main(int argc, const char **argv) int rc = spdk_app_start(&opts, start_lsvd, &args); spdk_app_fini(); + + log_info("Exiting ..."); return rc; } diff --git a/src/translate.cc b/src/translate.cc index 7543c330..f9db503b 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -127,7 +127,6 @@ class translate_req : public request local_buf_base = data_ptr; local_buf_limit = data_ptr + bytes; } - translate_req(work_type op_, translate_impl *tx_) { op = op_; @@ -153,7 +152,6 @@ class translate_impl : public translate std::string name; lsvd_config &cfg; usize vol_size; - uuid_t &vol_uuid; std::shared_ptr objstore; std::shared_ptr rcache; @@ -223,7 +221,7 @@ class translate_impl : public translate seqnum_t last_seq, vec &clones, std::map &objinfo, vec &checkpoints) - : name(name), cfg(cfg), vol_size(vol_size), vol_uuid(vol_uuid), + : translate(vol_uuid), name(name), cfg(cfg), vol_size(vol_size), objstore(be), rcache(rcache), objmap(objmap), omap_mtx(omap_mtx), bufmap(bmap), bufmap_lock(bmap_lck), cur_seq(last_seq + 1), clones(clones), object_info(objinfo), checkpoints(checkpoints), @@ -346,7 +344,7 @@ void translate_impl::make_obj_hdr(char *buf, uint32_t _seq, .hdr_sectors = (uint32_t)hdr_sectors, .data_sectors = (uint32_t)data_sectors, .crc = 0}; - memcpy(h->vol_uuid, &uuid, sizeof(uuid_t)); + uuid_copy(h->vol_uuid, uuid); *dh = (obj_data_hdr){.cache_seq = 0, .objs_cleaned_offset = 0, @@ -544,6 +542,8 @@ void translate_req::notify(request *child) */ void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) { + debug("Writing checkpoint {}", cp_seq); + std::vector entries; std::vector objects; @@ -602,9 +602,11 @@ void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) checkpoints.erase(checkpoints.begin()); } - serialise_superblock(superblock_buf, checkpoints, clones, uuid); + serialise_superblock(superblock_buf, checkpoints, clones, uuid, vol_size); + debug("Updating superblock with new checkpoint"); objstore->write(name, superblock_buf.data(), superblock_buf.size()); + debug("Deleting old checkpoints {}", ckpts_to_delete); for (auto c : ckpts_to_delete) objstore->delete_obj(oname(name, c)); @@ -882,6 +884,8 @@ void translate_impl::flush_thread(std::stop_token st) auto t0 = std::chrono::system_clock::now(); auto seq0 = cur_seq.load(); + debug("Flush thread {} starting", pthread_self()); + while (true) { std::this_thread::sleep_for(interval); if (st.stop_requested()) diff --git a/src/translate.h b/src/translate.h index e757181f..0aa85791 100644 --- a/src/translate.h +++ b/src/translate.h @@ -15,10 +15,10 @@ class translate { public: - uuid_t uuid; uint64_t max_cache_seq; + uuid_t &uuid; - translate() {} + translate(uuid_t &uuid) : uuid(uuid) {} virtual ~translate() {} virtual void shutdown(void) = 0; From 05e512e660b75c9e6b481b969bb835cd0aba6dd8 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:30:10 +0000 Subject: [PATCH 41/77] Fix CI meson version --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f676106c..742297a2 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: run: | sudo apt update make install-deps - pip3 install meson + pip3 install --upgrade meson - name: Build run: | From 074cfcefa3a27ca498a8719222a5593672c5e3c2 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:31:44 +0000 Subject: [PATCH 42/77] Fix CI folly build --- subprojects/folly.wrap | 1 - 1 file changed, 1 deletion(-) diff --git a/subprojects/folly.wrap b/subprojects/folly.wrap index 042c9e81..32ad06f7 100644 --- a/subprojects/folly.wrap +++ b/subprojects/folly.wrap @@ -1,7 +1,6 @@ [wrap-git] url = https://github.com/facebook/folly.git revision = v2024.05.20.00 -patch_directory = folly clone-recursive = true [provide] From 924e2a4679bd957637fcbbdcc5a57648908a5bdf Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:33:14 +0000 Subject: [PATCH 43/77] Add folly boost deps --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 141b05d2..05fdb931 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ clean: install-deps: sudo apt install -y meson mold libfmt-dev libaio-dev librados-dev \ - libjemalloc-dev libboost-dev libradospp-dev \ + libjemalloc-dev libboost-all-dev libradospp-dev \ liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ libibverbs-dev librdmacm-dev python3-pyelftools libcunit1-dev From 079b96e61a563de517e1787033ce99d7872c7c9d Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:38:21 +0000 Subject: [PATCH 44/77] Add more folly deps --- Makefile | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 05fdb931..dcd77d50 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,14 @@ clean: cd build-dbg; meson compile --clean install-deps: - sudo apt install -y meson mold libfmt-dev libaio-dev librados-dev \ - libjemalloc-dev libboost-all-dev libradospp-dev \ - liburing-dev pkg-config uuid-dev libnuma-dev libarchive-dev \ - libibverbs-dev librdmacm-dev python3-pyelftools libcunit1-dev - + # Folly deps + sudo apt install libboost-all-dev libdouble-conversion-dev libevent-dev \ + libflags-dev libgmock-dev libgoogle-glog-dev libgtest-dev \ + liblz4-dev liblzma-dev libsnappy-dev libsodium-dev libunwind-dev \ + libzstd-dev ninja-build zlib1g-dev + # SPDK deps + sudo apt install libnuma-dev libarchive-dev libibverbs-dev librdmacm-dev \ + python3-pyelftools libcunit1-dev + # LSVD deps + sudo apt install -y meson mold libfmt-dev librados-dev \ + libjemalloc-dev libradospp-dev liburing-dev pkg-config uuid-dev From cf139d1053b6379183f2850387e12f9c9a5e3042 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:39:30 +0000 Subject: [PATCH 45/77] Fix deps typo --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index dcd77d50..9decf6ac 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ clean: install-deps: # Folly deps sudo apt install libboost-all-dev libdouble-conversion-dev libevent-dev \ - libflags-dev libgmock-dev libgoogle-glog-dev libgtest-dev \ + libgflags-dev libgmock-dev libgoogle-glog-dev libgtest-dev \ liblz4-dev liblzma-dev libsnappy-dev libsodium-dev libunwind-dev \ libzstd-dev ninja-build zlib1g-dev # SPDK deps From 302269357d3a4228f2650a03c3eaad71714c6dc0 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 19:44:56 +0000 Subject: [PATCH 46/77] Add libaio back --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9decf6ac..4c257a70 100644 --- a/Makefile +++ b/Makefile @@ -24,7 +24,7 @@ install-deps: libzstd-dev ninja-build zlib1g-dev # SPDK deps sudo apt install libnuma-dev libarchive-dev libibverbs-dev librdmacm-dev \ - python3-pyelftools libcunit1-dev + python3-pyelftools libcunit1-dev libaio-dev # LSVD deps sudo apt install -y meson mold libfmt-dev librados-dev \ libjemalloc-dev libradospp-dev liburing-dev pkg-config uuid-dev From 3a90482f04b97069c28964c082dc1efd3dc3baf7 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 21:33:49 +0000 Subject: [PATCH 47/77] Fix wlog header init issues --- src/image.cc | 4 +- src/imgtool.cc | 80 +------------------- src/write_cache.cc | 179 +++++++++++++++++++++++---------------------- src/write_cache.h | 7 +- 4 files changed, 95 insertions(+), 175 deletions(-) diff --git a/src/image.cc b/src/image.cc index 7e5098ef..c2ba0d25 100644 --- a/src/image.cc +++ b/src/image.cc @@ -35,7 +35,7 @@ lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) map_lock, bufmap, bufmap_lock, last_data_seq, clones, obj_info, checkpoints); - wlog = open_wlog(cfg.wlog_path(name), cfg.wlog_size / 4096, *xlate, cfg); + wlog = open_wlog(cfg.wlog_path(name), *xlate, cfg); THROW_MSG_ON(!wlog, "Failed to open write log"); // recover_from_wlog(); @@ -468,7 +468,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request { assert(parent == nullptr); - img->wlog->get_room(req_bytes / 512); + img->wlog->reserve_room(req_bytes / 512); img->xlate->backend_backpressure(); sector_t size_sectors = req_bytes / 512; diff --git a/src/imgtool.cc b/src/imgtool.cc index 11377984..b3517884 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -15,18 +15,10 @@ #include "translate.h" #include "utils.h" -enum tool_operation { - OP_CREATE = 1, - OP_DELETE = 2, - OP_INFO = 3, - OP_MKCACHE = 4, - OP_CLONE = 5 -}; +enum tool_operation { OP_CREATE = 1, OP_DELETE = 2, OP_INFO = 3, OP_CLONE = 5 }; const char *backend = "rados"; const char *image_name; -const char *cache_dir; -const char *cache_dev; cfg_cache_type cache_type = LSVD_CFG_READ; enum tool_operation op; const char *pool_name = "lsvd"; @@ -46,11 +38,7 @@ static long parseint(const char *_s) } static struct argp_option options[] = { - {"cache-dir", 'd', "DIR", 0, "cache directory", 0}, {"create", 'C', 0, 0, "create image", 0}, - {"mkcache", 'k', "DEV", 0, "use DEV as cache", 0}, - {"cache-type", 't', "R/W", 0, - "R for read cache, W for write cache (default: R)", 0}, {"size", 'z', "SIZE", 0, "size in bytes (M/G=2^20,2^30)", 0}, {"delete", 'D', 0, 0, "delete image", 0}, {"info", 'I', 0, 0, "show image information", 0}, @@ -61,18 +49,12 @@ static struct argp_option options[] = { static char args_doc[] = "IMAGE"; -extern int init_wcache(int fd, uuid_t &uuid, int n_pages); -int (*make_cache)(int fd, uuid_t &uuid, int n_pages) = init_wcache; - static error_t parse_opt(int key, char *arg, struct argp_state *state) { switch (key) { case ARGP_KEY_ARG: image_name = arg; break; - case 'd': - cache_dir = arg; - break; case 'C': op = OP_CREATE; break; @@ -85,21 +67,6 @@ static error_t parse_opt(int key, char *arg, struct argp_state *state) case 'I': op = OP_INFO; break; - case 't': - if (arg[0] == 'R') { - cache_type = LSVD_CFG_READ; - log_error("read cache no longer supported"); - exit(1); - } else if (arg[0] == 'W') { - cache_type = LSVD_CFG_WRITE; - make_cache = init_wcache; - } else - argp_usage(state); - break; - case 'k': - op = OP_MKCACHE; - cache_dev = arg; - break; case 'c': op = OP_CLONE; break; @@ -171,41 +138,6 @@ void info(rados_ioctx_t io, const char *image_name) } } -void mk_cache(rados_ioctx_t io, const char *image_name, const char *dev_name, - cfg_cache_type type) -{ - int rv, fd = open(dev_name, O_RDWR); - if (fd < 0) { - perror("device file open"); - exit(1); - } - auto sz = getsize64(fd); - - lsvd_config cfg; - if ((rv = cfg.read()) < 0) { - printf("error reading config: %d\n", rv); - exit(1); - } - auto objstore = make_rados_backend(io); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { - printf("error reading superblock: %d\n", rv); - exit(1); - } - auto cache_file = cfg.cache_filename(uu, image_name, type); - - auto n_pages = sz / 4096; - if (make_cache(fd, uu, n_pages) < 0) { - printf("make_cache failed\n"); - exit(1); - } - if ((rv = symlink(dev_name, cache_file.c_str())) < 0) { - perror("symbolic link"); - exit(1); - } - close(fd); -} - int main(int argc, char **argv) { std::set_terminate([]() { @@ -218,14 +150,6 @@ int main(int argc, char **argv) argp_parse(&argp, argc, argv, 0, 0, 0); - setenv("LSVD_BACKEND", backend, 1); - if (cache_dir != NULL) { - if (cache_type == LSVD_CFG_READ) - setenv("LSVD_RCACHE_DIR", cache_dir, 1); - else - setenv("LSVD_WCACHE_DIR", cache_dir, 1); - } - rados_t cluster; int err = rados_create2(&cluster, "ceph", "client.admin", 0); check_ret_neg(err, "Failed to create cluster handle"); @@ -246,8 +170,6 @@ int main(int argc, char **argv) rbd_remove(io_ctx, image_name); else if (op == OP_INFO) info(io_ctx, image_name); - else if (op == OP_MKCACHE) - mk_cache(io_ctx, image_name, cache_dev, cache_type); else if (op == OP_CLONE) { auto src_img = image_name; auto dst_img = argv[argc - 1]; diff --git a/src/write_cache.cc b/src/write_cache.cc index 2f9ac06d..de1e9724 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -17,15 +17,15 @@ #include "utils.h" #include "write_cache.h" +const usize SUPER_BLOCKNO = 1; + /* ------------- Write cache structure ------------- */ class wcache_write_req; class write_cache_impl : public write_cache { size_t dev_max; - uint32_t super_blkno; int fd = -1; - lsvd_config &cfg; std::atomic sequence = 1; // write sequence # @@ -38,24 +38,17 @@ class write_cache_impl : public write_cache size_t write_batch = 0; std::condition_variable write_cv; - /* initialization stuff - */ + // initialization stuff int roll_log_forward(); - char *_hdrbuf; // for reading at startup - - thread_pool *misc_threads; - void write_checkpoint(void); - /* allocate journal entry, create a header - */ + // allocate journal entry, create a header uint32_t allocate(page_t n, page_t &pad, page_t &n_pad, page_t &prev); j_write_super *super; page_t previous_hdr = 0; page_t next_alloc = 0; - /* these are used by wcache_write_req - */ + // these are used by wcache_write_req friend class wcache_write_req; std::mutex m; translate &be; @@ -63,13 +56,12 @@ class write_cache_impl : public write_cache nvme *nvme_w = NULL; public: - /* throttle writes with window of max_write_pages - */ - void get_room(sector_t sectors); + // throttle writes with window of max_write_pages + void reserve_room(sector_t sectors); void release_room(sector_t sectors); void flush(void); - write_cache_impl(uint32_t blkno, int _fd, translate &_be, lsvd_config &cfg); + write_cache_impl(int fd, translate &_be, lsvd_config &cfg); ~write_cache_impl(); request *writev(sector_t lba, smartiov *iov); @@ -219,7 +211,7 @@ void wcache_write_req::run(request *parent_) * * TODO record how long this takes per request, unlikely to be bottleneck though */ -void write_cache_impl::get_room(sector_t sectors) +void write_cache_impl::reserve_room(sector_t sectors) { int pages = sectors / 8; std::unique_lock lk(m2); @@ -308,11 +300,11 @@ j_hdr *write_cache_impl::mk_header(char *buf, uint32_t type, page_t blks, */ void write_cache_impl::write_checkpoint(void) { - /* shouldn't really need the copy, since it's only called on - * shutdown, except that some unit tests call this and expect things - * to work afterwards - */ - j_write_super *super_copy = (j_write_super *)aligned_alloc(4096, 4096); + // shouldn't really need the copy, since it's only called on + // shutdown, except that some unit tests call this and expect things + // to work afterwards + vec buf(4096); + auto super_copy = (j_write_super *)buf.data(); memcpy(super_copy, super, 4096); super_copy->seq = sequence; @@ -328,10 +320,8 @@ void write_cache_impl::write_checkpoint(void) super_copy->clean = true; - if (nvme_w->write((char *)super_copy, 4096, 4096L * super_blkno) < 0) - throw_fs_error("wckpt_s"); - - free(super_copy); + auto res = nvme_w->write(buf.data(), buf.size(), 4096); + THROW_ERRNO_ON(res < 0, errno, "Failed to write wlog header"); } /* needs to set the following variables: @@ -454,27 +444,23 @@ int write_cache_impl::roll_log_forward() #endif } -write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate &be, - lsvd_config &cfg) - : fd(fd), cfg(cfg), be(be) +write_cache_impl::write_cache_impl(int fd, translate &be, lsvd_config &cfg) + : fd(fd), be(be) { - - super_blkno = blkno; dev_max = getsize64(fd); - _hdrbuf = (char *)aligned_alloc(4096, 4096); - - const char *name = "wlog_uring"; - nvme_w = make_nvme_uring(fd, name); + nvme_w = make_nvme_uring(fd, "wlog_uring"); char *buf = (char *)aligned_alloc(4096, 4096); - if (nvme_w->read(buf, 4096, 4096L * super_blkno) < 4096) - throw_fs_error("wcache"); + auto res = nvme_w->read(buf, 4096, 4096L * SUPER_BLOCKNO); + THROW_ERRNO_ON(res < 0, -res, "Failed to read wlog header"); + THROW_MSG_ON(res < 4096, "Short read {}/4096 on wlog header", res); + super = (j_write_super *)buf; + THROW_MSG_ON(super->magic != LSVD_MAGIC, "Invalid magic in wlog sub-hdr"); - /* if it's clean we can read in the map and lengths, otherwise - * do crash recovery. Then set the dirty flag - */ + // if it's clean we can read in the map and lengths, otherwise + // do crash recovery. Then set the dirty flag if (super->clean) { sequence = super->seq; next_alloc = super->base; @@ -483,22 +469,18 @@ write_cache_impl::write_cache_impl(uint32_t blkno, int fd, translate &be, next_alloc = super->base; super->clean = false; - if (nvme_w->write(buf, 4096, 4096L * super_blkno) < 4096) - throw_fs_error("wcache"); + res = nvme_w->write(buf, 4096, 4096L * SUPER_BLOCKNO); + THROW_ERRNO_ON(res < 0, -res, "Failed to write wlog subhdr"); int n_pages = super->limit - super->base; - max_write_pages = n_pages / 2 + n_pages / 4; + max_write_pages = n_pages / 2 + n_pages / 4; // no idea why this is 3/4ths write_batch = cfg.wcache_batch; - - misc_threads = new thread_pool(&m); } write_cache_impl::~write_cache_impl() { - delete misc_threads; close(fd); free(super); - free(_hdrbuf); delete nvme_w; } @@ -530,75 +512,94 @@ request *write_cache_impl::writev(sector_t lba, smartiov *iovs) void write_cache_impl::do_write_checkpoint(void) { write_checkpoint(); } -int init_wcache(int fd, uuid_t &uuid, int n_pages) +int init_wcache(int fd, uuid_t &uuid, usize cache_size) { - page_t w_pages = n_pages - 1; - page_t _map = div_round_up(w_pages, 256); - page_t _len = div_round_up(w_pages, 512); - page_t w_meta = 2 * (_map + _len); - char buf[4096]; - - w_pages -= w_meta; - - memset(buf, 0, sizeof(buf)); - auto w_super = (j_write_super *)buf; - *w_super = (j_write_super){LSVD_MAGIC, - LSVD_J_W_SUPER, - 1, - 1, - 1, - 1, - 1 + w_meta, - 1 + w_meta, - 1 + w_meta + w_pages, - 1 + w_meta, - 0, - 0, - 0, - 0, - 0, - 0, - {0}}; - memcpy(w_super->vol_uuid, uuid, sizeof(uuid_t)); - - int ret = pwrite(fd, buf, 4096, 0); + // write log file has 2 header blocks: the first 4k block is the j_hdr, + // the second 4k block is the j_write_super + // not entirely sure why they are separate, but I'm leaving it for now + + page_t total_pages = cache_size / 4096; + page_t content_pages = total_pages - 2; + page_t _map = div_round_up(content_pages, 256); + page_t _len = div_round_up(content_pages, 512); + page_t meta_pages = 2 * (_map + _len); + page_t data_pages = content_pages - meta_pages; + + vec buf(4096 * 2); + auto hdr = (j_hdr *)buf.data(); + *hdr = { + .magic = LSVD_MAGIC, + .type = LSVD_J_DATA, + .version = 1, + .len = total_pages, + .seq = 0, + .crc32 = 0, + .extent_offset = 0, + .extent_len = 0, + .prev = 0, + }; + + auto sup = (j_write_super *)(buf.data() + 4096); + *sup = { + .magic = LSVD_MAGIC, + .type = LSVD_J_W_SUPER, + .version = 1, + .clean = 1, + .seq = 1, + .meta_base = 1, + .meta_limit = 1 + meta_pages, + .base = 1 + meta_pages, + .limit = 1 + meta_pages + data_pages, + .next = 1 + meta_pages, + .map_start = 0, + .map_blocks = 0, + .map_entries = 0, + .len_start = 0, + .len_blocks = 0, + .len_entries = 0, + .vol_uuid = {0}, + }; + uuid_copy(sup->vol_uuid, uuid); + + int ret = pwrite(fd, buf.data(), buf.size(), 0); PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to write wlog header"); // just truncate to right length, don't bother writing zeroes - ret = ftruncate(fd, 4096 * (1 + w_pages + w_meta)); + ret = ftruncate(fd, 4096 * total_pages); PR_ERR_RET_IF(ret < 0, -errno, errno, "Failed to truncate wlog file"); return 0; } -uptr open_wlog(fspath path, usize size, translate &xlate, - lsvd_config &cfg) +uptr open_wlog(fspath path, translate &xlate, lsvd_config &cfg) { + log_info("Opening write log at '{}'", path.string()); + int fd = 0; if (!std::filesystem::exists(path)) { - log_info("Creating write cache file '{}'", path.string()); + log_info("Creating write log file '{}'", path.string()); fd = open(path.c_str(), O_RDWR | O_CREAT, 0644); PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to create cache file"); - auto err = init_wcache(fd, xlate.uuid, size / 4096); + auto err = init_wcache(fd, xlate.uuid, cfg.wlog_size); PR_ERR_RET_IF(err < 0, nullptr, -err, "Failed to init wlog"); + } else { + fd = open(path.c_str(), O_RDWR); + PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to open wlog file"); } - fd = open(path.c_str(), O_RDWR); - PR_ERR_RET_IF(fd < 0, nullptr, errno, "Failed to open wlog file"); - char buf[4096]; int err = pread(fd, buf, 4096, 0); PR_ERR_RET_IF(err < 0, nullptr, errno, "Failed to read wlog header"); - j_write_super *super = (j_write_super *)buf; + auto super = (j_hdr *)buf; PR_RET_IF(super->magic != LSVD_MAGIC, nullptr, "Invalid write cache magic number: {}", super->magic); - PR_RET_IF(super->type != LSVD_J_W_SUPER, nullptr, "Invalid cache type: {}", + PR_RET_IF(super->type != LSVD_J_DATA, nullptr, "Invalid cache type: {}", super->type); try { - return std::make_unique(1, fd, xlate, cfg); + return std::make_unique(fd, xlate, cfg); } catch (std::exception &e) { log_error("Failed to open write cache: {}", e.what()); close(fd); diff --git a/src/write_cache.h b/src/write_cache.h index 21ca0222..42955a70 100644 --- a/src/write_cache.h +++ b/src/write_cache.h @@ -10,7 +10,7 @@ class write_cache { public: - virtual void get_room(sector_t sectors) = 0; + virtual void reserve_room(sector_t sectors) = 0; virtual void release_room(sector_t sectors) = 0; virtual void flush(void) = 0; @@ -24,7 +24,4 @@ class write_cache uptr make_write_cache(uint32_t blkno, int fd, translate *be, lsvd_config *cfg); -int init_wcache(int fd, uuid_t &uuid, int n_pages); - -uptr open_wlog(fspath path, usize size, translate &xlate, - lsvd_config &cfg); +uptr open_wlog(fspath path, translate &xlate, lsvd_config &cfg); From 6b1442dbc9f6698486d26c7d01d2a68e884dc15b Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 22:34:03 +0000 Subject: [PATCH 48/77] Link uring statically --- src/meson.build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/meson.build b/src/meson.build index 1ef9c1d9..35b5a5dc 100644 --- a/src/meson.build +++ b/src/meson.build @@ -25,7 +25,7 @@ lsvd_deps = [ dependency('zlib'), dependency('fmt'), dependency('boost'), - dependency('liburing'), + dependency('liburing', static: true), dependency('uuid'), cxx.find_library('rados', required: true), # cxx.find_library('tcmalloc', required: false), From ec946771abe580c6a25dac82932c6b79e580bdb8 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 23:31:56 +0000 Subject: [PATCH 49/77] Link folly and prepare to switch to fbvector --- meson.build | 6 ++--- meson.ini | 4 +-- src/config.cc | 6 ++--- src/config.h | 4 +-- src/extent.h | 17 +++++++------ src/image.cc | 8 +++--- src/image.h | 6 ++--- src/lsvd_types.h | 8 +++--- src/meson.build | 9 ++++--- src/misc_cache.h | 6 ++--- src/objects.cc | 2 +- src/rados_backend.cc | 2 +- src/shared_read_cache.cc | 10 ++++---- src/smartiov.h | 2 +- src/translate.cc | 53 ++++++++++++++++++++-------------------- src/utils.h | 8 +++--- src/write_cache.cc | 5 ++-- test/meson.build | 4 +-- 18 files changed, 80 insertions(+), 80 deletions(-) diff --git a/meson.build b/meson.build index c06f9ea2..29dcbe68 100644 --- a/meson.build +++ b/meson.build @@ -20,8 +20,6 @@ if get_option('buildtype') == 'debug' endif subdir('src') - -liblsvd = shared_library('lsvd', lsvd_src, dependencies: lsvd_deps) lsvd_ar = static_library('lsvd', lsvd_src, dependencies: lsvd_deps) executable( @@ -32,13 +30,13 @@ executable( executable( 'imgtool', - 'src/imgtool.cc', + ['src/imgtool.cc'], link_whole: lsvd_ar, dependencies: lsvd_deps, ) executable( 'thick-image', - 'src/thick-image.cc', + ['src/thick-image.cc'], link_whole: lsvd_ar, dependencies: lsvd_deps, ) diff --git a/meson.ini b/meson.ini index 0e8514bf..a1be4df4 100644 --- a/meson.ini +++ b/meson.ini @@ -1,5 +1,5 @@ [binaries] c = 'clang-18' -c_ld = 'mold' +c_ld = 'lld-18' cpp = 'clang++-18' -cpp_ld = 'mold' \ No newline at end of file +cpp_ld = 'lld-18' \ No newline at end of file diff --git a/src/config.cc b/src/config.cc index b874d26d..04dfde18 100644 --- a/src/config.cc +++ b/src/config.cc @@ -24,9 +24,9 @@ namespace fs = std::filesystem; #include "config.h" #include "config_macros.h" -std::vector cfg_path({"lsvd.conf", "/usr/local/etc/lsvd.conf"}); +vec cfg_path({"lsvd.conf", "/usr/local/etc/lsvd.conf"}); -static void split(std::string s, std::vector &words) +static void split(std::string s, vec &words) { std::string w = ""; for (auto c : s) { @@ -65,7 +65,7 @@ int lsvd_config::read() while (getline(fp, line)) { if (line[0] == '#') continue; - std::vector words; + vec words; split(line, words); if (words.size() != 2) continue; diff --git a/src/config.h b/src/config.h index 47cc8749..446f64cf 100644 --- a/src/config.h +++ b/src/config.h @@ -26,8 +26,8 @@ class lsvd_config int backend_obj_size = 8 * 1024 * 1024; // in bytes int wcache_batch = 8; // requests int wcache_chunk = 2 * 1024 * 1024; // bytes - std::string rcache_dir = "/tmp"; - std::string wcache_dir = "/tmp"; + std::string rcache_dir = "/tmp/lsvd/"; + std::string wcache_dir = "/tmp/lsvd/"; u32 num_parallel_writes = 8; int hard_sync = 0; enum cfg_backend backend = BACKEND_RADOS; diff --git a/src/extent.h b/src/extent.h index 17f63e47..2d23d73a 100644 --- a/src/extent.h +++ b/src/extent.h @@ -29,7 +29,8 @@ #include #include #include -#include + +#include "utils.h" namespace extmap { @@ -227,9 +228,9 @@ template struct extmap { static const int _load = load; public: - typedef std::vector extent_vector; - std::vector lists; - std::vector maxes; + typedef vec extent_vector; + vec lists; + vec maxes; int count; extmap() { count = 0; } @@ -410,9 +411,9 @@ template struct extmap { // Python-style list slicing - remove [len]..[end] and return it // - static std::vector *_slice(std::vector *A, int len) + static vec *_slice(vec *A, int len) { - auto half = new std::vector(); + auto half = new vec(); half->reserve(_load); for (auto it = A->begin() + len; it != A->end(); it++) half->push_back(*it); @@ -660,7 +661,7 @@ template struct extmap { // various ways of calling _update... // - void update(T_in base, T_in limit, T_out e, std::vector *del) + void update(T_in base, T_in limit, T_out e, vec *del) { _update(base, limit, e, false, del); } @@ -669,7 +670,7 @@ template struct extmap { _update(base, limit, e, false, nullptr); } - void trim(T_in base, T_in limit, std::vector *del) + void trim(T_in base, T_in limit, vec *del) { static T_out unused; if (count > 0) diff --git a/src/image.cc b/src/image.cc index c2ba0d25..e179f58d 100644 --- a/src/image.cc +++ b/src/image.cc @@ -227,7 +227,7 @@ class lsvd_image::read_request : public lsvd_image::aio_request { assert(parent == nullptr); - std::vector requests; + vec requests; img->handle_reads(req_offset, iovs, requests); num_subreqs = requests.size(); @@ -255,7 +255,7 @@ class lsvd_image::read_request : public lsvd_image::aio_request }; void lsvd_image::handle_reads(size_t offset, smartiov iovs, - std::vector &requests) + vec &requests) { sector_t start_sector = offset / 512; sector_t end_sector = start_sector + iovs.bytes() / 512; @@ -444,7 +444,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request * and be done with it. The old code had these as pointers, but changed * them to be in the vectwor. */ - std::vector sub_iovs; + vec sub_iovs; public: write_request(lsvd_image *img, size_t offset, smartiov iovs, @@ -478,7 +478,7 @@ class lsvd_image::write_request : public lsvd_image::aio_request n_req += div_round_up(req_bytes / 512, max_sectors); // TODO: this is horribly ugly - std::vector requests; + vec requests; auto cur_offset = req_offset; for (sector_t s_offset = 0; s_offset < size_sectors; diff --git a/src/image.h b/src/image.h index 023dcad7..fea02c27 100644 --- a/src/image.h +++ b/src/image.h @@ -57,8 +57,8 @@ class lsvd_image usize size; // bytes lsvd_config cfg; - std::vector clones; // Base images on which we're built - std::vector checkpoints; // Checkpoints + vec clones; // Base images on which we're built + vec checkpoints; // Checkpoints std::map obj_info; // LBA -> object id, object offset @@ -107,5 +107,5 @@ class lsvd_image private: void handle_reads(size_t offset, smartiov iovs, - std::vector &requests); + vec &requests); }; diff --git a/src/lsvd_types.h b/src/lsvd_types.h index b8f046b9..0ac51707 100644 --- a/src/lsvd_types.h +++ b/src/lsvd_types.h @@ -1,9 +1,9 @@ #pragma once +#include "utils.h" #include #include #include -#include using sector_t = int64_t; using page_t = int32_t; @@ -17,8 +17,7 @@ enum { LSVD_MAGIC = 0x4456534c }; * copy them into the provided output vector */ template -void decode_offset_len(char *buf, size_t offset, size_t len, - std::vector &vals) +void decode_offset_len(char *buf, size_t offset, size_t len, vec &vals) { T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); for (; p < end; p++) @@ -29,8 +28,7 @@ void decode_offset_len(char *buf, size_t offset, size_t len, * length field name_len. */ template -void decode_offset_len_ptr(char *buf, size_t offset, size_t len, - std::vector &vals) +void decode_offset_len_ptr(char *buf, size_t offset, size_t len, vec &vals) { T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); for (; p < end;) { diff --git a/src/meson.build b/src/meson.build index 35b5a5dc..4ede957e 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,8 +1,10 @@ cxx = meson.get_compiler('cpp') cmake = import('cmake') -libfolly_subproject = cmake.subproject('folly') -libfolly = libfolly_subproject.dependency('folly') +# cmvars = cmake.subproject_options() +# cmvars.add_cmake_defines({'POSITION_INDEPENDENT_CODE': true}) +folly_cm = cmake.subproject('folly') +libfolly = folly_cm.dependency('folly') lsvd_src = files( 'config.cc', @@ -20,7 +22,7 @@ lsvd_src = files( lsvd_inc = include_directories('.') lsvd_deps = [ - # libfolly, + libfolly, dependency('threads'), dependency('zlib'), dependency('fmt'), @@ -28,7 +30,6 @@ lsvd_deps = [ dependency('liburing', static: true), dependency('uuid'), cxx.find_library('rados', required: true), - # cxx.find_library('tcmalloc', required: false), cxx.find_library('jemalloc', required: false), ] diff --git a/src/misc_cache.h b/src/misc_cache.h index 633e3332..53f3c637 100644 --- a/src/misc_cache.h +++ b/src/misc_cache.h @@ -101,12 +101,12 @@ static inline void throw_fs_error(std::string msg) */ template class sized_vector { - std::vector *elements; + vec *elements; public: ~sized_vector() { delete elements; } - void init(int n) { elements = new std::vector(n); } - void init(int n, T val) { elements = new std::vector(n, val); } + void init(int n) { elements = new vec(n); } + void init(int n, T val) { elements = new vec(n, val); } T &operator[](int index) { return (*elements)[index]; } }; diff --git a/src/objects.cc b/src/objects.cc index 18dae9af..c217d868 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -123,7 +123,7 @@ opt> object_reader::fetch_object_header(std::string objname) */ template void deserialise_offset_ptr(char *buf, size_t offset, size_t len, - std::vector &vals) + vec &vals) { T *p = (T *)(buf + offset), *end = (T *)(buf + offset + len); for (; p < end;) { diff --git a/src/rados_backend.cc b/src/rados_backend.cc index f144a8c8..b63e2904 100644 --- a/src/rados_backend.cc +++ b/src/rados_backend.cc @@ -191,7 +191,7 @@ class rados_backend : public backend auto size = get_size(name); PASSTHRU_NULLOPT(size); - std::vector buf(size.value()); + vec buf(size.value()); smartiov iov((char *)buf.data(), buf.size()); auto r = read(name, 0, iov); if (r < 0) diff --git a/src/shared_read_cache.cc b/src/shared_read_cache.cc index b4de2b91..0a74887a 100644 --- a/src/shared_read_cache.cc +++ b/src/shared_read_cache.cc @@ -86,13 +86,13 @@ class shared_read_cache void *pending_fill_data = nullptr; // Keep track of pending reads - std::vector pending_reads; + vec pending_reads; // Keep track of the reverse map so we can evict this entry chunk_key key; }; - std::vector cache_state; + vec cache_state; std::mutex global_cache_lock; std::mutex cache_stats_lock; @@ -352,7 +352,7 @@ class shared_read_cache::cache_miss_request : public self_refcount_request { is_backend_done = true; - std::vector reqs; + vec reqs; { std::unique_lock lock(cache.global_cache_lock); @@ -459,7 +459,7 @@ shared_read_cache::shared_read_cache(std::string cache_path, // CACHE_CHUNK_SIZE); cache_store = std::unique_ptr(make_nvme_uring(fd, "rcache_uring")); - cache_state = std::vector(num_cache_blocks); + cache_state = vec(num_cache_blocks); } shared_read_cache::~shared_read_cache() {} @@ -686,7 +686,7 @@ class sharded_cache : public read_cache // TODO in-place obj instead of uptrs, don't understand why we can't have // just a vector of the plain object - std::vector> shards; + vec> shards; // centralise the reporter std::thread cache_stats_reporter; diff --git a/src/smartiov.h b/src/smartiov.h index efc720b7..f32a34a6 100644 --- a/src/smartiov.h +++ b/src/smartiov.h @@ -9,7 +9,7 @@ */ class smartiov { - std::vector iovs; + vec iovs; public: smartiov() {} diff --git a/src/translate.cc b/src/translate.cc index f9db503b..3eff4b54 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -84,7 +84,7 @@ class translate_req : public request char *gc_data = NULL; // passed in by GC thread /* lba/len/obj/offset (ignore obj/offset for REQ_PUT) */ - std::vector entries; + vec entries; /* used for removing from map */ char *local_buf_base = NULL; @@ -171,7 +171,7 @@ class translate_impl : public translate friend class translate_req; translate_req *current = NULL; - std::vector &clones; + vec &clones; std::map &object_info; vec &checkpoints; @@ -233,6 +233,9 @@ class translate_impl : public translate total_live_sectors += oi.live; } + current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); + assert(current->batch_buf != nullptr); + // start worker, flush, and GC threads if (cfg.flush_interval_msec > 0) flush_worker = @@ -247,8 +250,6 @@ class translate_impl : public translate workers->pool.push( std::thread(&translate_impl::worker_thread, this, workers)); - current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); - // Fully serialise superblock once, so we can do partial serialisations // later on and skip the checkpoint stuff every time // currently unimplemented @@ -430,7 +431,7 @@ ssize_t translate_impl::trim(size_t offset, size_t len) std::unique_lock obj_w_lock(omap_mtx); // trim the map - std::vector deleted; + vec deleted; objmap.trim(offset / 512, (offset + len) / 512, &deleted); // and then update the GC accounting @@ -503,7 +504,7 @@ void translate_req::notify(request *child) * point to this buffer */ std::unique_lock obj_w_lock(tx->bufmap_lock); - std::vector> extents; + vec> extents; for (auto const &e : entries) { auto limit = e.lba + e.len; for (auto it2 = tx->bufmap.lookup(e.lba); @@ -544,8 +545,8 @@ void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) { debug("Writing checkpoint {}", cp_seq); - std::vector entries; - std::vector objects; + vec entries; + vec objects; for (auto it = objmap.begin(); it != objmap.end(); it++) { auto [base, limit, ptr] = it->vals(); @@ -596,7 +597,7 @@ void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) // Update superblock with new checkpoint, and keep only the last 3 // around both in the backend and the superblock checkpoints.push_back(cp_seq); - std::vector ckpts_to_delete; + vec ckpts_to_delete; while (checkpoints.size() > 3) { ckpts_to_delete.push_back(checkpoints.front()); checkpoints.erase(checkpoints.begin()); @@ -649,7 +650,7 @@ void translate_impl::write_gc(seqnum_t _seq, translate_req *req) auto in_ptr = req->gc_data; // int _data_sectors = 0; // actual sectors in GC write - std::vector obj_extents; + vec obj_extents; req->local_buf_base = data_ptr; for (auto const &[base, len, obj, offset] : req->entries) { @@ -685,7 +686,7 @@ void translate_impl::write_gc(seqnum_t _seq, translate_req *req) sector_t offset = hdr_sectors; data_ptr = data_ptr0; - std::vector deleted; + vec deleted; req->entries.clear(); // replace with actual extents written std::unique_lock obj_w_lock(omap_mtx); // protect the readers @@ -748,9 +749,9 @@ void translate_impl::process_batch(seqnum_t _seq, translate_req *req) /* and the object map (copy entries to right format at same time) */ sector_t sector_offset = hdr_sectors; - std::vector deleted; + vec deleted; deleted.reserve(req->entries.size()); - std::vector dm_entries; + vec dm_entries; dm_entries.reserve(req->entries.size()); for (auto e : req->entries) { @@ -922,7 +923,7 @@ void translate_impl::do_gc(std::stop_token &st) int max_obj = cur_seq.load(); std::shared_lock obj_r_lock(omap_mtx); - std::vector dead_objects; + vec dead_objects; for (auto const &p : object_info) { auto [hdrlen, datalen, live] = p.second; if (live == 0) { @@ -987,7 +988,7 @@ void translate_impl::do_gc(std::stop_token &st) /* gather list of objects needing cleaning, return if none */ const double threshold = cfg.gc_threshold / 100.0; - std::vector> objs_to_clean; + vec> objs_to_clean; for (auto [u, o, n] : utilization) { if (u > threshold) continue; @@ -1058,7 +1059,7 @@ void translate_impl::do_gc(std::stop_token &st) auto file_end = offset; - std::vector<_extent> all_extents; + vec<_extent> all_extents; for (auto it = live_extents.begin(); it != live_extents.end(); it++) { auto [base, limit, ptr] = it->vals(); all_extents.push_back((_extent){base, limit, ptr}); @@ -1070,7 +1071,7 @@ void translate_impl::do_gc(std::stop_token &st) while (all_extents.size() > 0) { sector_t sectors = 0, max = cfg.backend_obj_size / 512; - std::vector<_extent> extents; + vec<_extent> extents; auto it = all_extents.begin(); while (it != all_extents.end() && sectors < max) { @@ -1274,9 +1275,9 @@ int translate_clone_image(sptr objstore, const char *source, #if 0 ssize_t translate_impl::init(const char *prefix_, bool timedflush) { - std::vector ckpts; - std::vector clones; - std::vector snaps; + vec ckpts; + vec clones; + vec snaps; /* note prefix = superblock name */ @@ -1346,9 +1347,9 @@ ssize_t translate_impl::init(const char *prefix_, bool timedflush) */ int last_ckpt = -1; if (ckpts.size() > 0) { - std::vector objects; - std::vector deletes; - std::vector entries; + vec objects; + vec deletes; + vec entries; /* hmm, we should never have checkpoints listed in the * super that aren't persisted on the backend, should we? @@ -1391,8 +1392,8 @@ ssize_t translate_impl::init(const char *prefix_, bool timedflush) /* roll forward */ for (;; seq++) { - std::vector cleaned; - std::vector entries; + vec cleaned; + vec entries; common_obj_hdr h; obj_data_hdr dh; @@ -1416,7 +1417,7 @@ ssize_t translate_impl::init(const char *prefix_, bool timedflush) max_cache_seq = dh.cache_seq; int offset = 0, hdr_len = h.hdr_sectors; - std::vector deleted; + vec deleted; for (auto m : entries) { extmap::obj_offset oo = {seq, offset + hdr_len}; objmap.update(m.lba, m.lba + m.len, oo, &deleted); diff --git a/src/utils.h b/src/utils.h index 7a6654cf..9ac8757e 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,5 +1,6 @@ #pragma once +#include "folly/FBVector.h" #include #include #include @@ -31,6 +32,7 @@ template using sptr = std::shared_ptr; template using uptr = std::unique_ptr; template using opt = std::optional; template using vec = std::vector; +template using fvec = folly::fbvector; #define CEXTERN extern "C" @@ -218,10 +220,10 @@ template struct overloaded : Ts... { using Ts::operator()...; }; -inline std::vector split_string_on_char(const std::string &s, +inline vec split_string_on_char(const std::string &s, char delim) { - std::vector result; + vec result; std::stringstream ss(s); std::string item; @@ -232,7 +234,7 @@ inline std::vector split_string_on_char(const std::string &s, return result; } -inline std::string string_join(const std::vector &strings, +inline std::string string_join(const vec &strings, const std::string &delim) { std::string result; diff --git a/src/write_cache.cc b/src/write_cache.cc index de1e9724..a4d22aa0 100644 --- a/src/write_cache.cc +++ b/src/write_cache.cc @@ -143,7 +143,7 @@ wcache_write_req::wcache_write_req(sector_t lba, smartiov *iovs, page_t n_pages, r_pad = wcache->nvme_w->make_write_request(&pad_iov, pad * 4096L); } - std::vector extents; + vec extents; extents.push_back((j_extent){(uint64_t)lba, iovs->bytes() / 512}); /* TODO: don't assign seq# in mk_header @@ -399,7 +399,7 @@ int write_cache_impl::roll_log_forward() * - put mappings into cache map * - write data to backend */ - std::vector entries; + vec entries; decode_offset_len(_hdrbuf, h->extent_offset, h->extent_len, entries); @@ -486,7 +486,6 @@ write_cache_impl::~write_cache_impl() request *write_cache_impl::writev(sector_t lba, smartiov *iovs) { - size_t bytes = iovs->bytes(); page_t pages = div_round_up(bytes, 4096); page_t pad, n_pad, prev = 0; diff --git a/test/meson.build b/test/meson.build index 61ef5e0b..4c849fae 100644 --- a/test/meson.build +++ b/test/meson.build @@ -2,7 +2,7 @@ seq = executable( 'test-seq', 'test-seq.cc', include_directories: lsvd_inc, - link_with: liblsvd, + link_with: lsvd_ar, dependencies: lsvd_deps, ) @@ -10,7 +10,7 @@ extentmap = executable( 'test-extentmap', 'test-extentmap.cc', include_directories: lsvd_inc, - link_with: liblsvd, + link_with: lsvd_ar, dependencies: lsvd_deps, ) From a133a720eb980ae1ff500876cbbe4273c5d62617 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 23:34:05 +0000 Subject: [PATCH 50/77] Fix config UAF --- src/image.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/image.cc b/src/image.cc index e179f58d..7e2c8b92 100644 --- a/src/image.cc +++ b/src/image.cc @@ -14,8 +14,8 @@ const int block_sectors = CACHE_CHUNK_SIZE / 512; -lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg) - : imgname(name), cfg(cfg) +lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg_) + : imgname(name), cfg(cfg_) { objstore = make_rados_backend(io); rcache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); From f35560f7301992a7bdaae0a283a9fbeef1c81b73 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 22 May 2024 23:44:50 +0000 Subject: [PATCH 51/77] Fix memory leaks --- src/image.cc | 6 +++++- src/image.h | 5 +++-- src/translate.cc | 14 +++++++++----- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/src/image.cc b/src/image.cc index 7e2c8b92..02100043 100644 --- a/src/image.cc +++ b/src/image.cc @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -15,7 +16,7 @@ const int block_sectors = CACHE_CHUNK_SIZE / 512; lsvd_image::lsvd_image(std::string name, rados_ioctx_t io, lsvd_config cfg_) - : imgname(name), cfg(cfg_) + : imgname(name), cfg(cfg_), io(io) { objstore = make_rados_backend(io); rcache = get_read_cache_instance(cfg.rcache_dir, cfg.cache_size, objstore); @@ -48,6 +49,9 @@ lsvd_image::~lsvd_image() wlog->do_write_checkpoint(); xlate->shutdown(); + // TODO figure out who owns the rados connection + rados_ioctx_destroy(io); + log_info("Image '{}' closed", imgname); } diff --git a/src/image.h b/src/image.h index fea02c27..4f2d3870 100644 --- a/src/image.h +++ b/src/image.h @@ -57,6 +57,8 @@ class lsvd_image usize size; // bytes lsvd_config cfg; + rados_ioctx_t io; + vec clones; // Base images on which we're built vec checkpoints; // Checkpoints std::map obj_info; @@ -106,6 +108,5 @@ class lsvd_image rados_ioctx_t io); private: - void handle_reads(size_t offset, smartiov iovs, - vec &requests); + void handle_reads(size_t offset, smartiov iovs, vec &requests); }; diff --git a/src/translate.cc b/src/translate.cc index 3eff4b54..3c08c9c3 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -110,7 +110,11 @@ class translate_req : public request return len + bytes <= max; } - ~translate_req() {} + ~translate_req() + { + if (batch_buf) + free(batch_buf); + } /* NOTE - this assumes the only significant header entry is the map */ @@ -519,8 +523,6 @@ void translate_req::notify(request *child) } } - if (batch_buf != NULL) // allocated in constructor - free(batch_buf); if (gc_buf != NULL) // allocated in write_gc free(gc_buf); if (gc_data != NULL) // allocated in gc threqad @@ -587,9 +589,11 @@ void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) .map_len = (uint32_t)map_bytes}; auto objs = (char *)(ch + 1); - memcpy(objs, (char *)objects.data(), objs_bytes); auto maps = objs + objs_bytes; - memcpy(maps, (char *)entries.data(), map_bytes); + if (objs_bytes > 0) + memcpy(objs, (char *)objects.data(), objs_bytes); + if (map_bytes > 0) + memcpy(maps, (char *)entries.data(), map_bytes); // Write out the checkpoint objstore->write(oname(name, cp_seq), cp_buf.data(), cp_buf.size()); From 74bda81b2d6a6e8dc6025d623c0255212c3a977c Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 23 May 2024 00:22:38 +0000 Subject: [PATCH 52/77] Add vendored liburing --- .gitignore | 2 ++ subprojects/liburing.wrap | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 subprojects/liburing.wrap diff --git a/.gitignore b/.gitignore index d4b8999b..8b84efbf 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ qemu/*.img qemu/*.iso qemu/bzImage +subprojects/liburing-* +subprojects/packagecache diff --git a/subprojects/liburing.wrap b/subprojects/liburing.wrap new file mode 100644 index 00000000..00de3a76 --- /dev/null +++ b/subprojects/liburing.wrap @@ -0,0 +1,13 @@ +[wrap-file] +directory = liburing-liburing-2.5 +source_url = https://github.com/axboe/liburing/archive/refs/tags/liburing-2.5.tar.gz +source_filename = liburing-2.5.tar.gz +source_hash = 456f5f882165630f0dc7b75e8fd53bd01a955d5d4720729b4323097e6e9f2a98 +patch_filename = liburing_2.5-1_patch.zip +patch_url = https://wrapdb.mesonbuild.com/v2/liburing_2.5-1/get_patch +patch_hash = d72f651e0edd8102535af575d682ce86c3fc2fdabb40b8faa2659d0f7d437f44 +source_fallback_url = https://github.com/mesonbuild/wrapdb/releases/download/liburing_2.5-1/liburing-2.5.tar.gz +wrapdb_version = 2.5-1 + +[provide] +dependency_names = liburing From 29b9ef1f58c52f9e196c8c8382f528d3623ea7c5 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 23 May 2024 00:28:09 +0000 Subject: [PATCH 53/77] Change test-seq cfg options --- subprojects/folly | 1 + subprojects/spdk | 1 + test/test-seq.cc | 16 ++++++++++------ 3 files changed, 12 insertions(+), 6 deletions(-) create mode 160000 subprojects/folly create mode 160000 subprojects/spdk diff --git a/subprojects/folly b/subprojects/folly new file mode 160000 index 00000000..89ac8453 --- /dev/null +++ b/subprojects/folly @@ -0,0 +1 @@ +Subproject commit 89ac8453fc7825e566b46a7ebb9e0348b1c09338 diff --git a/subprojects/spdk b/subprojects/spdk new file mode 160000 index 00000000..0786843e --- /dev/null +++ b/subprojects/spdk @@ -0,0 +1 @@ +Subproject commit 0786843e99550cedb6de26c25641d9c294ddcb85 diff --git a/test/test-seq.cc b/test/test-seq.cc index d94bfef8..66aae582 100644 --- a/test/test-seq.cc +++ b/test/test-seq.cc @@ -1,5 +1,5 @@ -#include #include +#include #include #include @@ -16,8 +16,11 @@ using comp_buf = std::array; #ifdef __cplusplus extern "C" #endif -const char* __asan_default_options() { return "detect_leaks=0"; } - + const char * + __asan_default_options() +{ + return "detect_leaks=0"; +} /** * Usage: @@ -33,7 +36,8 @@ void hexdump(std::string desc, const void *addr, const int len, int perLine = 16) { int i; - unsigned char buff[perLine + 1]; + vec buf(perLine + 1); + auto buff = (unsigned char *)buf.data(); const unsigned char *pc = (const unsigned char *)addr; check_cond(len <= 0, "Invalid length {}", len); @@ -172,8 +176,8 @@ void run_test(rados_ioctx_t ctx) int main(int argc, char *argv[]) { // config options - setenv("LSVD_RCACHE_DIR", "/tmp/lsvd-read", 1); - setenv("LSVD_WCACHE_DIR", "/tmp/lsvd-write", 1); + setenv("LSVD_RCACHE_DIR", "/tmp/lsvd", 1); + setenv("LSVD_WCACHE_DIR", "/tmp/lsvd", 1); setenv("LSVD_CACHE_SIZE", "2147483648", 1); std::string pool_name = "pone"; From c621a4d9a667413807de78d59e2808266546a6f6 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 23 May 2024 01:08:04 +0000 Subject: [PATCH 54/77] Add some nvmf config to spdk fe --- src/spdk_frontend.cc | 58 +++++++++++++++++++++++ subprojects/packagefiles/spdk/meson.build | 1 + 2 files changed, 59 insertions(+) diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 3b0904bb..111d35de 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -1,8 +1,11 @@ #include "spdk/event.h" +#include "spdk/nvmf.h" #include +#include #include #include "bdev_lsvd.h" +#include "spdk/nvmf_spec.h" #include "utils.h" struct start_lsvd_args { @@ -38,6 +41,61 @@ static void start_lsvd(void *arg) } // TODO setup nvmf subsystems and all that nonsense + // we can worry about refactoring it into functions later + + // Step 1: create nvmf target + log_info("Creating NVMF target"); + auto nvmf_opts = (spdk_nvmf_target_opts){ + .name = "lsvd_nvmf_tgt", + }; + auto tgt = spdk_nvmf_tgt_create(&nvmf_opts); + assert(tgt != nullptr); + + // Step 1.5: add discovery subsystem so we can probe for it + log_info("Creating NVMF discovery subsystem"); + auto disc_ss = spdk_nvmf_subsystem_create( + tgt, SPDK_NVMF_DISCOVERY_NQN, SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT, 0); + assert(disc_ss != nullptr); + spdk_nvmf_subsystem_set_allow_any_host(disc_ss, true); + + // Step 2: create TCP transport + spdk_nvmf_transport_opts opts; + auto succ = spdk_nvmf_transport_opts_init("TCP", &opts, sizeof(opts)); + assert(succ == true); + // opts.io_unit_size = 131072; + // opts.max_qpairs_per_ctrlr = 8; + opts.in_capsule_data_size = 8192; + debug("TCP transport opts: io_unit_size={}, max_qpairs_per_ctrlr={}, " + "in_capsule_data_size={}", + opts.io_unit_size, opts.max_qpairs_per_ctrlr, + opts.in_capsule_data_size); + + log_info("Creating TCP transport"); + auto transport_p = std::promise(); + spdk_nvmf_transport_create_async( + "TCP", &opts, + [](auto p, auto b) { + auto pr = (std::promise *)p; + pr->set_value(b); + }, + &transport_p); + auto transport = transport_p.get_future().get(); + assert(transport != nullptr); + + log_info("Adding TCP transport to target"); + auto stat_p = std::promise(); + spdk_nvmf_tgt_add_transport( + tgt, transport, + [](auto p, auto stat) { + auto pr = (std::promise *)p; + pr->set_value(stat); + }, + nullptr); + auto status = stat_p.get_future().get(); + assert(status == 0); + + // Step 3: create subsystem for our bdev + log_info("Creating SPDK controller"); } int main(int argc, const char **argv) diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build index 1298dfb2..c798d74c 100644 --- a/subprojects/packagefiles/spdk/meson.build +++ b/subprojects/packagefiles/spdk/meson.build @@ -56,6 +56,7 @@ custom_libnames = [ 'spdk_lvol', 'spdk_blob', 'spdk_nvme', + 'spdk_nvmf', # 'spdk_bdev_aio', 'spdk_bdev_ftl', 'spdk_ftl', From e7029516136c6720b079d8692e4a706b0cac530f Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 28 May 2024 03:48:39 +0000 Subject: [PATCH 55/77] Transcribe rpc fns --- src/backend.h | 2 +- src/rados_backend.cc | 19 +++++ src/spdk_frontend.cc | 199 +++++++++++++++++++++++++++++-------------- 3 files changed, 157 insertions(+), 63 deletions(-) diff --git a/src/backend.h b/src/backend.h index 75f2feb1..5a6f65d1 100644 --- a/src/backend.h +++ b/src/backend.h @@ -51,5 +51,5 @@ class backend virtual bool exists(std::string name) = 0; }; -extern std::shared_ptr make_file_backend(const char *prefix); extern std::shared_ptr make_rados_backend(rados_ioctx_t io); +rados_ioctx_t connect_to_pool(str pool_name); diff --git a/src/rados_backend.cc b/src/rados_backend.cc index b63e2904..74ab05a0 100644 --- a/src/rados_backend.cc +++ b/src/rados_backend.cc @@ -205,3 +205,22 @@ std::shared_ptr make_rados_backend(rados_ioctx_t io) { return std::make_shared(io); } + +rados_ioctx_t connect_to_pool(str pool_name) +{ + rados_t cluster; + int err = rados_create2(&cluster, "ceph", "client.admin", 0); + check_ret_neg(err, "Failed to create cluster handle"); + + err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); + check_ret_neg(err, "Failed to read config file"); + + err = rados_connect(cluster); + check_ret_neg(err, "Failed to connect to cluster"); + + rados_ioctx_t io_ctx; + err = rados_ioctx_create(cluster, pool_name.c_str(), &io_ctx); + check_ret_neg(err, "Failed to connect to pool {}", pool_name); + + return io_ctx; +} diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 111d35de..7ccd95a2 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -1,101 +1,176 @@ #include "spdk/event.h" +#include "spdk/nvme.h" #include "spdk/nvmf.h" +#include #include #include #include +#include "backend.h" #include "bdev_lsvd.h" #include "spdk/nvmf_spec.h" #include "utils.h" +const char *NVME_SS_NQN = "nqn.2019-05.io.lsvd:cnode1"; +const char *HOSTNAME = "127.0.0.1"; +const char *PORT = "4420"; + +using IntCallbackFn = std::function; +IntCallbackFn *alloc_cb(std::function cb) +{ + return new IntCallbackFn(cb); +} + +void invoke_and_free_cb(void *ctx, int status) +{ + auto cb = static_cast *>(ctx); + (*cb)(status); + delete cb; +} + struct start_lsvd_args { const char *pool_name; const char *image_name; }; -static void start_lsvd(void *arg) +spdk_nvmf_tgt *create_target() { - log_info("Starting LSVD SPDK program ..."); - auto args = (start_lsvd_args *)arg; - - rados_t cluster; - int err = rados_create2(&cluster, "ceph", "client.admin", 0); - check_ret_neg(err, "Failed to create cluster handle"); - - err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); - check_ret_neg(err, "Failed to read config file"); - - err = rados_connect(cluster); - check_ret_neg(err, "Failed to connect to cluster"); - - rados_ioctx_t io_ctx; - err = rados_ioctx_create(cluster, args->pool_name, &io_ctx); - check_ret_neg(err, "Failed to connect to pool {}", args->pool_name); - - lsvd_config cfg; // TODO get this from somewhere reasonable - cfg.cache_size = 160 * 1024 * 1024; - err = bdev_lsvd_create(args->image_name, io_ctx, cfg); - if (err) { - log_error("Failed to create bdev"); - spdk_app_stop(err); - } - - // TODO setup nvmf subsystems and all that nonsense - // we can worry about refactoring it into functions later - - // Step 1: create nvmf target log_info("Creating NVMF target"); - auto nvmf_opts = (spdk_nvmf_target_opts){ + spdk_nvmf_target_opts opts = { .name = "lsvd_nvmf_tgt", }; - auto tgt = spdk_nvmf_tgt_create(&nvmf_opts); + auto tgt = spdk_nvmf_tgt_create(&opts); assert(tgt != nullptr); + return tgt; +} - // Step 1.5: add discovery subsystem so we can probe for it +spdk_nvmf_subsystem *add_discovery_ss(spdk_nvmf_tgt *tgt) +{ log_info("Creating NVMF discovery subsystem"); - auto disc_ss = spdk_nvmf_subsystem_create( + auto ss = spdk_nvmf_subsystem_create( tgt, SPDK_NVMF_DISCOVERY_NQN, SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT, 0); - assert(disc_ss != nullptr); - spdk_nvmf_subsystem_set_allow_any_host(disc_ss, true); + assert(ss != nullptr); + spdk_nvmf_subsystem_set_allow_any_host(ss, true); + return ss; +} - // Step 2: create TCP transport +spdk_nvmf_subsystem *add_nvme_ss(spdk_nvmf_tgt *tgt) +{ + log_info("Creating SPDK controller subsystem"); + auto ss = + spdk_nvmf_subsystem_create(tgt, NVME_SS_NQN, SPDK_NVMF_SUBTYPE_NVME, 1); + assert(ss != nullptr); + spdk_nvmf_subsystem_set_allow_any_host(ss, true); + spdk_nvmf_subsystem_set_sn(ss, "SPDK_000001"); + spdk_nvmf_subsystem_set_mn(ss, "LSVD NVMe controller"); + spdk_nvmf_subsystem_set_ana_reporting(ss, true); + return ss; +} + +using TranspCb = std::function; +void create_tcp_transport(TranspCb *cb) +{ + log_info("Creating TCP transport"); spdk_nvmf_transport_opts opts; auto succ = spdk_nvmf_transport_opts_init("TCP", &opts, sizeof(opts)); assert(succ == true); - // opts.io_unit_size = 131072; - // opts.max_qpairs_per_ctrlr = 8; + opts.io_unit_size = 131072; + opts.max_qpairs_per_ctrlr = 8; opts.in_capsule_data_size = 8192; debug("TCP transport opts: io_unit_size={}, max_qpairs_per_ctrlr={}, " "in_capsule_data_size={}", opts.io_unit_size, opts.max_qpairs_per_ctrlr, opts.in_capsule_data_size); - log_info("Creating TCP transport"); - auto transport_p = std::promise(); - spdk_nvmf_transport_create_async( + auto rc = spdk_nvmf_transport_create_async( "TCP", &opts, - [](auto p, auto b) { - auto pr = (std::promise *)p; - pr->set_value(b); - }, - &transport_p); - auto transport = transport_p.get_future().get(); - assert(transport != nullptr); - - log_info("Adding TCP transport to target"); - auto stat_p = std::promise(); - spdk_nvmf_tgt_add_transport( - tgt, transport, - [](auto p, auto stat) { - auto pr = (std::promise *)p; - pr->set_value(stat); + [](auto ctx, auto r) { + auto cb = static_cast(ctx); + (*cb)(r); + delete cb; }, - nullptr); - auto status = stat_p.get_future().get(); - assert(status == 0); + cb); + assert(rc == 0); +} + +void add_transport(spdk_nvmf_tgt *tgt, spdk_nvmf_transport *tr, + std::function *cb) +{ + log_info("Adding transport to target"); + spdk_nvmf_tgt_add_transport(tgt, tr, invoke_and_free_cb, cb); +} + +void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, str host, + str port, std::function *cb) +{ + log_info("Adding listener to subsystem"); + + spdk_nvme_transport_id trid; + // They're fixed-size char[] bufs in the struct, so make sure we have space + assert(host.size() < sizeof(trid.traddr)); + assert(port.size() < sizeof(trid.trsvcid)); + trid.trtype = SPDK_NVME_TRANSPORT_TCP; + trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; + std::copy(host.begin(), host.end(), trid.traddr); + std::copy(port.begin(), port.end(), trid.trsvcid); + + spdk_nvmf_listen_opts lopts1; + spdk_nvmf_listen_opts_init(&lopts1, sizeof(lopts1)); + auto rc = spdk_nvmf_tgt_listen_ext(tgt, &trid, &lopts1); + assert(rc == 0); + + spdk_nvmf_listener_opts lopts; + spdk_nvmf_subsystem_listener_opts_init(&lopts, sizeof(lopts)); + lopts.secure_channel = false; + + spdk_nvmf_subsystem_add_listener_ext(ss, &trid, invoke_and_free_cb, cb, + &lopts); +} + +void add_bdev_ns(spdk_nvmf_subsystem *ss, str bdev_name) +{ + log_info("Adding bdev namespace to subsystem"); + spdk_nvmf_ns_opts nopts; + spdk_nvmf_ns_opts_get_defaults(&nopts, sizeof(nopts)); + auto err = spdk_nvmf_subsystem_add_ns_ext(ss, bdev_name.c_str(), &nopts, + sizeof(nopts), nullptr); + assert(err == 0); +} + +static void start_lsvd(void *arg) +{ + log_info("Starting LSVD SPDK program ..."); + auto args = (start_lsvd_args *)arg; - // Step 3: create subsystem for our bdev - log_info("Creating SPDK controller"); + auto io_ctx = connect_to_pool(args->pool_name); + + // Setup spdk nvmf + auto tgt = create_target(); + auto disc_ss = add_discovery_ss(tgt); + auto nvme_ss = add_nvme_ss(tgt); + + // Add lsvd bdev + lsvd_config cfg; // TODO read this in from a config file + cfg.cache_size = 160 * 1024 * 1024; // small 160mb cache for testing + auto err = bdev_lsvd_create(args->image_name, io_ctx, cfg); + assert(err == 0); + add_bdev_ns(nvme_ss, args->image_name); + + // some stupid formatting decisions up ahead due to tower-of-callback + // clang-format off + create_tcp_transport(new TranspCb([=](auto *tr) { + assert(tr != nullptr); + add_transport(tgt, tr, alloc_cb([=](int status) { + assert(status == 0); + add_ss_listener(tgt, nvme_ss, HOSTNAME, PORT, alloc_cb([=](int status) { + assert(status == 0); + // Start both subsystems + spdk_nvmf_subsystem_start(nvme_ss, nullptr, nullptr); + spdk_nvmf_subsystem_start(disc_ss, nullptr, nullptr); + })); + })); + })); + // clang-format on } int main(int argc, const char **argv) From f73950884e29df5c832bd80e1a7dfa2cbc560308 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 02:30:10 +0000 Subject: [PATCH 56/77] Fix missing pollgroup --- src/spdk_frontend.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index 7ccd95a2..ac62a3e0 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -38,9 +38,14 @@ spdk_nvmf_tgt *create_target() log_info("Creating NVMF target"); spdk_nvmf_target_opts opts = { .name = "lsvd_nvmf_tgt", + .discovery_filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY, }; auto tgt = spdk_nvmf_tgt_create(&opts); assert(tgt != nullptr); + + auto pg = spdk_nvmf_poll_group_create(tgt); + assert(pg != nullptr); + return tgt; } @@ -109,10 +114,13 @@ void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, str host, // They're fixed-size char[] bufs in the struct, so make sure we have space assert(host.size() < sizeof(trid.traddr)); assert(port.size() < sizeof(trid.trsvcid)); - trid.trtype = SPDK_NVME_TRANSPORT_TCP; - trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; std::copy(host.begin(), host.end(), trid.traddr); std::copy(port.begin(), port.end(), trid.trsvcid); + trid.trtype = SPDK_NVME_TRANSPORT_TCP; + trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; + // This is required because spdk looks at trstring, not the trtype + spdk_nvme_transport_id_populate_trstring( + &trid, spdk_nvme_transport_id_trtype_str(trid.trtype)); spdk_nvmf_listen_opts lopts1; spdk_nvmf_listen_opts_init(&lopts1, sizeof(lopts1)); @@ -160,13 +168,15 @@ static void start_lsvd(void *arg) // clang-format off create_tcp_transport(new TranspCb([=](auto *tr) { assert(tr != nullptr); - add_transport(tgt, tr, alloc_cb([=](int status) { - assert(status == 0); - add_ss_listener(tgt, nvme_ss, HOSTNAME, PORT, alloc_cb([=](int status) { - assert(status == 0); + add_transport(tgt, tr, alloc_cb([=](int rc) { + assert(rc == 0); + add_ss_listener(tgt, nvme_ss, HOSTNAME, PORT, alloc_cb([=](int rc) { + assert(rc == 0); // Start both subsystems spdk_nvmf_subsystem_start(nvme_ss, nullptr, nullptr); spdk_nvmf_subsystem_start(disc_ss, nullptr, nullptr); + + log_info("LSVD SPDK program started successfully"); })); })); })); From e62035ed9b96c6b35b32c00413f23baa4b484011 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 03:06:54 +0000 Subject: [PATCH 57/77] Fix partially-initialised spdk bdev config --- src/bdev_lsvd.cc | 1 + src/spdk_frontend.cc | 76 +++++++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index c22a7c0c..f140f700 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -45,6 +45,7 @@ class lsvd_iodevice lsvd_iodevice(uptr img_) : img(std::move(img_)) { + std::memset(&bdev, 0, sizeof(bdev)); bdev.product_name = strdup("Log-structured Virtual Disk"); bdev.name = strdup(img->imgname.c_str()); bdev.blocklen = 4096; diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index ac62a3e0..b33b19a2 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -15,6 +15,22 @@ const char *NVME_SS_NQN = "nqn.2019-05.io.lsvd:cnode1"; const char *HOSTNAME = "127.0.0.1"; const char *PORT = "4420"; +spdk_nvme_transport_id get_trid(const char *host, const char *port) +{ + spdk_nvme_transport_id trid; + // They're fixed-size char[] bufs in the struct, so make sure we have space + assert(strlen(host) < sizeof(trid.traddr)); + assert(strlen(port) < sizeof(trid.trsvcid)); + std::copy(host, host + strlen(host), trid.traddr); + std::copy(port, port + strlen(port), trid.trsvcid); + trid.trtype = SPDK_NVME_TRANSPORT_TCP; + trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; + // This is required because spdk looks at trstring, not the trtype + spdk_nvme_transport_id_populate_trstring( + &trid, spdk_nvme_transport_id_trtype_str(trid.trtype)); + return trid; +} + using IntCallbackFn = std::function; IntCallbackFn *alloc_cb(std::function cb) { @@ -98,39 +114,29 @@ void create_tcp_transport(TranspCb *cb) assert(rc == 0); } -void add_transport(spdk_nvmf_tgt *tgt, spdk_nvmf_transport *tr, - std::function *cb) +void add_tgt_transport(spdk_nvmf_tgt *tgt, spdk_nvmf_transport *tr, + std::function *cb) { log_info("Adding transport to target"); spdk_nvmf_tgt_add_transport(tgt, tr, invoke_and_free_cb, cb); } -void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, str host, - str port, std::function *cb) +void start_tgt_listen(spdk_nvmf_tgt *tgt, spdk_nvme_transport_id trid) { - log_info("Adding listener to subsystem"); - - spdk_nvme_transport_id trid; - // They're fixed-size char[] bufs in the struct, so make sure we have space - assert(host.size() < sizeof(trid.traddr)); - assert(port.size() < sizeof(trid.trsvcid)); - std::copy(host.begin(), host.end(), trid.traddr); - std::copy(port.begin(), port.end(), trid.trsvcid); - trid.trtype = SPDK_NVME_TRANSPORT_TCP; - trid.adrfam = SPDK_NVMF_ADRFAM_IPV4; - // This is required because spdk looks at trstring, not the trtype - spdk_nvme_transport_id_populate_trstring( - &trid, spdk_nvme_transport_id_trtype_str(trid.trtype)); - - spdk_nvmf_listen_opts lopts1; - spdk_nvmf_listen_opts_init(&lopts1, sizeof(lopts1)); - auto rc = spdk_nvmf_tgt_listen_ext(tgt, &trid, &lopts1); + spdk_nvmf_listen_opts lopts; + spdk_nvmf_listen_opts_init(&lopts, sizeof(lopts)); + auto rc = spdk_nvmf_tgt_listen_ext(tgt, &trid, &lopts); assert(rc == 0); +} + +void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, + spdk_nvme_transport_id trid, std::function *cb) +{ + log_info("Adding listener to subsystem"); spdk_nvmf_listener_opts lopts; spdk_nvmf_subsystem_listener_opts_init(&lopts, sizeof(lopts)); lopts.secure_channel = false; - spdk_nvmf_subsystem_add_listener_ext(ss, &trid, invoke_and_free_cb, cb, &lopts); } @@ -140,9 +146,9 @@ void add_bdev_ns(spdk_nvmf_subsystem *ss, str bdev_name) log_info("Adding bdev namespace to subsystem"); spdk_nvmf_ns_opts nopts; spdk_nvmf_ns_opts_get_defaults(&nopts, sizeof(nopts)); - auto err = spdk_nvmf_subsystem_add_ns_ext(ss, bdev_name.c_str(), &nopts, - sizeof(nopts), nullptr); - assert(err == 0); + auto nsid = spdk_nvmf_subsystem_add_ns_ext(ss, bdev_name.c_str(), &nopts, + sizeof(nopts), nullptr); + assert(nsid != 0); } static void start_lsvd(void *arg) @@ -156,6 +162,7 @@ static void start_lsvd(void *arg) auto tgt = create_target(); auto disc_ss = add_discovery_ss(tgt); auto nvme_ss = add_nvme_ss(tgt); + auto trid = get_trid(HOSTNAME, PORT); // Add lsvd bdev lsvd_config cfg; // TODO read this in from a config file @@ -168,15 +175,18 @@ static void start_lsvd(void *arg) // clang-format off create_tcp_transport(new TranspCb([=](auto *tr) { assert(tr != nullptr); - add_transport(tgt, tr, alloc_cb([=](int rc) { + add_tgt_transport(tgt, tr, alloc_cb([=](int rc) { assert(rc == 0); - add_ss_listener(tgt, nvme_ss, HOSTNAME, PORT, alloc_cb([=](int rc) { - assert(rc == 0); - // Start both subsystems - spdk_nvmf_subsystem_start(nvme_ss, nullptr, nullptr); - spdk_nvmf_subsystem_start(disc_ss, nullptr, nullptr); - - log_info("LSVD SPDK program started successfully"); + start_tgt_listen(tgt, trid); + add_ss_listener(tgt, disc_ss, trid, alloc_cb([=](int) { + add_ss_listener(tgt, nvme_ss, trid, alloc_cb([=](int rc) { + assert(rc == 0); + // Start both subsystems + spdk_nvmf_subsystem_start(nvme_ss, nullptr, nullptr); + spdk_nvmf_subsystem_start(disc_ss, nullptr, nullptr); + + log_info("LSVD SPDK program started successfully"); + })); })); })); })); From b5855d2e25a91d066e83f5b63ca87958c94361c9 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 03:28:59 +0000 Subject: [PATCH 58/77] Downgrade logging lvs and wait for ss start --- src/spdk_frontend.cc | 63 +++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/src/spdk_frontend.cc b/src/spdk_frontend.cc index b33b19a2..e22681be 100644 --- a/src/spdk_frontend.cc +++ b/src/spdk_frontend.cc @@ -51,7 +51,7 @@ struct start_lsvd_args { spdk_nvmf_tgt *create_target() { - log_info("Creating NVMF target"); + debug("Creating NVMF target"); spdk_nvmf_target_opts opts = { .name = "lsvd_nvmf_tgt", .discovery_filter = SPDK_NVMF_TGT_DISCOVERY_MATCH_ANY, @@ -67,7 +67,7 @@ spdk_nvmf_tgt *create_target() spdk_nvmf_subsystem *add_discovery_ss(spdk_nvmf_tgt *tgt) { - log_info("Creating NVMF discovery subsystem"); + debug("Creating NVMF discovery subsystem"); auto ss = spdk_nvmf_subsystem_create( tgt, SPDK_NVMF_DISCOVERY_NQN, SPDK_NVMF_SUBTYPE_DISCOVERY_CURRENT, 0); assert(ss != nullptr); @@ -77,7 +77,7 @@ spdk_nvmf_subsystem *add_discovery_ss(spdk_nvmf_tgt *tgt) spdk_nvmf_subsystem *add_nvme_ss(spdk_nvmf_tgt *tgt) { - log_info("Creating SPDK controller subsystem"); + debug("Creating SPDK controller subsystem"); auto ss = spdk_nvmf_subsystem_create(tgt, NVME_SS_NQN, SPDK_NVMF_SUBTYPE_NVME, 1); assert(ss != nullptr); @@ -91,7 +91,7 @@ spdk_nvmf_subsystem *add_nvme_ss(spdk_nvmf_tgt *tgt) using TranspCb = std::function; void create_tcp_transport(TranspCb *cb) { - log_info("Creating TCP transport"); + debug("Creating TCP transport"); spdk_nvmf_transport_opts opts; auto succ = spdk_nvmf_transport_opts_init("TCP", &opts, sizeof(opts)); assert(succ == true); @@ -117,7 +117,7 @@ void create_tcp_transport(TranspCb *cb) void add_tgt_transport(spdk_nvmf_tgt *tgt, spdk_nvmf_transport *tr, std::function *cb) { - log_info("Adding transport to target"); + debug("Adding transport to target"); spdk_nvmf_tgt_add_transport(tgt, tr, invoke_and_free_cb, cb); } @@ -132,7 +132,7 @@ void start_tgt_listen(spdk_nvmf_tgt *tgt, spdk_nvme_transport_id trid) void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, spdk_nvme_transport_id trid, std::function *cb) { - log_info("Adding listener to subsystem"); + debug("Adding listener to subsystem"); spdk_nvmf_listener_opts lopts; spdk_nvmf_subsystem_listener_opts_init(&lopts, sizeof(lopts)); @@ -143,7 +143,7 @@ void add_ss_listener(spdk_nvmf_tgt *tgt, spdk_nvmf_subsystem *ss, void add_bdev_ns(spdk_nvmf_subsystem *ss, str bdev_name) { - log_info("Adding bdev namespace to subsystem"); + debug("Adding bdev namespace to subsystem"); spdk_nvmf_ns_opts nopts; spdk_nvmf_ns_opts_get_defaults(&nopts, sizeof(nopts)); auto nsid = spdk_nvmf_subsystem_add_ns_ext(ss, bdev_name.c_str(), &nopts, @@ -151,6 +151,19 @@ void add_bdev_ns(spdk_nvmf_subsystem *ss, str bdev_name) assert(nsid != 0); } +void start_ss(spdk_nvmf_subsystem *ss, std::function *cb) +{ + // debug("Starting subsystem"); + spdk_nvmf_subsystem_start( + ss, + [](auto ss, auto arg, auto rc) { + auto cb = static_cast *>(arg); + (*cb)(rc); + delete cb; + }, + cb); +} + static void start_lsvd(void *arg) { log_info("Starting LSVD SPDK program ..."); @@ -172,24 +185,26 @@ static void start_lsvd(void *arg) add_bdev_ns(nvme_ss, args->image_name); // some stupid formatting decisions up ahead due to tower-of-callback + // it also looks cleaner without indents // clang-format off create_tcp_transport(new TranspCb([=](auto *tr) { - assert(tr != nullptr); - add_tgt_transport(tgt, tr, alloc_cb([=](int rc) { - assert(rc == 0); - start_tgt_listen(tgt, trid); - add_ss_listener(tgt, disc_ss, trid, alloc_cb([=](int) { - add_ss_listener(tgt, nvme_ss, trid, alloc_cb([=](int rc) { - assert(rc == 0); - // Start both subsystems - spdk_nvmf_subsystem_start(nvme_ss, nullptr, nullptr); - spdk_nvmf_subsystem_start(disc_ss, nullptr, nullptr); - - log_info("LSVD SPDK program started successfully"); - })); - })); - })); - })); + assert(tr != nullptr); + + add_tgt_transport(tgt, tr, alloc_cb([=](int rc) { + assert(rc == 0); + + start_tgt_listen(tgt, trid); + add_ss_listener(tgt, disc_ss, trid, alloc_cb([=](int) { + add_ss_listener(tgt, nvme_ss, trid, alloc_cb([=](int rc) { + assert(rc == 0); + + // Start both subsystems + start_ss(nvme_ss, alloc_cb([=](int) { + start_ss(disc_ss, alloc_cb([=](int) { + + log_info("LSVD SPDK program started successfully"); + + })); })); })); })); })); })); // clang-format on } @@ -212,7 +227,7 @@ int main(int argc, const char **argv) .pool_name = argv[1], .image_name = argv[2], }; - log_info("Args: pool={}, image={}", args.pool_name, args.image_name); + debug("Args: pool={}, image={}", args.pool_name, args.image_name); spdk_app_opts opts = {.shutdown_cb = []() { log_info("Shutting down LSVD SPDK program ..."); From 5ca6a8f33e261fd724de626d8f51f50c344c7573 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 03:41:07 +0000 Subject: [PATCH 59/77] Fix wraps to depth1 and use cmake for folly --- subprojects/folly.wrap | 2 ++ subprojects/spdk.wrap | 1 + 2 files changed, 3 insertions(+) diff --git a/subprojects/folly.wrap b/subprojects/folly.wrap index 32ad06f7..90426749 100644 --- a/subprojects/folly.wrap +++ b/subprojects/folly.wrap @@ -2,6 +2,8 @@ url = https://github.com/facebook/folly.git revision = v2024.05.20.00 clone-recursive = true +method = cmake +depth = 1 [provide] _folly = folly_dep \ No newline at end of file diff --git a/subprojects/spdk.wrap b/subprojects/spdk.wrap index 8890bc98..1336d4c0 100644 --- a/subprojects/spdk.wrap +++ b/subprojects/spdk.wrap @@ -3,6 +3,7 @@ url = https://github.com/spdk/spdk.git revision = v24.01 patch_directory = spdk clone-recursive = true +depth = 1 [provide] _spdk = spdk_dep \ No newline at end of file From d773251e74c9d71152f6b3e0f3f24d803bae8615 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 03:45:40 +0000 Subject: [PATCH 60/77] Add thinlto cache --- meson.build | 1 + 1 file changed, 1 insertion(+) diff --git a/meson.build b/meson.build index 29dcbe68..a3601f9d 100644 --- a/meson.build +++ b/meson.build @@ -8,6 +8,7 @@ project( 'b_colorout=always', 'b_sanitize=address,undefined', 'b_lto=true', + 'b_thinlto_cache=true', ], ) From f733fc9bcb2e38b8de0f2d93652c8e22a98b15e7 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 19:09:24 +0000 Subject: [PATCH 61/77] Fix incorrect thread for completion path --- src/bdev_lsvd.cc | 64 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/src/bdev_lsvd.cc b/src/bdev_lsvd.cc index f140f700..8890edc6 100644 --- a/src/bdev_lsvd.cc +++ b/src/bdev_lsvd.cc @@ -4,22 +4,35 @@ #include "bdev_lsvd.h" #include "image.h" +#include "request.h" #include "smartiov.h" #include "spdk/thread.h" #include "utils.h" static int bdev_lsvd_init(void); static void bdev_lsvd_finish(void); +static int bdev_lsvd_io_ctx_size(void); static spdk_bdev_module lsvd_if = { .module_init = bdev_lsvd_init, .module_fini = bdev_lsvd_finish, .name = "LSVD bdev module", + .get_ctx_size = bdev_lsvd_io_ctx_size, }; SPDK_BDEV_MODULE_REGISTER(ext_lsvd, &lsvd_if); -static int bdev_lsvd_init(void) { return 0; } -static void bdev_lsvd_finish(void) {} +static int bdev_lsvd_init(void) +{ + spdk_io_device_register( + &lsvd_if, [](auto iod, auto buf) { return 0; }, + [](auto iod, auto buf) { return; }, 0, "lsvd_poll_groups"); + return 0; +} + +static void bdev_lsvd_finish(void) +{ + spdk_io_device_unregister(&lsvd_if, nullptr); +} /** * Function table for the LSVD bdev module. @@ -90,42 +103,73 @@ static spdk_io_channel *lsvd_get_io_channel(void *ctx) // SPDK will pass this to the iodevice's registered create/destroy // io_channel functions that were passed in when the device was registered. // We don't need to do anything special here, so just return the iodevice. - return spdk_get_io_channel(iodev); + auto ch = spdk_get_io_channel(iodev); + assert(ch != nullptr); + return ch; +} + +struct lsvd_bdev_io { + spdk_thread *submit_td; + spdk_bdev_io_status status; + request *r; +}; + +static int bdev_lsvd_io_ctx_size(void) { return sizeof(lsvd_bdev_io); } + +static void lsvd_io_done(lsvd_bdev_io *io, int rc) +{ + auto sth = io->submit_td; + assert(sth != nullptr); + + // error is -errno, succ is 0 or bytes read/written + io->status = + rc >= 0 ? SPDK_BDEV_IO_STATUS_SUCCESS : SPDK_BDEV_IO_STATUS_FAILED; + + spdk_thread_send_msg( + sth, + [](void *ctx) { + auto io = (lsvd_bdev_io *)ctx; + spdk_bdev_io_complete(spdk_bdev_io_from_ctx(io), io->status); + }, + io); } static void lsvd_submit_io(spdk_io_channel *c, spdk_bdev_io *io) { auto dev = static_cast(io->bdev->ctxt); auto &img = dev->img; + auto lio = (lsvd_bdev_io *)(io->driver_ctx); + lio->submit_td = spdk_io_channel_get_thread(c); // io details auto offset = io->u.bdev.offset_blocks * io->bdev->blocklen; auto len = io->u.bdev.num_blocks * io->bdev->blocklen; smartiov iov(io->u.bdev.iovs, io->u.bdev.iovcnt); - request *r; + auto comp = [lio](int rc) { lsvd_io_done(lio, rc); }; + switch (io->type) { case SPDK_BDEV_IO_TYPE_READ: - r = img->read(offset, iov, nullptr); + lio->r = img->read(offset, iov, comp); break; case SPDK_BDEV_IO_TYPE_WRITE: - r = img->write(offset, iov, nullptr); + lio->r = img->write(offset, iov, comp); break; case SPDK_BDEV_IO_TYPE_FLUSH: - r = img->flush(nullptr); + lio->r = img->flush(comp); break; case SPDK_BDEV_IO_TYPE_UNMAP: - r = img->trim(offset, len, nullptr); + lio->r = img->trim(offset, len, comp); break; case SPDK_BDEV_IO_TYPE_WRITE_ZEROES: - r = img->trim(offset, len, nullptr); + lio->r = img->trim(offset, len, comp); break; default: log_error("Unknown request type: {}", io->type); return; } - r->run(nullptr); + lio->r->run(nullptr); } // Just copying from bdev_rbd, not sure where this is actually used From ea29fe3f3948930c41f5da89062dce3d8525e185 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 19:12:17 +0000 Subject: [PATCH 62/77] Build debug only for CI --- Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4c257a70..24f790f1 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,8 @@ setup: meson setup --native-file meson.ini build-dbg --buildtype=debug ln -s build-dbg builddir -debug: setup +debug: + meson setup --native-file meson.ini build-dbg --buildtype=debug cd build-dbg; meson compile paper: From 8b4dd7db03db861fe8a01afffc231a186daf2a75 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 29 May 2024 19:14:44 +0000 Subject: [PATCH 63/77] Remove subproject dirs --- subprojects/folly | 1 - subprojects/spdk | 1 - 2 files changed, 2 deletions(-) delete mode 160000 subprojects/folly delete mode 160000 subprojects/spdk diff --git a/subprojects/folly b/subprojects/folly deleted file mode 160000 index 89ac8453..00000000 --- a/subprojects/folly +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 89ac8453fc7825e566b46a7ebb9e0348b1c09338 diff --git a/subprojects/spdk b/subprojects/spdk deleted file mode 160000 index 0786843e..00000000 --- a/subprojects/spdk +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0786843e99550cedb6de26c25641d9c294ddcb85 From 461abe50a723de0c36b1e93bdf7a526b3d6fdb34 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 30 May 2024 00:00:21 +0000 Subject: [PATCH 64/77] Rewrite imgtool to use new apis --- src/imgtool.cc | 242 ++++++++++++++++++++----------------------------- src/objects.cc | 6 +- src/utils.h | 3 +- 3 files changed, 104 insertions(+), 147 deletions(-) diff --git a/src/imgtool.cc b/src/imgtool.cc index b3517884..0e6e52d6 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -1,141 +1,79 @@ #include +#include +#include +#include #include #include #include -#include +#include #include #include #include #include "backend.h" -#include "config.h" -#include "fake_rbd.h" -#include "lsvd_types.h" +#include "image.h" #include "objects.h" -#include "translate.h" #include "utils.h" -enum tool_operation { OP_CREATE = 1, OP_DELETE = 2, OP_INFO = 3, OP_CLONE = 5 }; - -const char *backend = "rados"; -const char *image_name; -cfg_cache_type cache_type = LSVD_CFG_READ; -enum tool_operation op; -const char *pool_name = "lsvd"; -size_t size = 0; - -static long parseint(const char *_s) +static usize parseint(str i) { - char *s = (char *)_s; - long val = strtol(s, &s, 0); - if (toupper(*s) == 'G') + usize processed; + auto val = std::stoll(i, &processed); + char *postfix = (char *)i.c_str() + processed; + + if (toupper(*postfix) == 'G') val *= (1024 * 1024 * 1024); - if (toupper(*s) == 'M') + if (toupper(*postfix) == 'M') val *= (1024 * 1024); - if (toupper(*s) == 'K') + if (toupper(*postfix) == 'K') val *= 1024; + return val; } -static struct argp_option options[] = { - {"create", 'C', 0, 0, "create image", 0}, - {"size", 'z', "SIZE", 0, "size in bytes (M/G=2^20,2^30)", 0}, - {"delete", 'D', 0, 0, "delete image", 0}, - {"info", 'I', 0, 0, "show image information", 0}, - {"clone", 'c', "IMAGE", 0, "clone image", 0}, - {"pool", 'p', "POOL", 0, "pool name", 0}, - {0, 0, 0, 0, 0, 0}, -}; - -static char args_doc[] = "IMAGE"; +static void create(rados_ioctx_t io, str name, usize size) +{ + auto rc = lsvd_image::create_new(name, size, io); + THROW_MSG_ON(rc != 0, "Failed to create new image '{}'", name); +} -static error_t parse_opt(int key, char *arg, struct argp_state *state) +static void remove(rados_ioctx_t io, str name) { - switch (key) { - case ARGP_KEY_ARG: - image_name = arg; - break; - case 'C': - op = OP_CREATE; - break; - case 'z': - size = parseint(arg); - break; - case 'D': - op = OP_DELETE; - break; - case 'I': - op = OP_INFO; - break; - case 'c': - op = OP_CLONE; - break; - case 'p': - pool_name = arg; - case ARGP_KEY_END: - if (op == 0 || (op == OP_CREATE && size == 0)) - argp_usage(state); - break; - } - return 0; + auto rc = lsvd_image::delete_image(name, io); + THROW_MSG_ON(rc != 0, "Failed to delete image '{}'", name); } -static struct argp argp = {options, parse_opt, NULL, args_doc, 0, 0, 0}; +static void clone(rados_ioctx_t io, str src, str dst) +{ + auto rc = lsvd_image::clone_image(src, dst, io); + THROW_MSG_ON(rc != 0, "Failed to clone image '{}' to '{}'", src, dst); +} -void info(rados_ioctx_t io, const char *image_name) +static void info(rados_ioctx_t io, str name) { - lsvd_config cfg; - int rv; - if ((rv = cfg.read()) < 0) { - printf("error reading config: %d\n", rv); - exit(1); - } - auto objstore = make_rados_backend(io); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, image_name, uu)) < 0) { - printf("error reading superblock: %d\n", rv); - exit(1); - } - auto rcache_file = cfg.cache_filename(uu, image_name, LSVD_CFG_READ); - auto wcache_file = cfg.cache_filename(uu, image_name, LSVD_CFG_WRITE); - printf("image: %s\n", image_name); - printf("read cache: %s\n", rcache_file.c_str()); - printf("write cache: %s\n", wcache_file.c_str()); - - char base_buf[4096]; - rv = objstore->read(image_name, 0, base_buf, sizeof(base_buf)); - if (rv < 0) - throw std::runtime_error("failed to read superblock"); - - auto base_hdr = (common_obj_hdr *)base_buf; - auto base_super = (super_hdr *)(base_hdr + 1); - - if (base_hdr->magic != LSVD_MAGIC || base_hdr->type != OBJ_SUPERBLOCK) - throw std::runtime_error("corrupt superblock"); - - char uuid_str[64]; - uuid_unparse_lower(base_hdr->vol_uuid, uuid_str); - fmt::print("UUID: {}\n", uuid_str); - fmt::print("Size: {} bytes", base_super->vol_size * 512); - fmt::print(" / {} GiB\n", - (double)base_super->vol_size * 512. / 1024. / 1024. / 1024.); - fmt::print("Checkpoints: {}\n", base_super->ckpts_len / 4.); - fmt::print("Snapshots: {}\n", base_super->snaps_len / 4.); - fmt::print("Is a clone: {}\n", base_super->clones_len == 0 ? "no" : "yes"); - - // parse clones - if (base_super->clones_len == 0) - return; - - uint32_t consumed = 0; - while (consumed < base_super->clones_len) { - auto ci = - (clone_info *)(base_buf + base_super->clones_offset + consumed); - auto objname = (char *)(ci + 1); - auto upto_seq = ci->last_seq; - fmt::print("Base: {}, upto seq {}\n", objname, upto_seq); - consumed += sizeof(clone_info) + strlen(objname) + 1; - } + auto be = make_rados_backend(io); + auto parser = object_reader(be); + + auto sb = parser.read_superblock(name); + THROW_MSG_ON(!sb, "Superblock not found"); + + auto i = *sb; + char uuid_str[37]; + uuid_unparse_lower(i.uuid, uuid_str); + + using namespace fmt; + print("=== Image info ===\n"); + print("Name: {}\n", name); + print("UUID: {}\n", uuid_str); + print("Size: {} bytes / {} GiB\n", i.vol_size, + (double)i.vol_size / 1024 / 1024 / 1024); + print("Checkpoints: {}\n", i.ckpts); + + for (auto &c : i.clones) + print("Base: '{}' upto seq {}\n", c->name, c->last_seq); + + for (auto &c : i.snaps) + print("Snapshot: '{}' at seq {}\n", c->name, c->seq); } int main(int argc, char **argv) @@ -148,35 +86,55 @@ int main(int argc, char **argv) std::abort(); }); - argp_parse(&argp, argc, argv, 0, 0, 0); - - rados_t cluster; - int err = rados_create2(&cluster, "ceph", "client.admin", 0); - check_ret_neg(err, "Failed to create cluster handle"); - - err = rados_conf_read_file(cluster, "/etc/ceph/ceph.conf"); - check_ret_neg(err, "Failed to read config file"); - - err = rados_connect(cluster); - check_ret_neg(err, "Failed to connect to cluster"); - - rados_ioctx_t io_ctx; - err = rados_ioctx_create(cluster, pool_name, &io_ctx); - check_ret_neg(err, "Failed to connect to pool {}", pool_name); - - if (op == OP_CREATE && size > 0) - rbd_create(io_ctx, image_name, size, NULL); - else if (op == OP_DELETE) - rbd_remove(io_ctx, image_name); - else if (op == OP_INFO) - info(io_ctx, image_name); - else if (op == OP_CLONE) { - auto src_img = image_name; - auto dst_img = argv[argc - 1]; - fmt::print("cloning from {} to {}\n", src_img, dst_img); - rbd_clone(io_ctx, src_img, dst_img); + namespace po = boost::program_options; + po::options_description desc("Allowed options"); + + // clang-format off + desc.add_options() + ("help", "produce help message") + ("cmd", po::value(), "subcommand: create, clone, delete, info") + ("img", po::value(), "name of the iname") + ("pool", po::value(), "pool where the image resides") + ("size", po::value()->default_value("1G"), + "size in bytes (M=2^20,G=2^30)") + ("dest", po::value(), "destination (for clone)"); + // clang-format on + + po::positional_options_description p; + p.add("cmd", 1).add("pool", 1).add("img", 1); + + po::variables_map vm; + po::store( + po::command_line_parser(argc, argv).options(desc).positional(p).run(), + vm); + po::notify(vm); + + if (vm.count("help") || !vm.count("cmd") || !vm.count("pool") || + !vm.count("img")) { + std::cout << desc << "\n"; + return 1; } - rados_ioctx_destroy(io_ctx); - rados_shutdown(cluster); + auto cmd = vm["cmd"].as(); + auto pool = vm["pool"].as(); + auto img = vm["img"].as(); + + auto io = connect_to_pool(pool); + THROW_MSG_ON(io == nullptr, "Failed to connect to pool '{}'", pool); + + if (cmd == "create") { + auto size = parseint(vm["size"].as()); + create(io, img, size); + } else if (cmd == "delete") + remove(io, img); + else if (cmd == "clone") { + THROW_MSG_ON(!vm.count("dest"), "Destination image not specified"); + auto dst = vm["dest"].as(); + clone(io, img, dst); + } else if (cmd == "info") + info(io, img); + else + THROW_MSG_ON(true, "Unknown command '{}'", cmd); + + rados_ioctx_destroy(io); } diff --git a/src/objects.cc b/src/objects.cc index c217d868..98b80c28 100644 --- a/src/objects.cc +++ b/src/objects.cc @@ -202,17 +202,17 @@ opt object_reader::read_data_hdr(std::string oname) PASSTHRU_NULLOPT(hdr); parsed_data_hdr h; + h.buf = std::move(*hdr); h.hdr = (common_obj_hdr *)h.buf.data(); PR_RET_IF(h.hdr->type != OBJ_LOGDATA, std::nullopt, "Invalid object type in '{}'", oname); - h.data_hdr = (obj_data_hdr *)(hdr->data() + sizeof(common_obj_hdr)); + h.data_hdr = (obj_data_hdr *)(h.buf.data() + sizeof(common_obj_hdr)); - auto buf = hdr->data(); + auto buf = h.buf.data(); h.cleaned = deserialise_ptrs( buf, h.data_hdr->objs_cleaned_offset, h.data_hdr->objs_cleaned_len); h.data_map = deserialise_ptrs(buf, h.data_hdr->data_map_offset, h.data_hdr->data_map_len); - h.buf = std::move(*hdr); return h; } diff --git a/src/utils.h b/src/utils.h index 9ac8757e..ee5009af 100644 --- a/src/utils.h +++ b/src/utils.h @@ -220,8 +220,7 @@ template struct overloaded : Ts... { using Ts::operator()...; }; -inline vec split_string_on_char(const std::string &s, - char delim) +inline vec split_string_on_char(const std::string &s, char delim) { vec result; std::stringstream ss(s); From 8c117bc34b5b3fc7292b9e74acef854ea0aa4a73 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 30 May 2024 00:00:47 +0000 Subject: [PATCH 65/77] Remove trace prints --- src/translate.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/translate.cc b/src/translate.cc index 3c08c9c3..46d4e55c 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -608,10 +608,10 @@ void translate_impl::write_checkpoint(seqnum_t cp_seq, translate_req *req) } serialise_superblock(superblock_buf, checkpoints, clones, uuid, vol_size); - debug("Updating superblock with new checkpoint"); + // debug("Updating superblock with new checkpoint"); objstore->write(name, superblock_buf.data(), superblock_buf.size()); - debug("Deleting old checkpoints {}", ckpts_to_delete); + // debug("Deleting old checkpoints {}", ckpts_to_delete); for (auto c : ckpts_to_delete) objstore->delete_obj(oname(name, c)); @@ -790,6 +790,8 @@ void translate_impl::process_batch(seqnum_t _seq, translate_req *req) auto obj_size = (hdr_sectors + data_sectors) * 512; auto obj_ptr = hdr_ptr; + trace("Writing data obj seq {}", _seq); + rcache->insert_object(pf, _seq, obj_size, obj_ptr); auto req2 = objstore->aio_write(name.str(), obj_ptr, obj_size); From f0b9d6d3b7ff58cc1d98b2ea67534aac1bb98651 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 30 May 2024 00:30:09 +0000 Subject: [PATCH 66/77] Migrate image manip fns to image.cc --- meson.build | 2 +- src/image.cc | 57 ++++++++++ src/image.h | 4 +- src/liblsvd.cc | 33 +----- src/translate.cc | 263 ----------------------------------------------- src/translate.h | 7 -- 6 files changed, 63 insertions(+), 303 deletions(-) diff --git a/meson.build b/meson.build index a3601f9d..2244811c 100644 --- a/meson.build +++ b/meson.build @@ -17,7 +17,7 @@ add_project_arguments('-Wno-unused-parameter', language: 'cpp') if get_option('buildtype') == 'debug' add_project_arguments('-fno-inline', language: 'cpp') - add_project_arguments('-DLOGLV=1', language: 'cpp') + add_project_arguments('-DLOGLV=0', language: 'cpp') endif subdir('src') diff --git a/src/image.cc b/src/image.cc index 02100043..0db401ab 100644 --- a/src/image.cc +++ b/src/image.cc @@ -563,3 +563,60 @@ request *lsvd_image::flush(std::function cb) { return new flush_request(this, cb); } + +int lsvd_image::create_new(std::string name, usize size, rados_ioctx_t io) +{ + auto be = make_rados_backend(io); + auto parser = object_reader(be); + + uuid_t uuid; + uuid_generate_random(uuid); + + vec buf(4096); + vec ckpts; + vec clones; + serialise_superblock(buf, ckpts, clones, uuid, size); + + return be->write(name, buf.data(), buf.size()); +} + +int lsvd_image::get_uuid(str name, uuid_t &uuid, rados_ioctx_t io) +{ + auto be = make_rados_backend(io); + auto parser = object_reader(be); + auto osb = parser.read_superblock(name); + PR_RET_IF(!osb, -EEXIST, "Could not read superblock '{}'", name); + + uuid_copy(uuid, osb->uuid); + return 0; +} + +int lsvd_image::delete_image(std::string name, rados_ioctx_t io) +{ + auto be = make_rados_backend(io); + auto parser = object_reader(be); + auto osb = parser.read_superblock(name); + PR_RET_IF(!osb, -EEXIST, "Could not read superblock '{}'", name); + auto sb = *osb; + + seqnum_t seq; + for (auto ckpt : sb.ckpts) { + auto rc = be->delete_obj(oname(name, ckpt)); + PR_RET_IF(rc < 0, rc, "Failed to delete checkpoint '{}'", ckpt); + seq = ckpt; + } + + for (int n = 0; n < 16; seq++, n++) + if (be->delete_obj(oname(name, seq)) >= 0) + n = 0; + + // delete the superblock last so we can recover from partial deletion + return be->delete_obj(name); +} + +int lsvd_image::clone_image(std::string oldname, std::string newname, + rados_ioctx_t io) +{ + UNIMPLEMENTED(); + return -1; +} \ No newline at end of file diff --git a/src/image.h b/src/image.h index 4f2d3870..7a1f0a9e 100644 --- a/src/image.h +++ b/src/image.h @@ -101,8 +101,8 @@ class lsvd_image // Image management // They all return 0 on success, -errno on failure - static int create_new(std::string name, rados_ioctx_t io); - static int get_uuid(std::string name, rados_ioctx_t io); + static int create_new(std::string name, usize size, rados_ioctx_t io); + static int get_uuid(std::string name, uuid_t &uuid, rados_ioctx_t io); static int delete_image(std::string name, rados_ioctx_t io); static int clone_image(std::string oldname, std::string newname, rados_ioctx_t io); diff --git a/src/liblsvd.cc b/src/liblsvd.cc index 49e93812..6c9bd2c1 100644 --- a/src/liblsvd.cc +++ b/src/liblsvd.cc @@ -226,27 +226,13 @@ std::pair split_string(std::string s, extern "C" int rbd_create(rados_ioctx_t io, const char *name, uint64_t size, int *order) { - lsvd_config cfg; - if (cfg.read() < 0) - return -1; - auto objstore = make_rados_backend(io); - auto rv = translate_create_image(objstore, name, size); - return rv; + return lsvd_image::create_new(name, size, io); } extern "C" int rbd_clone(rados_ioctx_t io, const char *source_img, const char *dest_img) { - lsvd_config cfg; - if (cfg.read() < 0) { - throw std::runtime_error("Failed to read config"); - return -1; - } - - auto objstore = make_rados_backend(io); - auto rv = translate_clone_image(objstore, source_img, dest_img); - - return rv; + return lsvd_image::clone_image(source_img, dest_img, io); } /* remove all objects and cache file. @@ -256,20 +242,7 @@ extern "C" int rbd_clone(rados_ioctx_t io, const char *source_img, */ extern "C" int rbd_remove(rados_ioctx_t io, const char *name) { - lsvd_config cfg; - auto rv = cfg.read(); - if (rv < 0) - return rv; - auto objstore = make_rados_backend(io); - uuid_t uu; - if ((rv = translate_get_uuid(objstore, name, uu)) < 0) - return rv; - auto rcache_file = cfg.cache_filename(uu, name, LSVD_CFG_READ); - unlink(rcache_file.c_str()); - auto wcache_file = cfg.cache_filename(uu, name, LSVD_CFG_WRITE); - unlink(wcache_file.c_str()); - rv = translate_remove_image(objstore, name); - return rv; + return lsvd_image::delete_image(name, io); } extern "C" void rbd_uuid(rbd_image_t image, uuid_t *uuid) diff --git a/src/translate.cc b/src/translate.cc index 46d4e55c..0806980c 100644 --- a/src/translate.cc +++ b/src/translate.cc @@ -1189,266 +1189,3 @@ void translate_impl::gc_thread(std::stop_token st) log_info("Stopping GC"); } - -/* ---------------- Debug ---------------- */ - -int translate_create_image(sptr objstore, const char *name, - uint64_t size) -{ - char buf[4096]; - memset(buf, 0, 4096); - - auto _hdr = (common_obj_hdr *)buf; - *_hdr = (common_obj_hdr){LSVD_MAGIC, - 1, // version - {0}, // UUID - OBJ_SUPERBLOCK, // type - 0, // seq - 8, // hdr_sectors - 0, // data_sectors - 0}; - uuid_generate_random(_hdr->vol_uuid); - - auto _super = (super_hdr *)(_hdr + 1); - uint64_t sectors = size / 512; - *_super = (super_hdr){sectors, // vol_size - 0, 0, // checkpoint offset, len - 0, 0, // clone offset, len - 0, 0}; // snap offset, len - - auto rv = objstore->write(name, buf, 4096); - return rv; -} - -int translate_get_uuid(sptr objstore, const char *name, uuid_t &uu) -{ - char buf[4096]; - int rv = objstore->read(name, 0, buf, sizeof(buf)); - if (rv < 0) - return rv; - auto hdr = (common_obj_hdr *)buf; - memcpy(uu, hdr->vol_uuid, sizeof(uuid_t)); - return 0; -} - -int translate_remove_image(sptr objstore, const char *name) -{ - // read the superblock to get the list of checkpoints and objects - object_reader parser(objstore); - auto super = parser.read_superblock(name); - PR_RET_IF(!super, -1, "Could not read superblock for {}", name); - - seqnum_t seq = 0; - - if (super->ckpts.size() > 0) { - seq = super->ckpts.back(); - auto cpoint = parser.read_checkpoint(oname(name, seq)); - PR_RET_IF(!cpoint, -1, "Could not read checkpoint {}.{}", name, seq); - - // Delete objects - for (auto const &o : cpoint->objects) { - auto r = objstore->delete_obj(oname(name, o->seq)); - if (r < 0) - log_warn("Failed to delete obj {}, r={}", o->seq, r); - } - } - - // Delete checkpoints - for (auto const &c : super->ckpts) { - auto r = objstore->delete_obj(oname(name, c)); - if (r < 0) - log_warn("Failed to delete obj {}, r={}", c, r); - } - - // delete any objects after the last checkpoint, up to the first run of - // 32 missing sequence numbers - for (int n = 0; n < 16; seq++, n++) { - if (objstore->delete_obj(oname(name, seq)) >= 0) - n = 0; - } - - // delete the superblock last so we can recover from partial deletion - objstore->delete_obj(name); - return 0; -} - -int translate_clone_image(sptr objstore, const char *source, - const char *dest) -{ - TODO(); -} - -#if 0 -ssize_t translate_impl::init(const char *prefix_, bool timedflush) -{ - vec ckpts; - vec clones; - vec snaps; - - /* note prefix = superblock name - */ - strcpy(super_name, prefix_); - - auto [_buf, bytes] = - parser->read_super(super_name, ckpts, clones, snaps, uuid); - - check_cond(bytes < 0, "read_super failed for obj {}", super_name); - check_cond(_buf == NULL, "no superblock"); - - int n_ckpts = ckpts.size(); - - super_buf = _buf; - super_h = (common_obj_hdr *)super_buf; - super_len = super_h->hdr_sectors * 512; - super_sh = (super_hdr *)(super_h + 1); - - memcpy(&uuid, super_h->vol_uuid, sizeof(uuid)); - - current = new translate_req(REQ_PUT, cfg.backend_obj_size, this); - seq = 1; // empty volume case - - /* is this a clone? - */ - if (super_sh->clones_len > 0) { - debug("Image is a clone, parsing cloneinfo headers"); - - char buf[4096]; - auto ci = (clone_info *)(_buf + super_sh->clones_offset); - auto obj_name = (char *)(ci + 1); - while (true) { - if (has_poolname_prefix(obj_name)) { - log_warn("Found poolname prefix in baseimg name: {}; stripping " - "it out. Cross-pool clones are not supported.", - obj_name); - obj_name = strip_poolname_prefix(obj_name); - log_info("Using base name: {}", obj_name); - } - - auto rv = objstore->read(obj_name, 0, buf, sizeof(buf)); - check_cond(rv < 0, "Failed to read {}", obj_name); - - auto _h = (common_obj_hdr *)buf; - auto _sh = (super_hdr *)(_h + 1); - - check_cond(_h->magic != LSVD_MAGIC || _h->type != LSVD_SUPER, - "Corrupted superblock in {}", obj_name); - check_cond(memcmp(_h->vol_uuid, ci->vol_uuid, sizeof(uuid_t)) != 0, - "UUID mismatch in {}", obj_name); - clone c; - strcpy(c.prefix, obj_name); - c.last_seq = ci->last_seq; - if (clones.size() > 0) - clones.back().first_seq = ci->last_seq + 1; - clone_list.push_back(c); - debug("Using base image {} upto seq {}", obj_name, c.last_seq); - - if (_sh->clones_len == 0) - break; - ci = (clone_info *)(buf + _sh->clones_offset); - obj_name = (char *)(ci + 1); - } - } - - /* read in the last checkpoint, then roll forward from there; - */ - int last_ckpt = -1; - if (ckpts.size() > 0) { - vec objects; - vec deletes; - vec entries; - - /* hmm, we should never have checkpoints listed in the - * super that aren't persisted on the backend, should we? - */ - while (n_ckpts > 0) { - int c = ckpts[n_ckpts - 1]; - objname name(name_for_seq(c), c); - do_log("reading ckpt %s\n", name.c_str()); - if (parser->read_checkpoint(name.c_str(), max_cache_seq, ckpts, - objects, deletes, entries) >= 0) { - last_ckpt = c; - break; - } - do_log("chkpt skip %d\n", c); - n_ckpts--; - } - if (last_ckpt == -1) - return -1; - - for (int i = 0; i < n_ckpts; i++) { - do_log("chkpt from super: %d\n", ckpts[i]); - checkpoints.push_back(ckpts[i]); // so we can delete them later - } - - for (auto o : objects) { - object_info[o.seq] = (obj_info){.hdr = (int)o.hdr_sectors, - .data = (int)o.data_sectors, - .live = (int)o.live_sectors}; - total_sectors += o.data_sectors; - total_live_sectors += o.live_sectors; - } - for (auto m : entries) { - objmap.update( - m.lba, m.lba + m.len, - (extmap::obj_offset){.obj = m.obj, .offset = m.offset}); - } - seq = last_ckpt + 1; - } - - /* roll forward - */ - for (;; seq++) { - vec cleaned; - vec entries; - common_obj_hdr h; - obj_data_hdr dh; - - objname name(name_for_seq(seq), seq); - if (parser->read_data_hdr(name.c_str(), h, dh, cleaned, entries) < 0) - break; - if (h.type == LSVD_CKPT) { - do_log("ckpt from roll-forward: %d\n", seq.load()); - checkpoints.push_back(seq); - continue; - } - - do_log("roll %d\n", seq.load()); - assert(h.type == LSVD_DATA); - object_info[seq] = (obj_info){.hdr = (int)h.hdr_sectors, - .data = (int)h.data_sectors, - .live = (int)h.data_sectors}; - total_sectors += h.data_sectors; - total_live_sectors += h.data_sectors; - if (dh.cache_seq) // skip GC writes - max_cache_seq = dh.cache_seq; - - int offset = 0, hdr_len = h.hdr_sectors; - vec deleted; - for (auto m : entries) { - extmap::obj_offset oo = {seq, offset + hdr_len}; - objmap.update(m.lba, m.lba + m.len, oo, &deleted); - offset += m.len; - } - for (auto d : deleted) { - auto [base, limit, ptr] = d.vals(); - object_info[ptr.obj].live -= (limit - base); - assert(object_info[ptr.obj].live >= 0); - total_live_sectors -= (limit - base); - } - } - - /* delete any potential "dangling" objects. - */ - for (int i = 1; i < 32; i++) { - objname name(name_for_seq(i + seq), i + seq); - objstore->delete_obj(name.str()); - } - - workers->pool.push( - std::thread(&translate_impl::worker_thread, this, workers)); - if (timedflush) - misc_threads->pool.push( - std::thread(&translate_impl::flush_thread, this, misc_threads)); - return bytes; -} -#endif \ No newline at end of file diff --git a/src/translate.h b/src/translate.h index 0aa85791..dd05987f 100644 --- a/src/translate.h +++ b/src/translate.h @@ -49,10 +49,3 @@ uptr make_translate(std::shared_ptr _io, lsvd_config *cfg, extmap::objmap *map, extmap::bufmap *bufmap, std::shared_mutex *m, std::mutex *buf_m, sptr rcache); - -int translate_create_image(sptr objstore, const char *name, - uint64_t size); -int translate_clone_image(sptr objstore, const char *source, - const char *dest); -int translate_remove_image(sptr objstore, const char *name); -int translate_get_uuid(sptr objstore, const char *name, uuid_t &uu); From 86c45f17ca4c46d36fb82d3bc4c49a97e5279d8a Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 30 May 2024 00:38:53 +0000 Subject: [PATCH 67/77] Add option for thick to imgtool (not implemented) --- README.md | 28 ++++++++++++---------------- docs/qemu-launch.md | 2 +- src/imgtool.cc | 7 +++++-- tools/capture-fio-perf-trace.bash | 2 +- tools/utils.bash | 2 +- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 057eb940..be3218f7 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ Parameters are: - `batch_size`, `LSVD_BATCH_SIZE`: size of objects written to the backend, in bytes (K/M recognized as 1024, 1024\*1024). Default: 8MiB - `wcache_batch`: write cache batching (see below) - `wcache_chunk': maximum size of atomic write, in bytes - larger writes will be split and may be non-atomic. -- `rcache_dir` - directory used for read cache file and GC temporary files. Note that `lsvd_imgtool` can format a partition for cache and symlink it into this directory, although the performance improvement seems limited. +- `rcache_dir` - directory used for read cache file and GC temporary files. Note that `imgtool` can format a partition for cache and symlink it into this directory, although the performance improvement seems limited. - `wcache_dir` - directory used for write cache file - `xlate_window`: max writes (i.e. objects) in flight to the backend. Note that this value is coupled to the size of the write cache, which must be big enough to hold all outstanding writes in case of a crash. - `hard_sync` (untested): "flush" forces all batched writes to the backend. @@ -70,7 +70,7 @@ figure out how to optimize at runtime instead of bothering the user for a value. First create a volume: ``` -build$ sudo bin/lsvd_imgtool --create --rados --size=20g pool/imgname +build$ sudo imgtool create poolname imgname --size=20g ``` Then you can start a SPDK NVMe-oF gateway: @@ -158,21 +158,17 @@ The read cache typically fetches 64K blocks, so there may be a bit of extra load Most of the testing to date has been with an 8,3 code with 64K stripe size. ## Tools -`lsvd_imgtool` mostly just calls the LSVD versions of `rbd_create` and `rbd_remove`, although it can also format a cache file (e.g. if you're using a raw partition) -``` -build$ bin/lsvd_imgtool --help -Usage: lsvd_imgtool [OPTION...] -IMAGE - -C, --create create image - -d, --cache-dir=DIR cache directory - -D, --delete delete image - -I, --info show image information - -k, --mkcache=DEV use DEV as cache - -O, --rados use RADOS - -z, --size=SIZE size in bytes (M/G=2^20,2^30) - -?, --help Give this help list - --usage Give a short usage message +``` +build$ ./imgtool --help +❯ ./imgtool --help +Allowed options: + --help produce help message + --cmd arg subcommand: create, clone, delete, info + --img arg name of the iname + --pool arg pool where the image resides + --size arg (=1G) size in bytes (M=2^20,G=2^30) + --dest arg destination (for clone) ``` Other tools live in the `tools` subdirectory - see the README there for more details. diff --git a/docs/qemu-launch.md b/docs/qemu-launch.md index 13995164..0f7dfd4a 100644 --- a/docs/qemu-launch.md +++ b/docs/qemu-launch.md @@ -5,7 +5,7 @@ install config, run QEMU with `-drive format=raw,file=seed.iso,cache=none,if=virtio`. 1. Create a lsvd image if you don't already have one: - `./imgtool --create --rados --size 10g $pool_name/$img_name` + `./imgtool create --size 10g $pool_name $img_name` 2. Launch LSVD as a NVMF target `qemu-gateway.bash $pool_name $img_name` 3. Lanuch QEMU with the NVMF target `qemu-client.bash`. This does the following: - `nvme connect` to the nvmf target on the gateway diff --git a/src/imgtool.cc b/src/imgtool.cc index 0e6e52d6..00c030df 100644 --- a/src/imgtool.cc +++ b/src/imgtool.cc @@ -31,7 +31,7 @@ static usize parseint(str i) return val; } -static void create(rados_ioctx_t io, str name, usize size) +static void create(rados_ioctx_t io, str name, usize size, bool thick) { auto rc = lsvd_image::create_new(name, size, io); THROW_MSG_ON(rc != 0, "Failed to create new image '{}'", name); @@ -97,6 +97,8 @@ int main(int argc, char **argv) ("pool", po::value(), "pool where the image resides") ("size", po::value()->default_value("1G"), "size in bytes (M=2^20,G=2^30)") + ("thick", po::value()->default_value(false), + "thick provision when creating an image (not currently supported)") ("dest", po::value(), "destination (for clone)"); // clang-format on @@ -124,7 +126,8 @@ int main(int argc, char **argv) if (cmd == "create") { auto size = parseint(vm["size"].as()); - create(io, img, size); + auto thick = vm["thick"].as(); + create(io, img, size, thick); } else if (cmd == "delete") remove(io, img); else if (cmd == "clone") { diff --git a/tools/capture-fio-perf-trace.bash b/tools/capture-fio-perf-trace.bash index ed744cc4..a40b16f7 100755 --- a/tools/capture-fio-perf-trace.bash +++ b/tools/capture-fio-perf-trace.bash @@ -9,7 +9,7 @@ make clean make -j$(nproc) release ./tools/remove_objs.py pone perf-fio -# ./imgtool --rados --create --size=1G pone/perf-fio +# ./imgtool create --size 1G pone perf-fio ./thick-image --size=10G pone/perf-fio cd test/ diff --git a/tools/utils.bash b/tools/utils.bash index 0aaee13a..d925844a 100644 --- a/tools/utils.bash +++ b/tools/utils.bash @@ -101,7 +101,7 @@ function create_lsvd_thin { # ./builddir/imgtool --delete --rados $pool/$img || true ./tools/remove_objs.py $pool $img - ./builddir/imgtool --create --rados --size=$size $pool/$img + ./builddir/imgtool create --size $size $pool $img # make sure image exists rados -p $pool stat $img From 1ebcb6e4a6428abbae71d9a2c12c74e5c7d71dfb Mon Sep 17 00:00:00 2001 From: Kristi Nikolla Date: Tue, 4 Jun 2024 16:31:16 +0000 Subject: [PATCH 68/77] Add nasm to dependencies --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 24f790f1..050f9857 100644 --- a/Makefile +++ b/Makefile @@ -19,13 +19,13 @@ clean: install-deps: # Folly deps - sudo apt install libboost-all-dev libdouble-conversion-dev libevent-dev \ + sudo apt install -y libboost-all-dev libdouble-conversion-dev libevent-dev \ libgflags-dev libgmock-dev libgoogle-glog-dev libgtest-dev \ liblz4-dev liblzma-dev libsnappy-dev libsodium-dev libunwind-dev \ libzstd-dev ninja-build zlib1g-dev # SPDK deps - sudo apt install libnuma-dev libarchive-dev libibverbs-dev librdmacm-dev \ - python3-pyelftools libcunit1-dev libaio-dev + sudo apt install -y libnuma-dev libarchive-dev libibverbs-dev librdmacm-dev \ + python3-pyelftools libcunit1-dev libaio-dev nasm # LSVD deps sudo apt install -y meson mold libfmt-dev librados-dev \ libjemalloc-dev libradospp-dev liburing-dev pkg-config uuid-dev From 124cd5f0c792ac20fb3429cfa2a3804e76e761a9 Mon Sep 17 00:00:00 2001 From: Kristi Nikolla Date: Tue, 4 Jun 2024 17:11:56 +0000 Subject: [PATCH 69/77] Remove liburing from installed system packages --- .github/workflows/test.yaml | 2 +- Makefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 742297a2..4cb58059 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,4 +46,4 @@ jobs: - name: Logs if: always() run: | - cat build-rel/meson-logs/meson-log.txt + cat build-dbg/meson-logs/meson-log.txt diff --git a/Makefile b/Makefile index 050f9857..a067ad93 100644 --- a/Makefile +++ b/Makefile @@ -28,4 +28,4 @@ install-deps: python3-pyelftools libcunit1-dev libaio-dev nasm # LSVD deps sudo apt install -y meson mold libfmt-dev librados-dev \ - libjemalloc-dev libradospp-dev liburing-dev pkg-config uuid-dev + libjemalloc-dev libradospp-dev pkg-config uuid-dev From fb4340035f8253c921e77d417a53f69def549ac8 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Tue, 4 Jun 2024 22:58:58 +0000 Subject: [PATCH 70/77] Temporarily remove folly dep --- src/meson.build | 10 +++++----- src/utils.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/meson.build b/src/meson.build index 4ede957e..5af34b55 100644 --- a/src/meson.build +++ b/src/meson.build @@ -1,10 +1,10 @@ cxx = meson.get_compiler('cpp') -cmake = import('cmake') +# cmake = import('cmake') # cmvars = cmake.subproject_options() # cmvars.add_cmake_defines({'POSITION_INDEPENDENT_CODE': true}) -folly_cm = cmake.subproject('folly') -libfolly = folly_cm.dependency('folly') +# folly_cm = cmake.subproject('folly') +# libfolly = folly_cm.dependency('folly') lsvd_src = files( 'config.cc', @@ -22,11 +22,11 @@ lsvd_src = files( lsvd_inc = include_directories('.') lsvd_deps = [ - libfolly, + # libfolly, dependency('threads'), dependency('zlib'), dependency('fmt'), - dependency('boost'), + dependency('boost', modules: ['system', 'filesystem', 'program_options', 'thread', 'regex']), dependency('liburing', static: true), dependency('uuid'), cxx.find_library('rados', required: true), diff --git a/src/utils.h b/src/utils.h index ee5009af..66733c59 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,6 +1,6 @@ #pragma once -#include "folly/FBVector.h" +// #include "folly/FBVector.h" #include #include #include @@ -32,7 +32,7 @@ template using sptr = std::shared_ptr; template using uptr = std::unique_ptr; template using opt = std::optional; template using vec = std::vector; -template using fvec = folly::fbvector; +// template using fvec = folly::fbvector; #define CEXTERN extern "C" From 549a25b417ad08c26a4f96fbb4163c1e670b8698 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 5 Jun 2024 00:18:43 +0000 Subject: [PATCH 71/77] Update to ubuntu 2404 and use system meson --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4cb58059..3c4edc0f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -8,7 +8,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - uses: actions/checkout@v3 From e90406c037bcac1cd29e6ec772e97b7601303bad Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 5 Jun 2024 02:34:23 +0000 Subject: [PATCH 72/77] Forcibly break system apt meson in CI --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 3c4edc0f..f026ba4a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: run: | sudo apt update make install-deps - pip3 install --upgrade meson + sudo pip3 install --upgrade meson --break-system-packages - name: Build run: | From d41fc4139350c805833eb473500fe6af84d1973e Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 5 Jun 2024 02:40:46 +0000 Subject: [PATCH 73/77] Revert to ubuntu22 in CI --- .github/workflows/test.yaml | 2 +- subprojects/folly | 1 + subprojects/spdk | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) create mode 160000 subprojects/folly create mode 160000 subprojects/spdk diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index f026ba4a..27554865 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -8,7 +8,7 @@ on: jobs: build: - runs-on: ubuntu-24.04 + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 diff --git a/subprojects/folly b/subprojects/folly new file mode 160000 index 00000000..89ac8453 --- /dev/null +++ b/subprojects/folly @@ -0,0 +1 @@ +Subproject commit 89ac8453fc7825e566b46a7ebb9e0348b1c09338 diff --git a/subprojects/spdk b/subprojects/spdk new file mode 160000 index 00000000..0786843e --- /dev/null +++ b/subprojects/spdk @@ -0,0 +1 @@ +Subproject commit 0786843e99550cedb6de26c25641d9c294ddcb85 From 9a47947e1c08aaed4433ea395b72ac249bf12a83 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 5 Jun 2024 03:00:28 +0000 Subject: [PATCH 74/77] Revert pip break system packages --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 27554865..a5d0b688 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,7 +23,7 @@ jobs: run: | sudo apt update make install-deps - sudo pip3 install --upgrade meson --break-system-packages + sudo pip3 install --upgrade meson - name: Build run: | From 741b44d5f8aee95fc9668b768cce5931d42beaa5 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Wed, 5 Jun 2024 04:59:59 +0000 Subject: [PATCH 75/77] Remove empty subproject folders --- subprojects/folly | 1 - subprojects/spdk | 1 - 2 files changed, 2 deletions(-) delete mode 160000 subprojects/folly delete mode 160000 subprojects/spdk diff --git a/subprojects/folly b/subprojects/folly deleted file mode 160000 index 89ac8453..00000000 --- a/subprojects/folly +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 89ac8453fc7825e566b46a7ebb9e0348b1c09338 diff --git a/subprojects/spdk b/subprojects/spdk deleted file mode 160000 index 0786843e..00000000 --- a/subprojects/spdk +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0786843e99550cedb6de26c25641d9c294ddcb85 From a7a72c358f3cf88115de64677b66d783fc61162c Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Mon, 10 Jun 2024 21:17:09 +0000 Subject: [PATCH 76/77] Output all logs on fail --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a5d0b688..4011a16a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -46,4 +46,4 @@ jobs: - name: Logs if: always() run: | - cat build-dbg/meson-logs/meson-log.txt + cat build-dbg/meson-logs/*.txt From 697fe4b25e0645c12f473a245a68e81e23db7eb9 Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Thu, 13 Jun 2024 02:12:56 +0000 Subject: [PATCH 77/77] Create cache directories --- .github/workflows/test.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4011a16a..d3b0a332 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -38,10 +38,11 @@ jobs: run: | mkdir -p /tmp/lsvd-read mkdir -p /tmp/lsvd-write + mkdir -p /tmp/lsvd cd build-dbg sudo meson test - sudo ./imgtool --create --size 1g --pool pone test-img + sudo ./imgtool create --size 1g --pool pone --img test-img - name: Logs if: always()