From 87bf28a1157258a05183bf3b1425348323a180af Mon Sep 17 00:00:00 2001 From: Isaac Khor Date: Fri, 6 Sep 2024 01:06:44 +0000 Subject: [PATCH] Fix nvmf module and clean up some docs --- README.md | 117 +++++---- docs/configuration.md | 14 -- docs/install.md | 11 - docs/meeting_notes.md | 294 ---------------------- docs/testing.md | 12 - src/meson.build | 2 + subprojects/packagefiles/spdk/meson.build | 123 ++++----- 7 files changed, 136 insertions(+), 437 deletions(-) delete mode 100644 docs/configuration.md delete mode 100644 docs/install.md delete mode 100644 docs/meeting_notes.md delete mode 100644 docs/testing.md diff --git a/README.md b/README.md index be3218f7..6f02677a 100644 --- a/README.md +++ b/README.md @@ -30,66 +30,93 @@ It is able to install and boot Ubuntu 22.04 (see `qemu/`) and is stable under most of our tests, but there are likely regressions around crash recovery and other less well-trodden paths. -## Build +## How to run -This project uses `meson` to manage the build system. Run `make setup` to -generate the build files, then run `meson compile` in either `build-rel` or -`build-dbg` to build the release or debug versions of the code. +``` +echo 4096 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages +docker run --net host -v /dev/hugepages:/dev/hugepages -v /etc/ceph:/etc/ceph -v /var/tmp:/var/tmp -v /dev/shm:/dev/shm -i -t --privileged --entrypoint /bin/bash ghcr.io/cci-moc/lsvd-rbd:main +``` -A makefile is also offered for convenience; `make` builds the debug version -by default. +If the cpu is too old, you might have to rebuild the image: -## Configuration +``` +git clone https://github.com/cci-moc/lsvd-rbd.git +cd lsvd-rbd +docker build -t lsvd-rbd . +docker run --net host -v /dev/hugepages:/dev/hugepages -v /etc/ceph:/etc/ceph -v /var/tmp:/var/tmp -v /dev/shm:/dev/shm -i -t --privileged --entrypoint /bin/bash lsvd-rbd +``` + +To setup lsvd images: + +``` +#./imgtool create --size 100g +./imgtool create lsvd-ssd benchtest1 --size 100g +``` + +To configure nvmf: + +``` +export gateway_ip=0.0.0.0 +./rpc.py nvmf_create_transport -t TCP -u 16384 -m 8 -c 8192 +./rpc.py nvmf_create_subsystem nqn.2016-06.io.spdk:cnode1 -a -s SPDK00000000000001 -d SPDK_Controller1 +./rpc.py nvmf_subsystem_add_listener nqn.2016-06.io.spdk:cnode1 -t tcp -a $gateway_ip -s 9922 +``` + +To mount images on the gateway: -LSVD is not yet merged into the Ceph configuration framework, and uses its own -system. It reads from a configuration file (`lsvd.conf` or -`/usr/local/etc/lsvd.conf`) or from environment variables of the form -`LSVD_`, where NAME is the upper-case version of the config file variable. -Default values can be found in `config.h` - -Parameters are: - -- `batch_size`, `LSVD_BATCH_SIZE`: size of objects written to the backend, in bytes (K/M recognized as 1024, 1024\*1024). Default: 8MiB -- `wcache_batch`: write cache batching (see below) -- `wcache_chunk': maximum size of atomic write, in bytes - larger writes will be split and may be non-atomic. -- `rcache_dir` - directory used for read cache file and GC temporary files. Note that `imgtool` can format a partition for cache and symlink it into this directory, although the performance improvement seems limited. -- `wcache_dir` - directory used for write cache file -- `xlate_window`: max writes (i.e. objects) in flight to the backend. Note that this value is coupled to the size of the write cache, which must be big enough to hold all outstanding writes in case of a crash. -- `hard_sync` (untested): "flush" forces all batched writes to the backend. -- `backend`: "file" or "rados" (default rados). The "file" backend is for testing only -- `cache_size` (bytes, K/M/G): total size of the cache file. Currently split 1/3 write, 2/3 read. Ignored if the cache file already exists. -- `ckpt_interval` N: limits the number of objects to be examined during crash recovery by flushing metadata every N objects. -- `flush_msec`: timeout for flushing batched writes -- `gc_threshold` (percent): described below - -Typically the only parameters that need to be set are `cache_dir` and -`cache_size`. Parameters may be added or removed as we tune things and/or -figure out how to optimize at runtime instead of bothering the user for a value. - -## Using LSVD with fio and QEMU - -First create a volume: ``` -build$ sudo imgtool create poolname imgname --size=20g +export PYTHONPATH=/app/src/ +./rpc.py --plugin rpc_plugin bdev_lsvd_create lsvd-ssd benchtest1 -c '{"rcache_dir":"/var/tmp/lsvd","wlog_dir":"/var/tmp/lsvd"}' +./rpc.py nvmf_subsystem_add_ns nqn.2016-06.io.spdk:cnode1 benchtest1 ``` -Then you can start a SPDK NVMe-oF gateway: +To kill gracefully shutdown gateway: + ``` -./qemu/qemu-gateway.sh pool imgname +./rpc.py --plugin rpc_plugin bdev_lsvd_delete benchtest1 +./rpc.py spdk_kill_instance SIGTERM +docker kill ``` -Then connect to the NVMe-oF gateway: +## Mount a client ``` -nvme connect -t tcp -n nqn.2016-06.io.spdk:cnode1 -a +modprobe nvme-fabrics +nvme disconnect -n nqn.2016-06.io.spdk:cnode1 +gw_ip=${gw_ip:-10.1.0.5} +nvme connect -t tcp --traddr $gw_ip -s 9922 -n nqn.2016-06.io.spdk:cnode1 -o normal +sleep 2 +nvme list +dev_name=$(nvme list | perl -lane 'print @F[0] if /SPDK/') +printf "Using device $dev_name\n" ``` -You should now have just a plain old NVMe device, with which you can use just -like any other NVMe device. -Do not use multiple fio jobs on the same image - currently there's no protection -and they'll stomp all over each other. RBD performs horribly in that case, but -AFAIK it doesn't compromise correctness. +## Build + +This project uses `meson` to manage the build system. Run `make setup` to +generate the build files, then run `meson compile` in either `build-rel` or +`build-dbg` to build the release or debug versions of the code. + +A makefile is also offered for convenience; `make` builds the debug version +by default. + +## Configuration + +LSVD is configured using a JSON file. When creating an image, we will +try to read the following paths and parse them for configuration options: + +- Default built-in configuration +- `/usr/local/etc/lsvd.json` +- `./lsvd.json` +- user supplied path + +The file read last has highest priority. + +We will also first try to parse the user-supplied path as a JSON object, and if +that fails try treat it as a path and read it from a file. + +An example configuration file is provided in `docs/example_config.json`. ## Image and object names diff --git a/docs/configuration.md b/docs/configuration.md deleted file mode 100644 index 4b3c2a9d..00000000 --- a/docs/configuration.md +++ /dev/null @@ -1,14 +0,0 @@ -# Configuring LSVD - -LSVD is configured using a JSON file. When creating an image, we will -try to read the following paths and parse them for configuration options: - -- Default built-in configuration -- `/usr/local/etc/lsvd.json` -- `./lsvd.json` -- user supplied path - -The file read last has highest priority. - -We will also first try to parse the user-supplied path as a JSON object, and if -that fails try treat it as a path and read it from a file. diff --git a/docs/install.md b/docs/install.md deleted file mode 100644 index 4468e659..00000000 --- a/docs/install.md +++ /dev/null @@ -1,11 +0,0 @@ -# SPDK setup - -- Clone the entire repo including submodules -- Build SPDK with RBD support - -``` -./configure --with-rbd -make -``` -- Build LSVD: `make release` -- Run scripts are available in `qemu/` or `experiments/` directories \ No newline at end of file diff --git a/docs/meeting_notes.md b/docs/meeting_notes.md deleted file mode 100644 index 74d13fc8..00000000 --- a/docs/meeting_notes.md +++ /dev/null @@ -1,294 +0,0 @@ -## W7-4 - -- Kristi - - get CI working, try to build a single nvmeof gateway artefact - - move clone.py back into translate.cc -- Sumatra - - readahead in read cache -- Isaac - - image striping -- Timothy - -## Todos W6-1 - -Current todos: - -- Refactor top-lv requests into lsvd class -- Image striping - - Make sure ordering is preserved between stripes -- SPDK fork for new LSVD backend -- Potential switch of backend to RBD instead of s3/rados -- General production ready testing - -Question: how to prioritise the above? who works on what? - -## Todos W49-2 - -- RDB and LSVD on both HDD and SSDs, all 4 configurations -- Working set (80g) < nvme cache (500g) -- Fio synthetic workloads, vary blocksize, io depth, workload -- Filebench (varmail, oltp, fileserver, fileserver-fsync?), maybe YCSB -- Shared cache VM boot times - -Notes on write log: - -- Remote malloc -- Remote nvme -- 2 remote nvme -- Measure all of this with a single write workload, reads don't matter here - -## Todos W49-1 - -- By W49-2, run benchmarks for: - - SSD backend - - HDD backend - - blocksize 4K, 8K, 16K -- By w49-3, set up VM boot for shared cache - -Notes on VMWare proposal: - -- IOPs guarantee for certain users - -## W48-5: to get done week Dec 4-7 for presentation dec 12th - -- filesystem benchamrks - fix OLTP - this weekend - same parametser as eurosys Logical disk and cache both -- working set for disk benchmark < size disk - same as eurosys -- all above with Ceph disk backend - run both filesystem and disk bencharks - this will require much larger read cache block size -- trace-driven optimization - from start to adding a print statement in init - VM boot -- shared cache - boot multiple clones demnstrate: 1) easest - boot clone after ahving booked another one becomes warm show both miss rate and boot time, 2) boot time for a fleet of similar images 10 VMs simultaneously -- test larger block sizes for read cache and impact on trace optimization - i.e., prefetching - especially for disk backend -- presentation drafts by the 6th - -### stretch goals -- write-ahead log should be memory-only - want to show it with 1- malloc store, 2- versus with 1 NVME log, and 3- 2 NVME log -- NVME rather than ramdisk -- iscsi target - -## ATC 24 todos -- benchmark against nvme drive instead of ramdisk -- trace-driven optimisation of VM boot image - timo -- ycsb benchmark? -- iscsi target, effort should be minimal - - re-attach to the same local cache -- make sure shared cache and cloned images work -- per-commit benchmarks - sumatra -- write-ahead log should be memory-only instead of going to nvme - - with UPS, failure mode we address is failure of gateway server - -Paper thesis: - -- storage gateway that's fast, scales, deploys as nvmf target -- GC performs well (?) -- shared gateway, read sharing allows for better performance? - - shared cache - - shared images between VMs - - physical machines with hardware we don't have (NVMF) -- shared gateway and cloned images - - since they're derivations, they share prefixes - - thus share cache - - fleet deployments -- most machines are mostly the same and derived - - faster startup due to warmed cache from other machines booting? - - container-like sharing of base images, potential performance advantage -- disaggregation of backend - -2023-12-12 talk with vincent from IBM: - -Hey guys, I think we should have a series of 20 minute presentations. This is a -good deadline 12/12 to have results on the various efforts. Perhaps Peter and I -giving an overview of the group vision, the larger opportunity that pulls -together LSVD and D4N, how this ties into data center architecture/MOC/AI, and -then a series of targeted talks on: - -- Performance of LSVD for block and file system benchmarks -- LSVD re-organizing images for fast boot -- LSVD sharing of cache state -- D4N implementation and initial performance results -- D4N locality integrated into k8s - - -## old todo list - -`lsvd.cc`: -- implement `rbd_aio_discard` -- implement `rbd_aio_flush` -- `rbd_aio_req` - merge with lsvd_completion -- `rbd_remove` - find and remove cache file - -`read_cache.cc`: -- implement CLOCK replacement -- lots of notes that documentation needs to be added -- **code cleanup** - stop using saved superblock for runtime variables - -`translate.cc`: -- coalesce writes -- improved GC - -`write_cache.cc`: -- **batching** - account for write size (not just number of writes) in batching -- **code cleanup** - stop using saved superblock for runtime variables - -**unused fields in superblock** - `total_sectors`, `live_sectors`, `next_object` - either use these or delete them. - -**putting data into read cache** - do this from translate when it's writing a batch. Data will get written twice, but there won't be a read operation. Make the write cache a fixed size, dependent on backend write window and batch size, and read cache takes all the rest of the space - -**improved GC** - interface to get data from read cache, also decide whether to do partial or full read of an object. - -**clone volumes** -- create script for clone. note that we - -**snapshots** - -# all old stuff below here - -[DONE]**GET FIO WORKING** -- read seems to have regressed totally -- write seems to hang on completion when compiled with -O3 - -## config file -[DONE] -things to go here: -- write batch size -- write window outstanding -- directory for cache files -- number of write cache threads -- which backend to use? - -Can probably have sections that override settings on a per-volume basis - - -## cache file handling - -[NO] **split read/write** - if we're going to use files, there's no reason why the two caches can't go in different files. -(yes there is - it makes it harder to use a partition) - -[DONE] **naming** - default name is volume UUID, or "UUID.rcache", "UUID.wcache" - -[DONE] **creation** - volume startup should be able to create cache if none exists -note that we still don't handle malformed cache files - -## other stuff - -[DONE] **write cache sequence number** - need to record this in the backend so that we can implement write cache roll forward properly. - -[DONE] **read blocking** - need to block interfering reads during GC. (note - this can be done just by blocking access to any objects which haven't finished being written out, and this works for misses in the write cache, too) - -[DONE] **garbage collection** - need to make it work properly, then test it - -[DONE] **write pacing** - implement pacing for the backend. - -Note - I had been thinking about having the RBD level (`lsvd.cc`) pass data to the translation layer after write cache completion, but this won't work, as it won't preserve the write ordering in the cache. It will result in a *legal* ordering, but if the backend and cache differ, volume could change after crash recovery. - -[DONE] **top-level structure** - `fake_rbd_image` was just an -afterthought. Need to re-architect it properly. - -**merged caches** - Is there any way we can move stuff from the write cache to the read cache? - -**race conditions** - scrub notification methods to look for race conditions like the read cache one. - -[DONE] Checkpoint list weirdness: -``` - ckpts: 68 : 4294967295 -``` - -translate threads - do we actually need multiple threads, since we're -using async calls? probably not. - -any other parameters that should go in the config file? - -## list of TODO comments in code - -`io.cc`: -- [DONE] `e_iocb` instance is self-deleting, fix this - -requests in general: -- `request->run` - should this return success/error? -- [YES] is there any use for `req->wait` method? - -`rados_backend.cc`: -- should `rados_backend` take an ioctx rather than using pool in the prefix? -- conversely - handle multiple pools in rados backend -- [DONE] shut down RADOS state on rados backend delete - -`lsvd.cc`: -- implement `rbd_aio_discard` -- implement `rbd_aio_flush` -- implement `rbd_aio_readv`, `rbd_aio_writev` -- [DONE] `rbd_aio_req` - clean up the state machine -- `rbd_aio_req` - merge with lsvd_completion - -`read_cache.cc`: -- implement CLOCK replacement -- lots of notes that documentation needs to be added - -`translate.cc`: -- [DONE] get rid of global UUID -- [DONE] initialize `last_ckpt` - done? need to check -- `translate_impl::worker_thread` - coalesce writes -- GC in general, also something about the objmap lock... - -`write_cache.cc`: -- [DONE] it looks like we might not be freeing `wcache_write_req` properly? -- [DONE] switch metadata regions when writing checkpoint -- something about `super_copy->next` -- [DONE] `roll_log_forward` -- [DONE] **write throttling** - need to (a) bound number of outstanding NVMe - writes, (b) avoid "hanging" writes due to batching -- [DONE] **shutdown** - flush all writes and checkpoint before closing -- **batching** - account for write size (not just number of writes) in batching -- **code cleanup** - stop using saved superblock for runtime variables - -## performance - -`io_uring` - should shift from libaio to `io_uring` - currently taking -as much CPU for `io_submit` as for the rest of LSVD. Or can I just get -libaio to work correctly? Currently 5% of CPU (out of total 38.5% used -by LSVD) is going to `usleep`. - -**locking** - can we use shared lock for `get_room`? any other locking -fixes to get rid of overhead? - -## write cache CRC - -should I do it? code is: -``` - #include - uint32_t crc2 = ~crc32(-1, (unsigned char*)h, 4096); - -``` -check the CRC by saving a copy, zeroing it out again, and recomputing. - -Standard Linux zlib CRC32 isn't all that fast - 750MB/s on the old E5-2660 v2 machines, 2.1GB/s on the new Ryzen. Cloudflare zlib (https://github.com/cloudflare/zlib) is **way** faster - 21GB/s on the Ryzen and 1.9GB/s on the old HP machines. - -## write recovery - -Right now write cache log recovery is a mess. Ways to fix it: - -[DONE] **clean shutdown** - add a clean shutdown flag; if it's set, we read the lengths and map from the metadata section and we're done. - -[DONE] **brute force** - on startup seach the entire cache to find the beginning of the journal, then roll it forward, updating the map and sending all the data to the backend. - -**translation layer assist** - translation layer tracks sequence numbers (in write cache) of each write, and provides an interface to get the highest sequence number s.t. all writes before that have committed to the backend. Log replay is: -- read all headers to find start of log -- read headers starting at start of log to update cache map -- for any newer than recorded max confirmed (minus one), send to the backend again. - -filtering out spurious journal entries with brute force - basically we go through the cache looking for blocks where the magic number is ok, and the starting sequence number is the lowest one we encounter. - -But... there might be a spurious one. We can handle this by looking at it as the search for the start of the sequence after the gap, starting with the oldest sequence number. If the we find a block *b* with sequence number lower than all seen so far, it's a tentative start to the log. Check *b+len* etc. all the way to the end of the log, checking magic number and consecutive sequence numbers the whole way. If we stop partway, then this was a false start - throw away the sequence number information and start scanning for the magic number at *b+1*. - -Note that it can only stop before the end if it begins at *b=0*. - -**current status (10/21)** - has dirty/clean flag, checkpoints only on clean shutdown, does brute force to recover cache state and write it all to backend. - -Remaining crash recovery optimizations: -- periodic checkpointing when using large caches so we don't need to go through entire cache. (is this necessary?) -- translation layer assist to avoid replaying entire cache to backend on crash recovery - -## translation layer startup issues - -after several crash/restart cycles, getting assertion failure due to object overwrite - -## gc ideas - -add a GC generation field to objects - diff --git a/docs/testing.md b/docs/testing.md deleted file mode 100644 index fa054c3e..00000000 --- a/docs/testing.md +++ /dev/null @@ -1,12 +0,0 @@ -# Tests - -The tests are all in the `test/` directory. Many of them haven't been run -in many releases and may not work. - -Compile them with `make debug` - -Known working tests: - -- `test/test-seq.cc` - functional test, will create an image, write to it -sequentially, and read it back and check that the contents are the same as what -was written diff --git a/src/meson.build b/src/meson.build index d43a21d3..cecd0c57 100644 --- a/src/meson.build +++ b/src/meson.build @@ -31,6 +31,8 @@ lsvd_deps = [ dependency('uuid'), dependency('nlohmann_json'), cxx.find_library('rados', required: true), + cxx.find_library('rbd', required: true), + cxx.find_library('aio', required: true), cxx.find_library('jemalloc', required: false), ] diff --git a/subprojects/packagefiles/spdk/meson.build b/subprojects/packagefiles/spdk/meson.build index 8c57dbfe..b67d3d38 100644 --- a/subprojects/packagefiles/spdk/meson.build +++ b/subprojects/packagefiles/spdk/meson.build @@ -34,77 +34,78 @@ message('Setting up dependency ..') message('build_subprojects:', get_option('build_subprojects')) custom_libnames = [ - 'spdk_event', - 'spdk_env_dpdk_rpc', - 'spdk_event_bdev', + 'isal', + 'isal_crypto', + 'rte_bus_pci', + 'rte_cryptodev', + 'rte_dmadev', + 'rte_eal', + 'rte_ethdev', + 'rte_hash', + 'rte_kvargs', + 'rte_log', + 'rte_mbuf', + 'rte_mempool', + 'rte_mempool_ring', + 'rte_net', + 'rte_pci', + 'rte_power', + 'rte_rcu', + 'rte_ring', + 'rte_telemetry', + 'rte_vhost', + 'spdk_accel', + 'spdk_accel_error', + 'spdk_accel_ioat', 'spdk_bdev', - 'spdk_notify', - # 'spdk_bdev_malloc', - # 'spdk_bdev_null', - # 'spdk_bdev_nvme', - # 'spdk_bdev_passthru', - # 'spdk_bdev_lvol', - # 'spdk_bdev_raid', - # 'spdk_bdev_error', - # 'spdk_bdev_gpt', - # 'spdk_bdev_split', - # 'spdk_bdev_delay', - # 'spdk_bdev_zone_block', - # 'spdk_bdev_rbd', - 'spdk_blobfs_bdev', - 'spdk_blobfs', - 'spdk_blob_bdev', - 'spdk_lvol', - 'spdk_blob', - 'spdk_nvme', - 'spdk_nvmf', - # 'spdk_bdev_aio', + 'spdk_bdev_aio', + 'spdk_bdev_delay', + 'spdk_bdev_error', 'spdk_bdev_ftl', - 'spdk_ftl', + 'spdk_bdev_gpt', + 'spdk_bdev_lvol', + 'spdk_bdev_malloc', + 'spdk_bdev_null', + 'spdk_bdev_nvme', + 'spdk_bdev_passthru', + 'spdk_bdev_raid', + 'spdk_bdev_rbd', + 'spdk_bdev_split', 'spdk_bdev_virtio', - 'spdk_virtio', - 'spdk_vfio_user', - 'spdk_event_accel', - 'spdk_accel', + 'spdk_bdev_zone_block', + 'spdk_blob', + 'spdk_blob_bdev', + 'spdk_blobfs', + 'spdk_blobfs_bdev', 'spdk_dma', - 'spdk_accel_error', - 'spdk_accel_ioat', - 'spdk_ioat', - 'spdk_event_vmd', - 'spdk_vmd', + 'spdk_env_dpdk', + 'spdk_env_dpdk_rpc', + 'spdk_event', + 'spdk_event_accel', + 'spdk_event_bdev', + 'spdk_event_iobuf', + 'spdk_event_nvmf', 'spdk_event_sock', + 'spdk_event_vmd', + 'spdk_ftl', + 'spdk_init', + 'spdk_ioat', + 'spdk_json', + 'spdk_jsonrpc', + 'spdk_log', + 'spdk_lvol', + 'spdk_notify', + 'spdk_nvme', + 'spdk_nvmf', + 'spdk_rpc', 'spdk_sock', 'spdk_sock_posix', - 'spdk_event_iobuf', - 'spdk_init', 'spdk_thread', 'spdk_trace', - 'spdk_rpc', - 'spdk_jsonrpc', - 'spdk_json', - 'spdk_env_dpdk', 'spdk_util', - 'spdk_log', - 'rte_eal', - 'rte_mempool', - 'rte_ring', - 'rte_mbuf', - 'rte_bus_pci', - 'rte_pci', - 'rte_mempool_ring', - 'rte_telemetry', - 'rte_kvargs', - 'rte_rcu', - 'rte_power', - 'rte_ethdev', - 'rte_vhost', - 'rte_net', - 'rte_dmadev', - 'rte_cryptodev', - 'rte_hash', - 'rte_log', - 'isal', - 'isal_crypto', + 'spdk_vfio_user', + 'spdk_virtio', + 'spdk_vmd', ] spdk_deps = [