From 78326efb4638018231fd9b60891112235d54d089 Mon Sep 17 00:00:00 2001 From: Vladislav Grubov Date: Thu, 8 Dec 2022 18:05:30 +0300 Subject: [PATCH] auto step down when ETCD is not reachable * This patch introduces background fiber config._fencing_f which auto enables on rw nodes and watches ETCD * It is executed after on_after_cfg callback and watches //clusters//master path in ETCD during etcd.fencing_timeout (default: 10s) * Fiber is enabled only in topology etcd.cluster.master and only if etcd.fencing_enabled flag is specified (in conf.lua or in common config) * Since network drops are indistinguishable from HTTP timeouts after each :wait() time out, fencing rechecks ETCD via :list() method * If nothing changed (99% chance) fiber will try again after fencing_timeout * If another node is specified in ETCD, node will automatically steps down (executes box.cfg{read_only=true}) and will wait to become master * fencing fiber never returns node to be rw (it never calls box.cfg{read_only=false} or package.reload()) * to bring instance back rw you should manually call package.reload() or use switchover. --- README.md | 544 +++++++++++++++++++++++++++++++++++++--- config.lua | 197 +++++++++++++++ test/Dockerfile | 6 + test/app/conf.lua | 19 ++ test/app/init.lua | 45 ++++ test/docker-compose.yml | 60 +++++ test/instance.etcd.yaml | 26 ++ test/net/Makefile | 22 ++ test/net/README.md | 81 ++++++ 9 files changed, 967 insertions(+), 33 deletions(-) create mode 100644 test/Dockerfile create mode 100644 test/app/conf.lua create mode 100644 test/app/init.lua create mode 100644 test/docker-compose.yml create mode 100644 test/instance.etcd.yaml create mode 100644 test/net/Makefile create mode 100644 test/net/README.md diff --git a/README.md b/README.md index 66ebaef..e5befb9 100644 --- a/README.md +++ b/README.md @@ -1,61 +1,539 @@ -Having conf.lua +# Config + +Module to make proper initialization and configuration of tarantool instance. + +It can be used with or without ETCD. + +Only ETCD APIv2 now supported. + +## Status + +Ready for production use. + +Latest stable release: `config 0.6.0`. + +## Installation + +```bash +tarantoolctl rocks --server=https://moonlibs.org install config 0.6.0 +``` + +Starting with Tarantool 2.10.0 you may add configuration of moonlibs.org into `config-5.1.lua` + +```bash +$ cat .rocks/config-5.1.lua +rocks_servers = { + "https://moonlibs.org", + "http://moonlibs.github.io/rocks", + "http://rocks.tarantool.org/", + "http://luarocks.org/repositories/rocks" +} +``` + +## Configuration + +To configure tarantool instance you must deploy `conf.lua` file. + +### Example of `conf.lua` + +Typically conf.lua should be located in `/etc//conf.lua`. ```lua +assert(instance_name, "instance_name must be defined") +etcd = { + instance_name = instance_name, + prefix = '/etcd/path/to/application/etcd', + endpoints = { + "https://etcd1:2379", + "https://etcd2:2379", + "https://etcd3:2379", + }, + timeout = 3, + boolean_auto = true, + print_config = true, + login = 'etcd-username', + password = 'etcd-password', +} + +-- This options will be passed as is to box.cfg box = { - work_dir = '.'; - pid_file = 'box.pid'; - custom_proc_title = 'm1'; - background = false; - slab_alloc_arena = 0.1; - --- Networking. Dynamic --- - listen = '127.0.0.1:3013', - readahead = 65536, + pid_file = '/var/run/tarantool/'..instance_name..'.pid', + memtx_dir = '/var/lib/tarantool/snaps/' .. instance_name, + wal_dir = '/var/lib/tarantool/xlogs/' .. instance_name, + log_nonblock = false, } -console = { - listen = '127.0.0.1:3016' + +--- You may hardcode options for your application in `app` section +app = { + } -include 'app.lua' ``` -and app.lua: +### Usage in `init.lua` ```lua -app = { - pool = { - { uri = '127.0.0.1:3013', zone = '1' }; - { uri = '127.0.0.2:3013', zone = '2' }; - { uri = '127.0.0.3:3013', zone = '3' }; - } + +local instance_name = os.getenv('TT_INSTANCE_NAME') + +require 'config' { + mkdir = true, + instance_name = instance_name, + file = '/etc//conf.lua', + master_selection_policy = 'etcd.cluster.master', +} + +print("Tarantool bootstrapped") +``` + +## Usage + +Module config is used both for bootstrap and configuration of your Tarantool application. + +In application you may access config options using following syntax + +```lua +local DEFAULT_TIMEOUT = 3 + +--- If app/http/timeout is defined in config (ETCD or conf.lua) then it will be returned +--- otherwise value of DEFAULT_TIMEUOT will be returned +local http_timeout = config.get('app.http.timeout', DEFAULT_TIMEOUT) + +--- If app/is_enabled is not defined then `nil` will be returned. +local is_enabled = config.get('app.is_enabled') +``` + +## Topologies + +`moonlibs/config` supports different types of Tarantool topologies. + +All of them make sence when application is configured using ETCD. + +To distinguish application topology option `master_selection_policy` is used. + +### Single-shard topology + +In most cases you need single shard topology. It means, that your application has single master and many replicas. + +Shard will be configured with full-mesh topology. Read more about full-mesh topology on [Tarantool website](https://www.tarantool.io/en/doc/latest/concepts/replication/repl_architecture/). + +Each instance of application must have unique name. For example: + +- `userdb_001` +- `userdb_002` +- `userdb_003` + +Typically instance name **should not** contain `master` or `replica` word in it. + +#### Example of `init.lua` + +```lua +--- variable instance_name must be derived somehow for each tarantool instance +--- For example from name of the file. or from environment variable +require 'config' { + mkdir = true, + instance_name = instance_name, + file = '/etc/userdb/conf.lua', + master_selection_policy = 'etcd.cluster.master', +} +``` + +#### Example of `/etc/userdb/conf.lua` + +```lua +assert(instance_name, "instance_name must be defined") +etcd = { + instance_name = instance_name, + prefix = '/tarantool/userdb', + endpoints = { + "https://etcd1:2379", + "https://etcd2:2379", + "https://etcd3:2379", + }, + timeout = 3, + boolean_auto = true, + print_config = true, } +-- This options will be passed as is to box.cfg +box = { + pid_file = '/var/run/tarantool/'..instance_name..'.pid', + memtx_dir = '/var/lib/tarantool/snaps/' .. instance_name, + wal_dir = '/var/lib/tarantool/xlogs/' .. instance_name, + log_nonblock = false, +} +``` + +#### Example of ETCD configuration (`etcd.cluster.master`) + +```yaml +tarantool: + userdb: + clusters: + userdb: + master: userdb_001 + replicaset_uuid: 045e12d8-0001-0000-0000-000000000000 + common: + box: + log_level: 5 + memtx_memory: 268435456 + instances: + userdb_001: + cluster: userdb + box: + instance_uuid: 045e12d8-0000-0001-0000-000000000000 + listen: 10.0.1.11:3301 + userdb_002: + cluster: userdb + box: + instance_uuid: 045e12d8-0000-0002-0000-000000000000 + listen: 10.0.1.12:3302 + userdb_003: + cluster: userdb + box: + instance_uuid: 045e12d8-0000-0003-0000-000000000000 + listen: 10.0.1.13:3303 +``` + +`/tarantool/userdb` -- is root path for application configuration + +`/tarantool/userdb/common` -- is common configuration for each instance of application. + +`/tarantool/userdb/common/box` -- is section to configure box.cfg parameters. See more on [Tarantool website](https://www.tarantool.io/en/doc/latest/reference/configuration). + +`/tarantool/userdb/clusters` section contains list of shards. For single-shard application it is good to single shard it as application itself. + +`/tarantool/userdb/instances` section contains instance-specific configuration. It must contain `/box/{listen,instance_uuid}` and `cluster` options. + +##### Configuration precedence + +- /etc/app-name/conf.lua +- ETCD:/instances/ +- ETCD:/common/ +- config.get default value + +#### Fencing configuration + +`etcd.cluster.master` topology supports auto fencing mechanism. + +Auto fencing is implemented via background fiber which waits for changes on `/clusters/` directory. + +There are 4 parameters to configure: + +| Parameter | Description | Default Value | +|----------------------------------|---------------------------------------|---------------------| +| `etcd/fencing_enabled` | Trigger to enable/disable fencing | `false` | +| `etcd/fencing_timeout` | Fencing timeout | `10` (seconds) | +| `etcd/fencing_pause` | Fencing pause | `fencing_timeout/2` | +| `etcd/fencing_check_replication` | Respect replication when ETCD is down | `false` | + +Example of enabled fencing: + +```yaml +tarantool: + userdb: + common: + etcd: + fencing_enabled: true ``` -in init.lua +Fencing also can be enabled in `conf.lua`: ```lua -local conf = require 'config' ('conf.lua') -- call to conf loads config +etcd = { + endpoints = {"http://etcd1:2379", "http://etcd2:2379", "http://etcd3:2379"}, + prefix = "/tarantool/userdb", + timeout = 3, + fencing_enabled = true, +} +``` + +#### Fencing algorithm + +Fencing can be enabled only for topology `etcd.cluster.master` and only if `etcd/fencing_enabled` is `true` (default: `false`). -local pool = conf.get('app.pool',{}) +Fencing algorithm is the following: + +0. Wait until instance became `rw`. +1. Wait randomized `fencing_pause` (fencing_pause ± 500ms). +2. Recheck ETCD `/clusters/` in `fencing_timeout`. +3. Depends on response: + 1. [ETCD is ok] => provision self to be `rw` for next `fencing_timeout` seconds. Go to `1.` + 2. [ETCD is down] => execute `box.cfg{read_only=true}` if `etcd/fencing_check_replication` is disabled. Go to `0.` + 3. [ETCD has another master, and switching in progress] => do nothing. Go to `1.` + 4. [ETCD has another master, and switching is not in progress] => execute `box.cfg{read_only=true}`. Go to `0.` + +**Note:** to request ETCD Quorum Reads are used. So it is safe to use it in split brain. + +### Multi-proxy topology (etcd.instance.single) + +`moonlibs/config` supports multi proxy topology. This topology is usefull when you need to have many stateless tarantool proxies or totally independent masters. + +Each instance **should** have unique name. For example: + +- proxy_001 +- proxy_002 +- proxy_003 +- proxy_004 +- proxy_005 + +#### Example of proxy `init.lua` + +```lua +--- variable instance_name must be derived somehow for each tarantool instance +--- For example from name of the file. or from environment variable +require 'config' { + mkdir = true, + instance_name = instance_name, + file = '/etc/proxy/conf.lua', + master_selection_policy = 'etcd.instance.single', +} ``` -or anywhere in application module +#### Example of `/etc/proxy/conf.lua` ```lua -local conf = require 'config' +assert(instance_name, "instance_name must be defined") +etcd = { + instance_name = instance_name, + prefix = '/tarantool/proxy', + endpoints = { + "https://etcd1:2379", + "https://etcd2:2379", + "https://etcd3:2379", + }, + timeout = 3, + boolean_auto = true, + print_config = true, +} + +-- This options will be passed as is to box.cfg +box = { + pid_file = '/var/run/tarantool/'..instance_name..'.pid', + memtx_dir = '/var/lib/tarantool/snaps/' .. instance_name, + wal_dir = '/var/lib/tarantool/xlogs/' .. instance_name, + log_nonblock = false, +} +``` -local pool = conf.get('app.pool',{}) +#### Example of ETCD configuration (`etcd.instance.single`) + +```yaml +tarantool: + proxy: + common: + box: + log_level: 5 + memtx_memory: 33554432 + instances: + proxy_001: + box: + instance_uuid: 01712087-0000-0001-0000-000000000000 + listen: 10.0.2.12:7101 + proxy_002: + box: + instance_uuid: 01712087-0000-0002-0000-000000000000 + listen: 10.0.2.13:7102 + proxy_003: + box: + instance_uuid: 01712087-0000-0003-0000-000000000000 + listen: 10.0.2.11:7103 ``` -then we could run +The etcd configuration is the same as `etcd.cluster.master` except that `/tarantool/proxy/clusters` is not defined. + +Also `/tarantool/proxy/instances//cluster` **must not** be defined. + +### Multi-shard topology for custom sharding (`etcd.cluster.master`) + +`etcd.cluster.master` can be used for multi-shard topologies as well. -```sh -tarantool init.lua -# runs tarantool with conf.lua +Multi-shard means that application consists of several replicasets. Each replicaset has single master and several replicas. + +`conf.lua` and `init.lua` files remains exactly the same. But configuration of ETCD slightly changes: + +```yaml +tarantool: + notifications: + clusters: + notifications_002: + master: notifications_002_01 + replicaset_uuid: 11079f9c-0002-0000-0000-000000000000 + notifications_001: + master: notifications_001_01 + replicaset_uuid: 11079f9c-0001-0000-0000-000000000000 + common: + box: + log_level: 5 + memtx_memory: 268435456 + instances: + notifications_001_01: + cluster: notifications_001 + box: + instance_uuid: 11079f9c-0001-0001-0000-000000000000 + listen: 10.0.3.11:4011 + notifications_001_02: + cluster: notifications_001 + box: + instance_uuid: 11079f9c-0001-0002-0000-000000000000 + listen: 10.0.3.12:4012 + notifications_002_01: + cluster: notifications_002 + box: + instance_uuid: 11079f9c-0002-0001-0000-000000000000 + listen: 10.0.3.11:4021 + notifications_002_02: + cluster: notifications_002 + box: + instance_uuid: 11079f9c-0002-0002-0000-000000000000 + listen: 10.0.3.12:4022 ``` -or +This configuration describes configuration of application `notifications` with 2 replicasets `notifications_001` and `notifications_002`. + +Shard `notifications_001` contains 2 nodes: + +- `notifications_001_01` - described as master +- `notifications_001_02` -```sh -tarantool -c cf1.lua init.lua -# runs tarantool with cf1.lua +Shard `notifications_002` contains 2 nodes: + +- `notifications_002_01` - described as master +- `notifications_002_02` + +### Multi-shard topology for vshard-based applications (`etcd.cluster.vshard`) + +In most cases for multi-shard applications it is better to use module [tarantool/vshard](https://www.tarantool.io/en/doc/latest/concepts/sharding). + +vshard required to be properly configured. Each instance of the cluster must contain the same view of cluster topology. + +vshard application has 2 groups of instances: storages (data nodes) and routers (stateless proxy nodes). + +#### Example of ETCD configuration for vshard-based applications (`etcd.cluster.vshard`) + +```yaml +tarantool: + profile: + common: + vshard: + bucket_count: 30000 + box: + log_level: 5 + replication_connect_quorum: 2 + clusters: + profile_001: + master: profile_001_01 + replicaset_uuid: 17120f91-0001-0000-0000-000000000000 + profile_002: + master: profile_002_01 + replicaset_uuid: 17120f91-0002-0000-0000-000000000000 + instances: + profile_001_01: + cluster: profile_001 + box: + instance_uuid: 17120f91-0001-0001-0000-000000000000 + listen: 10.0.4.11:4011 + profile_001_02: + cluster: profile_001 + box: + instance_uuid: 17120f91-0001-0002-0000-000000000000 + listen: 10.0.4.12:4012 + profile_002_01: + cluster: profile_002 + box: + instance_uuid: 17120f91-0002-0001-0000-000000000000 + listen: 10.0.4.11:4021 + profile_002_02: + cluster: profile_002 + box: + instance_uuid: 17120f91-0002-0002-0000-000000000000 + listen: 10.0.4.12:4022 + router_001: + router: true + box: + instance_uuid: 12047e12-0000-0001-0000-000000000000 + listen: 10.0.5.12:7001 + router_002: + router: true + box: + instance_uuid: 12047e12-0000-0002-0000-000000000000 + listen: 10.0.5.13:7002 + router_003: + router: true + box: + instance_uuid: 12047e12-0000-0003-0000-000000000000 + listen: 10.0.5.11:7003 ``` + +#### Example of vshard-based init.lua (`etcd.cluster.vshard`) + +The code of simultanious bootstrap is tricky, and short safe version of it listed below + +```lua +local fun = require 'fun' +--- variable instance_name must be derived somehow for each tarantool instance +--- For example from name of the file. or from environment variable +require 'config' { + mkdir = true, + instance_name = instance_name, + file = '/etc/profile/conf.lua', + master_selection_policy = 'etcd.cluster.vshard', + on_load = function(conf, cfg) + -- on_load is called each time right after fetching data from ETCD + local all_cfg = conf.etcd:get_all() + + -- Construct vshard/sharding table from ETCD + cfg.sharding = fun.iter(all_cfg.clusters) + :map(function(shard_name, shard_info) + return shard_info.replicaset_uuid, { + replicas = fun.iter(all_cfg.instances) + :grep(function(instance_name, instance_info) + return instance_info.cluster == shard_name + end) + :map(function(instance_name, instance_info) + return instance_info.box.instance_uuid, { + name = instance_name, + uri = 'guest:@'..instance_info.box.listen, + master = instance_name == shard_info.master, + } + end) + :tomap() + } + end) + :tomap() + end, + on_after_cfg = function(conf, cfg) + -- on_after_cfg is once after returning from box.cfg (Tarantool is already online) + if cfg.cluster then + vshard.storage.cfg({ + sharding = cfg.sharding, + bucket_count = config.get('vshard.bucket_count'), + }, box.info.uuid) + end + if cfg.router then + vshard.router.cfg({ + sharding = cfg.sharding, + bucket_count = config.get('vshard.bucket_count'), + }) + end + end, +} +``` + +#### VShard Maintenance + +By default vshard does not support master auto discovery. If you switch master in any replicaset you have to reconfigure routers as well. + +With vshard topology it is strongly recommended to use [package.reload](https://github.com/moonlibs/package-reload). Module must be required before first require of `config`. + +```lua +require 'package.reload' +-- .... +require 'config' { + -- ... +} +-- ... +``` + +It is good to use [switchover](https://gitlab.com/ochaton/switchover) to maintenance sharded applications. + +To get used to vshard please read getting started of it [Sharding with Vshard](https://www.tarantool.io/en/doc/latest/book/admin/vshard_admin/#vshard-install) diff --git a/config.lua b/config.lua index 9577f0d..07b62c3 100644 --- a/config.lua +++ b/config.lua @@ -3,6 +3,7 @@ local fio = require 'fio' local json = require 'json' local yaml = require 'yaml' local digest = require 'digest' +local fiber = require 'fiber' json.cfg{ encode_invalid_as_nil = true } local function lookaround(fun) @@ -922,6 +923,202 @@ local M end -- print(string.format("Box configured")) + local msp = config.get('sys.master_selection_policy') + if type(cfg.etcd) == 'table' + and config.get('etcd.fencing_enabled') + and msp == 'etcd.cluster.master' + and type(cfg.cluster) == 'string' and cfg.cluster ~= '' + then + M._fencing_f = fiber.create(function() + fiber.name('config/fencing') + fiber.yield() -- yield execution + local function in_my_gen() fiber.testcancel() return config._fencing_f == fiber.self() end + assert(cfg.cluster, "cfg.cluster must be defined") + + local watch_path = fio.pathjoin( + config.get('etcd.prefix'), + 'clusters', + cfg.cluster + ) + + local my_name = assert(config.get('sys.instance_name'), "instance_name is not defined") + local fencing_timeout = config.get('etcd.fencing_timeout', 10) + local fencing_pause = config.get('etcd.fencing_pause', fencing_timeout/2) + local fencing_check_replication = config.get('etcd.fencing_check_replication') + if type(fencing_check_replication) == 'string' then + fencing_check_replication = fencing_check_replication == 'true' + else + fencing_check_replication = fencing_check_replication == true + end + + local etcd_cluster, watch_index + + local function refresh_list() + local result, resp = config.etcd:list(watch_path) + if resp.status == 200 then + etcd_cluster = result + if type(resp.headers) == 'table' + and tonumber(resp.headers['x-etcd-index']) + and tonumber(resp.headers['x-etcd-index']) > (tonumber(watch_index) or 0) + then + watch_index = tonumber(resp.headers['x-etcd-index']) + end + end + return etcd_cluster, watch_index + end + + local function fencing_check(deadline) + local timeout = math.min(deadline-fiber.time(), fencing_timeout) + local check_started = fiber.time() + local pcall_ok, err_or_resolution, new_cluster = pcall(function() + local not_timed_out, response = config.etcd:wait(watch_path, { + index = watch_index, + timeout = timeout, + }) + + -- http timed out / our network drop - we'll never know + if not not_timed_out then return 'timeout' end + local res = json.decode(response.body) + + if type(response.headers) == 'table' + and tonumber(response.headers['x-etcd-index']) + and tonumber(response.headers['x-etcd-index']) > watch_index + then + watch_index = tonumber(response.headers['x-etcd-index']) + end + + if res.node then + return 'changed', config.etcd:recursive_extract(watch_path, res.node) + end + end) + + if not pcall_ok then + log.warn("ETCD watch failed: %s", err_or_resolution) + end + + if err_or_resolution ~= 'changed' then + new_cluster = nil + end + + if not new_cluster then + deadline = deadline+fencing_timeout + while fiber.time() < deadline and in_my_gen() do + local ok, e_cluster = pcall(refresh_list) + if ok and e_cluster then + new_cluster = e_cluster + break + end + if not in_my_gen() then return end + fiber.sleep(fencing_pause / 10) + end + end + + if not in_my_gen() then return end + + if type(new_cluster) ~= 'table' then -- ETCD is down + log.warn('[fencing] ETCD %s is not discovered in etcd during %s seconds', + watch_path, fiber.time()-check_started) + + if not fencing_check_replication then + return false + end + + -- In proper fencing we must step down immediately as soon as we discover + -- that coordinator is down. But in real world there are some circumstances + -- when coordinator can be down for several seconds if someone crashes network + -- or ETCD itself. + -- We propose that it is safe to not step down as soon as we are connected to all + -- replicas in replicaset (etcd.cluster.master is fullmesh topology). + -- We do not check downstreams here, because downstreams cannot lead to collisions. + -- It at least 1 upstream is not in status follow + -- (Tarantool replication checks with tcp-healthchecks once in box.cfg.replication_timeout) + -- We immediately stepdown. + for _, ru in pairs(box.info.replication) do + if ru.id ~= box.info.id and ru.upstream then + if ru.upstream.status ~= "follow" then + log.warn("[fencing] upstream %s is not followed by me %s:%s (idle: %s, lag:%s)", + ru.upstream.peer, ru.upstream.status, ru.upstream.message, + ru.upstream.idle, ru.upstream.lag + ) + return false + end + end + end + + log.warn('[fencing] ETCD is down but all upstreams are followed by me. Continuing leadership') + return true + elseif new_cluster.master == my_name then + -- The most commmon branch. We are registered as the leader. + return true + elseif new_cluster.switchover then -- new_cluster.master ~= my_name + -- Another instance is the leader in ETCD. But we could be the one + -- who will be the next (cluster is under switching right now). + -- It is almost impossible to get this path in production. But the only one + -- protection we have is `fencing_pause` and `fencing_timeout`. + -- So, we will do nothing until ETCD mutex is present + log.warn('[fencing] It seems that cluster is under switchover right now %s', json.encode(new_cluster)) + -- (if we are ro -- then we must end the loop) + -- (if we are rw -- then we must continue the loop) + return not box.info.ro + else + log.warn('[fencing] ETCD %s/master is %s not us. Stepping down', watch_path, new_cluster.master) + return false + end + end + + if not pcall(refresh_list) then + log.warn("etcd list failed") + end + log.info("etcd_master is %s (index: %s)", json.encode(etcd_cluster), watch_index) + + -- Main fencing loop + -- It is executed on every replica in the shard + -- if instance is ro then it will wait until instance became rw + while in_my_gen() do + -- Wait until instance became rw loop + while box.info.ro and in_my_gen() do + -- this is just fancy sleep. + -- if node became rw in less than 3 seconds we will check it immediately + pcall(box.ctl.wait_rw, 3) + end + + -- after waiting to be rw we will step into fencing-loop + -- we must check that we are still in our code generation + -- to proceed + if not in_my_gen() then return end + + -- we will not step down until deadline. + local deadline = fiber.time()+fencing_timeout + repeat + -- Before ETCD check we better pause + -- we do a little bit randomized sleep to not spam ETCD + fiber.sleep(math.random(math.max(0.5, fencing_pause-0.5), fencing_pause+0.5)) + -- After each yield we have to check that we are still in our generation + if not in_my_gen() then return end + + -- some one makes us readonly. There no need to check ETCD + -- we break from this loop immediately + if box.info.ro then break end + + -- fencing_check(deadline) if it returns true, + -- then we update leadership leasing + if fencing_check(deadline) then + -- update deadline. + deadline = fiber.time()+fencing_timeout + end + + if not in_my_gen() then return end + until box.info.ro or fiber.time() > deadline + + -- We have left deadline-loop. It means that fencing is required + if not box.info.ro then + log.warn('[fencing] Performing self fencing (box.cfg{read_only=true})') + box.cfg{read_only=true} + end + end + end) + end + return M end }) diff --git a/test/Dockerfile b/test/Dockerfile new file mode 100644 index 0000000..ac744bc --- /dev/null +++ b/test/Dockerfile @@ -0,0 +1,6 @@ +FROM tarantool/tarantool:2.10 +RUN apk add --no-cache -u iproute2 make bind-tools + +WORKDIR /opt/tarantool + +CMD ["tarantool" "/opt/tarantool/init.lua"] \ No newline at end of file diff --git a/test/app/conf.lua b/test/app/conf.lua new file mode 100644 index 0000000..044e133 --- /dev/null +++ b/test/app/conf.lua @@ -0,0 +1,19 @@ +etcd = { + instance_name = os.getenv("TT_INSTANCE_NAME"), + prefix = '/instance', + endpoints = {"http://etcd:2379"}, + fencing_enabled = true, +} + +box = { + background = false, + log_level = 6, + log_format = 'plain', + + memtx_dir = '/var/lib/tarantool/snaps/', + wal_dir = '/var/lib/tarantool/xlogs', +} + +app = { + +} \ No newline at end of file diff --git a/test/app/init.lua b/test/app/init.lua new file mode 100644 index 0000000..295817d --- /dev/null +++ b/test/app/init.lua @@ -0,0 +1,45 @@ +local fiber = require "fiber" + +require 'config' { + mkdir = true, + print_config = true, + instance_name = os.getenv("TT_INSTANCE_NAME"), + file = 'conf.lua', + master_selection_policy = 'etcd.cluster.master', + + on_after_cfg = function() + if not box.info.ro then + box.schema.user.grant('guest', 'super', nil, nil, { if_not_exists = true }) + + box.schema.space.create('T', {if_not_exists = true}) + box.space.T:create_index('I', { if_not_exists = true }) + end + end, +} + +fiber.create(function() + fiber.name('pusher') + + while true do + repeat + pcall(box.ctl.wait_rw, 3) + fiber.testcancel() + until not box.info.ro + + local fibers = {} + for _ = 1, 10 do + local f = fiber.create(function() + fiber.self():set_joinable(true) + for i = 1, 100 do + box.space.T:replace{i, box.info.id, box.info.vclock} + end + end) + table.insert(fibers, f) + end + + for _, f in ipairs(fibers) do + f:join() + end + end +end) + diff --git a/test/docker-compose.yml b/test/docker-compose.yml new file mode 100644 index 0000000..4833b68 --- /dev/null +++ b/test/docker-compose.yml @@ -0,0 +1,60 @@ +version: "3" + +x-etcd: &etcd + image: quay.io/coreos/etcd:v2.3.8 + container_name: etcd + networks: + - tarantool + environment: + ETCD_LISTEN_PEER_URLS: http://0.0.0.0:2380 + ETCD_LISTEN_CLIENT_URLS: http://0.0.0.0:2379 + ETCDCTL_API: 2 + ETCD_INITIAL_CLUSTER_TOKEN: etcd-cluster + ETCD_INITIAL_CLUSTER: etcd=http://etcd:2380 + ETCD_NAME: etcd + ETCD_ADVERTISE_CLIENT_URLS: http://etcd:2379 + ETCD_INITIAL_ADVERTISE_PEER_URLS: http://etcd:2380 + +x-tt: &tt + build: . + volumes: + - $PWD/../:/opt/tarantool/.rocks/share/tarantool:ro + - $PWD/app:/opt/tarantool + - $PWD/net:/opt/tarantool/net:ro + depends_on: + etcd: + condition: service_started + privileged: true + networks: + - tarantool + command: ["/bin/sh", "-c", "sleep 5 && tarantool /opt/tarantool/init.lua"] + +networks: + tarantool: + name: tt_net + driver: bridge + +services: + etcd: + <<: *etcd + etcd_load: + image: registry.gitlab.com/ochaton/switchover:010a6965 + networks: + - tarantool + volumes: + - $PWD/instance.etcd.yaml:/instance.etcd.yaml:ro + depends_on: + etcd: + condition: service_started + entrypoint: [''] + command: ["/bin/sh", "-c", "sleep 3 && switchover -v -e http://etcd:2379 etcd load / /instance.etcd.yaml"] + instance_01: + <<: *tt + container_name: instance_01 + environment: + TT_INSTANCE_NAME: instance_01 + instance_02: + <<: *tt + container_name: instance_02 + environment: + TT_INSTANCE_NAME: instance_02 diff --git a/test/instance.etcd.yaml b/test/instance.etcd.yaml new file mode 100644 index 0000000..3e0768b --- /dev/null +++ b/test/instance.etcd.yaml @@ -0,0 +1,26 @@ +--- +instance: + clusters: + instance: + master: instance_01 + replicaset_uuid: 91157a11-0001-0000-0000-000000000000 + common: + etcd: + fencing_timeout: 5 + fencing_pause: 3 + box: + replication_connect_quorum: 1 + log_level: 5 + memtx_memory: 268435456 + instances: + instance_01: + cluster: instance + box: + instance_uuid: 91157a11-0000-0001-0000-000000000000 + listen: instance_01:3301 + instance_02: + cluster: instance + box: + instance_uuid: 91157a11-0000-0002-0000-000000000000 + listen: instance_02:3302 +... diff --git a/test/net/Makefile b/test/net/Makefile new file mode 100644 index 0000000..84df839 --- /dev/null +++ b/test/net/Makefile @@ -0,0 +1,22 @@ +setup: + tc qdisc add dev eth0 root handle 1: prio + tc qdisc add dev eth0 parent 1:3 handle 10: netem loss 100% + +offline-dport-%: + tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dport $* 0xffff flowid 1:3 + +offline-dst-%: + tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dst $(shell host -T4 $* | cut -f 4 -d' ') flowid 1:3 + +online: + tc filter del dev eth0 parent 1: protocol ip pref 1 u32 + +filter: + tc -s -d filter show dev eth0 + +qdisc: + tc -d -s qdisc show dev eth0 + +clear: + tc fliter del dev eth0 parent 1: + tc qdisc del dev eth0 root diff --git a/test/net/README.md b/test/net/README.md new file mode 100644 index 0000000..cadbba9 --- /dev/null +++ b/test/net/README.md @@ -0,0 +1,81 @@ +# Split-Brain test toolchain + +## Run + +```bash +$ pwd +config/test + +$ docker compose up --build +``` + +## Prepare + +### Prepare instance_01 + +```bash +docker exec -it instance_001 /bin/sh + +# make setup must be executed only once per container +/opt/tarantool $ make -C net setup +``` + +### Prepare instance_02 + +```bash +docker exec -it instance_002 /bin/sh + +# make setup must be executed only once per container +/opt/tarantool $ make -C net setup +``` + +## Make online + +```bash +docker exec -it instance_01 /bin/sh + +/opt/tarantool $ make -C net online +``` + +## Isolation + +### Isolate instance_01 against instance_02 + +```bash +docker exec -it instance_01 /bin/sh + +/opt/tarantool $ make -C net offline-dst-instance_02 +``` + +### Isolate instance_01 against etcd + +```bash +docker exec -it instance_01 /bin/sh + +/opt/tarantool $ make -C net offline-dst-etcd +``` + +### Total instance_01 isolation + +```bash +docker exec -it instance_01 /bin/sh + +/opt/tarantool $ make -C net offline-dst-instance_02 +/opt/tarantool $ make -C net offline-dst-etcd +``` + +### Split brain instance_01 / instance_02 + +```bash +docker exec -it instance_01 /bin/sh + +/opt/tarantool $ make -C net offline-dst-instance_02 +/opt/tarantool $ make -C net offline-dst-autofailover-2 +``` + +```bash +docker exec -it instance_02 /bin/sh + +/opt/tarantool $ make -C net offline-dst-instance_01 +/opt/tarantool $ make -C net offline-dst-autofailover-1 +```