auto step down when ETCD is not reachable

* This patch introduces background fiber config._fencing_f which auto enables on rw nodes and watches ETCD * It is executed after on_after_cfg callback and watches /<prefix>/clusters/<shard-name>/master path in ETCD during etcd.fencing_timeout (default: 10s) * Fiber is enabled only in topology etcd.cluster.master and only if etcd.fencing_enabled flag is specified (in conf.lua or in common config) * Since network drops are indistinguishable from HTTP timeouts after each :wait() time out, fencing rechecks ETCD via :list() method * If nothing changed (99% chance) fiber will try again after fencing_timeout * If another node is specified in ETCD, node will automatically steps down (executes box.cfg{read_only=true}) and will wait to become master * fencing fiber never returns node to be rw (it never calls box.cfg{read_only=false} or package.reload()) * to bring instance back rw you should manually call package.reload() or use switchover.
moonlibs · Jan 17, 2023 · 78326ef · 78326ef
1 parent 5c64a4c
commit 78326ef
Show file tree

Hide file tree

Showing 9 changed files with 967 additions and 33 deletions.
diff --git a/README.md b/README.md
diff --git a/config.lua b/config.lua
@@ -3,6 +3,7 @@ local fio = require 'fio'
 local json = require 'json'
 local yaml = require 'yaml'
 local digest = require 'digest'
+local fiber  = require 'fiber'
 json.cfg{ encode_invalid_as_nil = true }
 
 local function lookaround(fun)
@@ -922,6 +923,202 @@ local M
 			end
 			-- print(string.format("Box configured"))
 
+			local msp = config.get('sys.master_selection_policy')
+			if type(cfg.etcd) == 'table'
+				and config.get('etcd.fencing_enabled')
+				and msp == 'etcd.cluster.master'
+				and type(cfg.cluster) == 'string' and cfg.cluster ~= ''
+			then
+				M._fencing_f = fiber.create(function()
+					fiber.name('config/fencing')
+					fiber.yield() -- yield execution
+					local function in_my_gen() fiber.testcancel() return config._fencing_f == fiber.self() end
+					assert(cfg.cluster, "cfg.cluster must be defined")
+
+					local watch_path = fio.pathjoin(
+						config.get('etcd.prefix'),
+						'clusters',
+						cfg.cluster
+					)
+
+					local my_name = assert(config.get('sys.instance_name'), "instance_name is not defined")
+					local fencing_timeout = config.get('etcd.fencing_timeout', 10)
+					local fencing_pause = config.get('etcd.fencing_pause', fencing_timeout/2)
+					local fencing_check_replication = config.get('etcd.fencing_check_replication')
+					if type(fencing_check_replication) == 'string' then
+						fencing_check_replication = fencing_check_replication == 'true'
+					else
+						fencing_check_replication = fencing_check_replication == true
+					end
+
+					local etcd_cluster, watch_index
+
+					local function refresh_list()
+						local result, resp = config.etcd:list(watch_path)
+						if resp.status == 200 then
+							etcd_cluster = result
+							if type(resp.headers) == 'table'
+								and tonumber(resp.headers['x-etcd-index'])
+								and tonumber(resp.headers['x-etcd-index']) > (tonumber(watch_index) or 0)
+							then
+								watch_index = tonumber(resp.headers['x-etcd-index'])
+							end
+						end
+						return etcd_cluster, watch_index
+					end
+
+					local function fencing_check(deadline)
+						local timeout = math.min(deadline-fiber.time(), fencing_timeout)
+						local check_started = fiber.time()
+						local pcall_ok, err_or_resolution, new_cluster = pcall(function()
+							local not_timed_out, response = config.etcd:wait(watch_path, {
+								index = watch_index,
+								timeout = timeout,
+							})
+
+							-- http timed out / our network drop - we'll never know
+							if not not_timed_out then return 'timeout' end
+							local res = json.decode(response.body)
+
+							if type(response.headers) == 'table'
+								and tonumber(response.headers['x-etcd-index'])
+								and tonumber(response.headers['x-etcd-index']) > watch_index
+							then
+								watch_index = tonumber(response.headers['x-etcd-index'])
+							end
+
+							if res.node then
+								return 'changed', config.etcd:recursive_extract(watch_path, res.node)
+							end
+						end)
+
+						if not pcall_ok then
+							log.warn("ETCD watch failed: %s", err_or_resolution)
+						end
+
+						if err_or_resolution ~= 'changed' then
+							new_cluster = nil
+						end
+
+						if not new_cluster then
+							deadline = deadline+fencing_timeout
+							while fiber.time() < deadline and in_my_gen() do
+								local ok, e_cluster = pcall(refresh_list)
+								if ok and e_cluster then
+									new_cluster = e_cluster
+									break
+								end
+								if not in_my_gen() then return end
+								fiber.sleep(fencing_pause / 10)
+							end
+						end
+
+						if not in_my_gen() then return end
+
+						if type(new_cluster) ~= 'table' then -- ETCD is down
+							log.warn('[fencing] ETCD %s is not discovered in etcd during %s seconds',
+								watch_path, fiber.time()-check_started)
+
+							if not fencing_check_replication then
+								return false
+							end
+
+							-- In proper fencing we must step down immediately as soon as we discover
+							-- that coordinator is down. But in real world there are some circumstances
+							-- when coordinator can be down for several seconds if someone crashes network
+							-- or ETCD itself.
+							-- We propose that it is safe to not step down as soon as we are connected to all
+							-- replicas in replicaset (etcd.cluster.master is fullmesh topology).
+							-- We do not check downstreams here, because downstreams cannot lead to collisions.
+							-- It at least 1 upstream is not in status follow
+							-- (Tarantool replication checks with tcp-healthchecks once in box.cfg.replication_timeout)
+							-- We immediately stepdown.
+							for _, ru in pairs(box.info.replication) do
+								if ru.id ~= box.info.id and ru.upstream then
+									if ru.upstream.status ~= "follow" then
+										log.warn("[fencing] upstream %s is not followed by me %s:%s (idle: %s, lag:%s)",
+											ru.upstream.peer, ru.upstream.status, ru.upstream.message,
+											ru.upstream.idle, ru.upstream.lag
+										)
+										return false
+									end
+								end
+							end
+
+							log.warn('[fencing] ETCD is down but all upstreams are followed by me. Continuing leadership')
+							return true
+						elseif new_cluster.master == my_name then
+							-- The most commmon branch. We are registered as the leader.
+							return true
+						elseif new_cluster.switchover then -- new_cluster.master ~= my_name
+							-- Another instance is the leader in ETCD. But we could be the one
+							-- who will be the next (cluster is under switching right now).
+							-- It is almost impossible to get this path in production. But the only one
+							-- protection we have is `fencing_pause` and `fencing_timeout`.
+							-- So, we will do nothing until ETCD mutex is present
+							log.warn('[fencing] It seems that cluster is under switchover right now %s', json.encode(new_cluster))
+							-- (if we are ro -- then we must end the loop)
+							-- (if we are rw -- then we must continue the loop)
+							return not box.info.ro
+						else
+							log.warn('[fencing] ETCD %s/master is %s not us. Stepping down', watch_path, new_cluster.master)
+							return false
+						end
+					end
+
+					if not pcall(refresh_list) then
+						log.warn("etcd list failed")
+					end
+					log.info("etcd_master is %s (index: %s)", json.encode(etcd_cluster), watch_index)
+
+					-- Main fencing loop
+					-- It is executed on every replica in the shard
+					-- if instance is ro then it will wait until instance became rw
+					while in_my_gen() do
+						-- Wait until instance became rw loop
+						while box.info.ro and in_my_gen() do
+							-- this is just fancy sleep.
+							-- if node became rw in less than 3 seconds we will check it immediately
+							pcall(box.ctl.wait_rw, 3)
+						end
+
+						-- after waiting to be rw we will step into fencing-loop
+						-- we must check that we are still in our code generation
+						-- to proceed
+						if not in_my_gen() then return end
+
+						-- we will not step down until deadline.
+						local deadline = fiber.time()+fencing_timeout
+						repeat
+							-- Before ETCD check we better pause
+							-- we do a little bit randomized sleep to not spam ETCD
+							fiber.sleep(math.random(math.max(0.5, fencing_pause-0.5), fencing_pause+0.5))
+							-- After each yield we have to check that we are still in our generation
+							if not in_my_gen() then return end
+
+							-- some one makes us readonly. There no need to check ETCD
+							-- we break from this loop immediately
+							if box.info.ro then break end
+
+							-- fencing_check(deadline) if it returns true,
+							-- then we update leadership leasing
+							if fencing_check(deadline) then
+								-- update deadline.
+								deadline = fiber.time()+fencing_timeout
+							end
+
+							if not in_my_gen() then return end
+						until box.info.ro or fiber.time() > deadline
+
+						-- We have left deadline-loop. It means that fencing is required
+						if not box.info.ro then
+							log.warn('[fencing] Performing self fencing (box.cfg{read_only=true})')
+							box.cfg{read_only=true}
+						end
+					end
+				end)
+			end
+
 			return M
 		end
 	})

diff --git a/test/Dockerfile b/test/Dockerfile
@@ -0,0 +1,6 @@
+FROM tarantool/tarantool:2.10
+RUN apk add --no-cache -u iproute2 make bind-tools
+
+WORKDIR /opt/tarantool
+
+CMD ["tarantool" "/opt/tarantool/init.lua"]
diff --git a/test/app/conf.lua b/test/app/conf.lua
@@ -0,0 +1,19 @@
+etcd = {
+	instance_name = os.getenv("TT_INSTANCE_NAME"),
+	prefix = '/instance',
+	endpoints = {"http://etcd:2379"},
+	fencing_enabled = true,
+}
+
+box = {
+	background = false,
+	log_level = 6,
+	log_format = 'plain',
+
+	memtx_dir = '/var/lib/tarantool/snaps/',
+	wal_dir = '/var/lib/tarantool/xlogs',
+}
+
+app = {
+
+}
diff --git a/test/app/init.lua b/test/app/init.lua
@@ -0,0 +1,45 @@
+local fiber = require "fiber"
+
+require 'config' {
+	mkdir = true,
+	print_config = true,
+	instance_name = os.getenv("TT_INSTANCE_NAME"),
+	file = 'conf.lua',
+	master_selection_policy = 'etcd.cluster.master',
+
+	on_after_cfg = function()
+		if not box.info.ro then
+			box.schema.user.grant('guest', 'super', nil, nil, { if_not_exists = true })
+
+			box.schema.space.create('T', {if_not_exists = true})
+			box.space.T:create_index('I', { if_not_exists = true })
+		end
+	end,
+}
+
+fiber.create(function()
+	fiber.name('pusher')
+
+	while true do
+		repeat
+			pcall(box.ctl.wait_rw, 3)
+			fiber.testcancel()
+		until not box.info.ro
+
+		local fibers = {}
+		for _ = 1, 10 do
+			local f = fiber.create(function()
+				fiber.self():set_joinable(true)
+				for i = 1, 100 do
+					box.space.T:replace{i, box.info.id, box.info.vclock}
+				end
+			end)
+			table.insert(fibers, f)
+		end
+
+		for _, f in ipairs(fibers) do
+			f:join()
+		end
+	end
+end)
+
diff --git a/test/docker-compose.yml b/test/docker-compose.yml
@@ -0,0 +1,60 @@
+version: "3"
+
+x-etcd: &etcd
+  image: quay.io/coreos/etcd:v2.3.8
+  container_name: etcd
+  networks:
+    - tarantool
+  environment:
+    ETCD_LISTEN_PEER_URLS: http://0.0.0.0:2380
+    ETCD_LISTEN_CLIENT_URLS: http://0.0.0.0:2379
+    ETCDCTL_API: 2
+    ETCD_INITIAL_CLUSTER_TOKEN: etcd-cluster
+    ETCD_INITIAL_CLUSTER: etcd=http://etcd:2380
+    ETCD_NAME: etcd
+    ETCD_ADVERTISE_CLIENT_URLS: http://etcd:2379
+    ETCD_INITIAL_ADVERTISE_PEER_URLS: http://etcd:2380
+
+x-tt: &tt
+  build: .
+  volumes:
+    - $PWD/../:/opt/tarantool/.rocks/share/tarantool:ro
+    - $PWD/app:/opt/tarantool
+    - $PWD/net:/opt/tarantool/net:ro
+  depends_on:
+    etcd:
+      condition: service_started
+  privileged: true
+  networks:
+    - tarantool
+  command: ["/bin/sh", "-c", "sleep 5 && tarantool /opt/tarantool/init.lua"]
+
+networks:
+  tarantool:
+    name: tt_net
+    driver: bridge
+
+services:
+  etcd:
+    <<: *etcd
+  etcd_load:
+    image: registry.gitlab.com/ochaton/switchover:010a6965
+    networks:
+      - tarantool
+    volumes:
+      - $PWD/instance.etcd.yaml:/instance.etcd.yaml:ro
+    depends_on:
+      etcd:
+        condition: service_started
+    entrypoint: ['']
+    command: ["/bin/sh", "-c", "sleep 3 && switchover -v -e http://etcd:2379 etcd load / /instance.etcd.yaml"]
+  instance_01:
+    <<: *tt
+    container_name: instance_01
+    environment:
+      TT_INSTANCE_NAME: instance_01
+  instance_02:
+    <<: *tt
+    container_name: instance_02
+    environment:
+      TT_INSTANCE_NAME: instance_02
diff --git a/test/instance.etcd.yaml b/test/instance.etcd.yaml
@@ -0,0 +1,26 @@
+---
+instance:
+  clusters:
+    instance:
+      master: instance_01
+      replicaset_uuid: 91157a11-0001-0000-0000-000000000000
+  common:
+    etcd:
+      fencing_timeout: 5
+      fencing_pause: 3
+    box:
+      replication_connect_quorum: 1
+      log_level: 5
+      memtx_memory: 268435456
+  instances:
+    instance_01:
+      cluster: instance
+      box:
+        instance_uuid: 91157a11-0000-0001-0000-000000000000
+        listen: instance_01:3301
+    instance_02:
+      cluster: instance
+      box:
+        instance_uuid: 91157a11-0000-0002-0000-000000000000
+        listen: instance_02:3302
+...
diff --git a/test/net/Makefile b/test/net/Makefile
@@ -0,0 +1,22 @@
+setup:
+	tc qdisc add dev eth0 root handle 1: prio
+	tc qdisc add dev eth0 parent 1:3 handle 10: netem loss 100%
+
+offline-dport-%:
+	tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dport $* 0xffff flowid 1:3
+
+offline-dst-%:
+	tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dst $(shell host -T4 $* | cut -f 4 -d' ') flowid 1:3
+
+online:
+	tc filter del dev eth0 parent 1: protocol ip pref 1 u32
+
+filter:
+	tc -s -d filter show dev eth0
+
+qdisc:
+	tc -d -s qdisc show dev eth0
+
+clear:
+	tc fliter del dev eth0 parent 1:
+	tc qdisc del dev eth0 root