Skip to content

Commit

Permalink
auto step down when ETCD is not reachable
Browse files Browse the repository at this point in the history
	* This patch introduces background fiber config._fencing_f
	which auto enables on rw nodes and watches ETCD
	* It is executed after on_after_cfg callback and watches
	  /<prefix>/clusters/<shard-name>/master path in ETCD during
	  etcd.fencing_timeout (default: 10s)
	* Fiber is enabled only in topology etcd.cluster.master and only
	  if etcd.fencing_enabled flag is specified (in conf.lua or in
	  	  common config)
	* Since network drops are indistinguishable from HTTP timeouts
	  after each :wait() time out, fencing rechecks ETCD via :list()
	  method
	* If nothing changed (99% chance) fiber will try again after
	  fencing_timeout
	* If another node is specified in ETCD, node will automatically
	  steps down (executes box.cfg{read_only=true}) and will wait
	  to become master
	* fencing fiber never returns node to be rw (it never calls
	  box.cfg{read_only=false} or package.reload())
	* to bring instance back rw you should manually call
	  package.reload() or use switchover.
  • Loading branch information
Vladislav Grubov committed Jan 17, 2023
1 parent 5c64a4c commit 78326ef
Show file tree
Hide file tree
Showing 9 changed files with 967 additions and 33 deletions.
544 changes: 511 additions & 33 deletions README.md

Large diffs are not rendered by default.

197 changes: 197 additions & 0 deletions config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ local fio = require 'fio'
local json = require 'json'
local yaml = require 'yaml'
local digest = require 'digest'
local fiber = require 'fiber'
json.cfg{ encode_invalid_as_nil = true }

local function lookaround(fun)
Expand Down Expand Up @@ -922,6 +923,202 @@ local M
end
-- print(string.format("Box configured"))

local msp = config.get('sys.master_selection_policy')
if type(cfg.etcd) == 'table'
and config.get('etcd.fencing_enabled')
and msp == 'etcd.cluster.master'
and type(cfg.cluster) == 'string' and cfg.cluster ~= ''
then
M._fencing_f = fiber.create(function()
fiber.name('config/fencing')
fiber.yield() -- yield execution
local function in_my_gen() fiber.testcancel() return config._fencing_f == fiber.self() end
assert(cfg.cluster, "cfg.cluster must be defined")

local watch_path = fio.pathjoin(
config.get('etcd.prefix'),
'clusters',
cfg.cluster
)

local my_name = assert(config.get('sys.instance_name'), "instance_name is not defined")
local fencing_timeout = config.get('etcd.fencing_timeout', 10)
local fencing_pause = config.get('etcd.fencing_pause', fencing_timeout/2)
local fencing_check_replication = config.get('etcd.fencing_check_replication')
if type(fencing_check_replication) == 'string' then
fencing_check_replication = fencing_check_replication == 'true'
else
fencing_check_replication = fencing_check_replication == true
end

local etcd_cluster, watch_index

local function refresh_list()
local result, resp = config.etcd:list(watch_path)
if resp.status == 200 then
etcd_cluster = result
if type(resp.headers) == 'table'
and tonumber(resp.headers['x-etcd-index'])
and tonumber(resp.headers['x-etcd-index']) > (tonumber(watch_index) or 0)
then
watch_index = tonumber(resp.headers['x-etcd-index'])
end
end
return etcd_cluster, watch_index
end

local function fencing_check(deadline)
local timeout = math.min(deadline-fiber.time(), fencing_timeout)
local check_started = fiber.time()
local pcall_ok, err_or_resolution, new_cluster = pcall(function()
local not_timed_out, response = config.etcd:wait(watch_path, {
index = watch_index,
timeout = timeout,
})

-- http timed out / our network drop - we'll never know
if not not_timed_out then return 'timeout' end
local res = json.decode(response.body)

if type(response.headers) == 'table'
and tonumber(response.headers['x-etcd-index'])
and tonumber(response.headers['x-etcd-index']) > watch_index
then
watch_index = tonumber(response.headers['x-etcd-index'])
end

if res.node then
return 'changed', config.etcd:recursive_extract(watch_path, res.node)
end
end)

if not pcall_ok then
log.warn("ETCD watch failed: %s", err_or_resolution)
end

if err_or_resolution ~= 'changed' then
new_cluster = nil
end

if not new_cluster then
deadline = deadline+fencing_timeout
while fiber.time() < deadline and in_my_gen() do
local ok, e_cluster = pcall(refresh_list)
if ok and e_cluster then
new_cluster = e_cluster
break
end
if not in_my_gen() then return end
fiber.sleep(fencing_pause / 10)
end
end

if not in_my_gen() then return end

if type(new_cluster) ~= 'table' then -- ETCD is down
log.warn('[fencing] ETCD %s is not discovered in etcd during %s seconds',
watch_path, fiber.time()-check_started)

if not fencing_check_replication then
return false
end

-- In proper fencing we must step down immediately as soon as we discover
-- that coordinator is down. But in real world there are some circumstances
-- when coordinator can be down for several seconds if someone crashes network
-- or ETCD itself.
-- We propose that it is safe to not step down as soon as we are connected to all
-- replicas in replicaset (etcd.cluster.master is fullmesh topology).
-- We do not check downstreams here, because downstreams cannot lead to collisions.
-- It at least 1 upstream is not in status follow
-- (Tarantool replication checks with tcp-healthchecks once in box.cfg.replication_timeout)
-- We immediately stepdown.
for _, ru in pairs(box.info.replication) do
if ru.id ~= box.info.id and ru.upstream then
if ru.upstream.status ~= "follow" then
log.warn("[fencing] upstream %s is not followed by me %s:%s (idle: %s, lag:%s)",
ru.upstream.peer, ru.upstream.status, ru.upstream.message,
ru.upstream.idle, ru.upstream.lag
)
return false
end
end
end

log.warn('[fencing] ETCD is down but all upstreams are followed by me. Continuing leadership')
return true
elseif new_cluster.master == my_name then
-- The most commmon branch. We are registered as the leader.
return true
elseif new_cluster.switchover then -- new_cluster.master ~= my_name
-- Another instance is the leader in ETCD. But we could be the one
-- who will be the next (cluster is under switching right now).
-- It is almost impossible to get this path in production. But the only one
-- protection we have is `fencing_pause` and `fencing_timeout`.
-- So, we will do nothing until ETCD mutex is present
log.warn('[fencing] It seems that cluster is under switchover right now %s', json.encode(new_cluster))
-- (if we are ro -- then we must end the loop)
-- (if we are rw -- then we must continue the loop)
return not box.info.ro
else
log.warn('[fencing] ETCD %s/master is %s not us. Stepping down', watch_path, new_cluster.master)
return false
end
end

if not pcall(refresh_list) then
log.warn("etcd list failed")
end
log.info("etcd_master is %s (index: %s)", json.encode(etcd_cluster), watch_index)

-- Main fencing loop
-- It is executed on every replica in the shard
-- if instance is ro then it will wait until instance became rw
while in_my_gen() do
-- Wait until instance became rw loop
while box.info.ro and in_my_gen() do
-- this is just fancy sleep.
-- if node became rw in less than 3 seconds we will check it immediately
pcall(box.ctl.wait_rw, 3)
end

-- after waiting to be rw we will step into fencing-loop
-- we must check that we are still in our code generation
-- to proceed
if not in_my_gen() then return end

-- we will not step down until deadline.
local deadline = fiber.time()+fencing_timeout
repeat
-- Before ETCD check we better pause
-- we do a little bit randomized sleep to not spam ETCD
fiber.sleep(math.random(math.max(0.5, fencing_pause-0.5), fencing_pause+0.5))
-- After each yield we have to check that we are still in our generation
if not in_my_gen() then return end

-- some one makes us readonly. There no need to check ETCD
-- we break from this loop immediately
if box.info.ro then break end

-- fencing_check(deadline) if it returns true,
-- then we update leadership leasing
if fencing_check(deadline) then
-- update deadline.
deadline = fiber.time()+fencing_timeout
end

if not in_my_gen() then return end
until box.info.ro or fiber.time() > deadline

-- We have left deadline-loop. It means that fencing is required
if not box.info.ro then
log.warn('[fencing] Performing self fencing (box.cfg{read_only=true})')
box.cfg{read_only=true}
end
end
end)
end

return M
end
})
Expand Down
6 changes: 6 additions & 0 deletions test/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM tarantool/tarantool:2.10
RUN apk add --no-cache -u iproute2 make bind-tools

WORKDIR /opt/tarantool

CMD ["tarantool" "/opt/tarantool/init.lua"]
19 changes: 19 additions & 0 deletions test/app/conf.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
etcd = {
instance_name = os.getenv("TT_INSTANCE_NAME"),
prefix = '/instance',
endpoints = {"http://etcd:2379"},
fencing_enabled = true,
}

box = {
background = false,
log_level = 6,
log_format = 'plain',

memtx_dir = '/var/lib/tarantool/snaps/',
wal_dir = '/var/lib/tarantool/xlogs',
}

app = {

}
45 changes: 45 additions & 0 deletions test/app/init.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
local fiber = require "fiber"

require 'config' {
mkdir = true,
print_config = true,
instance_name = os.getenv("TT_INSTANCE_NAME"),
file = 'conf.lua',
master_selection_policy = 'etcd.cluster.master',

on_after_cfg = function()
if not box.info.ro then
box.schema.user.grant('guest', 'super', nil, nil, { if_not_exists = true })

box.schema.space.create('T', {if_not_exists = true})
box.space.T:create_index('I', { if_not_exists = true })
end
end,
}

fiber.create(function()
fiber.name('pusher')

while true do
repeat
pcall(box.ctl.wait_rw, 3)
fiber.testcancel()
until not box.info.ro

local fibers = {}
for _ = 1, 10 do
local f = fiber.create(function()
fiber.self():set_joinable(true)
for i = 1, 100 do
box.space.T:replace{i, box.info.id, box.info.vclock}
end
end)
table.insert(fibers, f)
end

for _, f in ipairs(fibers) do
f:join()
end
end
end)

60 changes: 60 additions & 0 deletions test/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
version: "3"

x-etcd: &etcd
image: quay.io/coreos/etcd:v2.3.8
container_name: etcd
networks:
- tarantool
environment:
ETCD_LISTEN_PEER_URLS: http://0.0.0.0:2380
ETCD_LISTEN_CLIENT_URLS: http://0.0.0.0:2379
ETCDCTL_API: 2
ETCD_INITIAL_CLUSTER_TOKEN: etcd-cluster
ETCD_INITIAL_CLUSTER: etcd=http://etcd:2380
ETCD_NAME: etcd
ETCD_ADVERTISE_CLIENT_URLS: http://etcd:2379
ETCD_INITIAL_ADVERTISE_PEER_URLS: http://etcd:2380

x-tt: &tt
build: .
volumes:
- $PWD/../:/opt/tarantool/.rocks/share/tarantool:ro
- $PWD/app:/opt/tarantool
- $PWD/net:/opt/tarantool/net:ro
depends_on:
etcd:
condition: service_started
privileged: true
networks:
- tarantool
command: ["/bin/sh", "-c", "sleep 5 && tarantool /opt/tarantool/init.lua"]

networks:
tarantool:
name: tt_net
driver: bridge

services:
etcd:
<<: *etcd
etcd_load:
image: registry.gitlab.com/ochaton/switchover:010a6965
networks:
- tarantool
volumes:
- $PWD/instance.etcd.yaml:/instance.etcd.yaml:ro
depends_on:
etcd:
condition: service_started
entrypoint: ['']
command: ["/bin/sh", "-c", "sleep 3 && switchover -v -e http://etcd:2379 etcd load / /instance.etcd.yaml"]
instance_01:
<<: *tt
container_name: instance_01
environment:
TT_INSTANCE_NAME: instance_01
instance_02:
<<: *tt
container_name: instance_02
environment:
TT_INSTANCE_NAME: instance_02
26 changes: 26 additions & 0 deletions test/instance.etcd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
instance:
clusters:
instance:
master: instance_01
replicaset_uuid: 91157a11-0001-0000-0000-000000000000
common:
etcd:
fencing_timeout: 5
fencing_pause: 3
box:
replication_connect_quorum: 1
log_level: 5
memtx_memory: 268435456
instances:
instance_01:
cluster: instance
box:
instance_uuid: 91157a11-0000-0001-0000-000000000000
listen: instance_01:3301
instance_02:
cluster: instance
box:
instance_uuid: 91157a11-0000-0002-0000-000000000000
listen: instance_02:3302
...
22 changes: 22 additions & 0 deletions test/net/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
setup:
tc qdisc add dev eth0 root handle 1: prio
tc qdisc add dev eth0 parent 1:3 handle 10: netem loss 100%

offline-dport-%:
tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dport $* 0xffff flowid 1:3

offline-dst-%:
tc filter add dev eth0 parent 1: protocol ip prio 1 u32 match ip dst $(shell host -T4 $* | cut -f 4 -d' ') flowid 1:3

online:
tc filter del dev eth0 parent 1: protocol ip pref 1 u32

filter:
tc -s -d filter show dev eth0

qdisc:
tc -d -s qdisc show dev eth0

clear:
tc fliter del dev eth0 parent 1:
tc qdisc del dev eth0 root
Loading

0 comments on commit 78326ef

Please sign in to comment.