Skip to content

Commit

Permalink
fix(vault): let shdict secret vault cache presist enough time during …
Browse files Browse the repository at this point in the history
…resurrect_ttl (#13471)

This PR fixes an issue that rotate_secret may flush a secret value with NEGATIVE_CACHED_VALUE when vault backend is down and a secret value stored in the shared dict has passed its ttl and hasn't finished consuming its resurrect_ttl.

TLDR; this issue happens easily when a reference is being used via the vault PDK function in custom codes(serverless functions, custom plugins, etc.), and some of the worker processes may not be triggered via the service/routes that use these custom codes, and these worker processes do not hold a valid LRU cache for the secret value

The issue was first reported in FTI-6137.

---------

Signed-off-by: Aapo Talvensaari <aapo.talvensaari@gmail.com>
Co-authored-by: Aapo Talvensaari <aapo.talvensaari@gmail.com>
(cherry picked from commit 9269195)
  • Loading branch information
windmgc committed Aug 23, 2024
1 parent d190bcf commit ddcfaa4
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
message: Fixed an issue where the Vault secret cache got refreshed during `resurrect_ttl` time and could not be fetched by other workers.
type: bugfix
scope: Core
24 changes: 21 additions & 3 deletions kong/pdk/vault.lua
Original file line number Diff line number Diff line change
Expand Up @@ -822,9 +822,20 @@ local function new(self)
-- @tparam table parsed_reference the parsed reference
-- @treturn string|nil the retrieved value from the vault, of `nil`
-- @treturn string|nil a string describing an error if there was one
-- @treturn boolean|nil whether to resurrect value in case vault errors or doesn't return value
-- @usage local value, err = get_from_vault(reference, strategy, config, cache_key, parsed_reference)
local function get_from_vault(reference, strategy, config, cache_key, parsed_reference)
local function get_from_vault(reference, strategy, config, cache_key, parsed_reference, resurrect)
local value, err, ttl = invoke_strategy(strategy, config, parsed_reference)
if resurrect and value == nil then
local resurrected_value = SECRETS_CACHE:get(cache_key)
if resurrected_value then
return resurrected_value

else
return nil, fmt("could not get value from external vault (%s)", err)
end
end

local cache_value, shdict_ttl, lru_ttl = get_cache_value_and_ttl(value, config, ttl)
local ok, cache_err = SECRETS_CACHE:safe_set(cache_key, cache_value, shdict_ttl)
if not ok then
Expand Down Expand Up @@ -1267,18 +1278,25 @@ local function new(self)
-- If the TTL is still greater than the resurrect time
-- we don't have to rotate the secret, except it if it
-- negatively cached.
local resurrect
local ttl = SECRETS_CACHE:ttl(new_cache_key)
if ttl and SECRETS_CACHE:get(new_cache_key) ~= NEGATIVELY_CACHED_VALUE then
local resurrect_ttl = max(config.resurrect_ttl or DAO_MAX_TTL, SECRETS_CACHE_MIN_TTL)
-- the secret is still within ttl, no need to refresh
if ttl > resurrect_ttl then
return true
end

-- the secret is still within resurrect ttl time, so when we try to refresh the secret
-- we do not forciblly override it with a negative value, so that the cached value
-- can be resurrected
resurrect = ttl > SECRETS_CACHE_MIN_TTL
end

strategy = caching_strategy(strategy, config_hash)

-- we should refresh the secret at this point
local ok, err = get_from_vault(reference, strategy, config, new_cache_key, parsed_reference)
-- try to refresh the secret, according to the remaining time the cached value may or may not be refreshed.
local ok, err = get_from_vault(reference, strategy, config, new_cache_key, parsed_reference, resurrect)
if not ok then
return nil, fmt("could not retrieve value for reference %s (%s)", reference, err)
end
Expand Down
139 changes: 139 additions & 0 deletions spec/02-integration/13-vaults/07-resurrect_spec.lua
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ local LUA_PATH = CUSTOM_VAULTS .. "/?.lua;" ..
local DUMMY_HEADER = "Dummy-Plugin"
local fmt = string.format

local json = require "cjson"



--- A vault test harness is a driver for vault backends, which implements
Expand Down Expand Up @@ -48,6 +50,9 @@ local fmt = string.format
--- fixtures() output is passed directly to `helpers.start_kong()`
---@field fixtures fun(self: vault_test_harness):table|nil
---
--- pause() is exactly what you'd expect
---@field pause fun(self: vault_test_harness)
---
---
---@field prefix string # generated by the test suite
---@field host string # generated by the test suite
Expand Down Expand Up @@ -88,6 +93,10 @@ local VAULTS = {
}
}
end,

pause = function(_)
return test_vault.client.pause()
end,
},
}

Expand All @@ -103,6 +112,7 @@ for _, vault in ipairs(VAULTS) do
vault.setup = vault.setup or noop
vault.teardown = vault.teardown or noop
vault.fixtures = vault.fixtures or noop
vault.pause = vault.pause or noop
end


Expand Down Expand Up @@ -232,5 +242,134 @@ describe("vault resurrect_ttl and rotation (#" .. strategy .. ") #" .. vault.nam
end)


describe("#multiworker vault resurrect_ttl and rotation (#" .. strategy .. ") #" .. vault.name, function()
local client, admin_client
local secret = "my-secret"

lazy_setup(function()
helpers.setenv("KONG_LUA_PATH_OVERRIDE", LUA_PATH)
helpers.setenv("KONG_VAULT_ROTATION_INTERVAL", "1")

vault:setup()
vault:create_secret(secret, "init")

local bp = helpers.get_db_utils(strategy,
{ "vaults", "routes", "services", "plugins" },
{ "dummy" },
{ vault.name })


assert(bp.vaults:insert({
name = vault.name,
prefix = vault.prefix,
config = vault.config,
}))

local route = assert(bp.routes:insert({
name = vault.host,
hosts = { vault.host },
paths = { "/" },
service = assert(bp.services:insert()),
}))


assert(bp.plugins:insert({
name = "post-function",
config = {
access = {fmt([[
local value, err = kong.vault.get("{vault://%s/%s?ttl=%d&resurrect_ttl=%d}")
if value then
kong.response.exit(200, {["value"]=value, ["pid"]=ngx.worker.pid()}, {["Content-Type"]="application/json"})
end
]], vault.prefix, secret, 2, 5),}
},
route = { id = route.id },
}))

assert(helpers.start_kong({
database = strategy,
nginx_conf = "spec/fixtures/custom_nginx.template",
vaults = vault.name,
plugins = "post-function",
log_level = "debug",
dedicated_config_processing = false,
-- nginx_worker_processes = 2,
nginx_main_worker_processes = 2,
}, nil, nil, vault:fixtures() ))

client = helpers.proxy_client()
admin_client = helpers.admin_client()
end)


lazy_teardown(function()
if client then
client:close()
end

if admin_client then
admin_client:close()
end

helpers.stop_kong()
vault:teardown()

helpers.unsetenv("KONG_LUA_PATH_OVERRIDE")
end)


it("resurrects secret value from shared dict when secret is deleted (backend: #" .. vault.name .. ")", function()
-- fetch all worker pids
local status_ret = admin_client:get("/")
local body = assert.res_status(200, status_ret)
local json_body = json.decode(body)
assert.truthy(json_body)
local worker_pids = json_body.pids.workers
assert.truthy(#worker_pids == 2)

local worker_secret_hits = {}
for _, worker_pid in ipairs(worker_pids) do
worker_secret_hits[tostring(worker_pid)] = false
end

vault:update_secret(secret, "old", { ttl = 2, resurrect_ttl = 5 })

-- trigger post-function in one of the workers
local res = client:get("/", {headers = {host = assert(vault.host)}})
local body = assert.res_status(200, res)
local json_body = json.decode(body)
assert.same("old", json_body.value)
worker_secret_hits[tostring(json_body.pid)] = true

vault:pause()

-- let ttl pass and try to trigger post-function in all workers
-- check all of them can resurrect the secret from shared dict
ngx.sleep(3)

assert.with_timeout(5).with_step(0.1).eventually(
function()
-- avoid connection reuse so that we can hit all workers
local new_client = helpers.proxy_client()
local res = new_client:get("/", {headers = {host = assert(vault.host)}})
local body = assert.res_status(200, res)
local json_body = json.decode(body)
new_client:close()
assert.same("old", json_body.value)
worker_secret_hits[tostring(json_body.pid)] = true

for k, v in pairs(worker_secret_hits) do
if not v then
return false, "worker pid " .. k .. " did not hit the secret"
end
end

return true
end
).is_truthy("expected all workers to resurrect the secret from shared dict")
end)
end)


end -- each vault backend
end -- each strategy
39 changes: 39 additions & 0 deletions spec/fixtures/custom_vaults/kong/vaults/test/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,24 @@ function test.init()
end


function test.pause()
local shm = ngx.shared[test.SHM_NAME]
shm:set("paused", true)
return kong.response.exit(200, { message = "succeed" })
end


function test.is_running()
local shm = ngx.shared[test.SHM_NAME]
return shm:get("paused") ~= true
end


function test.get(conf, resource, version)
if not test.is_running() then
return nil, "Vault server paused"
end

local secret = get_from_shm(resource, version)

kong.log.inspect({
Expand Down Expand Up @@ -92,6 +109,10 @@ end


function test.api()
if not test.is_running() then
return kong.response.exit(503, { message = "Vault server paused" })
end

local shm = assert(ngx.shared[test.SHM_NAME])
local secret = assert(ngx.var.secret)
local args = assert(kong.request.get_query())
Expand Down Expand Up @@ -200,6 +221,18 @@ function test.client.get(secret, version)
end


function test.client.pause()
local client = assert(http.new())

local uri = fmt("http://127.0.0.1:%d/pause", test.PORT)

local res, err = client:request_uri(uri, { method = "GET" })
assert(err == nil, "failed GET " .. uri .. ": " .. tostring(err))

return cjson.decode(res.body)
end


test.http_mock = [[
lua_shared_dict ]] .. test.SHM_NAME .. [[ 5m;
Expand All @@ -212,6 +245,12 @@ test.http_mock = [[
require("kong.vaults.test").api()
}
}
location ~^/pause {
content_by_lua_block {
require("kong.vaults.test").pause()
}
}
}
]]

Expand Down

0 comments on commit ddcfaa4

Please sign in to comment.