Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apisix/cli/ngx_tpl.lua
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ lua {
lua_shared_dict status-report {* meta.lua_shared_dict["status-report"] *};
{% end %}
lua_shared_dict nacos 10m;
lua_shared_dict consul 10m;
lua_shared_dict upstream-healthcheck {* meta.lua_shared_dict["upstream-healthcheck"] *};
}

Expand Down
153 changes: 92 additions & 61 deletions apisix/discovery/consul/init.lua
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ local ngx_timer_at = ngx.timer.at
local ngx_timer_every = ngx.timer.every
local log = core.log
local json_delay_encode = core.json.delay_encode
local process = require("ngx.process")
local ngx_worker_id = ngx.worker.id
local exiting = ngx.worker.exiting
local thread_spawn = ngx.thread.spawn
Expand All @@ -42,16 +43,22 @@ local null = ngx.null
local type = type
local next = next

local all_services = core.table.new(0, 5)
local consul_dict = ngx.shared.consul
if not consul_dict then
error("lua_shared_dict \"consul\" not configured")
end

local default_service
local default_weight
local sort_type
local skip_service_map = core.table.new(0, 1)
local dump_params

local events
local events_list
local consul_services
-- per-worker cache: service_name -> {raw = "json string", nodes = {decoded table}}
-- returns the same table instance while the shared dict value is unchanged,
-- so that compare_upstream_node() fast-path (old_t == new_t) works
local nodes_cache = {}

local default_skip_services = {"consul"}
local default_random_range = 5
Expand All @@ -66,53 +73,81 @@ local _M = {
}


local function discovery_consul_callback(data, event, source, pid)
all_services = data
log.notice("update local variable all_services, event is: ", event,
"source: ", source, "server pid:", pid,
", all services: ", json_delay_encode(all_services, true))
end


function _M.all_nodes()
return all_services
local keys = consul_dict:get_keys(0)
local services = core.table.new(0, #keys)
for _, key in ipairs(keys) do
local value = consul_dict:get(key)
if value then
local nodes, err = core.json.decode(value)
if nodes then
services[key] = nodes
else
log.error("failed to decode nodes for service: ", key, ", error: ", err)
end
end
end
return services
end


function _M.nodes(service_name)
if not all_services then
log.error("all_services is nil, failed to fetch nodes for : ", service_name)
return
local value = consul_dict:get(service_name)
if not value then
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nodes() uses consul_dict:get(service_name). During reloads or refresh patterns that use flush_all(), get() returns nil even though stale values may still exist, causing avoidable 503s. Consider using get_stale() (and only falling back to default when both fresh+stale are missing). Also consider clearing nodes_cache[service_name] when the dict has no value to avoid retaining entries for removed services indefinitely.

Suggested change
if not value then
if value == nil then
-- try to use stale value during reloads or after flush_all()
value = consul_dict:get_stale(service_name)
end
if value == nil then
-- no fresh or stale value in shared dict; clear per-worker cache entry
nodes_cache[service_name] = nil

Copilot uses AI. Check for mistakes.
log.error("consul service not found: ", service_name, ", return default service")
return default_service and {default_service}
end

local resp_list = all_services[service_name]
local cached = nodes_cache[service_name]
if cached and cached.raw == value then
return cached.nodes
end

if not resp_list then
log.error("fetch nodes failed by ", service_name, ", return default service")
local nodes, err = core.json.decode(value)
if not nodes then
log.error("fetch nodes failed by ", service_name, ", error: ", err)
return default_service and {default_service}
end

log.info("process id: ", ngx_worker_id(), ", all_services[", service_name, "] = ",
json_delay_encode(resp_list, true))
nodes_cache[service_name] = {raw = value, nodes = nodes}

log.info("process id: ", ngx_worker_id(), ", [", service_name, "] = ",
json_delay_encode(nodes, true))

return resp_list
return nodes
Comment on lines 94 to +117
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nodes() now reads JSON from ngx.shared.consul and decodes it on every call, which makes it return a fresh table each time. In apisix/upstream.lua, this defeats the compare_upstream_node fast-path (old_t == new_t) and forces the slow path (sorting/comparing) on every request, which is both a performance regression and can change node ordering (the slow path sorts by host in-place). Consider caching decoded nodes per-worker (e.g., LRU keyed by service name + raw JSON string/version) and returning the same table instance while the shared-dict value is unchanged.

Copilot uses AI. Check for mistakes.
end


local function update_all_services(consul_server_url, up_services)
-- clean old unused data
-- write new/updated values first so readers never see a missing service
for k, v in pairs(up_services) do
local content, err = core.json.encode(v)
if content then
local ok, set_err, forcible = consul_dict:set(k, content)
if not ok then
log.error("failed to set nodes for service: ", k, ", error: ", set_err,
", please consider increasing lua_shared_dict consul size")
elseif forcible then
log.warn("consul shared dict is full, forcibly evicting items while ",
"setting nodes for service: ", k,
", please consider increasing lua_shared_dict consul size")
end
else
Comment on lines 121 to +135
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

update_all_services() deletes all previously-known keys before setting the new ones. While this runs, workers can observe a missing service_name in ngx.shared.consul and fall back to the default (or nil), causing transient request failures during an update window. A safer approach is to write/set new values first (optionally under a generation/version namespace) and only then delete obsolete keys, so reads never see an empty/missing service during an update.

Copilot uses AI. Check for mistakes.
log.error("failed to encode nodes for service: ", k, ", error: ", err)
end
end

-- then delete keys that are no longer present
local old_services = consul_services[consul_server_url] or {}
for k, _ in pairs(old_services) do
all_services[k] = nil
if not up_services[k] then
consul_dict:delete(k)
end
end
core.table.clear(old_services)

for k, v in pairs(up_services) do
all_services[k] = v
end
consul_services[consul_server_url] = up_services

log.info("update all services: ", json_delay_encode(all_services, true))
log.info("update all services to shared dict")
end


Expand Down Expand Up @@ -149,14 +184,21 @@ local function read_dump_services()
return
end

all_services = entity.services
log.info("load dump file into memory success")
for k, v in pairs(entity.services) do
local content, json_err = core.json.encode(v)
if content then
consul_dict:set(k, content)
else
log.error("failed to encode dump service: ", k, ", error: ", json_err)
end
end
log.info("load dump file into shared dict success")
Comment on lines +187 to +195
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

read_dump_services() writes services into ngx.shared.consul without clearing existing keys first. Since lua_shared_dict contents survive nginx reloads, this can retain stale keys (or temporarily overwrite newer values) when the dump file is loaded during startup/reload. Consider either clearing the dict (or using a generation/version namespace) before loading the dump so the shared dict reflects exactly the dump contents until Consul updates arrive.

Copilot uses AI. Check for mistakes.
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since the values are set using service keys, I don't think this should be of concern.

end


local function write_dump_services()
local entity = {
services = all_services,
services = _M.all_nodes(),
last_update = ngx.time(),
expire = dump_params.expire, -- later need handle it
Comment on lines 199 to 203
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

write_dump_services() now calls _M.all_nodes() which scans all shdict keys and JSON-decodes every service on each dump write. If dump is enabled and service count is high, this can add noticeable CPU overhead at each Consul update. Consider maintaining a privileged-agent-only in-memory snapshot for dump generation, or passing the updated snapshot into write_dump_services to avoid full scans/decodes.

Copilot uses AI. Check for mistakes.
}
Expand Down Expand Up @@ -556,14 +598,6 @@ function _M.connect(premature, consul_server, retry_delay)

update_all_services(consul_server.consul_server_url, up_services)

--update events
local post_ok, post_err = events:post(events_list._source,
events_list.updating, all_services)
if not post_ok then
log.error("post_event failure with ", events_list._source,
", update all services error: ", post_err)
end

if dump_params then
ngx_timer_at(0, write_dump_services)
end
Expand Down Expand Up @@ -611,35 +645,32 @@ end

function _M.init_worker()
local consul_conf = local_conf.discovery.consul
dump_params = consul_conf.dump

if consul_conf.dump then
local dump = consul_conf.dump
dump_params = dump

if dump.load_on_init then
read_dump_services()
end
end

events = require("apisix.events")
events_list = events:event_list(
"discovery_consul_update_all_services",
"updating"
)

if 0 ~= ngx_worker_id() then
events:register(discovery_consul_callback, events_list._source, events_list.updating)
return
end

log.notice("consul_conf: ", json_delay_encode(consul_conf, true))
default_weight = consul_conf.weight
sort_type = consul_conf.sort_type
-- set default service, used when the server node cannot be found
if consul_conf.default_service then
default_service = consul_conf.default_service
default_service.weight = default_weight
end

if process.type() ~= "privileged agent" then
return
end

-- flush stale data that may persist across reloads,
-- since consul_services is re-initialized empty
consul_dict:flush_all()
Comment on lines +662 to +664
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consul_dict:flush_all() runs in the privileged agent during init_worker, which will clear the shared dict for all workers during an nginx reload. Since nodes() reads via consul_dict:get(), in-flight requests handled by old workers during a graceful reload can suddenly see missing services and return 503s. Consider avoiding a global flush on reload (e.g., use a generation/version keyspace and swap, or rebuild consul_services from existing dict keys), or at minimum ensure readers can still access stale values during refresh and free memory after flushing (see flush_expired).

Suggested change
-- flush stale data that may persist across reloads,
-- since consul_services is re-initialized empty
consul_dict:flush_all()
-- flush expired stale data that may persist across reloads,
-- since consul_services is re-initialized empty but existing
-- unexpired shared dict entries may still be in use by workers
consul_dict:flush_expired()

Copilot uses AI. Check for mistakes.

Comment on lines +662 to +665
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consul_dict:flush_all() runs in the privileged agent during init_worker(), but it executes after read_dump_services() may have populated the shared dict (and workers also call read_dump_services() before the privileged-agent guard). This can wipe the dump-loaded nodes and leave the dict empty until Consul fetch completes, defeating the dump-on-reload/startup mitigation and potentially causing 503s. Consider either (a) moving the flush earlier (before any dump load) and performing dump load only in the privileged agent, or (b) removing flush_all() and instead cleaning stale keys during update_all_services() in a way that survives reloads.

Copilot uses AI. Check for mistakes.
Comment on lines +662 to +665
Copy link

Copilot AI Mar 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR changes the cross-worker data sharing mechanism for Consul nodes to rely on lua_shared_dict, but there isn’t a regression test validating the restart/reload scenario from #12398 (e.g., repeated HUP reloads / restarts while sending requests should not produce intermittent 503s once Consul has data). Adding a Test::Nginx case similar to existing HUP-based tests would help prevent regressions.

Suggested change
-- flush stale data that may persist across reloads,
-- since consul_services is re-initialized empty
consul_dict:flush_all()

Copilot uses AI. Check for mistakes.
if consul_conf.dump then
if consul_conf.dump.load_on_init then
read_dump_services()
end
end

log.notice("consul_conf: ", json_delay_encode(consul_conf, true))

if consul_conf.skip_services then
skip_service_map = core.table.new(0, #consul_conf.skip_services)
for _, v in ipairs(consul_conf.skip_services) do
Expand Down Expand Up @@ -673,7 +704,7 @@ end


function _M.dump_data()
return {config = local_conf.discovery.consul, services = all_services }
return {config = local_conf.discovery.consul, services = _M.all_nodes()}
end


Expand Down
1 change: 1 addition & 0 deletions t/APISIX.pm
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,7 @@ lua {
lua_shared_dict standalone-config 10m;
lua_shared_dict status-report 1m;
lua_shared_dict nacos 10m;
lua_shared_dict consul 10m;
lua_shared_dict upstream-healthcheck 10m;
}
_EOC_
Expand Down
2 changes: 1 addition & 1 deletion t/discovery/consul_dump.t
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ GET /hello
--- error_code: 503
--- error_log
connect consul
fetch nodes failed
consul service not found
failed to set upstream


Expand Down
Loading